mwmbl/index.py

83 lines
2.4 KiB
Python
Raw Normal View History

2021-03-13 20:54:15 +00:00
"""
Create a search index
"""
2021-05-30 20:30:34 +00:00
from collections import Counter
2021-05-19 20:48:03 +00:00
from itertools import islice
from typing import Iterator, Iterable
2021-03-23 22:03:48 +00:00
from urllib.parse import unquote
2021-03-13 20:54:15 +00:00
2021-05-30 20:30:34 +00:00
import pandas as pd
2021-03-13 20:54:15 +00:00
2021-12-14 19:59:31 +00:00
# NUM_PAGES = 8192
# PAGE_SIZE = 512
from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
2021-04-12 17:37:33 +00:00
2021-03-13 22:21:50 +00:00
NUM_INITIAL_TOKENS = 50
2021-03-13 20:54:15 +00:00
2021-03-23 22:03:48 +00:00
HTTP_START = 'http://'
HTTPS_START = 'https://'
2021-04-12 17:37:33 +00:00
BATCH_SIZE = 100
2021-03-23 22:03:48 +00:00
2021-03-13 20:54:15 +00:00
def is_content_token(nlp, token):
lexeme = nlp.vocab[token.orth]
2021-03-23 22:03:48 +00:00
return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
2021-03-13 20:54:15 +00:00
def tokenize(nlp, cleaned_text):
tokens = nlp.tokenizer(cleaned_text)
2021-03-13 22:21:50 +00:00
content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
if is_content_token(nlp, token)]
2021-03-13 20:54:15 +00:00
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
return lowered
2021-03-23 22:03:48 +00:00
def prepare_url_for_tokenizing(url: str):
if url.startswith(HTTP_START):
url = url[len(HTTP_START):]
elif url.startswith(HTTPS_START):
url = url[len(HTTPS_START):]
for c in '/._':
if c in url:
url = url.replace(c, ' ')
return url
2021-12-18 22:56:39 +00:00
def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
2021-03-23 22:03:48 +00:00
title_tokens = tokenize(nlp, title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(nlp, prepared_url)
2021-12-18 22:56:39 +00:00
extract_tokens = tokenize(nlp, extract)
2021-12-19 20:48:28 +00:00
print("Extract tokens", extract_tokens)
2021-12-18 22:56:39 +00:00
tokens = title_tokens | url_tokens | extract_tokens
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
2021-03-23 22:03:48 +00:00
if i % 1000 == 0:
print("Processed", i)
2021-03-13 20:54:15 +00:00
2021-03-24 21:55:35 +00:00
def grouper(n: int, iterator: Iterator):
while True:
chunk = tuple(islice(iterator, n))
if not chunk:
return
yield chunk
2021-12-18 22:56:39 +00:00
def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
2021-03-24 21:55:35 +00:00
indexer.create_if_not_exists()
2021-05-30 20:30:34 +00:00
terms = Counter()
2021-12-18 22:56:39 +00:00
pages = get_pages(nlp, titles_urls_and_extracts)
for page in pages:
for token in page.tokens:
2021-12-18 22:56:39 +00:00
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
terms.update([t.lower() for t in page.tokens])
2021-05-30 20:30:34 +00:00
term_df = pd.DataFrame({
'term': terms.keys(),
'count': terms.values(),
})
term_df.to_csv(terms_path)