mwmbl/index.py

82 lines
2.4 KiB
Python

"""
Create a search index
"""
from collections import Counter
from itertools import islice
from typing import Iterator, Iterable
from urllib.parse import unquote
import pandas as pd
# NUM_PAGES = 8192
# PAGE_SIZE = 512
from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
NUM_INITIAL_TOKENS = 50
HTTP_START = 'http://'
HTTPS_START = 'https://'
BATCH_SIZE = 100
def is_content_token(nlp, token):
lexeme = nlp.vocab[token.orth]
return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
def tokenize(nlp, cleaned_text):
tokens = nlp.tokenizer(cleaned_text)
content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
if is_content_token(nlp, token)]
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
return lowered
def prepare_url_for_tokenizing(url: str):
if url.startswith(HTTP_START):
url = url[len(HTTP_START):]
elif url.startswith(HTTPS_START):
url = url[len(HTTPS_START):]
for c in '/._':
if c in url:
url = url.replace(c, ' ')
return url
def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
title_tokens = tokenize(nlp, title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(nlp, prepared_url)
extract_tokens = tokenize(nlp, extract)
print("Extract tokens", extract_tokens)
tokens = title_tokens | url_tokens | extract_tokens
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
if i % 1000 == 0:
print("Processed", i)
def grouper(n: int, iterator: Iterator):
while True:
chunk = tuple(islice(iterator, n))
if not chunk:
return
yield chunk
def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
indexer.create_if_not_exists()
terms = Counter()
pages = get_pages(nlp, titles_urls_and_extracts)
for page in pages:
for token in page.tokens:
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
terms.update([t.lower() for t in page.tokens])
term_df = pd.DataFrame({
'term': terms.keys(),
'count': terms.values(),
})
term_df.to_csv(terms_path)