82 lines
2.4 KiB
Python
82 lines
2.4 KiB
Python
"""
|
|
Create a search index
|
|
"""
|
|
from collections import Counter
|
|
from itertools import islice
|
|
from typing import Iterator, Iterable
|
|
from urllib.parse import unquote
|
|
|
|
import pandas as pd
|
|
|
|
# NUM_PAGES = 8192
|
|
# PAGE_SIZE = 512
|
|
from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
|
|
|
|
NUM_INITIAL_TOKENS = 50
|
|
|
|
HTTP_START = 'http://'
|
|
HTTPS_START = 'https://'
|
|
BATCH_SIZE = 100
|
|
|
|
|
|
def is_content_token(nlp, token):
|
|
lexeme = nlp.vocab[token.orth]
|
|
return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
|
|
|
|
|
|
def tokenize(nlp, cleaned_text):
|
|
tokens = nlp.tokenizer(cleaned_text)
|
|
content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
|
|
if is_content_token(nlp, token)]
|
|
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
|
|
return lowered
|
|
|
|
|
|
def prepare_url_for_tokenizing(url: str):
|
|
if url.startswith(HTTP_START):
|
|
url = url[len(HTTP_START):]
|
|
elif url.startswith(HTTPS_START):
|
|
url = url[len(HTTPS_START):]
|
|
for c in '/._':
|
|
if c in url:
|
|
url = url.replace(c, ' ')
|
|
return url
|
|
|
|
|
|
def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
|
|
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
|
|
title_tokens = tokenize(nlp, title_cleaned)
|
|
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
|
url_tokens = tokenize(nlp, prepared_url)
|
|
extract_tokens = tokenize(nlp, extract)
|
|
print("Extract tokens", extract_tokens)
|
|
tokens = title_tokens | url_tokens | extract_tokens
|
|
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
|
|
|
|
if i % 1000 == 0:
|
|
print("Processed", i)
|
|
|
|
|
|
def grouper(n: int, iterator: Iterator):
|
|
while True:
|
|
chunk = tuple(islice(iterator, n))
|
|
if not chunk:
|
|
return
|
|
yield chunk
|
|
|
|
|
|
def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
|
|
indexer.create_if_not_exists()
|
|
|
|
terms = Counter()
|
|
pages = get_pages(nlp, titles_urls_and_extracts)
|
|
for page in pages:
|
|
for token in page.tokens:
|
|
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
|
|
terms.update([t.lower() for t in page.tokens])
|
|
|
|
term_df = pd.DataFrame({
|
|
'term': terms.keys(),
|
|
'count': terms.values(),
|
|
})
|
|
term_df.to_csv(terms_path)
|