diff --git a/index.py b/index.py index 32f4ea6..d8fda0a 100644 --- a/index.py +++ b/index.py @@ -3,6 +3,7 @@ Create a search index """ import json import os +from collections import Counter from dataclasses import dataclass from itertools import islice from mmap import mmap, PROT_READ @@ -11,6 +12,7 @@ from urllib.parse import unquote import justext import mmh3 +import pandas as pd from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError NUM_PAGES = 8192 @@ -184,10 +186,19 @@ def grouper(n: int, iterator: Iterator): yield chunk -def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls): +def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls, terms_path): indexer.create_if_not_exists() + terms = Counter() pages = get_pages(nlp, titles_and_urls) for chunk in grouper(BATCH_SIZE, pages): indexer.index(list(chunk)) + for page in chunk: + terms.update([t.lower() for t in page.tokens]) + + term_df = pd.DataFrame({ + 'term': terms.keys(), + 'count': terms.values(), + }) + term_df.to_csv(terms_path) diff --git a/paths.py b/paths.py index 4b39f20..88335df 100644 --- a/paths.py +++ b/paths.py @@ -7,6 +7,7 @@ CRAWL_PREFIX = 'crawl_' CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*") INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch') TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch') +TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv') WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz') diff --git a/performance.py b/performance.py index b460718..8f0cb10 100644 --- a/performance.py +++ b/performance.py @@ -12,7 +12,7 @@ from starlette.testclient import TestClient from app import app from fsqueue import ZstdJsonSerializer from index import TinyIndexer, index_titles_and_urls -from paths import TEST_INDEX_PATH, DATA_DIR +from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH from wiki import get_wiki_titles_and_urls NUM_DOCUMENTS = 30000 @@ -84,7 +84,7 @@ def performance_test(): titles_and_urls = get_test_pages() start_time = datetime.now() - index_titles_and_urls(indexer, nlp, titles_and_urls) + index_titles_and_urls(indexer, nlp, titles_and_urls, TEST_TERMS_PATH) stop_time = datetime.now() index_time = (stop_time - start_time).total_seconds() @@ -111,6 +111,5 @@ def print_pages(pages): print() - if __name__ == '__main__': performance_test()