Count terms

This commit is contained in:
Daoud Clarke 2021-05-30 21:30:34 +01:00
parent 62d22d9d52
commit fb5b6ffd45
3 changed files with 15 additions and 4 deletions

View file

@ -3,6 +3,7 @@ Create a search index
"""
import json
import os
from collections import Counter
from dataclasses import dataclass
from itertools import islice
from mmap import mmap, PROT_READ
@ -11,6 +12,7 @@ from urllib.parse import unquote
import justext
import mmh3
import pandas as pd
from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
NUM_PAGES = 8192
@ -184,10 +186,19 @@ def grouper(n: int, iterator: Iterator):
yield chunk
def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls, terms_path):
indexer.create_if_not_exists()
terms = Counter()
pages = get_pages(nlp, titles_and_urls)
for chunk in grouper(BATCH_SIZE, pages):
indexer.index(list(chunk))
for page in chunk:
terms.update([t.lower() for t in page.tokens])
term_df = pd.DataFrame({
'term': terms.keys(),
'count': terms.values(),
})
term_df.to_csv(terms_path)

View file

@ -7,6 +7,7 @@ CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')

View file

@ -12,7 +12,7 @@ from starlette.testclient import TestClient
from app import app
from fsqueue import ZstdJsonSerializer
from index import TinyIndexer, index_titles_and_urls
from paths import TEST_INDEX_PATH, DATA_DIR
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
from wiki import get_wiki_titles_and_urls
NUM_DOCUMENTS = 30000
@ -84,7 +84,7 @@ def performance_test():
titles_and_urls = get_test_pages()
start_time = datetime.now()
index_titles_and_urls(indexer, nlp, titles_and_urls)
index_titles_and_urls(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
stop_time = datetime.now()
index_time = (stop_time - start_time).total_seconds()
@ -111,6 +111,5 @@ def print_pages(pages):
print()
if __name__ == '__main__':
performance_test()