Count terms
This commit is contained in:
parent
62d22d9d52
commit
fb5b6ffd45
3 changed files with 15 additions and 4 deletions
13
index.py
13
index.py
|
@ -3,6 +3,7 @@ Create a search index
|
|||
"""
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from itertools import islice
|
||||
from mmap import mmap, PROT_READ
|
||||
|
@ -11,6 +12,7 @@ from urllib.parse import unquote
|
|||
|
||||
import justext
|
||||
import mmh3
|
||||
import pandas as pd
|
||||
from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
|
||||
|
||||
NUM_PAGES = 8192
|
||||
|
@ -184,10 +186,19 @@ def grouper(n: int, iterator: Iterator):
|
|||
yield chunk
|
||||
|
||||
|
||||
def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
|
||||
def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls, terms_path):
|
||||
indexer.create_if_not_exists()
|
||||
|
||||
terms = Counter()
|
||||
pages = get_pages(nlp, titles_and_urls)
|
||||
for chunk in grouper(BATCH_SIZE, pages):
|
||||
indexer.index(list(chunk))
|
||||
|
||||
for page in chunk:
|
||||
terms.update([t.lower() for t in page.tokens])
|
||||
|
||||
term_df = pd.DataFrame({
|
||||
'term': terms.keys(),
|
||||
'count': terms.values(),
|
||||
})
|
||||
term_df.to_csv(terms_path)
|
||||
|
|
1
paths.py
1
paths.py
|
@ -7,6 +7,7 @@ CRAWL_PREFIX = 'crawl_'
|
|||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
|
||||
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
|
||||
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
|
||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ from starlette.testclient import TestClient
|
|||
from app import app
|
||||
from fsqueue import ZstdJsonSerializer
|
||||
from index import TinyIndexer, index_titles_and_urls
|
||||
from paths import TEST_INDEX_PATH, DATA_DIR
|
||||
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
from wiki import get_wiki_titles_and_urls
|
||||
|
||||
NUM_DOCUMENTS = 30000
|
||||
|
@ -84,7 +84,7 @@ def performance_test():
|
|||
titles_and_urls = get_test_pages()
|
||||
|
||||
start_time = datetime.now()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls)
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
|
||||
stop_time = datetime.now()
|
||||
|
||||
index_time = (stop_time - start_time).total_seconds()
|
||||
|
@ -111,6 +111,5 @@ def print_pages(pages):
|
|||
print()
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
performance_test()
|
||||
|
|
Loading…
Reference in a new issue