mwmbl/analyse/performance.py
nitred 11eedcde84 renamed package to mwmbl
- renamed package to mwmbl in pyproject.toml
- tinysearchengine and indexer modules have been moved into mwmbl package folder
- analyse module has been left as is in the root of the repo
- import statements in tinysearchengine now use mwmbl.tinysearchengine
- import statements in indexer now use mwmbl.indexer or mwmbl.tinysearchengine or relative imports like .paths
- import statements in analyse now use mwmbl.indexer or mwmbl.tinysearchengine
- final CMD in Dockerfile now uses updated path mwmbl.tinysearchengine.app
- fixed a couple of import statement errors in tinysearchengine/indexer.py
2021-12-28 12:35:46 +01:00

120 lines
3.5 KiB
Python

"""
Test the performance of the search in terms of compression and speed.
"""
import os
from datetime import datetime
import numpy as np
from spacy.lang.en import English
from starlette.testclient import TestClient
from mwmbl.tinysearchengine import create_app
from mwmbl.indexer.fsqueue import ZstdJsonSerializer
from mwmbl.indexer.index import index_titles_urls_and_extracts
from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
NUM_DOCUMENTS = 30000
NUM_PAGES_FOR_STATS = 10
TEST_PAGE_SIZE = 512
TEST_NUM_PAGES = 1024
TEST_DATA_PATH = os.path.join(DATA_DIR, 'test-urls.zstd')
RECALL_AT_K = 3
NUM_QUERY_CHARS = 10
def get_test_pages():
serializer = ZstdJsonSerializer()
with open(TEST_DATA_PATH, 'rb') as data_file:
data = serializer.deserialize(data_file.read())
return [(row['title'], row['url']) for row in data if row['title'] is not None]
def query_test():
titles_and_urls = get_test_pages()
print(f"Got {len(titles_and_urls)} titles and URLs")
tiny_index = TinyIndex(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE)
app = create_app.create(tiny_index)
client = TestClient(app)
start = datetime.now()
hits = 0
count = 0
for title, url in titles_and_urls:
query = title[:NUM_QUERY_CHARS]
result = client.get('/complete', params={'q': query})
assert result.status_code == 200
data = result.json()
hit = False
if data:
for result in data[1][:RECALL_AT_K]:
if url in result:
hit = True
break
if hit:
hits += 1
else:
print("Miss", data, title, url, sep='\n')
count += 1
end = datetime.now()
print(f"Hits: {hits} out of {count}")
print(f"Recall at {RECALL_AT_K}: {hits/count}")
print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
def page_stats(indexer: TinyIndexer):
pages_and_sizes = []
for i in range(TEST_NUM_PAGES):
page = indexer.get_page(i)
if page is not None:
pages_and_sizes.append((len(page), page))
big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
def performance_test():
nlp = English()
try:
os.remove(TEST_INDEX_PATH)
except FileNotFoundError:
print("No test index found, creating")
with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
titles_and_urls = get_test_pages()
start_time = datetime.now()
index_titles_urls_and_extracts(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
stop_time = datetime.now()
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
page_size_mean, page_size_std, big_pages = page_stats(indexer)
print("Indexed pages:", NUM_DOCUMENTS)
print("Index time:", index_time)
print("Index size:", index_size)
print("Mean docs per page:", page_size_mean)
print("Std err of docs per page:", page_size_std)
print("Big pages")
print_pages(big_pages)
# print("Num tokens", indexer.get_num_tokens())
query_test()
def print_pages(pages):
for page in pages:
print("Page", page)
for title, url in page:
print(title, url)
print()
if __name__ == '__main__':
performance_test()