mwmbl/performance.py

120 lines
3.4 KiB
Python
Raw Normal View History

2021-03-23 22:03:48 +00:00
"""
Test the performance of the search in terms of compression and speed.
"""
import os
from datetime import datetime
2021-04-16 22:01:01 +01:00
import numpy as np
2021-03-23 22:03:48 +00:00
from spacy.lang.en import English
2021-03-25 08:38:09 +00:00
from starlette.testclient import TestClient
2021-03-23 22:03:48 +00:00
import create_app
2021-05-21 11:30:42 +01:00
from fsqueue import ZstdJsonSerializer
from index import TinyIndexer, index_titles_and_urls, Document, TinyIndex
2021-05-30 21:30:34 +01:00
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
2021-03-23 22:03:48 +00:00
2021-04-16 05:28:51 +01:00
NUM_DOCUMENTS = 30000
NUM_PAGES_FOR_STATS = 10
2021-04-16 22:01:01 +01:00
TEST_PAGE_SIZE = 512
2021-04-18 04:54:46 +01:00
TEST_NUM_PAGES = 1024
2021-05-21 11:30:42 +01:00
TEST_DATA_PATH = os.path.join(DATA_DIR, 'test-urls.zstd')
RECALL_AT_K = 3
2021-05-21 11:30:42 +01:00
2021-06-11 21:43:12 +01:00
NUM_QUERY_CHARS = 10
2021-05-21 11:30:42 +01:00
def get_test_pages():
serializer = ZstdJsonSerializer()
with open(TEST_DATA_PATH, 'rb') as data_file:
data = serializer.deserialize(data_file.read())
return [(row['title'], row['url']) for row in data if row['title'] is not None]
2021-03-25 08:38:09 +00:00
def query_test():
2021-05-21 11:30:42 +01:00
titles_and_urls = get_test_pages()
print(f"Got {len(titles_and_urls)} titles and URLs")
2021-06-13 21:41:19 +01:00
tiny_index = TinyIndex(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE)
2021-03-25 08:38:09 +00:00
app = create_app.create(tiny_index)
2021-04-11 15:10:02 +01:00
client = TestClient(app)
2021-03-25 08:38:09 +00:00
start = datetime.now()
hits = 0
count = 0
2021-05-21 11:30:42 +01:00
for title, url in titles_and_urls:
2021-06-11 21:43:12 +01:00
query = title[:NUM_QUERY_CHARS]
result = client.get('/complete', params={'q': query})
2021-04-11 15:10:02 +01:00
assert result.status_code == 200
data = result.json()
2021-03-25 08:38:09 +00:00
hit = False
if data:
for result in data[1][:RECALL_AT_K]:
if url in result:
hit = True
break
if hit:
2021-03-25 08:38:09 +00:00
hits += 1
2021-05-25 21:47:19 +01:00
else:
print("Miss", data, title, url, sep='\n')
2021-03-25 08:38:09 +00:00
count += 1
2021-03-25 08:38:09 +00:00
end = datetime.now()
print(f"Hits: {hits} out of {count}")
print(f"Recall at {RECALL_AT_K}: {hits/count}")
2021-04-12 18:37:33 +01:00
print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
2021-03-25 08:38:09 +00:00
2021-04-16 05:28:51 +01:00
def page_stats(indexer: TinyIndexer):
2021-04-18 04:54:46 +01:00
pages_and_sizes = []
2021-04-16 22:01:01 +01:00
for i in range(TEST_NUM_PAGES):
2021-04-16 05:28:51 +01:00
page = indexer.get_page(i)
if page is not None:
2021-04-18 04:54:46 +01:00
pages_and_sizes.append((len(page), page))
big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
2021-04-16 05:28:51 +01:00
2021-03-23 22:03:48 +00:00
def performance_test():
nlp = English()
try:
os.remove(TEST_INDEX_PATH)
except FileNotFoundError:
print("No test index found, creating")
with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
2021-05-21 11:30:42 +01:00
titles_and_urls = get_test_pages()
2021-03-23 22:03:48 +00:00
2021-04-12 18:37:33 +01:00
start_time = datetime.now()
2021-05-30 21:30:34 +01:00
index_titles_and_urls(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
2021-04-12 18:37:33 +01:00
stop_time = datetime.now()
2021-03-23 22:03:48 +00:00
2021-04-12 18:37:33 +01:00
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
2021-03-23 22:03:48 +00:00
2021-04-18 04:54:46 +01:00
page_size_mean, page_size_std, big_pages = page_stats(indexer)
2021-04-16 05:28:51 +01:00
2021-04-12 18:37:33 +01:00
print("Indexed pages:", NUM_DOCUMENTS)
2021-03-23 22:03:48 +00:00
print("Index time:", index_time)
2021-04-16 05:28:51 +01:00
print("Index size:", index_size)
print("Mean docs per page:", page_size_mean)
print("Std err of docs per page:", page_size_std)
2021-04-18 04:54:46 +01:00
print("Big pages")
print_pages(big_pages)
2021-04-12 18:37:33 +01:00
# print("Num tokens", indexer.get_num_tokens())
2021-03-23 22:03:48 +00:00
2021-05-21 11:30:42 +01:00
query_test()
2021-04-18 04:54:46 +01:00
def print_pages(pages):
for page in pages:
print("Page", page)
2021-04-18 04:54:46 +01:00
for title, url in page:
print(title, url)
print()
2021-03-25 08:38:09 +00:00
2021-03-23 22:03:48 +00:00
if __name__ == '__main__':
performance_test()