Record docs per page

This commit is contained in:
Daoud Clarke 2021-04-16 05:28:51 +01:00
parent fdb5cbbf3c
commit ced0fceae8
2 changed files with 21 additions and 5 deletions

View file

@ -70,13 +70,13 @@ class TinyIndexBase:
def retrieve(self, token):
index = self._get_token_page_index(token)
return self._get_page(index)
return self.get_page(index)
def _get_token_page_index(self, token):
token_hash = mmh3.hash(token, signed=False)
return token_hash % self.num_pages
def _get_page(self, i):
def get_page(self, i):
"""
Get the page at index i, decompress and deserialise it using JSON
"""
@ -122,7 +122,7 @@ class TinyIndexer(TinyIndexBase):
def _index_document(self, document: Document, token: str):
page_index = self._get_token_page_index(token)
current_page = self._get_page(page_index)
current_page = self.get_page(page_index)
if current_page is None:
current_page = []
current_page.append([document.title, document.url])

View file

@ -2,6 +2,7 @@
Test the performance of the search in terms of compression and speed.
"""
import json
import numpy as np
import os
from datetime import datetime
from itertools import islice
@ -15,7 +16,8 @@ from paths import TEST_INDEX_PATH
from wiki import get_wiki_titles_and_urls
NUM_DOCUMENTS = 500
NUM_DOCUMENTS = 30000
NUM_PAGES_FOR_STATS = 10
def query_test():
@ -39,6 +41,16 @@ def query_test():
print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
def page_stats(indexer: TinyIndexer):
page_sizes = []
for i in range(NUM_PAGES):
page = indexer.get_page(i)
if page is not None:
page_sizes.append(len(page))
big_page_sizes = sorted(page_sizes)[-NUM_PAGES_FOR_STATS:]
return np.mean(big_page_sizes), np.std(big_page_sizes)
def performance_test():
nlp = English()
try:
@ -56,9 +68,13 @@ def performance_test():
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
page_size_mean, page_size_std = page_stats(indexer)
print("Indexed pages:", NUM_DOCUMENTS)
print("Index time:", index_time)
print("Index size", index_size)
print("Index size:", index_size)
print("Mean docs per page:", page_size_mean)
print("Std err of docs per page:", page_size_std)
# print("Num tokens", indexer.get_num_tokens())
query_test()