Record docs per page
This commit is contained in:
parent
fdb5cbbf3c
commit
ced0fceae8
2 changed files with 21 additions and 5 deletions
6
index.py
6
index.py
|
@ -70,13 +70,13 @@ class TinyIndexBase:
|
|||
|
||||
def retrieve(self, token):
|
||||
index = self._get_token_page_index(token)
|
||||
return self._get_page(index)
|
||||
return self.get_page(index)
|
||||
|
||||
def _get_token_page_index(self, token):
|
||||
token_hash = mmh3.hash(token, signed=False)
|
||||
return token_hash % self.num_pages
|
||||
|
||||
def _get_page(self, i):
|
||||
def get_page(self, i):
|
||||
"""
|
||||
Get the page at index i, decompress and deserialise it using JSON
|
||||
"""
|
||||
|
@ -122,7 +122,7 @@ class TinyIndexer(TinyIndexBase):
|
|||
|
||||
def _index_document(self, document: Document, token: str):
|
||||
page_index = self._get_token_page_index(token)
|
||||
current_page = self._get_page(page_index)
|
||||
current_page = self.get_page(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
current_page.append([document.title, document.url])
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
Test the performance of the search in terms of compression and speed.
|
||||
"""
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
from datetime import datetime
|
||||
from itertools import islice
|
||||
|
@ -15,7 +16,8 @@ from paths import TEST_INDEX_PATH
|
|||
from wiki import get_wiki_titles_and_urls
|
||||
|
||||
|
||||
NUM_DOCUMENTS = 500
|
||||
NUM_DOCUMENTS = 30000
|
||||
NUM_PAGES_FOR_STATS = 10
|
||||
|
||||
|
||||
def query_test():
|
||||
|
@ -39,6 +41,16 @@ def query_test():
|
|||
print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
|
||||
|
||||
|
||||
def page_stats(indexer: TinyIndexer):
|
||||
page_sizes = []
|
||||
for i in range(NUM_PAGES):
|
||||
page = indexer.get_page(i)
|
||||
if page is not None:
|
||||
page_sizes.append(len(page))
|
||||
big_page_sizes = sorted(page_sizes)[-NUM_PAGES_FOR_STATS:]
|
||||
return np.mean(big_page_sizes), np.std(big_page_sizes)
|
||||
|
||||
|
||||
def performance_test():
|
||||
nlp = English()
|
||||
try:
|
||||
|
@ -56,9 +68,13 @@ def performance_test():
|
|||
index_time = (stop_time - start_time).total_seconds()
|
||||
index_size = os.path.getsize(TEST_INDEX_PATH)
|
||||
|
||||
page_size_mean, page_size_std = page_stats(indexer)
|
||||
|
||||
print("Indexed pages:", NUM_DOCUMENTS)
|
||||
print("Index time:", index_time)
|
||||
print("Index size", index_size)
|
||||
print("Index size:", index_size)
|
||||
print("Mean docs per page:", page_size_mean)
|
||||
print("Std err of docs per page:", page_size_std)
|
||||
# print("Num tokens", indexer.get_num_tokens())
|
||||
|
||||
query_test()
|
||||
|
|
Loading…
Add table
Reference in a new issue