From ced0fceae8210a0fbab7b83c2415532c432d5f10 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Fri, 16 Apr 2021 05:28:51 +0100 Subject: [PATCH] Record docs per page --- index.py | 6 +++--- performance.py | 20 ++++++++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/index.py b/index.py index d265822..830ec09 100644 --- a/index.py +++ b/index.py @@ -70,13 +70,13 @@ class TinyIndexBase: def retrieve(self, token): index = self._get_token_page_index(token) - return self._get_page(index) + return self.get_page(index) def _get_token_page_index(self, token): token_hash = mmh3.hash(token, signed=False) return token_hash % self.num_pages - def _get_page(self, i): + def get_page(self, i): """ Get the page at index i, decompress and deserialise it using JSON """ @@ -122,7 +122,7 @@ class TinyIndexer(TinyIndexBase): def _index_document(self, document: Document, token: str): page_index = self._get_token_page_index(token) - current_page = self._get_page(page_index) + current_page = self.get_page(page_index) if current_page is None: current_page = [] current_page.append([document.title, document.url]) diff --git a/performance.py b/performance.py index 7997262..2d4280d 100644 --- a/performance.py +++ b/performance.py @@ -2,6 +2,7 @@ Test the performance of the search in terms of compression and speed. """ import json +import numpy as np import os from datetime import datetime from itertools import islice @@ -15,7 +16,8 @@ from paths import TEST_INDEX_PATH from wiki import get_wiki_titles_and_urls -NUM_DOCUMENTS = 500 +NUM_DOCUMENTS = 30000 +NUM_PAGES_FOR_STATS = 10 def query_test(): @@ -39,6 +41,16 @@ def query_test(): print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS) +def page_stats(indexer: TinyIndexer): + page_sizes = [] + for i in range(NUM_PAGES): + page = indexer.get_page(i) + if page is not None: + page_sizes.append(len(page)) + big_page_sizes = sorted(page_sizes)[-NUM_PAGES_FOR_STATS:] + return np.mean(big_page_sizes), np.std(big_page_sizes) + + def performance_test(): nlp = English() try: @@ -56,9 +68,13 @@ def performance_test(): index_time = (stop_time - start_time).total_seconds() index_size = os.path.getsize(TEST_INDEX_PATH) + page_size_mean, page_size_std = page_stats(indexer) + print("Indexed pages:", NUM_DOCUMENTS) print("Index time:", index_time) - print("Index size", index_size) + print("Index size:", index_size) + print("Mean docs per page:", page_size_mean) + print("Std err of docs per page:", page_size_std) # print("Num tokens", indexer.get_num_tokens()) query_test()