From ed90e49c5ef1e206e62d77351d0ab43b6ff04007 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 18 Apr 2021 04:54:46 +0100 Subject: [PATCH] Print big pages --- performance.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/performance.py b/performance.py index 3376198..e83b68a 100644 --- a/performance.py +++ b/performance.py @@ -17,7 +17,7 @@ from wiki import get_wiki_titles_and_urls NUM_DOCUMENTS = 30000 NUM_PAGES_FOR_STATS = 10 TEST_PAGE_SIZE = 512 -TEST_NUM_PAGES = 32 +TEST_NUM_PAGES = 1024 def query_test(): @@ -42,13 +42,13 @@ def query_test(): def page_stats(indexer: TinyIndexer): - page_sizes = [] + pages_and_sizes = [] for i in range(TEST_NUM_PAGES): page = indexer.get_page(i) if page is not None: - page_sizes.append(len(page)) - big_page_sizes = sorted(page_sizes)[-NUM_PAGES_FOR_STATS:] - return np.mean(big_page_sizes), np.std(big_page_sizes) + pages_and_sizes.append((len(page), page)) + big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS]) + return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages def performance_test(): @@ -68,16 +68,26 @@ def performance_test(): index_time = (stop_time - start_time).total_seconds() index_size = os.path.getsize(TEST_INDEX_PATH) - page_size_mean, page_size_std = page_stats(indexer) + page_size_mean, page_size_std, big_pages = page_stats(indexer) print("Indexed pages:", NUM_DOCUMENTS) print("Index time:", index_time) print("Index size:", index_size) print("Mean docs per page:", page_size_mean) print("Std err of docs per page:", page_size_std) + print("Big pages") + print_pages(big_pages) # print("Num tokens", indexer.get_num_tokens()) - query_test() + # query_test() + + +def print_pages(pages): + for page in pages: + for title, url in page: + print(title, url) + print() + if __name__ == '__main__':