Print big pages

This commit is contained in:
Daoud Clarke 2021-04-18 04:54:46 +01:00
parent c84eeba92e
commit ed90e49c5e

View file

@ -17,7 +17,7 @@ from wiki import get_wiki_titles_and_urls
NUM_DOCUMENTS = 30000
NUM_PAGES_FOR_STATS = 10
TEST_PAGE_SIZE = 512
TEST_NUM_PAGES = 32
TEST_NUM_PAGES = 1024
def query_test():
@ -42,13 +42,13 @@ def query_test():
def page_stats(indexer: TinyIndexer):
page_sizes = []
pages_and_sizes = []
for i in range(TEST_NUM_PAGES):
page = indexer.get_page(i)
if page is not None:
page_sizes.append(len(page))
big_page_sizes = sorted(page_sizes)[-NUM_PAGES_FOR_STATS:]
return np.mean(big_page_sizes), np.std(big_page_sizes)
pages_and_sizes.append((len(page), page))
big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
def performance_test():
@ -68,16 +68,26 @@ def performance_test():
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
page_size_mean, page_size_std = page_stats(indexer)
page_size_mean, page_size_std, big_pages = page_stats(indexer)
print("Indexed pages:", NUM_DOCUMENTS)
print("Index time:", index_time)
print("Index size:", index_size)
print("Mean docs per page:", page_size_mean)
print("Std err of docs per page:", page_size_std)
print("Big pages")
print_pages(big_pages)
# print("Num tokens", indexer.get_num_tokens())
query_test()
# query_test()
def print_pages(pages):
for page in pages:
for title, url in page:
print(title, url)
print()
if __name__ == '__main__':