Print big pages
This commit is contained in:
parent
c84eeba92e
commit
ed90e49c5e
1 changed files with 17 additions and 7 deletions
|
@ -17,7 +17,7 @@ from wiki import get_wiki_titles_and_urls
|
|||
NUM_DOCUMENTS = 30000
|
||||
NUM_PAGES_FOR_STATS = 10
|
||||
TEST_PAGE_SIZE = 512
|
||||
TEST_NUM_PAGES = 32
|
||||
TEST_NUM_PAGES = 1024
|
||||
|
||||
|
||||
def query_test():
|
||||
|
@ -42,13 +42,13 @@ def query_test():
|
|||
|
||||
|
||||
def page_stats(indexer: TinyIndexer):
|
||||
page_sizes = []
|
||||
pages_and_sizes = []
|
||||
for i in range(TEST_NUM_PAGES):
|
||||
page = indexer.get_page(i)
|
||||
if page is not None:
|
||||
page_sizes.append(len(page))
|
||||
big_page_sizes = sorted(page_sizes)[-NUM_PAGES_FOR_STATS:]
|
||||
return np.mean(big_page_sizes), np.std(big_page_sizes)
|
||||
pages_and_sizes.append((len(page), page))
|
||||
big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
|
||||
return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
|
||||
|
||||
|
||||
def performance_test():
|
||||
|
@ -68,16 +68,26 @@ def performance_test():
|
|||
index_time = (stop_time - start_time).total_seconds()
|
||||
index_size = os.path.getsize(TEST_INDEX_PATH)
|
||||
|
||||
page_size_mean, page_size_std = page_stats(indexer)
|
||||
page_size_mean, page_size_std, big_pages = page_stats(indexer)
|
||||
|
||||
print("Indexed pages:", NUM_DOCUMENTS)
|
||||
print("Index time:", index_time)
|
||||
print("Index size:", index_size)
|
||||
print("Mean docs per page:", page_size_mean)
|
||||
print("Std err of docs per page:", page_size_std)
|
||||
print("Big pages")
|
||||
print_pages(big_pages)
|
||||
# print("Num tokens", indexer.get_num_tokens())
|
||||
|
||||
query_test()
|
||||
# query_test()
|
||||
|
||||
|
||||
def print_pages(pages):
|
||||
for page in pages:
|
||||
for title, url in page:
|
||||
print(title, url)
|
||||
print()
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in a new issue