فهرست منبع

Print big pages

Daoud Clarke 4 سال پیش
والد
کامیت
ed90e49c5e
1فایلهای تغییر یافته به همراه17 افزوده شده و 7 حذف شده
  1. 17 7
      performance.py

+ 17 - 7
performance.py

@@ -17,7 +17,7 @@ from wiki import get_wiki_titles_and_urls
 NUM_DOCUMENTS = 30000
 NUM_PAGES_FOR_STATS = 10
 TEST_PAGE_SIZE = 512
-TEST_NUM_PAGES = 32
+TEST_NUM_PAGES = 1024
 
 
 def query_test():
@@ -42,13 +42,13 @@ def query_test():
 
 
 def page_stats(indexer: TinyIndexer):
-    page_sizes = []
+    pages_and_sizes = []
     for i in range(TEST_NUM_PAGES):
         page = indexer.get_page(i)
         if page is not None:
-            page_sizes.append(len(page))
-    big_page_sizes = sorted(page_sizes)[-NUM_PAGES_FOR_STATS:]
-    return np.mean(big_page_sizes), np.std(big_page_sizes)
+            pages_and_sizes.append((len(page), page))
+    big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
+    return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
 
 
 def performance_test():
@@ -68,16 +68,26 @@ def performance_test():
         index_time = (stop_time - start_time).total_seconds()
         index_size = os.path.getsize(TEST_INDEX_PATH)
 
-        page_size_mean, page_size_std = page_stats(indexer)
+        page_size_mean, page_size_std, big_pages = page_stats(indexer)
 
     print("Indexed pages:", NUM_DOCUMENTS)
     print("Index time:", index_time)
     print("Index size:", index_size)
     print("Mean docs per page:", page_size_mean)
     print("Std err of docs per page:", page_size_std)
+    print("Big pages")
+    print_pages(big_pages)
     # print("Num tokens", indexer.get_num_tokens())
 
-    query_test()
+    # query_test()
+
+
+def print_pages(pages):
+    for page in pages:
+        for title, url in page:
+            print(title, url)
+        print()
+
 
 
 if __name__ == '__main__':