performance.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. """
  2. Test the performance of the search in terms of compression and speed.
  3. """
  4. import os
  5. from datetime import datetime
  6. import numpy as np
  7. from spacy.lang.en import English
  8. from starlette.testclient import TestClient
  9. from mwmbl.tinysearchengine import create_app
  10. from mwmbl.indexer.fsqueue import ZstdJsonSerializer
  11. from mwmbl.indexer.index import index_titles_urls_and_extracts
  12. from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
  13. from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
  14. NUM_DOCUMENTS = 30000
  15. NUM_PAGES_FOR_STATS = 10
  16. TEST_PAGE_SIZE = 512
  17. TEST_NUM_PAGES = 1024
  18. TEST_DATA_PATH = os.path.join(DATA_DIR, 'test-urls.zstd')
  19. RECALL_AT_K = 3
  20. NUM_QUERY_CHARS = 10
  21. def get_test_pages():
  22. serializer = ZstdJsonSerializer()
  23. with open(TEST_DATA_PATH, 'rb') as data_file:
  24. data = serializer.deserialize(data_file.read())
  25. return [(row['title'], row['url']) for row in data if row['title'] is not None]
  26. def query_test():
  27. titles_and_urls = get_test_pages()
  28. print(f"Got {len(titles_and_urls)} titles and URLs")
  29. tiny_index = TinyIndex(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE)
  30. app = create_app.create()
  31. client = TestClient(app)
  32. start = datetime.now()
  33. hits = 0
  34. count = 0
  35. for title, url in titles_and_urls:
  36. query = title[:NUM_QUERY_CHARS]
  37. result = client.get('/complete', params={'q': query})
  38. assert result.status_code == 200
  39. data = result.json()
  40. hit = False
  41. if data:
  42. for result in data[1][:RECALL_AT_K]:
  43. if url in result:
  44. hit = True
  45. break
  46. if hit:
  47. hits += 1
  48. else:
  49. print("Miss", data, title, url, sep='\n')
  50. count += 1
  51. end = datetime.now()
  52. print(f"Hits: {hits} out of {count}")
  53. print(f"Recall at {RECALL_AT_K}: {hits/count}")
  54. print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
  55. def page_stats(indexer: TinyIndexer):
  56. pages_and_sizes = []
  57. for i in range(TEST_NUM_PAGES):
  58. page = indexer.get_page(i)
  59. if page is not None:
  60. pages_and_sizes.append((len(page), page))
  61. big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
  62. return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
  63. def performance_test():
  64. nlp = English()
  65. try:
  66. os.remove(TEST_INDEX_PATH)
  67. except FileNotFoundError:
  68. print("No test index found, creating")
  69. with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
  70. titles_and_urls = get_test_pages()
  71. start_time = datetime.now()
  72. index_titles_urls_and_extracts(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
  73. stop_time = datetime.now()
  74. index_time = (stop_time - start_time).total_seconds()
  75. index_size = os.path.getsize(TEST_INDEX_PATH)
  76. page_size_mean, page_size_std, big_pages = page_stats(indexer)
  77. print("Indexed pages:", NUM_DOCUMENTS)
  78. print("Index time:", index_time)
  79. print("Index size:", index_size)
  80. print("Mean docs per page:", page_size_mean)
  81. print("Std err of docs per page:", page_size_std)
  82. print("Big pages")
  83. print_pages(big_pages)
  84. # print("Num tokens", indexer.get_num_tokens())
  85. query_test()
  86. def print_pages(pages):
  87. for page in pages:
  88. print("Page", page)
  89. for title, url in page:
  90. print(title, url)
  91. print()
  92. if __name__ == '__main__':
  93. performance_test()