performance.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. """
  2. Test the performance of the search in terms of compression and speed.
  3. """
  4. import os
  5. from datetime import datetime
  6. from itertools import islice
  7. import numpy as np
  8. from spacy.lang.en import English
  9. from starlette.testclient import TestClient
  10. from app import app
  11. from index import TinyIndexer, index_titles_and_urls
  12. from paths import TEST_INDEX_PATH
  13. from wiki import get_wiki_titles_and_urls
  14. NUM_DOCUMENTS = 30000
  15. NUM_PAGES_FOR_STATS = 10
  16. TEST_PAGE_SIZE = 512
  17. TEST_NUM_PAGES = 1024
  18. def query_test():
  19. titles_and_urls = get_wiki_titles_and_urls()
  20. client = TestClient(app)
  21. start = datetime.now()
  22. hits = 0
  23. for title, url in islice(titles_and_urls, NUM_DOCUMENTS):
  24. result = client.get('/complete', params={'q': title})
  25. assert result.status_code == 200
  26. data = result.content.decode('utf8')
  27. # print("Data", data, url, sep='\n')
  28. if title in data:
  29. hits += 1
  30. end = datetime.now()
  31. print("Hits:", hits)
  32. print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
  33. def page_stats(indexer: TinyIndexer):
  34. pages_and_sizes = []
  35. for i in range(TEST_NUM_PAGES):
  36. page = indexer.get_page(i)
  37. if page is not None:
  38. pages_and_sizes.append((len(page), page))
  39. big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
  40. return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
  41. def performance_test():
  42. nlp = English()
  43. try:
  44. os.remove(TEST_INDEX_PATH)
  45. except FileNotFoundError:
  46. print("No test index found, creating")
  47. with TinyIndexer(TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
  48. titles_and_urls = get_wiki_titles_and_urls()
  49. titles_and_urls_slice = islice(titles_and_urls, NUM_DOCUMENTS)
  50. start_time = datetime.now()
  51. index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
  52. stop_time = datetime.now()
  53. index_time = (stop_time - start_time).total_seconds()
  54. index_size = os.path.getsize(TEST_INDEX_PATH)
  55. page_size_mean, page_size_std, big_pages = page_stats(indexer)
  56. print("Indexed pages:", NUM_DOCUMENTS)
  57. print("Index time:", index_time)
  58. print("Index size:", index_size)
  59. print("Mean docs per page:", page_size_mean)
  60. print("Std err of docs per page:", page_size_std)
  61. print("Big pages")
  62. print_pages(big_pages)
  63. # print("Num tokens", indexer.get_num_tokens())
  64. # query_test()
  65. def print_pages(pages):
  66. for page in pages:
  67. for title, url in page:
  68. print(title, url)
  69. print()
  70. if __name__ == '__main__':
  71. performance_test()