performance.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. """
  2. Test the performance of the search in terms of compression and speed.
  3. """
  4. import json
  5. import numpy as np
  6. import os
  7. from datetime import datetime
  8. from itertools import islice
  9. from spacy.lang.en import English
  10. from starlette.testclient import TestClient
  11. from app import app, complete
  12. from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
  13. from paths import TEST_INDEX_PATH
  14. from wiki import get_wiki_titles_and_urls
  15. NUM_DOCUMENTS = 30000
  16. NUM_PAGES_FOR_STATS = 10
  17. def query_test():
  18. titles_and_urls = get_wiki_titles_and_urls()
  19. client = TestClient(app)
  20. start = datetime.now()
  21. hits = 0
  22. for title, url in islice(titles_and_urls, NUM_DOCUMENTS):
  23. result = client.get('/complete', params={'q': title})
  24. assert result.status_code == 200
  25. data = result.content.decode('utf8')
  26. # print("Data", data, url, sep='\n')
  27. if title in data:
  28. hits += 1
  29. end = datetime.now()
  30. print("Hits:", hits)
  31. print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
  32. def page_stats(indexer: TinyIndexer):
  33. page_sizes = []
  34. for i in range(NUM_PAGES):
  35. page = indexer.get_page(i)
  36. if page is not None:
  37. page_sizes.append(len(page))
  38. big_page_sizes = sorted(page_sizes)[-NUM_PAGES_FOR_STATS:]
  39. return np.mean(big_page_sizes), np.std(big_page_sizes)
  40. def performance_test():
  41. nlp = English()
  42. try:
  43. os.remove(TEST_INDEX_PATH)
  44. except FileNotFoundError:
  45. print("No test index found, creating")
  46. with TinyIndexer(TEST_INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
  47. titles_and_urls = get_wiki_titles_and_urls()
  48. titles_and_urls_slice = islice(titles_and_urls, NUM_DOCUMENTS)
  49. start_time = datetime.now()
  50. index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
  51. stop_time = datetime.now()
  52. index_time = (stop_time - start_time).total_seconds()
  53. index_size = os.path.getsize(TEST_INDEX_PATH)
  54. page_size_mean, page_size_std = page_stats(indexer)
  55. print("Indexed pages:", NUM_DOCUMENTS)
  56. print("Index time:", index_time)
  57. print("Index size:", index_size)
  58. print("Mean docs per page:", page_size_mean)
  59. print("Std err of docs per page:", page_size_std)
  60. # print("Num tokens", indexer.get_num_tokens())
  61. query_test()
  62. if __name__ == '__main__':
  63. performance_test()