update_pages.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. """
  2. Iterate over each page in the index and update it based on what is in the index database.
  3. """
  4. import traceback
  5. from time import sleep
  6. from mwmbl.database import Database
  7. from mwmbl.indexer.indexdb import IndexDatabase
  8. from mwmbl.tinysearchengine.indexer import TinyIndex, Document
  9. def run_update(index_path):
  10. with Database() as db:
  11. index_db = IndexDatabase(db.connection)
  12. index_db.create_tables()
  13. with TinyIndex(Document, index_path, 'w') as indexer:
  14. with Database() as db:
  15. index_db = IndexDatabase(db.connection)
  16. pages_to_process = index_db.get_queued_pages()
  17. print(f"Got {len(pages_to_process)} pages to process")
  18. for i in pages_to_process:
  19. documents = index_db.get_queued_documents_for_page(i)
  20. print(f"Documents queued for page {i}: {len(documents)}")
  21. if len(documents) > 0:
  22. for j in range(3):
  23. try:
  24. indexer.add_to_page(i, documents)
  25. break
  26. except ValueError:
  27. documents = documents[:len(documents)//2]
  28. if len(documents) == 0:
  29. print("No more space")
  30. break
  31. print(f"Not enough space, adding {len(documents)}")
  32. index_db.clear_queued_documents_for_page(i)
  33. # All preprocessed documents should now have been indexed
  34. # Clear documents that have now been preprocessed and indexed
  35. num_cleared = index_db.clear_documents_for_preprocessing()
  36. print(f"Indexed {num_cleared} documents")
  37. def run(index_path):
  38. while True:
  39. try:
  40. run_update(index_path)
  41. except Exception as e:
  42. print("Exception updating pages in index")
  43. traceback.print_exception(type(e), e, e.__traceback__)
  44. sleep(10)
  45. if __name__ == '__main__':
  46. run_update('data/index.tinysearch')