index_queue.py 896 B

123456789101112131415161718192021222324252627282930
  1. """
  2. Index items in the file-system queue
  3. """
  4. from spacy.lang.en import English
  5. from fsqueue import FSQueue, ZstdJsonSerializer
  6. from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts
  7. from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
  8. def get_queue_items():
  9. titles_queue = FSQueue(DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer())
  10. titles_queue.unlock_all()
  11. while True:
  12. items_id, items = titles_queue.get()
  13. for item in items:
  14. if item['title'] is None:
  15. continue
  16. yield item['title'], item['url']
  17. def index_queue_items():
  18. nlp = English()
  19. with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
  20. titles_and_urls = get_queue_items()
  21. index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
  22. if __name__ == '__main__':
  23. index_queue_items()