inspect_index.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import logging
  2. import sys
  3. import spacy
  4. from mwmbl.indexer.index import tokenize_document
  5. from mwmbl.indexer.paths import INDEX_PATH
  6. from mwmbl.tinysearchengine.indexer import TinyIndex, Document
  7. logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
  8. nlp = spacy.load("en_core_web_sm")
  9. def store():
  10. document = Document(
  11. title='A nation in search of the new black | Theatre | The Guardian',
  12. url='https://www.theguardian.com/stage/2007/nov/18/theatre',
  13. extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
  14. score=1.0
  15. )
  16. with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
  17. tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
  18. print("Tokenized", tokenized)
  19. # for token in tokenized.tokens:
  20. #
  21. # tiny_index.index(token, document)
  22. def get_items():
  23. with TinyIndex(Document, INDEX_PATH) as tiny_index:
  24. items = tiny_index.retrieve('wikipedia')
  25. if items:
  26. for item in items:
  27. print("Items", item)
  28. def run():
  29. with TinyIndex(Document, INDEX_PATH) as tiny_index:
  30. for i in range(100000):
  31. page = tiny_index.get_page(i)
  32. for item in page:
  33. if ' search' in item.title:
  34. print("Page", i, item)
  35. if __name__ == '__main__':
  36. # store()
  37. # run()
  38. get_items()