wiki.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. """
  2. Index Wikipedia
  3. """
  4. import gzip
  5. import html
  6. from urllib.parse import quote
  7. from spacy.lang.en import English
  8. from index import index_titles_urls_and_extracts
  9. from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
  10. from paths import WIKI_TITLES_PATH, INDEX_PATH
  11. TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
  12. TITLE_START = '<title>Wikipedia: '
  13. TITLE_END = '</title>\n'
  14. def index_wiki():
  15. nlp = English()
  16. with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
  17. titles_and_urls = get_wiki_titles_and_urls()
  18. index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
  19. def get_wiki_titles_and_urls():
  20. start_len = len(TITLE_START)
  21. end_len = len(TITLE_END)
  22. with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
  23. wiki_titles_file.readline()
  24. for raw_title in wiki_titles_file:
  25. assert raw_title.startswith(TITLE_START)
  26. assert raw_title.endswith(TITLE_END)
  27. title = raw_title[start_len:-end_len]
  28. unescaped_title = html.unescape(title)
  29. url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
  30. yield unescaped_title, url
  31. if __name__ == '__main__':
  32. index_wiki()