"""
Index Wikipedia
"""
import gzip
import html
from urllib.parse import quote
from spacy.lang.en import English
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
from paths import WIKI_TITLES_PATH, INDEX_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
TITLE_START = '
Wikipedia: '
TITLE_END = '\n'
def index_wiki():
nlp = English()
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
titles_and_urls = get_wiki_titles_and_urls()
index_titles_and_urls(indexer, nlp, titles_and_urls)
def get_wiki_titles_and_urls():
start_len = len(TITLE_START)
end_len = len(TITLE_END)
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
wiki_titles_file.readline()
for raw_title in wiki_titles_file:
assert raw_title.startswith(TITLE_START)
assert raw_title.endswith(TITLE_END)
title = raw_title[start_len:-end_len]
unescaped_title = html.unescape(title)
url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
yield unescaped_title, url
if __name__ == '__main__':
index_wiki()