2021-03-15 22:06:37 +00:00
|
|
|
"""
|
|
|
|
Index Wikipedia
|
|
|
|
"""
|
2021-03-21 21:37:41 +00:00
|
|
|
import gzip
|
2021-03-23 22:03:48 +00:00
|
|
|
import html
|
|
|
|
from urllib.parse import quote
|
2021-03-15 22:06:37 +00:00
|
|
|
|
2021-03-21 21:37:41 +00:00
|
|
|
from spacy.lang.en import English
|
2021-03-15 22:06:37 +00:00
|
|
|
|
2021-04-12 17:37:33 +00:00
|
|
|
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
|
2021-03-23 22:03:48 +00:00
|
|
|
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
2021-03-15 22:06:37 +00:00
|
|
|
|
|
|
|
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
2021-03-23 22:03:48 +00:00
|
|
|
TITLE_START = '<title>Wikipedia: '
|
|
|
|
TITLE_END = '</title>\n'
|
2021-03-15 22:06:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
def index_wiki():
|
2021-03-21 21:37:41 +00:00
|
|
|
nlp = English()
|
2021-04-12 17:37:33 +00:00
|
|
|
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
|
|
|
titles_and_urls = get_wiki_titles_and_urls()
|
|
|
|
index_titles_and_urls(indexer, nlp, titles_and_urls)
|
2021-03-21 21:37:41 +00:00
|
|
|
|
|
|
|
|
2021-03-23 22:03:48 +00:00
|
|
|
def get_wiki_titles_and_urls():
|
|
|
|
start_len = len(TITLE_START)
|
|
|
|
end_len = len(TITLE_END)
|
|
|
|
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
|
|
|
|
wiki_titles_file.readline()
|
|
|
|
for raw_title in wiki_titles_file:
|
|
|
|
assert raw_title.startswith(TITLE_START)
|
|
|
|
assert raw_title.endswith(TITLE_END)
|
|
|
|
title = raw_title[start_len:-end_len]
|
|
|
|
unescaped_title = html.unescape(title)
|
|
|
|
url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
|
|
|
|
yield unescaped_title, url
|
2021-03-15 22:06:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
index_wiki()
|