Parse wiki (slowly)

This commit is contained in:
Daoud Clarke 2021-03-15 22:06:37 +00:00
parent f4215352c9
commit 8e6a67f31b
2 changed files with 65 additions and 0 deletions

View file

@ -6,3 +6,4 @@ HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')

64
wiki.py Normal file
View file

@ -0,0 +1,64 @@
"""
Index Wikipedia
"""
import bz2
from xml.dom import minidom
from xml.etree import ElementTree
from xml.etree.ElementTree import XMLParser
from mediawiki_parser import preprocessor, text
import wikitextparser as wtp
from paths import WIKI_DATA_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
class WikiIndexer:
def __init__(self):
self.tags = []
self.current_data = ''
self.wiki_preprocessor = preprocessor.make_parser({})
self.parser = text.make_parser()
def start(self, tag, attr):
tagname = tag.split('}')[-1]
self.tags.append(tagname)
# print("Start", self.tags)
def end(self, tag):
if self.tags == TEXT_TAGS:
self.handle_data(self.current_data)
self.current_data = ''
self.tags.pop()
# print("End", tag)
def data(self, data):
# print("Data", self.tags)
if self.tags == TEXT_TAGS:
self.current_data += data
pass
def close(self):
pass
def handle_data(self, data):
preprocessed_text = self.wiki_preprocessor.parse(data)
output = self.parser.parse(preprocessed_text.leaves())
print("Data", output)
def index_wiki():
target = WikiIndexer()
parser = XMLParser(target=target)
with bz2.open(WIKI_DATA_PATH, 'rt') as wiki_file:
for line in wiki_file:
parser.feed(line)
if __name__ == '__main__':
index_wiki()