From 8e6a67f31bb2483b7138f93422d878597598ead2 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Mon, 15 Mar 2021 22:06:37 +0000 Subject: [PATCH] Parse wiki (slowly) --- paths.py | 1 + wiki.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 wiki.py diff --git a/paths.py b/paths.py index 6fcbeee..bde5b31 100644 --- a/paths.py +++ b/paths.py @@ -6,3 +6,4 @@ HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv') CRAWL_PREFIX = 'crawl_' CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*") INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3') +WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') diff --git a/wiki.py b/wiki.py new file mode 100644 index 0000000..2ae66d9 --- /dev/null +++ b/wiki.py @@ -0,0 +1,64 @@ +""" +Index Wikipedia +""" +import bz2 +from xml.dom import minidom +from xml.etree import ElementTree +from xml.etree.ElementTree import XMLParser + +from mediawiki_parser import preprocessor, text + +import wikitextparser as wtp + +from paths import WIKI_DATA_PATH + +TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text'] + + +class WikiIndexer: + def __init__(self): + self.tags = [] + self.current_data = '' + + self.wiki_preprocessor = preprocessor.make_parser({}) + self.parser = text.make_parser() + + + def start(self, tag, attr): + tagname = tag.split('}')[-1] + self.tags.append(tagname) + # print("Start", self.tags) + + def end(self, tag): + if self.tags == TEXT_TAGS: + self.handle_data(self.current_data) + self.current_data = '' + self.tags.pop() + # print("End", tag) + + def data(self, data): + # print("Data", self.tags) + if self.tags == TEXT_TAGS: + self.current_data += data + pass + + def close(self): + pass + + def handle_data(self, data): + preprocessed_text = self.wiki_preprocessor.parse(data) + output = self.parser.parse(preprocessed_text.leaves()) + + print("Data", output) + + +def index_wiki(): + target = WikiIndexer() + parser = XMLParser(target=target) + with bz2.open(WIKI_DATA_PATH, 'rt') as wiki_file: + for line in wiki_file: + parser.feed(line) + + +if __name__ == '__main__': + index_wiki()