Parse wiki (slowly)
This commit is contained in:
parent
f4215352c9
commit
8e6a67f31b
2 changed files with 65 additions and 0 deletions
1
paths.py
1
paths.py
|
@ -6,3 +6,4 @@ HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
|||
CRAWL_PREFIX = 'crawl_'
|
||||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
|
||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
|
|
64
wiki.py
Normal file
64
wiki.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
"""
|
||||
Index Wikipedia
|
||||
"""
|
||||
import bz2
|
||||
from xml.dom import minidom
|
||||
from xml.etree import ElementTree
|
||||
from xml.etree.ElementTree import XMLParser
|
||||
|
||||
from mediawiki_parser import preprocessor, text
|
||||
|
||||
import wikitextparser as wtp
|
||||
|
||||
from paths import WIKI_DATA_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
|
||||
|
||||
class WikiIndexer:
|
||||
def __init__(self):
|
||||
self.tags = []
|
||||
self.current_data = ''
|
||||
|
||||
self.wiki_preprocessor = preprocessor.make_parser({})
|
||||
self.parser = text.make_parser()
|
||||
|
||||
|
||||
def start(self, tag, attr):
|
||||
tagname = tag.split('}')[-1]
|
||||
self.tags.append(tagname)
|
||||
# print("Start", self.tags)
|
||||
|
||||
def end(self, tag):
|
||||
if self.tags == TEXT_TAGS:
|
||||
self.handle_data(self.current_data)
|
||||
self.current_data = ''
|
||||
self.tags.pop()
|
||||
# print("End", tag)
|
||||
|
||||
def data(self, data):
|
||||
# print("Data", self.tags)
|
||||
if self.tags == TEXT_TAGS:
|
||||
self.current_data += data
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def handle_data(self, data):
|
||||
preprocessed_text = self.wiki_preprocessor.parse(data)
|
||||
output = self.parser.parse(preprocessed_text.leaves())
|
||||
|
||||
print("Data", output)
|
||||
|
||||
|
||||
def index_wiki():
|
||||
target = WikiIndexer()
|
||||
parser = XMLParser(target=target)
|
||||
with bz2.open(WIKI_DATA_PATH, 'rt') as wiki_file:
|
||||
for line in wiki_file:
|
||||
parser.feed(line)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
index_wiki()
|
Loading…
Add table
Reference in a new issue