Index wiki

This commit is contained in:
Daoud Clarke 2021-03-21 21:37:41 +00:00
parent 2eb6afc3fe
commit c17c10ac4c
3 changed files with 47 additions and 54 deletions

34
app.py
View file

@ -1,4 +1,6 @@
import sqlite3
from functools import lru_cache
import pandas as pd
from fastapi import FastAPI
@ -19,6 +21,7 @@ def search(s: str):
return RedirectResponse(url)
@lru_cache()
def complete_term(term):
con = sqlite3.connect(INDEX_PATH)
query = f"""
@ -39,26 +42,43 @@ def complete_term(term):
def complete(q: str):
terms = [x.lower() for x in q.split()]
completed = complete_term(terms[-1])
terms = terms[:-1] + [completed]
# completed = complete_term(terms[-1])
# terms = terms[:-1] + [completed]
con = sqlite3.connect(INDEX_PATH)
in_part = ','.join('?'*len(terms))
query = f"""
SELECT title, url, count(*)
SELECT title, url, count(*), length(title)
FROM terms INNER JOIN pages
ON terms.page_id = pages.id
WHERE term IN ({in_part})
GROUP BY title, url
ORDER BY 3 DESC
ORDER BY 3 DESC, 4
LIMIT 20
"""
data = pd.read_sql(query, con, params=terms)
results = data.apply(lambda row: f'{row.title}{row.url}', axis=1)
print("Results", results)
results = data.apply(lambda row: row.title.replace("\n", "") + '' +
row.url.replace("\n", ""), axis=1)
if len(results) == 0:
return []
return [q, results.to_list()[:5]]
results_list = results.to_list()[:5]
results_list = [q, results_list]
# , [], [], {
# 'google:suggestdetail': [
# {'a': 'A', 't': x, 'q': 'p=v'}
# for x in results_list]
# }]
print("Results", results_list)
return results_list
# titles = [x.strip() for x in data['title'].to_list()[:5]]
# urls = [x.strip() for x in data['url'].to_list()[:5]]
#
# # result = [q, titles, ['asd'] * 5, urls]
# result = [q, titles]
# print("Returning", result)
# return result
@app.get('/')

View file

@ -7,3 +7,4 @@ CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-all-titles-in-ns0.gz')

66
wiki.py
View file

@ -2,62 +2,34 @@
Index Wikipedia
"""
import bz2
from xml.dom import minidom
from xml.etree import ElementTree
import gzip
from xml.etree.ElementTree import XMLParser
from mediawiki_parser import preprocessor, text
from spacy.lang.en import English
import wikitextparser as wtp
from paths import WIKI_DATA_PATH
from index import tokenize, index
from paths import WIKI_DATA_PATH, WIKI_TITLES_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
class WikiIndexer:
def __init__(self):
self.tags = []
self.current_data = ''
self.wiki_preprocessor = preprocessor.make_parser({})
self.parser = text.make_parser()
def start(self, tag, attr):
tagname = tag.split('}')[-1]
self.tags.append(tagname)
# print("Start", self.tags)
def end(self, tag):
if self.tags == TEXT_TAGS:
self.handle_data(self.current_data)
self.current_data = ''
self.tags.pop()
# print("End", tag)
def data(self, data):
# print("Data", self.tags)
if self.tags == TEXT_TAGS:
self.current_data += data
pass
def close(self):
pass
def handle_data(self, data):
preprocessed_text = self.wiki_preprocessor.parse(data)
output = self.parser.parse(preprocessed_text.leaves())
print("Data", output)
def index_wiki():
target = WikiIndexer()
parser = XMLParser(target=target)
with bz2.open(WIKI_DATA_PATH, 'rt') as wiki_file:
for line in wiki_file:
parser.feed(line)
nlp = English()
indexed = 0
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
wiki_titles_file.readline()
for title in wiki_titles_file:
title_cleaned = title.replace('_', ' ')
tokens = tokenize(nlp, title_cleaned)
if len(tokens) > 0:
indexed += 1
url = 'https://en.wikipedia.org/wiki/' + title
index(tokens, url, title_cleaned)
if indexed % 1000 == 0:
print("Indexed", indexed)
if __name__ == '__main__':