Index wiki
This commit is contained in:
parent
2eb6afc3fe
commit
c17c10ac4c
3 changed files with 47 additions and 54 deletions
34
app.py
34
app.py
|
@ -1,4 +1,6 @@
|
|||
import sqlite3
|
||||
from functools import lru_cache
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
@ -19,6 +21,7 @@ def search(s: str):
|
|||
return RedirectResponse(url)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def complete_term(term):
|
||||
con = sqlite3.connect(INDEX_PATH)
|
||||
query = f"""
|
||||
|
@ -39,26 +42,43 @@ def complete_term(term):
|
|||
def complete(q: str):
|
||||
terms = [x.lower() for x in q.split()]
|
||||
|
||||
completed = complete_term(terms[-1])
|
||||
terms = terms[:-1] + [completed]
|
||||
# completed = complete_term(terms[-1])
|
||||
# terms = terms[:-1] + [completed]
|
||||
|
||||
con = sqlite3.connect(INDEX_PATH)
|
||||
in_part = ','.join('?'*len(terms))
|
||||
query = f"""
|
||||
SELECT title, url, count(*)
|
||||
SELECT title, url, count(*), length(title)
|
||||
FROM terms INNER JOIN pages
|
||||
ON terms.page_id = pages.id
|
||||
WHERE term IN ({in_part})
|
||||
GROUP BY title, url
|
||||
ORDER BY 3 DESC
|
||||
ORDER BY 3 DESC, 4
|
||||
LIMIT 20
|
||||
"""
|
||||
|
||||
data = pd.read_sql(query, con, params=terms)
|
||||
results = data.apply(lambda row: f'{row.title} — {row.url}', axis=1)
|
||||
print("Results", results)
|
||||
results = data.apply(lambda row: row.title.replace("\n", "") + ' — ' +
|
||||
row.url.replace("\n", ""), axis=1)
|
||||
if len(results) == 0:
|
||||
return []
|
||||
return [q, results.to_list()[:5]]
|
||||
results_list = results.to_list()[:5]
|
||||
results_list = [q, results_list]
|
||||
# , [], [], {
|
||||
# 'google:suggestdetail': [
|
||||
# {'a': 'A', 't': x, 'q': 'p=v'}
|
||||
# for x in results_list]
|
||||
# }]
|
||||
print("Results", results_list)
|
||||
return results_list
|
||||
|
||||
# titles = [x.strip() for x in data['title'].to_list()[:5]]
|
||||
# urls = [x.strip() for x in data['url'].to_list()[:5]]
|
||||
#
|
||||
# # result = [q, titles, ['asd'] * 5, urls]
|
||||
# result = [q, titles]
|
||||
# print("Returning", result)
|
||||
# return result
|
||||
|
||||
|
||||
@app.get('/')
|
||||
|
|
1
paths.py
1
paths.py
|
@ -7,3 +7,4 @@ CRAWL_PREFIX = 'crawl_'
|
|||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
|
||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-all-titles-in-ns0.gz')
|
||||
|
|
66
wiki.py
66
wiki.py
|
@ -2,62 +2,34 @@
|
|||
Index Wikipedia
|
||||
"""
|
||||
import bz2
|
||||
from xml.dom import minidom
|
||||
from xml.etree import ElementTree
|
||||
import gzip
|
||||
from xml.etree.ElementTree import XMLParser
|
||||
|
||||
from mediawiki_parser import preprocessor, text
|
||||
from spacy.lang.en import English
|
||||
|
||||
import wikitextparser as wtp
|
||||
|
||||
from paths import WIKI_DATA_PATH
|
||||
from index import tokenize, index
|
||||
from paths import WIKI_DATA_PATH, WIKI_TITLES_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
|
||||
|
||||
class WikiIndexer:
|
||||
def __init__(self):
|
||||
self.tags = []
|
||||
self.current_data = ''
|
||||
|
||||
self.wiki_preprocessor = preprocessor.make_parser({})
|
||||
self.parser = text.make_parser()
|
||||
|
||||
|
||||
def start(self, tag, attr):
|
||||
tagname = tag.split('}')[-1]
|
||||
self.tags.append(tagname)
|
||||
# print("Start", self.tags)
|
||||
|
||||
def end(self, tag):
|
||||
if self.tags == TEXT_TAGS:
|
||||
self.handle_data(self.current_data)
|
||||
self.current_data = ''
|
||||
self.tags.pop()
|
||||
# print("End", tag)
|
||||
|
||||
def data(self, data):
|
||||
# print("Data", self.tags)
|
||||
if self.tags == TEXT_TAGS:
|
||||
self.current_data += data
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def handle_data(self, data):
|
||||
preprocessed_text = self.wiki_preprocessor.parse(data)
|
||||
output = self.parser.parse(preprocessed_text.leaves())
|
||||
|
||||
print("Data", output)
|
||||
|
||||
|
||||
def index_wiki():
|
||||
target = WikiIndexer()
|
||||
parser = XMLParser(target=target)
|
||||
with bz2.open(WIKI_DATA_PATH, 'rt') as wiki_file:
|
||||
for line in wiki_file:
|
||||
parser.feed(line)
|
||||
nlp = English()
|
||||
indexed = 0
|
||||
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
|
||||
wiki_titles_file.readline()
|
||||
for title in wiki_titles_file:
|
||||
title_cleaned = title.replace('_', ' ')
|
||||
tokens = tokenize(nlp, title_cleaned)
|
||||
|
||||
if len(tokens) > 0:
|
||||
indexed += 1
|
||||
url = 'https://en.wikipedia.org/wiki/' + title
|
||||
index(tokens, url, title_cleaned)
|
||||
|
||||
if indexed % 1000 == 0:
|
||||
print("Indexed", indexed)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in a new issue