From c17c10ac4c6942274375693da3614553e1373e94 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 21 Mar 2021 21:37:41 +0000 Subject: [PATCH] Index wiki --- app.py | 34 +++++++++++++++++++++++------ paths.py | 1 + wiki.py | 66 ++++++++++++++++---------------------------------------- 3 files changed, 47 insertions(+), 54 deletions(-) diff --git a/app.py b/app.py index 89fcae8..3cc2fe3 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,6 @@ import sqlite3 +from functools import lru_cache + import pandas as pd from fastapi import FastAPI @@ -19,6 +21,7 @@ def search(s: str): return RedirectResponse(url) +@lru_cache() def complete_term(term): con = sqlite3.connect(INDEX_PATH) query = f""" @@ -39,26 +42,43 @@ def complete_term(term): def complete(q: str): terms = [x.lower() for x in q.split()] - completed = complete_term(terms[-1]) - terms = terms[:-1] + [completed] + # completed = complete_term(terms[-1]) + # terms = terms[:-1] + [completed] con = sqlite3.connect(INDEX_PATH) in_part = ','.join('?'*len(terms)) query = f""" - SELECT title, url, count(*) + SELECT title, url, count(*), length(title) FROM terms INNER JOIN pages ON terms.page_id = pages.id WHERE term IN ({in_part}) GROUP BY title, url - ORDER BY 3 DESC + ORDER BY 3 DESC, 4 + LIMIT 20 """ data = pd.read_sql(query, con, params=terms) - results = data.apply(lambda row: f'{row.title} — {row.url}', axis=1) - print("Results", results) + results = data.apply(lambda row: row.title.replace("\n", "") + ' — ' + + row.url.replace("\n", ""), axis=1) if len(results) == 0: return [] - return [q, results.to_list()[:5]] + results_list = results.to_list()[:5] + results_list = [q, results_list] + # , [], [], { + # 'google:suggestdetail': [ + # {'a': 'A', 't': x, 'q': 'p=v'} + # for x in results_list] + # }] + print("Results", results_list) + return results_list + + # titles = [x.strip() for x in data['title'].to_list()[:5]] + # urls = [x.strip() for x in data['url'].to_list()[:5]] + # + # # result = [q, titles, ['asd'] * 5, urls] + # result = [q, titles] + # print("Returning", result) + # return result @app.get('/') diff --git a/paths.py b/paths.py index bde5b31..b542392 100644 --- a/paths.py +++ b/paths.py @@ -7,3 +7,4 @@ CRAWL_PREFIX = 'crawl_' CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*") INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3') WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') +WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-all-titles-in-ns0.gz') diff --git a/wiki.py b/wiki.py index 2ae66d9..f013e4b 100644 --- a/wiki.py +++ b/wiki.py @@ -2,62 +2,34 @@ Index Wikipedia """ import bz2 -from xml.dom import minidom -from xml.etree import ElementTree +import gzip from xml.etree.ElementTree import XMLParser from mediawiki_parser import preprocessor, text +from spacy.lang.en import English -import wikitextparser as wtp - -from paths import WIKI_DATA_PATH +from index import tokenize, index +from paths import WIKI_DATA_PATH, WIKI_TITLES_PATH TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text'] -class WikiIndexer: - def __init__(self): - self.tags = [] - self.current_data = '' - - self.wiki_preprocessor = preprocessor.make_parser({}) - self.parser = text.make_parser() - - - def start(self, tag, attr): - tagname = tag.split('}')[-1] - self.tags.append(tagname) - # print("Start", self.tags) - - def end(self, tag): - if self.tags == TEXT_TAGS: - self.handle_data(self.current_data) - self.current_data = '' - self.tags.pop() - # print("End", tag) - - def data(self, data): - # print("Data", self.tags) - if self.tags == TEXT_TAGS: - self.current_data += data - pass - - def close(self): - pass - - def handle_data(self, data): - preprocessed_text = self.wiki_preprocessor.parse(data) - output = self.parser.parse(preprocessed_text.leaves()) - - print("Data", output) - - def index_wiki(): - target = WikiIndexer() - parser = XMLParser(target=target) - with bz2.open(WIKI_DATA_PATH, 'rt') as wiki_file: - for line in wiki_file: - parser.feed(line) + nlp = English() + indexed = 0 + with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file: + wiki_titles_file.readline() + for title in wiki_titles_file: + title_cleaned = title.replace('_', ' ') + tokens = tokenize(nlp, title_cleaned) + + if len(tokens) > 0: + indexed += 1 + url = 'https://en.wikipedia.org/wiki/' + title + index(tokens, url, title_cleaned) + + if indexed % 1000 == 0: + print("Indexed", indexed) if __name__ == '__main__':