Index wiki

2021-03-21 21:37:41 +00:00 · 2021-03-21 21:37:41 +00:00 · c17c10ac4c
commit c17c10ac4c
parent 2eb6afc3fe
3 changed files with 47 additions and 54 deletions
--- a/app.py
+++ b/app.py
@ -1,4 +1,6 @@
 import sqlite3
+from functools import lru_cache
+
 import pandas as pd

 from fastapi import FastAPI
@ -19,6 +21,7 @@ def search(s: str):
    return RedirectResponse(url)


+@lru_cache()
 def complete_term(term):
    con = sqlite3.connect(INDEX_PATH)
    query = f"""
@ -39,26 +42,43 @@ def complete_term(term):
 def complete(q: str):
    terms = [x.lower() for x in q.split()]

-    completed = complete_term(terms[-1])
-    terms = terms[:-1] + [completed]
+    # completed = complete_term(terms[-1])
+    # terms = terms[:-1] + [completed]

    con = sqlite3.connect(INDEX_PATH)
    in_part = ','.join('?'*len(terms))
    query = f"""
-        SELECT title, url, count(*)
+        SELECT title, url, count(*), length(title)
        FROM terms INNER JOIN pages
        ON terms.page_id = pages.id
        WHERE term IN ({in_part})
        GROUP BY title, url
-        ORDER BY 3 DESC
+        ORDER BY 3 DESC, 4
+        LIMIT 20
    """

    data = pd.read_sql(query, con, params=terms)
-    results = data.apply(lambda row: f'{row.title} — {row.url}', axis=1)
-    print("Results", results)
+    results = data.apply(lambda row: row.title.replace("\n", "") + ' — ' +
+                                     row.url.replace("\n", ""), axis=1)
    if len(results) == 0:
        return []
-    return [q, results.to_list()[:5]]
+    results_list = results.to_list()[:5]
+    results_list = [q, results_list]
+    # , [], [], {
+    #     'google:suggestdetail': [
+    #         {'a': 'A', 't': x, 'q': 'p=v'}
+    #         for x in results_list]
+    # }]
+    print("Results", results_list)
+    return results_list
+
+    # titles = [x.strip() for x in data['title'].to_list()[:5]]
+    # urls = [x.strip() for x in data['url'].to_list()[:5]]
+    #
+    # # result = [q, titles, ['asd'] * 5, urls]
+    # result = [q, titles]
+    # print("Returning", result)
+    # return result


@app.get('/')
--- a/paths.py
+++ b/paths.py
@ -7,3 +7,4 @@ CRAWL_PREFIX = 'crawl_'
 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
 INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
+WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-all-titles-in-ns0.gz')
--- a/wiki.py
+++ b/wiki.py
@ -2,62 +2,34 @@
 Index Wikipedia
 """
 import bz2
-from xml.dom import minidom
-from xml.etree import ElementTree
+import gzip
 from xml.etree.ElementTree import XMLParser

 from mediawiki_parser import preprocessor, text
+from spacy.lang.en import English

-import wikitextparser as wtp
-
-from paths import WIKI_DATA_PATH
+from index import tokenize, index
+from paths import WIKI_DATA_PATH, WIKI_TITLES_PATH

 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']


-class WikiIndexer:
-    def __init__(self):
-        self.tags = []
-        self.current_data = ''
-
-        self.wiki_preprocessor = preprocessor.make_parser({})
-        self.parser = text.make_parser()
-
-
-    def start(self, tag, attr):
-        tagname = tag.split('}')[-1]
-        self.tags.append(tagname)
-        # print("Start", self.tags)
-
-    def end(self, tag):
-        if self.tags == TEXT_TAGS:
-            self.handle_data(self.current_data)
-            self.current_data = ''
-        self.tags.pop()
-        # print("End", tag)
-
-    def data(self, data):
-        # print("Data", self.tags)
-        if self.tags == TEXT_TAGS:
-            self.current_data += data
-        pass
-
-    def close(self):
-        pass
-
-    def handle_data(self, data):
-        preprocessed_text = self.wiki_preprocessor.parse(data)
-        output = self.parser.parse(preprocessed_text.leaves())
-
-        print("Data", output)
-
-
 def index_wiki():
-    target = WikiIndexer()
-    parser = XMLParser(target=target)
-    with bz2.open(WIKI_DATA_PATH, 'rt') as wiki_file:
-        for line in wiki_file:
-            parser.feed(line)
+    nlp = English()
+    indexed = 0
+    with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
+        wiki_titles_file.readline()
+        for title in wiki_titles_file:
+            title_cleaned = title.replace('_', ' ')
+            tokens = tokenize(nlp, title_cleaned)
+
+            if len(tokens) > 0:
+                indexed += 1
+                url = 'https://en.wikipedia.org/wiki/' + title
+                index(tokens, url, title_cleaned)
+
+                if indexed % 1000 == 0:
+                    print("Indexed", indexed)


 if __name__ == '__main__':