4 jaren geleden · 14f820ff37
--- a/index.py
+++ b/index.py
@@ -4,6 +4,7 @@ Create a search index
 
				 import gzip
			
 
				 import sqlite3
			
 
				 from glob import glob
			
 
				+from urllib.parse import unquote
			
 
				 
			
 
				 import bs4
			
 
				 import justext
			
@@ -13,10 +14,13 @@ from paths import CRAWL_GLOB, INDEX_PATH
 
				 
			
 
				 NUM_INITIAL_TOKENS = 50
			
 
				 
			
 
				+HTTP_START = 'http://'
			
 
				+HTTPS_START = 'https://'
			
 
				+
			
 
				 
			
 
				 def is_content_token(nlp, token):
			
 
				     lexeme = nlp.vocab[token.orth]
			
 
				-    return lexeme.is_alpha and not token.is_stop
			
 
				+    return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
			
 
				 
			
 
				 
			
 
				 def tokenize(nlp, cleaned_text):
			
@@ -34,58 +38,60 @@ def clean(content):
 
				     return cleaned_text
			
 
				 
			
 
				 
			
 
				-def index(tokens, url, title):
			
 
				-    with sqlite3.connect(INDEX_PATH) as con:
			
 
				+class Indexer:
			
 
				+    def __init__(self, index_path):
			
 
				+        self.index_path = index_path
			
 
				+
			
 
				+    def index(self, tokens, url, title):
			
 
				+        with sqlite3.connect(self.index_path) as con:
			
 
				+            con.execute("""
			
 
				+                INSERT INTO pages (url, title)
			
 
				+                VALUES (?, ?)
			
 
				+            """, (url, title))
			
 
				+
			
 
				+            result = con.execute("""
			
 
				+                SELECT last_insert_rowid()
			
 
				+            """)
			
 
				+            page_id = result.fetchone()[0]
			
 
				+
			
 
				+            con.executemany("""
			
 
				+                INSERT INTO terms (term, page_id)
			
 
				+                VALUES (?, ?)
			
 
				+            """, [(term, page_id) for term in tokens])
			
 
				+
			
 
				+    def create_if_not_exists(self):
			
 
				+        con = sqlite3.connect(self.index_path)
			
 
				         con.execute("""
			
 
				-            INSERT INTO pages (url, title)
			
 
				-            VALUES (?, ?)
			
 
				-        """, (url, title))
			
 
				+        CREATE TABLE IF NOT EXISTS pages (
			
 
				+          id INTEGER PRIMARY KEY,
			
 
				+          url TEXT UNIQUE,
			
 
				+          title TEXT
			
 
				+        )
			
 
				+        """)
			
 
				 
			
 
				-        result = con.execute("""
			
 
				-            SELECT last_insert_rowid()
			
 
				+        con.execute("""
			
 
				+        CREATE TABLE IF NOT EXISTS terms (
			
 
				+          term TEXT,
			
 
				+          page_id INTEGER 
			
 
				+        )
			
 
				         """)
			
 
				-        page_id = result.fetchone()[0]
			
 
				-        print("Created page with id", page_id)
			
 
				-
			
 
				-        con.executemany("""
			
 
				-            INSERT INTO terms (term, page_id)
			
 
				-            VALUES (?, ?)
			
 
				-        """, [(term, page_id) for term in tokens])
			
 
				-
			
 
				-
			
 
				-def create_if_not_exists():
			
 
				-    con = sqlite3.connect(INDEX_PATH)
			
 
				-    con.execute("""
			
 
				-    CREATE TABLE IF NOT EXISTS pages (
			
 
				-      id INTEGER PRIMARY KEY,
			
 
				-      url TEXT UNIQUE,
			
 
				-      title TEXT
			
 
				-    )
			
 
				-    """)
			
 
				-
			
 
				-    con.execute("""
			
 
				-    CREATE TABLE IF NOT EXISTS terms (
			
 
				-      term TEXT,
			
 
				-      page_id INTEGER 
			
 
				-    )
			
 
				-    """)
			
 
				-
			
 
				-    con.execute("""
			
 
				-    CREATE INDEX IF NOT EXISTS term_index ON terms (term)
			
 
				-    """)
			
 
				-
			
 
				-
			
 
				-def page_indexed(url):
			
 
				-    con = sqlite3.connect(INDEX_PATH)
			
 
				-    result = con.execute("""
			
 
				-        SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
			
 
				-    """, (url,))
			
 
				-    value = result.fetchone()[0]
			
 
				-    return value == 1
			
 
				+
			
 
				+        con.execute("""
			
 
				+        CREATE INDEX IF NOT EXISTS term_index ON terms (term)
			
 
				+        """)
			
 
				+
			
 
				+    def page_indexed(self, url):
			
 
				+        con = sqlite3.connect(self.index_path)
			
 
				+        result = con.execute("""
			
 
				+            SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
			
 
				+        """, (url,))
			
 
				+        value = result.fetchone()[0]
			
 
				+        return value == 1
			
 
				 
			
 
				 
			
 
				 def run():
			
 
				-    create_if_not_exists()
			
 
				+    indexer = Indexer(INDEX_PATH)
			
 
				+    indexer.create_if_not_exists()
			
 
				     nlp = English()
			
 
				     for path in glob(CRAWL_GLOB):
			
 
				         print("Path", path)
			
@@ -93,7 +99,7 @@ def run():
 
				             url = html_file.readline().strip()
			
 
				             content = html_file.read()
			
 
				 
			
 
				-        if page_indexed(url):
			
 
				+        if indexer.page_indexed(url):
			
 
				             print("Page exists, skipping", url)
			
 
				             continue
			
 
				 
			
@@ -106,7 +112,33 @@ def run():
 
				         print("URL", url)
			
 
				         print("Tokens", tokens)
			
 
				         print("Title", title)
			
 
				-        index(tokens, url, title)
			
 
				+        indexer.index(tokens, url, title)
			
 
				+
			
 
				+
			
 
				+def prepare_url_for_tokenizing(url: str):
			
 
				+    if url.startswith(HTTP_START):
			
 
				+        url = url[len(HTTP_START):]
			
 
				+    elif url.startswith(HTTPS_START):
			
 
				+        url = url[len(HTTPS_START):]
			
 
				+    for c in '/._':
			
 
				+        if c in url:
			
 
				+            url = url.replace(c, ' ')
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				+def index_titles_and_urls(indexer, nlp, titles_and_urls):
			
 
				+    indexer.create_if_not_exists()
			
 
				+    for i, (title_cleaned, url) in enumerate(titles_and_urls):
			
 
				+        title_tokens = tokenize(nlp, title_cleaned)
			
 
				+        prepared_url = prepare_url_for_tokenizing(unquote(url))
			
 
				+        url_tokens = tokenize(nlp, prepared_url)
			
 
				+        tokens = title_tokens | url_tokens
			
 
				+
			
 
				+        if len(title_tokens) > 0:
			
 
				+            indexer.index(tokens, url, title_cleaned)
			
 
				+
			
 
				+        if i % 1000 == 0:
			
 
				+            print("Processed", i)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/paths.py
+++ b/paths.py
@@ -6,5 +6,6 @@ HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
 
				 CRAWL_PREFIX = 'crawl_'
			
 
				 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
			
 
				 INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
			
 
				+TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
			
 
				 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
			
 
				-WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-all-titles-in-ns0.gz')
			
 
				+WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
			
--- a/performance.py
+++ b/performance.py
@@ -0,0 +1,37 @@
 
				+"""
			
 
				+Test the performance of the search in terms of compression and speed.
			
 
				+"""
			
 
				+import os
			
 
				+from datetime import datetime
			
 
				+from itertools import islice
			
 
				+
			
 
				+from spacy.lang.en import English
			
 
				+
			
 
				+from index import Indexer, index_titles_and_urls
			
 
				+from paths import TEST_INDEX_PATH
			
 
				+from wiki import get_wiki_titles_and_urls
			
 
				+
			
 
				+
			
 
				+def performance_test():
			
 
				+    nlp = English()
			
 
				+    try:
			
 
				+        os.remove(TEST_INDEX_PATH)
			
 
				+    except FileNotFoundError:
			
 
				+        print("No test index found, creating")
			
 
				+    indexer = Indexer(TEST_INDEX_PATH)
			
 
				+    titles_and_urls = get_wiki_titles_and_urls()
			
 
				+    titles_and_urls_slice = islice(titles_and_urls, 1000)
			
 
				+
			
 
				+    start_time = datetime.now()
			
 
				+    index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
			
 
				+    stop_time = datetime.now()
			
 
				+
			
 
				+    index_time = (stop_time - start_time).total_seconds()
			
 
				+    index_size = os.path.getsize(TEST_INDEX_PATH)
			
 
				+
			
 
				+    print("Index time:", index_time)
			
 
				+    print("Index size", index_size)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    performance_test()
			
--- a/wiki.py
+++ b/wiki.py
@@ -1,35 +1,39 @@
 
				 """
			
 
				 Index Wikipedia
			
 
				 """
			
 
				-import bz2
			
 
				 import gzip
			
 
				-from xml.etree.ElementTree import XMLParser
			
 
				+import html
			
 
				+from urllib.parse import quote
			
 
				 
			
 
				-from mediawiki_parser import preprocessor, text
			
 
				 from spacy.lang.en import English
			
 
				 
			
 
				-from index import tokenize, index
			
 
				-from paths import WIKI_DATA_PATH, WIKI_TITLES_PATH
			
 
				+from index import Indexer, index_titles_and_urls
			
 
				+from paths import WIKI_TITLES_PATH, INDEX_PATH
			
 
				 
			
 
				 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
			
 
				+TITLE_START = '<title>Wikipedia: '
			
 
				+TITLE_END = '</title>\n'
			
 
				 
			
 
				 
			
 
				 def index_wiki():
			
 
				     nlp = English()
			
 
				-    indexed = 0
			
 
				-    with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
			
 
				-        wiki_titles_file.readline()
			
 
				-        for title in wiki_titles_file:
			
 
				-            title_cleaned = title.replace('_', ' ')
			
 
				-            tokens = tokenize(nlp, title_cleaned)
			
 
				+    indexer = Indexer(INDEX_PATH)
			
 
				+    titles_and_urls = get_wiki_titles_and_urls()
			
 
				+    index_titles_and_urls(indexer, nlp, titles_and_urls)
			
 
				 
			
 
				-            if len(tokens) > 0:
			
 
				-                indexed += 1
			
 
				-                url = 'https://en.wikipedia.org/wiki/' + title
			
 
				-                index(tokens, url, title_cleaned)
			
 
				 
			
 
				-                if indexed % 1000 == 0:
			
 
				-                    print("Indexed", indexed)
			
 
				+def get_wiki_titles_and_urls():
			
 
				+    start_len = len(TITLE_START)
			
 
				+    end_len = len(TITLE_END)
			
 
				+    with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
			
 
				+        wiki_titles_file.readline()
			
 
				+        for raw_title in wiki_titles_file:
			
 
				+            assert raw_title.startswith(TITLE_START)
			
 
				+            assert raw_title.endswith(TITLE_END)
			
 
				+            title = raw_title[start_len:-end_len]
			
 
				+            unescaped_title = html.unescape(title)
			
 
				+            url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
			
 
				+            yield unescaped_title, url
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':