Bladeren bron

Improve indexing; measure performance

Daoud Clarke 4 jaren geleden
bovenliggende
commit
14f820ff37
4 gewijzigde bestanden met toevoegingen van 141 en 67 verwijderingen
  1. 81 49
      index.py
  2. 2 1
      paths.py
  3. 37 0
      performance.py
  4. 21 17
      wiki.py

+ 81 - 49
index.py

@@ -4,6 +4,7 @@ Create a search index
 import gzip
 import sqlite3
 from glob import glob
+from urllib.parse import unquote
 
 import bs4
 import justext
@@ -13,10 +14,13 @@ from paths import CRAWL_GLOB, INDEX_PATH
 
 NUM_INITIAL_TOKENS = 50
 
+HTTP_START = 'http://'
+HTTPS_START = 'https://'
+
 
 def is_content_token(nlp, token):
     lexeme = nlp.vocab[token.orth]
-    return lexeme.is_alpha and not token.is_stop
+    return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
 
 
 def tokenize(nlp, cleaned_text):
@@ -34,58 +38,60 @@ def clean(content):
     return cleaned_text
 
 
-def index(tokens, url, title):
-    with sqlite3.connect(INDEX_PATH) as con:
+class Indexer:
+    def __init__(self, index_path):
+        self.index_path = index_path
+
+    def index(self, tokens, url, title):
+        with sqlite3.connect(self.index_path) as con:
+            con.execute("""
+                INSERT INTO pages (url, title)
+                VALUES (?, ?)
+            """, (url, title))
+
+            result = con.execute("""
+                SELECT last_insert_rowid()
+            """)
+            page_id = result.fetchone()[0]
+
+            con.executemany("""
+                INSERT INTO terms (term, page_id)
+                VALUES (?, ?)
+            """, [(term, page_id) for term in tokens])
+
+    def create_if_not_exists(self):
+        con = sqlite3.connect(self.index_path)
         con.execute("""
-            INSERT INTO pages (url, title)
-            VALUES (?, ?)
-        """, (url, title))
+        CREATE TABLE IF NOT EXISTS pages (
+          id INTEGER PRIMARY KEY,
+          url TEXT UNIQUE,
+          title TEXT
+        )
+        """)
 
-        result = con.execute("""
-            SELECT last_insert_rowid()
+        con.execute("""
+        CREATE TABLE IF NOT EXISTS terms (
+          term TEXT,
+          page_id INTEGER 
+        )
         """)
-        page_id = result.fetchone()[0]
-        print("Created page with id", page_id)
-
-        con.executemany("""
-            INSERT INTO terms (term, page_id)
-            VALUES (?, ?)
-        """, [(term, page_id) for term in tokens])
-
-
-def create_if_not_exists():
-    con = sqlite3.connect(INDEX_PATH)
-    con.execute("""
-    CREATE TABLE IF NOT EXISTS pages (
-      id INTEGER PRIMARY KEY,
-      url TEXT UNIQUE,
-      title TEXT
-    )
-    """)
-
-    con.execute("""
-    CREATE TABLE IF NOT EXISTS terms (
-      term TEXT,
-      page_id INTEGER 
-    )
-    """)
-
-    con.execute("""
-    CREATE INDEX IF NOT EXISTS term_index ON terms (term)
-    """)
-
-
-def page_indexed(url):
-    con = sqlite3.connect(INDEX_PATH)
-    result = con.execute("""
-        SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
-    """, (url,))
-    value = result.fetchone()[0]
-    return value == 1
+
+        con.execute("""
+        CREATE INDEX IF NOT EXISTS term_index ON terms (term)
+        """)
+
+    def page_indexed(self, url):
+        con = sqlite3.connect(self.index_path)
+        result = con.execute("""
+            SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
+        """, (url,))
+        value = result.fetchone()[0]
+        return value == 1
 
 
 def run():
-    create_if_not_exists()
+    indexer = Indexer(INDEX_PATH)
+    indexer.create_if_not_exists()
     nlp = English()
     for path in glob(CRAWL_GLOB):
         print("Path", path)
@@ -93,7 +99,7 @@ def run():
             url = html_file.readline().strip()
             content = html_file.read()
 
-        if page_indexed(url):
+        if indexer.page_indexed(url):
             print("Page exists, skipping", url)
             continue
 
@@ -106,7 +112,33 @@ def run():
         print("URL", url)
         print("Tokens", tokens)
         print("Title", title)
-        index(tokens, url, title)
+        indexer.index(tokens, url, title)
+
+
+def prepare_url_for_tokenizing(url: str):
+    if url.startswith(HTTP_START):
+        url = url[len(HTTP_START):]
+    elif url.startswith(HTTPS_START):
+        url = url[len(HTTPS_START):]
+    for c in '/._':
+        if c in url:
+            url = url.replace(c, ' ')
+    return url
+
+
+def index_titles_and_urls(indexer, nlp, titles_and_urls):
+    indexer.create_if_not_exists()
+    for i, (title_cleaned, url) in enumerate(titles_and_urls):
+        title_tokens = tokenize(nlp, title_cleaned)
+        prepared_url = prepare_url_for_tokenizing(unquote(url))
+        url_tokens = tokenize(nlp, prepared_url)
+        tokens = title_tokens | url_tokens
+
+        if len(title_tokens) > 0:
+            indexer.index(tokens, url, title_cleaned)
+
+        if i % 1000 == 0:
+            print("Processed", i)
 
 
 if __name__ == '__main__':

+ 2 - 1
paths.py

@@ -6,5 +6,6 @@ HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
 CRAWL_PREFIX = 'crawl_'
 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
 INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
+TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
-WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-all-titles-in-ns0.gz')
+WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')

+ 37 - 0
performance.py

@@ -0,0 +1,37 @@
+"""
+Test the performance of the search in terms of compression and speed.
+"""
+import os
+from datetime import datetime
+from itertools import islice
+
+from spacy.lang.en import English
+
+from index import Indexer, index_titles_and_urls
+from paths import TEST_INDEX_PATH
+from wiki import get_wiki_titles_and_urls
+
+
+def performance_test():
+    nlp = English()
+    try:
+        os.remove(TEST_INDEX_PATH)
+    except FileNotFoundError:
+        print("No test index found, creating")
+    indexer = Indexer(TEST_INDEX_PATH)
+    titles_and_urls = get_wiki_titles_and_urls()
+    titles_and_urls_slice = islice(titles_and_urls, 1000)
+
+    start_time = datetime.now()
+    index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
+    stop_time = datetime.now()
+
+    index_time = (stop_time - start_time).total_seconds()
+    index_size = os.path.getsize(TEST_INDEX_PATH)
+
+    print("Index time:", index_time)
+    print("Index size", index_size)
+
+
+if __name__ == '__main__':
+    performance_test()

+ 21 - 17
wiki.py

@@ -1,35 +1,39 @@
 """
 Index Wikipedia
 """
-import bz2
 import gzip
-from xml.etree.ElementTree import XMLParser
+import html
+from urllib.parse import quote
 
-from mediawiki_parser import preprocessor, text
 from spacy.lang.en import English
 
-from index import tokenize, index
-from paths import WIKI_DATA_PATH, WIKI_TITLES_PATH
+from index import Indexer, index_titles_and_urls
+from paths import WIKI_TITLES_PATH, INDEX_PATH
 
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
+TITLE_START = '<title>Wikipedia: '
+TITLE_END = '</title>\n'
 
 
 def index_wiki():
     nlp = English()
-    indexed = 0
-    with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
-        wiki_titles_file.readline()
-        for title in wiki_titles_file:
-            title_cleaned = title.replace('_', ' ')
-            tokens = tokenize(nlp, title_cleaned)
+    indexer = Indexer(INDEX_PATH)
+    titles_and_urls = get_wiki_titles_and_urls()
+    index_titles_and_urls(indexer, nlp, titles_and_urls)
 
-            if len(tokens) > 0:
-                indexed += 1
-                url = 'https://en.wikipedia.org/wiki/' + title
-                index(tokens, url, title_cleaned)
 
-                if indexed % 1000 == 0:
-                    print("Indexed", indexed)
+def get_wiki_titles_and_urls():
+    start_len = len(TITLE_START)
+    end_len = len(TITLE_END)
+    with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
+        wiki_titles_file.readline()
+        for raw_title in wiki_titles_file:
+            assert raw_title.startswith(TITLE_START)
+            assert raw_title.endswith(TITLE_END)
+            title = raw_title[start_len:-end_len]
+            unescaped_title = html.unescape(title)
+            url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
+            yield unescaped_title, url
 
 
 if __name__ == '__main__':