4 年前 · acc2a9194e
--- a/index.py
+++ b/index.py
@@ -2,24 +2,32 @@
 
				 Create a search index
			
 
				 """
			
 
				 import gzip
			
 
				+import json
			
 
				+import os
			
 
				 import sqlite3
			
 
				 from dataclasses import dataclass
			
 
				 from glob import glob
			
 
				 from itertools import chain, count, islice
			
 
				+from mmap import mmap
			
 
				 from typing import List, Iterator
			
 
				 from urllib.parse import unquote
			
 
				 
			
 
				 import bs4
			
 
				 import justext
			
 
				+import mmh3
			
 
				 from spacy.lang.en import English
			
 
				+from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
			
 
				 
			
 
				 from paths import CRAWL_GLOB, INDEX_PATH
			
 
				 
			
 
				+NUM_PAGES = 8192
			
 
				+PAGE_SIZE = 512
			
 
				+
			
 
				 NUM_INITIAL_TOKENS = 50
			
 
				 
			
 
				 HTTP_START = 'http://'
			
 
				 HTTPS_START = 'https://'
			
 
				-BATCH_SIZE = 10000
			
 
				+BATCH_SIZE = 100
			
 
				 
			
 
				 
			
 
				 def is_content_token(nlp, token):
			
@@ -43,91 +51,112 @@ def clean(content):
 
				 
			
 
				 
			
 
				 @dataclass
			
 
				-class Page:
			
 
				-    tokens: List[str]
			
 
				+class Document:
			
 
				     url: str
			
 
				     title: str
			
 
				 
			
 
				 
			
 
				-class Indexer:
			
 
				-    def __init__(self, index_path):
			
 
				+@dataclass
			
 
				+class TokenizedDocument(Document):
			
 
				+    tokens: List[str]
			
 
				+
			
 
				+
			
 
				+class TinyIndexBase:
			
 
				+    def __init__(self, num_pages, page_size):
			
 
				+        self.num_pages = num_pages
			
 
				+        self.page_size = page_size
			
 
				+        self.decompressor = ZstdDecompressor()
			
 
				+        self.mmap = None
			
 
				+
			
 
				+    def _get_page(self, i):
			
 
				+        """
			
 
				+        Get the page at index i, decompress and deserialise it using JSON
			
 
				+        """
			
 
				+        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
			
 
				+        try:
			
 
				+            decompressed_data = self.decompressor.decompress(page_data)
			
 
				+        except ZstdError:
			
 
				+            return None
			
 
				+        return json.loads(decompressed_data.decode('utf8'))
			
 
				+
			
 
				+
			
 
				+class TinyIndex(TinyIndexBase):
			
 
				+    def __init__(self, index_path, num_pages, page_size):
			
 
				+        super().__init__(num_pages, page_size)
			
 
				         self.index_path = index_path
			
 
				+        self.index_file = None
			
 
				+        self.mmap = None
			
 
				 
			
 
				-    def index(self, pages: List[Page]):
			
 
				-        with sqlite3.connect(self.index_path) as con:
			
 
				-            cursor = con.execute("""
			
 
				-                    SELECT max(id) FROM pages
			
 
				-                """)
			
 
				-            current_id = cursor.fetchone()[0]
			
 
				-            if current_id is None:
			
 
				-                first_page_id = 1
			
 
				-            else:
			
 
				-                first_page_id = current_id + 1
			
 
				-
			
 
				-            page_ids = range(first_page_id, first_page_id + len(pages))
			
 
				-            urls_titles_ids = ((page.url, page.title, page_id)
			
 
				-                               for page, page_id in zip(pages, page_ids))
			
 
				-            con.executemany("""
			
 
				-                INSERT INTO pages (url, title, id)
			
 
				-                VALUES (?, ?, ?)
			
 
				-            """, urls_titles_ids)
			
 
				-
			
 
				-            tokens = chain(*([(term, page_id) for term in page.tokens]
			
 
				-                             for page, page_id in zip(pages, page_ids)))
			
 
				-            con.executemany("""
			
 
				-                INSERT INTO terms (term, page_id)
			
 
				-                VALUES (?, ?)
			
 
				-            """, tokens)
			
 
				+
			
 
				+class TinyIndexer(TinyIndexBase):
			
 
				+    def __init__(self, index_path, num_pages, page_size):
			
 
				+        super().__init__(num_pages, page_size)
			
 
				+        self.index_path = index_path
			
 
				+        self.compressor = ZstdCompressor()
			
 
				+        self.decompressor = ZstdDecompressor()
			
 
				+        self.index_file = None
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        self.create_if_not_exists()
			
 
				+        self.index_file = open(self.index_path, 'r+b')
			
 
				+        self.mmap = mmap(self.index_file.fileno(), 0)
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        self.mmap.close()
			
 
				+        self.index_file.close()
			
 
				+
			
 
				+    def index(self, documents: List[TokenizedDocument]):
			
 
				+        for document in documents:
			
 
				+            for token in document.tokens:
			
 
				+                self._index_document(document, token)
			
 
				+
			
 
				+    def _index_document(self, document: Document, token: str):
			
 
				+        page_index = self._get_token_page_index(token)
			
 
				+        current_page = self._get_page(page_index)
			
 
				+        if current_page is None:
			
 
				+            current_page = []
			
 
				+        current_page.append([document.title, document.url])
			
 
				+        try:
			
 
				+            self._write_page(current_page, page_index)
			
 
				+        except ValueError:
			
 
				+            pass
			
 
				+
			
 
				+    def _get_token_page_index(self, token):
			
 
				+        token_hash = mmh3.hash(token, signed=False)
			
 
				+        return token_hash % self.num_pages
			
 
				+
			
 
				+    def _write_page(self, data, i):
			
 
				+        """
			
 
				+        Serialise the data using JSON, compress it and store it at index i.
			
 
				+        If the data is too big, it will raise a ValueError and not store anything
			
 
				+        """
			
 
				+        serialised_data = json.dumps(data)
			
 
				+        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
			
 
				+        page_length = len(compressed_data)
			
 
				+        if page_length > self.page_size:
			
 
				+            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
			
 
				+        padding = b'\x00' * (self.page_size - page_length)
			
 
				+        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
			
 
				 
			
 
				     def create_if_not_exists(self):
			
 
				-        con = sqlite3.connect(self.index_path)
			
 
				-        con.execute("""
			
 
				-        CREATE TABLE IF NOT EXISTS pages (
			
 
				-          id INTEGER PRIMARY KEY,
			
 
				-          url TEXT UNIQUE,
			
 
				-          title TEXT
			
 
				-        )
			
 
				-        """)
			
 
				-
			
 
				-        con.execute("""
			
 
				-        CREATE TABLE IF NOT EXISTS terms (
			
 
				-          term TEXT,
			
 
				-          page_id INTEGER 
			
 
				-        )
			
 
				-        """)
			
 
				-
			
 
				-        con.execute("""
			
 
				-        CREATE INDEX IF NOT EXISTS term_index ON terms (term)
			
 
				-        """)
			
 
				-
			
 
				-    def page_indexed(self, url):
			
 
				-        con = sqlite3.connect(self.index_path)
			
 
				-        result = con.execute("""
			
 
				-            SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
			
 
				-        """, (url,))
			
 
				-        value = result.fetchone()[0]
			
 
				-        return value == 1
			
 
				+        if not os.path.isfile(self.index_path):
			
 
				+            file_length = self.num_pages * self.page_size
			
 
				+            with open(self.index_path, 'wb') as index_file:
			
 
				+                index_file.write(b'\x00' * file_length)
			
 
				+
			
 
				+    def document_indexed(self, url):
			
 
				+        raise NotImplementedError()
			
 
				 
			
 
				     def get_num_tokens(self):
			
 
				-        con = sqlite3.connect(self.index_path)
			
 
				-        cursor = con.execute("""
			
 
				-            SELECT count(*) from terms
			
 
				-        """)
			
 
				-        num_terms = cursor.fetchone()[0]
			
 
				-        return num_terms
			
 
				+        raise NotImplementedError()
			
 
				 
			
 
				     def get_random_terms(self, n):
			
 
				-        con = sqlite3.connect(self.index_path)
			
 
				-        cursor = con.execute("""
			
 
				-            SELECT DISTINCT term FROM terms
			
 
				-            ORDER BY random() LIMIT ?
			
 
				-        """)
			
 
				-        terms = [t[0] for t in cursor.fetchall()]
			
 
				-        return terms
			
 
				+        raise NotImplementedError()
			
 
				 
			
 
				 
			
 
				 def run():
			
 
				-    indexer = Indexer(INDEX_PATH)
			
 
				+    indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
			
 
				     indexer.create_if_not_exists()
			
 
				     nlp = English()
			
 
				     for path in glob(CRAWL_GLOB):
			
@@ -136,7 +165,7 @@ def run():
 
				             url = html_file.readline().strip()
			
 
				             content = html_file.read()
			
 
				 
			
 
				-        if indexer.page_indexed(url):
			
 
				+        if indexer.document_indexed(url):
			
 
				             print("Page exists, skipping", url)
			
 
				             continue
			
 
				 
			
@@ -169,7 +198,7 @@ def get_pages(nlp, titles_and_urls):
 
				         prepared_url = prepare_url_for_tokenizing(unquote(url))
			
 
				         url_tokens = tokenize(nlp, prepared_url)
			
 
				         tokens = title_tokens | url_tokens
			
 
				-        yield Page(list(tokens), url, title_cleaned)
			
 
				+        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
			
 
				 
			
 
				         if i % 1000 == 0:
			
 
				             print("Processed", i)
			
@@ -183,7 +212,7 @@ def grouper(n: int, iterator: Iterator):
 
				         yield chunk
			
 
				 
			
 
				 
			
 
				-def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
			
 
				+def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
			
 
				     indexer.create_if_not_exists()
			
 
				 
			
 
				     pages = get_pages(nlp, titles_and_urls)
			
--- a/paths.py
+++ b/paths.py
@@ -5,7 +5,7 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
 
				 HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
			
 
				 CRAWL_PREFIX = 'crawl_'
			
 
				 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
			
 
				-INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
			
 
				-TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
			
 
				+INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
			
 
				+TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
			
 
				 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
			
 
				 WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
			
--- a/performance.py
+++ b/performance.py
@@ -10,12 +10,12 @@ from spacy.lang.en import English
 
				 from starlette.testclient import TestClient
			
 
				 
			
 
				 from app import app, complete
			
 
				-from index import Indexer, index_titles_and_urls
			
 
				+from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
			
 
				 from paths import TEST_INDEX_PATH
			
 
				 from wiki import get_wiki_titles_and_urls
			
 
				 
			
 
				 
			
 
				-NUM_PAGES = 500
			
 
				+NUM_DOCUMENTS = 10000
			
 
				 
			
 
				 
			
 
				 def query_test():
			
@@ -25,7 +25,7 @@ def query_test():
 
				 
			
 
				     start = datetime.now()
			
 
				     hits = 0
			
 
				-    for title, url in islice(titles_and_urls, NUM_PAGES):
			
 
				+    for title, url in islice(titles_and_urls, NUM_DOCUMENTS):
			
 
				         result = client.get('/complete', params={'q': title})
			
 
				         assert result.status_code == 200
			
 
				         data = result.content.decode('utf8')
			
@@ -36,7 +36,7 @@ def query_test():
 
				 
			
 
				     end = datetime.now()
			
 
				     print("Hits:", hits)
			
 
				-    print("Query time:", (end - start).total_seconds()/NUM_PAGES)
			
 
				+    print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
			
 
				 
			
 
				 
			
 
				 def performance_test():
			
@@ -45,23 +45,23 @@ def performance_test():
 
				         os.remove(TEST_INDEX_PATH)
			
 
				     except FileNotFoundError:
			
 
				         print("No test index found, creating")
			
 
				-    indexer = Indexer(TEST_INDEX_PATH)
			
 
				-    titles_and_urls = get_wiki_titles_and_urls()
			
 
				-    titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
			
 
				+    with TinyIndexer(TEST_INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
			
 
				+        titles_and_urls = get_wiki_titles_and_urls()
			
 
				+        titles_and_urls_slice = islice(titles_and_urls, NUM_DOCUMENTS)
			
 
				 
			
 
				-    start_time = datetime.now()
			
 
				-    index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
			
 
				-    stop_time = datetime.now()
			
 
				+        start_time = datetime.now()
			
 
				+        index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
			
 
				+        stop_time = datetime.now()
			
 
				 
			
 
				-    index_time = (stop_time - start_time).total_seconds()
			
 
				-    index_size = os.path.getsize(TEST_INDEX_PATH)
			
 
				+        index_time = (stop_time - start_time).total_seconds()
			
 
				+        index_size = os.path.getsize(TEST_INDEX_PATH)
			
 
				 
			
 
				-    print("Indexed pages:", NUM_PAGES)
			
 
				+    print("Indexed pages:", NUM_DOCUMENTS)
			
 
				     print("Index time:", index_time)
			
 
				     print("Index size", index_size)
			
 
				-    print("Num tokens", indexer.get_num_tokens())
			
 
				+    # print("Num tokens", indexer.get_num_tokens())
			
 
				 
			
 
				-    query_test()
			
 
				+    # query_test()
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/wiki.py
+++ b/wiki.py
@@ -7,7 +7,7 @@ from urllib.parse import quote
 
				 
			
 
				 from spacy.lang.en import English
			
 
				 
			
 
				-from index import Indexer, index_titles_and_urls
			
 
				+from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
			
 
				 from paths import WIKI_TITLES_PATH, INDEX_PATH
			
 
				 
			
 
				 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
			
@@ -17,9 +17,9 @@ TITLE_END = '</title>\n'
 
				 
			
 
				 def index_wiki():
			
 
				     nlp = English()
			
 
				-    indexer = Indexer(INDEX_PATH)
			
 
				-    titles_and_urls = get_wiki_titles_and_urls()
			
 
				-    index_titles_and_urls(indexer, nlp, titles_and_urls)
			
 
				+    with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
			
 
				+        titles_and_urls = get_wiki_titles_and_urls()
			
 
				+        index_titles_and_urls(indexer, nlp, titles_and_urls)
			
 
				 
			
 
				 
			
 
				 def get_wiki_titles_and_urls():