4 years ago · acc2a9194e
--- a/index.py
+++ b/index.py
@@ -2,24 +2,32 @@
 
															 Create a search index
														
 
															 """
														
 
															 import gzip
														
 
															+import json
														
 
															+import os
														
 
															 import sqlite3
														
 
															 from dataclasses import dataclass
														
 
															 from glob import glob
														
 
															 from itertools import chain, count, islice
														
 
															+from mmap import mmap
														
 
															 from typing import List, Iterator
														
 
															 from urllib.parse import unquote
														
 
															 import bs4
														
 
															 import justext
														
 
															+import mmh3
														
 
															 from spacy.lang.en import English
														
 
															+from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
														
 
															 from paths import CRAWL_GLOB, INDEX_PATH
														
 
															+NUM_PAGES = 8192
														
 
															+PAGE_SIZE = 512
														
 
															+
														
 
															 NUM_INITIAL_TOKENS = 50
														
 
															 HTTP_START = 'http://'
														
 
															 HTTPS_START = 'https://'
														
 
															-BATCH_SIZE = 10000
														
 
															+BATCH_SIZE = 100
														
 
															 def is_content_token(nlp, token):
														
@@ -43,91 +51,112 @@ def clean(content):
 
															 @dataclass
														
 
															-class Page:
														
 
															-    tokens: List[str]
														
 
															+class Document:
														
 
															     url: str
														
 
															     title: str
														
 
															-class Indexer:
														
 
															-    def __init__(self, index_path):
														
 
															+@dataclass
														
 
															+class TokenizedDocument(Document):
														
 
															+    tokens: List[str]
														
 
															+
														
 
															+
														
 
															+class TinyIndexBase:
														
 
															+    def __init__(self, num_pages, page_size):
														
 
															+        self.num_pages = num_pages
														
 
															+        self.page_size = page_size
														
 
															+        self.decompressor = ZstdDecompressor()
														
 
															+        self.mmap = None
														
 
															+
														
 
															+    def _get_page(self, i):
														
 
															+        """
														
 
															+        Get the page at index i, decompress and deserialise it using JSON
														
 
															+        """
														
 
															+        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
														
 
															+        try:
														
 
															+            decompressed_data = self.decompressor.decompress(page_data)
														
 
															+        except ZstdError:
														
 
															+            return None
														
 
															+        return json.loads(decompressed_data.decode('utf8'))
														
 
															+
														
 
															+
														
 
															+class TinyIndex(TinyIndexBase):
														
 
															+    def __init__(self, index_path, num_pages, page_size):
														
 
															+        super().__init__(num_pages, page_size)
														
 
															         self.index_path = index_path
														
 
															+        self.index_file = None
														
 
															+        self.mmap = None
														
 
															-    def index(self, pages: List[Page]):
														
 
															-        with sqlite3.connect(self.index_path) as con:
														
 
															-            cursor = con.execute("""
														
 
															-                    SELECT max(id) FROM pages
														
 
															-                """)
														
 
															-            current_id = cursor.fetchone()[0]
														
 
															-            if current_id is None:
														
 
															-                first_page_id = 1
														
 
															-            else:
														
 
															-                first_page_id = current_id + 1
														
 
															-
														
 
															-            page_ids = range(first_page_id, first_page_id + len(pages))
														
 
															-            urls_titles_ids = ((page.url, page.title, page_id)
														
 
															-                               for page, page_id in zip(pages, page_ids))
														
 
															-            con.executemany("""
														
 
															-                INSERT INTO pages (url, title, id)
														
 
															-                VALUES (?, ?, ?)
														
 
															-            """, urls_titles_ids)
														
 
															-
														
 
															-            tokens = chain(*([(term, page_id) for term in page.tokens]
														
 
															-                             for page, page_id in zip(pages, page_ids)))
														
 
															-            con.executemany("""
														
 
															-                INSERT INTO terms (term, page_id)
														
 
															-                VALUES (?, ?)
														
 
															-            """, tokens)
														
 
															+
														
 
															+class TinyIndexer(TinyIndexBase):
														
 
															+    def __init__(self, index_path, num_pages, page_size):
														
 
															+        super().__init__(num_pages, page_size)
														
 
															+        self.index_path = index_path
														
 
															+        self.compressor = ZstdCompressor()
														
 
															+        self.decompressor = ZstdDecompressor()
														
 
															+        self.index_file = None
														
 
															+
														
 
															+    def __enter__(self):
														
 
															+        self.create_if_not_exists()
														
 
															+        self.index_file = open(self.index_path, 'r+b')
														
 
															+        self.mmap = mmap(self.index_file.fileno(), 0)
														
 
															+        return self
														
 
															+
														
 
															+    def __exit__(self, exc_type, exc_val, exc_tb):
														
 
															+        self.mmap.close()
														
 
															+        self.index_file.close()
														
 
															+
														
 
															+    def index(self, documents: List[TokenizedDocument]):
														
 
															+        for document in documents:
														
 
															+            for token in document.tokens:
														
 
															+                self._index_document(document, token)
														
 
															+
														
 
															+    def _index_document(self, document: Document, token: str):
														
 
															+        page_index = self._get_token_page_index(token)
														
 
															+        current_page = self._get_page(page_index)
														
 
															+        if current_page is None:
														
 
															+            current_page = []
														
 
															+        current_page.append([document.title, document.url])
														
 
															+        try:
														
 
															+            self._write_page(current_page, page_index)
														
 
															+        except ValueError:
														
 
															+            pass
														
 
															+
														
 
															+    def _get_token_page_index(self, token):
														
 
															+        token_hash = mmh3.hash(token, signed=False)
														
 
															+        return token_hash % self.num_pages
														
 
															+
														
 
															+    def _write_page(self, data, i):
														
 
															+        """
														
 
															+        Serialise the data using JSON, compress it and store it at index i.
														
 
															+        If the data is too big, it will raise a ValueError and not store anything
														
 
															+        """
														
 
															+        serialised_data = json.dumps(data)
														
 
															+        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
														
 
															+        page_length = len(compressed_data)
														
 
															+        if page_length > self.page_size:
														
 
															+            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
														
 
															+        padding = b'\x00' * (self.page_size - page_length)
														
 
															+        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
														
 
															     def create_if_not_exists(self):
														
 
															-        con = sqlite3.connect(self.index_path)
														
 
															-        con.execute("""
														
 
															-        CREATE TABLE IF NOT EXISTS pages (
														
 
															-          id INTEGER PRIMARY KEY,
														
 
															-          url TEXT UNIQUE,
														
 
															-          title TEXT
														
 
															-        )
														
 
															-        """)
														
 
															-
														
 
															-        con.execute("""
														
 
															-        CREATE TABLE IF NOT EXISTS terms (
														
 
															-          term TEXT,
														
 
															-          page_id INTEGER 
														
 
															-        )
														
 
															-        """)
														
 
															-
														
 
															-        con.execute("""
														
 
															-        CREATE INDEX IF NOT EXISTS term_index ON terms (term)
														
 
															-        """)
														
 
															-
														
 
															-    def page_indexed(self, url):
														
 
															-        con = sqlite3.connect(self.index_path)
														
 
															-        result = con.execute("""
														
 
															-            SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
														
 
															-        """, (url,))
														
 
															-        value = result.fetchone()[0]
														
 
															-        return value == 1
														
 
															+        if not os.path.isfile(self.index_path):
														
 
															+            file_length = self.num_pages * self.page_size
														
 
															+            with open(self.index_path, 'wb') as index_file:
														
 
															+                index_file.write(b'\x00' * file_length)
														
 
															+
														
 
															+    def document_indexed(self, url):
														
 
															+        raise NotImplementedError()
														
 
															     def get_num_tokens(self):
														
 
															-        con = sqlite3.connect(self.index_path)
														
 
															-        cursor = con.execute("""
														
 
															-            SELECT count(*) from terms
														
 
															-        """)
														
 
															-        num_terms = cursor.fetchone()[0]
														
 
															-        return num_terms
														
 
															+        raise NotImplementedError()
														
 
															     def get_random_terms(self, n):
														
 
															-        con = sqlite3.connect(self.index_path)
														
 
															-        cursor = con.execute("""
														
 
															-            SELECT DISTINCT term FROM terms
														
 
															-            ORDER BY random() LIMIT ?
														
 
															-        """)
														
 
															-        terms = [t[0] for t in cursor.fetchall()]
														
 
															-        return terms
														
 
															+        raise NotImplementedError()
														
 
															 def run():
														
 
															-    indexer = Indexer(INDEX_PATH)
														
 
															+    indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
														
 
															     indexer.create_if_not_exists()
														
 
															     nlp = English()
														
 
															     for path in glob(CRAWL_GLOB):
														
@@ -136,7 +165,7 @@ def run():
 
															             url = html_file.readline().strip()
														
 
															             content = html_file.read()
														
 
															-        if indexer.page_indexed(url):
														
 
															+        if indexer.document_indexed(url):
														
 
															             print("Page exists, skipping", url)
														
 
															             continue
														
@@ -169,7 +198,7 @@ def get_pages(nlp, titles_and_urls):
 
															         prepared_url = prepare_url_for_tokenizing(unquote(url))
														
 
															         url_tokens = tokenize(nlp, prepared_url)
														
 
															         tokens = title_tokens | url_tokens
														
 
															-        yield Page(list(tokens), url, title_cleaned)
														
 
															+        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
														
 
															         if i % 1000 == 0:
														
 
															             print("Processed", i)
														
@@ -183,7 +212,7 @@ def grouper(n: int, iterator: Iterator):
 
															         yield chunk
														
 
															-def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
														
 
															+def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
														
 
															     indexer.create_if_not_exists()
														
 
															     pages = get_pages(nlp, titles_and_urls)
														
--- a/paths.py
+++ b/paths.py
@@ -5,7 +5,7 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
 
															 HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
														
 
															 CRAWL_PREFIX = 'crawl_'
														
 
															 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
														
 
															-INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
														
 
															-TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
														
 
															+INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
														
 
															+TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
														
 
															 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
														
 
															 WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
														
--- a/performance.py
+++ b/performance.py
@@ -10,12 +10,12 @@ from spacy.lang.en import English
 
															 from starlette.testclient import TestClient
														
 
															 from app import app, complete
														
 
															-from index import Indexer, index_titles_and_urls
														
 
															+from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
														
 
															 from paths import TEST_INDEX_PATH
														
 
															 from wiki import get_wiki_titles_and_urls
														
 
															-NUM_PAGES = 500
														
 
															+NUM_DOCUMENTS = 10000
														
 
															 def query_test():
														
@@ -25,7 +25,7 @@ def query_test():
 
															     start = datetime.now()
														
 
															     hits = 0
														
 
															-    for title, url in islice(titles_and_urls, NUM_PAGES):
														
 
															+    for title, url in islice(titles_and_urls, NUM_DOCUMENTS):
														
 
															         result = client.get('/complete', params={'q': title})
														
 
															         assert result.status_code == 200
														
 
															         data = result.content.decode('utf8')
														
@@ -36,7 +36,7 @@ def query_test():
 
															     end = datetime.now()
														
 
															     print("Hits:", hits)
														
 
															-    print("Query time:", (end - start).total_seconds()/NUM_PAGES)
														
 
															+    print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
														
 
															 def performance_test():
														
@@ -45,23 +45,23 @@ def performance_test():
 
															         os.remove(TEST_INDEX_PATH)
														
 
															     except FileNotFoundError:
														
 
															         print("No test index found, creating")
														
 
															-    indexer = Indexer(TEST_INDEX_PATH)
														
 
															-    titles_and_urls = get_wiki_titles_and_urls()
														
 
															-    titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
														
 
															+    with TinyIndexer(TEST_INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
														
 
															+        titles_and_urls = get_wiki_titles_and_urls()
														
 
															+        titles_and_urls_slice = islice(titles_and_urls, NUM_DOCUMENTS)
														
 
															-    start_time = datetime.now()
														
 
															-    index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
														
 
															-    stop_time = datetime.now()
														
 
															+        start_time = datetime.now()
														
 
															+        index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
														
 
															+        stop_time = datetime.now()
														
 
															-    index_time = (stop_time - start_time).total_seconds()
														
 
															-    index_size = os.path.getsize(TEST_INDEX_PATH)
														
 
															+        index_time = (stop_time - start_time).total_seconds()
														
 
															+        index_size = os.path.getsize(TEST_INDEX_PATH)
														
 
															-    print("Indexed pages:", NUM_PAGES)
														
 
															+    print("Indexed pages:", NUM_DOCUMENTS)
														
 
															     print("Index time:", index_time)
														
 
															     print("Index size", index_size)
														
 
															-    print("Num tokens", indexer.get_num_tokens())
														
 
															+    # print("Num tokens", indexer.get_num_tokens())
														
 
															-    query_test()
														
 
															+    # query_test()
														
 
															 if __name__ == '__main__':
														
--- a/wiki.py
+++ b/wiki.py
@@ -7,7 +7,7 @@ from urllib.parse import quote
 
															 from spacy.lang.en import English
														
 
															-from index import Indexer, index_titles_and_urls
														
 
															+from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
														
 
															 from paths import WIKI_TITLES_PATH, INDEX_PATH
														
 
															 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
														
@@ -17,9 +17,9 @@ TITLE_END = '</title>\n'
 
															 def index_wiki():
														
 
															     nlp = English()
														
 
															-    indexer = Indexer(INDEX_PATH)
														
 
															-    titles_and_urls = get_wiki_titles_and_urls()
														
 
															-    index_titles_and_urls(indexer, nlp, titles_and_urls)
														
 
															+    with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
														
 
															+        titles_and_urls = get_wiki_titles_and_urls()
														
 
															+        index_titles_and_urls(indexer, nlp, titles_and_urls)
														
 
															 def get_wiki_titles_and_urls():