ソースを参照

Index using compression

Daoud Clarke 4 年 前
コミット
acc2a9194e
4 ファイル変更124 行追加95 行削除
  1. 103 74
      index.py
  2. 2 2
      paths.py
  3. 15 15
      performance.py
  4. 4 4
      wiki.py

+ 103 - 74
index.py

@@ -2,24 +2,32 @@
 Create a search index
 """
 import gzip
+import json
+import os
 import sqlite3
 from dataclasses import dataclass
 from glob import glob
 from itertools import chain, count, islice
+from mmap import mmap
 from typing import List, Iterator
 from urllib.parse import unquote
 
 import bs4
 import justext
+import mmh3
 from spacy.lang.en import English
+from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
 
 from paths import CRAWL_GLOB, INDEX_PATH
 
+NUM_PAGES = 8192
+PAGE_SIZE = 512
+
 NUM_INITIAL_TOKENS = 50
 
 HTTP_START = 'http://'
 HTTPS_START = 'https://'
-BATCH_SIZE = 10000
+BATCH_SIZE = 100
 
 
 def is_content_token(nlp, token):
@@ -43,91 +51,112 @@ def clean(content):
 
 
 @dataclass
-class Page:
-    tokens: List[str]
+class Document:
     url: str
     title: str
 
 
-class Indexer:
-    def __init__(self, index_path):
+@dataclass
+class TokenizedDocument(Document):
+    tokens: List[str]
+
+
+class TinyIndexBase:
+    def __init__(self, num_pages, page_size):
+        self.num_pages = num_pages
+        self.page_size = page_size
+        self.decompressor = ZstdDecompressor()
+        self.mmap = None
+
+    def _get_page(self, i):
+        """
+        Get the page at index i, decompress and deserialise it using JSON
+        """
+        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
+        try:
+            decompressed_data = self.decompressor.decompress(page_data)
+        except ZstdError:
+            return None
+        return json.loads(decompressed_data.decode('utf8'))
+
+
+class TinyIndex(TinyIndexBase):
+    def __init__(self, index_path, num_pages, page_size):
+        super().__init__(num_pages, page_size)
         self.index_path = index_path
+        self.index_file = None
+        self.mmap = None
 
-    def index(self, pages: List[Page]):
-        with sqlite3.connect(self.index_path) as con:
-            cursor = con.execute("""
-                    SELECT max(id) FROM pages
-                """)
-            current_id = cursor.fetchone()[0]
-            if current_id is None:
-                first_page_id = 1
-            else:
-                first_page_id = current_id + 1
-
-            page_ids = range(first_page_id, first_page_id + len(pages))
-            urls_titles_ids = ((page.url, page.title, page_id)
-                               for page, page_id in zip(pages, page_ids))
-            con.executemany("""
-                INSERT INTO pages (url, title, id)
-                VALUES (?, ?, ?)
-            """, urls_titles_ids)
-
-            tokens = chain(*([(term, page_id) for term in page.tokens]
-                             for page, page_id in zip(pages, page_ids)))
-            con.executemany("""
-                INSERT INTO terms (term, page_id)
-                VALUES (?, ?)
-            """, tokens)
+
+class TinyIndexer(TinyIndexBase):
+    def __init__(self, index_path, num_pages, page_size):
+        super().__init__(num_pages, page_size)
+        self.index_path = index_path
+        self.compressor = ZstdCompressor()
+        self.decompressor = ZstdDecompressor()
+        self.index_file = None
+
+    def __enter__(self):
+        self.create_if_not_exists()
+        self.index_file = open(self.index_path, 'r+b')
+        self.mmap = mmap(self.index_file.fileno(), 0)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.mmap.close()
+        self.index_file.close()
+
+    def index(self, documents: List[TokenizedDocument]):
+        for document in documents:
+            for token in document.tokens:
+                self._index_document(document, token)
+
+    def _index_document(self, document: Document, token: str):
+        page_index = self._get_token_page_index(token)
+        current_page = self._get_page(page_index)
+        if current_page is None:
+            current_page = []
+        current_page.append([document.title, document.url])
+        try:
+            self._write_page(current_page, page_index)
+        except ValueError:
+            pass
+
+    def _get_token_page_index(self, token):
+        token_hash = mmh3.hash(token, signed=False)
+        return token_hash % self.num_pages
+
+    def _write_page(self, data, i):
+        """
+        Serialise the data using JSON, compress it and store it at index i.
+        If the data is too big, it will raise a ValueError and not store anything
+        """
+        serialised_data = json.dumps(data)
+        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
+        page_length = len(compressed_data)
+        if page_length > self.page_size:
+            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
+        padding = b'\x00' * (self.page_size - page_length)
+        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
 
     def create_if_not_exists(self):
-        con = sqlite3.connect(self.index_path)
-        con.execute("""
-        CREATE TABLE IF NOT EXISTS pages (
-          id INTEGER PRIMARY KEY,
-          url TEXT UNIQUE,
-          title TEXT
-        )
-        """)
-
-        con.execute("""
-        CREATE TABLE IF NOT EXISTS terms (
-          term TEXT,
-          page_id INTEGER 
-        )
-        """)
-
-        con.execute("""
-        CREATE INDEX IF NOT EXISTS term_index ON terms (term)
-        """)
-
-    def page_indexed(self, url):
-        con = sqlite3.connect(self.index_path)
-        result = con.execute("""
-            SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
-        """, (url,))
-        value = result.fetchone()[0]
-        return value == 1
+        if not os.path.isfile(self.index_path):
+            file_length = self.num_pages * self.page_size
+            with open(self.index_path, 'wb') as index_file:
+                index_file.write(b'\x00' * file_length)
+
+    def document_indexed(self, url):
+        raise NotImplementedError()
 
     def get_num_tokens(self):
-        con = sqlite3.connect(self.index_path)
-        cursor = con.execute("""
-            SELECT count(*) from terms
-        """)
-        num_terms = cursor.fetchone()[0]
-        return num_terms
+        raise NotImplementedError()
 
     def get_random_terms(self, n):
-        con = sqlite3.connect(self.index_path)
-        cursor = con.execute("""
-            SELECT DISTINCT term FROM terms
-            ORDER BY random() LIMIT ?
-        """)
-        terms = [t[0] for t in cursor.fetchall()]
-        return terms
+        raise NotImplementedError()
 
 
 def run():
-    indexer = Indexer(INDEX_PATH)
+    indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
     indexer.create_if_not_exists()
     nlp = English()
     for path in glob(CRAWL_GLOB):
@@ -136,7 +165,7 @@ def run():
             url = html_file.readline().strip()
             content = html_file.read()
 
-        if indexer.page_indexed(url):
+        if indexer.document_indexed(url):
             print("Page exists, skipping", url)
             continue
 
@@ -169,7 +198,7 @@ def get_pages(nlp, titles_and_urls):
         prepared_url = prepare_url_for_tokenizing(unquote(url))
         url_tokens = tokenize(nlp, prepared_url)
         tokens = title_tokens | url_tokens
-        yield Page(list(tokens), url, title_cleaned)
+        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
 
         if i % 1000 == 0:
             print("Processed", i)
@@ -183,7 +212,7 @@ def grouper(n: int, iterator: Iterator):
         yield chunk
 
 
-def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
+def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
     indexer.create_if_not_exists()
 
     pages = get_pages(nlp, titles_and_urls)

+ 2 - 2
paths.py

@@ -5,7 +5,7 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
 HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
 CRAWL_PREFIX = 'crawl_'
 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
-INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
-TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
+INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
+TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
 WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')

+ 15 - 15
performance.py

@@ -10,12 +10,12 @@ from spacy.lang.en import English
 from starlette.testclient import TestClient
 
 from app import app, complete
-from index import Indexer, index_titles_and_urls
+from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
 from paths import TEST_INDEX_PATH
 from wiki import get_wiki_titles_and_urls
 
 
-NUM_PAGES = 500
+NUM_DOCUMENTS = 10000
 
 
 def query_test():
@@ -25,7 +25,7 @@ def query_test():
 
     start = datetime.now()
     hits = 0
-    for title, url in islice(titles_and_urls, NUM_PAGES):
+    for title, url in islice(titles_and_urls, NUM_DOCUMENTS):
         result = client.get('/complete', params={'q': title})
         assert result.status_code == 200
         data = result.content.decode('utf8')
@@ -36,7 +36,7 @@ def query_test():
 
     end = datetime.now()
     print("Hits:", hits)
-    print("Query time:", (end - start).total_seconds()/NUM_PAGES)
+    print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
 
 
 def performance_test():
@@ -45,23 +45,23 @@ def performance_test():
         os.remove(TEST_INDEX_PATH)
     except FileNotFoundError:
         print("No test index found, creating")
-    indexer = Indexer(TEST_INDEX_PATH)
-    titles_and_urls = get_wiki_titles_and_urls()
-    titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
+    with TinyIndexer(TEST_INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+        titles_and_urls = get_wiki_titles_and_urls()
+        titles_and_urls_slice = islice(titles_and_urls, NUM_DOCUMENTS)
 
-    start_time = datetime.now()
-    index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
-    stop_time = datetime.now()
+        start_time = datetime.now()
+        index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
+        stop_time = datetime.now()
 
-    index_time = (stop_time - start_time).total_seconds()
-    index_size = os.path.getsize(TEST_INDEX_PATH)
+        index_time = (stop_time - start_time).total_seconds()
+        index_size = os.path.getsize(TEST_INDEX_PATH)
 
-    print("Indexed pages:", NUM_PAGES)
+    print("Indexed pages:", NUM_DOCUMENTS)
     print("Index time:", index_time)
     print("Index size", index_size)
-    print("Num tokens", indexer.get_num_tokens())
+    # print("Num tokens", indexer.get_num_tokens())
 
-    query_test()
+    # query_test()
 
 
 if __name__ == '__main__':

+ 4 - 4
wiki.py

@@ -7,7 +7,7 @@ from urllib.parse import quote
 
 from spacy.lang.en import English
 
-from index import Indexer, index_titles_and_urls
+from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
 from paths import WIKI_TITLES_PATH, INDEX_PATH
 
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
@@ -17,9 +17,9 @@ TITLE_END = '</title>\n'
 
 def index_wiki():
     nlp = English()
-    indexer = Indexer(INDEX_PATH)
-    titles_and_urls = get_wiki_titles_and_urls()
-    index_titles_and_urls(indexer, nlp, titles_and_urls)
+    with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+        titles_and_urls = get_wiki_titles_and_urls()
+        index_titles_and_urls(indexer, nlp, titles_and_urls)
 
 
 def get_wiki_titles_and_urls():