Browse Source

Index using compression

Daoud Clarke 4 years ago
parent
commit
acc2a9194e
4 changed files with 124 additions and 95 deletions
  1. 103 74
      index.py
  2. 2 2
      paths.py
  3. 15 15
      performance.py
  4. 4 4
      wiki.py

+ 103 - 74
index.py

@@ -2,24 +2,32 @@
 Create a search index
 Create a search index
 """
 """
 import gzip
 import gzip
+import json
+import os
 import sqlite3
 import sqlite3
 from dataclasses import dataclass
 from dataclasses import dataclass
 from glob import glob
 from glob import glob
 from itertools import chain, count, islice
 from itertools import chain, count, islice
+from mmap import mmap
 from typing import List, Iterator
 from typing import List, Iterator
 from urllib.parse import unquote
 from urllib.parse import unquote
 
 
 import bs4
 import bs4
 import justext
 import justext
+import mmh3
 from spacy.lang.en import English
 from spacy.lang.en import English
+from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
 
 
 from paths import CRAWL_GLOB, INDEX_PATH
 from paths import CRAWL_GLOB, INDEX_PATH
 
 
+NUM_PAGES = 8192
+PAGE_SIZE = 512
+
 NUM_INITIAL_TOKENS = 50
 NUM_INITIAL_TOKENS = 50
 
 
 HTTP_START = 'http://'
 HTTP_START = 'http://'
 HTTPS_START = 'https://'
 HTTPS_START = 'https://'
-BATCH_SIZE = 10000
+BATCH_SIZE = 100
 
 
 
 
 def is_content_token(nlp, token):
 def is_content_token(nlp, token):
@@ -43,91 +51,112 @@ def clean(content):
 
 
 
 
 @dataclass
 @dataclass
-class Page:
-    tokens: List[str]
+class Document:
     url: str
     url: str
     title: str
     title: str
 
 
 
 
-class Indexer:
-    def __init__(self, index_path):
+@dataclass
+class TokenizedDocument(Document):
+    tokens: List[str]
+
+
+class TinyIndexBase:
+    def __init__(self, num_pages, page_size):
+        self.num_pages = num_pages
+        self.page_size = page_size
+        self.decompressor = ZstdDecompressor()
+        self.mmap = None
+
+    def _get_page(self, i):
+        """
+        Get the page at index i, decompress and deserialise it using JSON
+        """
+        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
+        try:
+            decompressed_data = self.decompressor.decompress(page_data)
+        except ZstdError:
+            return None
+        return json.loads(decompressed_data.decode('utf8'))
+
+
+class TinyIndex(TinyIndexBase):
+    def __init__(self, index_path, num_pages, page_size):
+        super().__init__(num_pages, page_size)
         self.index_path = index_path
         self.index_path = index_path
+        self.index_file = None
+        self.mmap = None
 
 
-    def index(self, pages: List[Page]):
-        with sqlite3.connect(self.index_path) as con:
-            cursor = con.execute("""
-                    SELECT max(id) FROM pages
-                """)
-            current_id = cursor.fetchone()[0]
-            if current_id is None:
-                first_page_id = 1
-            else:
-                first_page_id = current_id + 1
-
-            page_ids = range(first_page_id, first_page_id + len(pages))
-            urls_titles_ids = ((page.url, page.title, page_id)
-                               for page, page_id in zip(pages, page_ids))
-            con.executemany("""
-                INSERT INTO pages (url, title, id)
-                VALUES (?, ?, ?)
-            """, urls_titles_ids)
-
-            tokens = chain(*([(term, page_id) for term in page.tokens]
-                             for page, page_id in zip(pages, page_ids)))
-            con.executemany("""
-                INSERT INTO terms (term, page_id)
-                VALUES (?, ?)
-            """, tokens)
+
+class TinyIndexer(TinyIndexBase):
+    def __init__(self, index_path, num_pages, page_size):
+        super().__init__(num_pages, page_size)
+        self.index_path = index_path
+        self.compressor = ZstdCompressor()
+        self.decompressor = ZstdDecompressor()
+        self.index_file = None
+
+    def __enter__(self):
+        self.create_if_not_exists()
+        self.index_file = open(self.index_path, 'r+b')
+        self.mmap = mmap(self.index_file.fileno(), 0)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.mmap.close()
+        self.index_file.close()
+
+    def index(self, documents: List[TokenizedDocument]):
+        for document in documents:
+            for token in document.tokens:
+                self._index_document(document, token)
+
+    def _index_document(self, document: Document, token: str):
+        page_index = self._get_token_page_index(token)
+        current_page = self._get_page(page_index)
+        if current_page is None:
+            current_page = []
+        current_page.append([document.title, document.url])
+        try:
+            self._write_page(current_page, page_index)
+        except ValueError:
+            pass
+
+    def _get_token_page_index(self, token):
+        token_hash = mmh3.hash(token, signed=False)
+        return token_hash % self.num_pages
+
+    def _write_page(self, data, i):
+        """
+        Serialise the data using JSON, compress it and store it at index i.
+        If the data is too big, it will raise a ValueError and not store anything
+        """
+        serialised_data = json.dumps(data)
+        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
+        page_length = len(compressed_data)
+        if page_length > self.page_size:
+            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
+        padding = b'\x00' * (self.page_size - page_length)
+        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
 
 
     def create_if_not_exists(self):
     def create_if_not_exists(self):
-        con = sqlite3.connect(self.index_path)
-        con.execute("""
-        CREATE TABLE IF NOT EXISTS pages (
-          id INTEGER PRIMARY KEY,
-          url TEXT UNIQUE,
-          title TEXT
-        )
-        """)
-
-        con.execute("""
-        CREATE TABLE IF NOT EXISTS terms (
-          term TEXT,
-          page_id INTEGER 
-        )
-        """)
-
-        con.execute("""
-        CREATE INDEX IF NOT EXISTS term_index ON terms (term)
-        """)
-
-    def page_indexed(self, url):
-        con = sqlite3.connect(self.index_path)
-        result = con.execute("""
-            SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
-        """, (url,))
-        value = result.fetchone()[0]
-        return value == 1
+        if not os.path.isfile(self.index_path):
+            file_length = self.num_pages * self.page_size
+            with open(self.index_path, 'wb') as index_file:
+                index_file.write(b'\x00' * file_length)
+
+    def document_indexed(self, url):
+        raise NotImplementedError()
 
 
     def get_num_tokens(self):
     def get_num_tokens(self):
-        con = sqlite3.connect(self.index_path)
-        cursor = con.execute("""
-            SELECT count(*) from terms
-        """)
-        num_terms = cursor.fetchone()[0]
-        return num_terms
+        raise NotImplementedError()
 
 
     def get_random_terms(self, n):
     def get_random_terms(self, n):
-        con = sqlite3.connect(self.index_path)
-        cursor = con.execute("""
-            SELECT DISTINCT term FROM terms
-            ORDER BY random() LIMIT ?
-        """)
-        terms = [t[0] for t in cursor.fetchall()]
-        return terms
+        raise NotImplementedError()
 
 
 
 
 def run():
 def run():
-    indexer = Indexer(INDEX_PATH)
+    indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
     indexer.create_if_not_exists()
     indexer.create_if_not_exists()
     nlp = English()
     nlp = English()
     for path in glob(CRAWL_GLOB):
     for path in glob(CRAWL_GLOB):
@@ -136,7 +165,7 @@ def run():
             url = html_file.readline().strip()
             url = html_file.readline().strip()
             content = html_file.read()
             content = html_file.read()
 
 
-        if indexer.page_indexed(url):
+        if indexer.document_indexed(url):
             print("Page exists, skipping", url)
             print("Page exists, skipping", url)
             continue
             continue
 
 
@@ -169,7 +198,7 @@ def get_pages(nlp, titles_and_urls):
         prepared_url = prepare_url_for_tokenizing(unquote(url))
         prepared_url = prepare_url_for_tokenizing(unquote(url))
         url_tokens = tokenize(nlp, prepared_url)
         url_tokens = tokenize(nlp, prepared_url)
         tokens = title_tokens | url_tokens
         tokens = title_tokens | url_tokens
-        yield Page(list(tokens), url, title_cleaned)
+        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
 
 
         if i % 1000 == 0:
         if i % 1000 == 0:
             print("Processed", i)
             print("Processed", i)
@@ -183,7 +212,7 @@ def grouper(n: int, iterator: Iterator):
         yield chunk
         yield chunk
 
 
 
 
-def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
+def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
     indexer.create_if_not_exists()
     indexer.create_if_not_exists()
 
 
     pages = get_pages(nlp, titles_and_urls)
     pages = get_pages(nlp, titles_and_urls)

+ 2 - 2
paths.py

@@ -5,7 +5,7 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
 HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
 HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
 CRAWL_PREFIX = 'crawl_'
 CRAWL_PREFIX = 'crawl_'
 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
-INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
-TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
+INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
+TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
 WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
 WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')

+ 15 - 15
performance.py

@@ -10,12 +10,12 @@ from spacy.lang.en import English
 from starlette.testclient import TestClient
 from starlette.testclient import TestClient
 
 
 from app import app, complete
 from app import app, complete
-from index import Indexer, index_titles_and_urls
+from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
 from paths import TEST_INDEX_PATH
 from paths import TEST_INDEX_PATH
 from wiki import get_wiki_titles_and_urls
 from wiki import get_wiki_titles_and_urls
 
 
 
 
-NUM_PAGES = 500
+NUM_DOCUMENTS = 10000
 
 
 
 
 def query_test():
 def query_test():
@@ -25,7 +25,7 @@ def query_test():
 
 
     start = datetime.now()
     start = datetime.now()
     hits = 0
     hits = 0
-    for title, url in islice(titles_and_urls, NUM_PAGES):
+    for title, url in islice(titles_and_urls, NUM_DOCUMENTS):
         result = client.get('/complete', params={'q': title})
         result = client.get('/complete', params={'q': title})
         assert result.status_code == 200
         assert result.status_code == 200
         data = result.content.decode('utf8')
         data = result.content.decode('utf8')
@@ -36,7 +36,7 @@ def query_test():
 
 
     end = datetime.now()
     end = datetime.now()
     print("Hits:", hits)
     print("Hits:", hits)
-    print("Query time:", (end - start).total_seconds()/NUM_PAGES)
+    print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
 
 
 
 
 def performance_test():
 def performance_test():
@@ -45,23 +45,23 @@ def performance_test():
         os.remove(TEST_INDEX_PATH)
         os.remove(TEST_INDEX_PATH)
     except FileNotFoundError:
     except FileNotFoundError:
         print("No test index found, creating")
         print("No test index found, creating")
-    indexer = Indexer(TEST_INDEX_PATH)
-    titles_and_urls = get_wiki_titles_and_urls()
-    titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
+    with TinyIndexer(TEST_INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+        titles_and_urls = get_wiki_titles_and_urls()
+        titles_and_urls_slice = islice(titles_and_urls, NUM_DOCUMENTS)
 
 
-    start_time = datetime.now()
-    index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
-    stop_time = datetime.now()
+        start_time = datetime.now()
+        index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
+        stop_time = datetime.now()
 
 
-    index_time = (stop_time - start_time).total_seconds()
-    index_size = os.path.getsize(TEST_INDEX_PATH)
+        index_time = (stop_time - start_time).total_seconds()
+        index_size = os.path.getsize(TEST_INDEX_PATH)
 
 
-    print("Indexed pages:", NUM_PAGES)
+    print("Indexed pages:", NUM_DOCUMENTS)
     print("Index time:", index_time)
     print("Index time:", index_time)
     print("Index size", index_size)
     print("Index size", index_size)
-    print("Num tokens", indexer.get_num_tokens())
+    # print("Num tokens", indexer.get_num_tokens())
 
 
-    query_test()
+    # query_test()
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':

+ 4 - 4
wiki.py

@@ -7,7 +7,7 @@ from urllib.parse import quote
 
 
 from spacy.lang.en import English
 from spacy.lang.en import English
 
 
-from index import Indexer, index_titles_and_urls
+from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
 from paths import WIKI_TITLES_PATH, INDEX_PATH
 from paths import WIKI_TITLES_PATH, INDEX_PATH
 
 
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
@@ -17,9 +17,9 @@ TITLE_END = '</title>\n'
 
 
 def index_wiki():
 def index_wiki():
     nlp = English()
     nlp = English()
-    indexer = Indexer(INDEX_PATH)
-    titles_and_urls = get_wiki_titles_and_urls()
-    index_titles_and_urls(indexer, nlp, titles_and_urls)
+    with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+        titles_and_urls = get_wiki_titles_and_urls()
+        index_titles_and_urls(indexer, nlp, titles_and_urls)
 
 
 
 
 def get_wiki_titles_and_urls():
 def get_wiki_titles_and_urls():