From a2b872008f58daea836cfc4ce8a5cfc1bbf2c282 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Thu, 16 Nov 2023 17:42:18 +0000
Subject: [PATCH] Add a script to evaluate how much it costs to add the term to
 the index

Old sizes mean 33.3673 0.08148019988498635
New sizes mean 32.1322 0.07700185221489449
---
 analyse/add_term_info.py          | 60 +++++++++++++++++++++++++++++++
 mwmbl/indexer/index.py            |  4 +--
 mwmbl/indexer/index_batches.py    |  2 +-
 mwmbl/tinysearchengine/indexer.py |  3 ++
 4 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 analyse/add_term_info.py

diff --git a/analyse/add_term_info.py b/analyse/add_term_info.py
new file mode 100644
index 0000000..22864ab
--- /dev/null
+++ b/analyse/add_term_info.py
@@ -0,0 +1,60 @@
+"""
+Investigate adding term information to the database.
+
+How much extra space will it take?
+"""
+import os
+from pathlib import Path
+from random import Random
+
+import numpy as np
+from scipy.stats import sem
+
+from mwmbl.indexer.index import tokenize_document
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
+
+from zstandard import ZstdCompressor
+
+random = Random(1)
+
+INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch"
+# INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
+
+
+def add_term_info(document: Document, index: TinyIndex, page_index: int):
+    tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
+    for token in tokenized.tokens:
+        token_page_index = index.get_key_page_index(token)
+        if token_page_index == page_index:
+            return Document(document.title, document.url, document.extract, document.score, token)
+    raise ValueError("Could not find token in page index")
+
+
+def run():
+    compressor = ZstdCompressor()
+    with TinyIndex(Document, INDEX_PATH) as index:
+        # Get some random integers between 0 and index.num_pages:
+        pages = random.sample(range(index.num_pages), 10000)
+
+        old_sizes = []
+        new_sizes = []
+
+        for i in pages:
+            page = index.get_page(i)
+            term_documents = []
+            for document in page:
+                term_document = add_term_info(document, index, i)
+                term_documents.append(term_document)
+
+            value_tuples = [astuple(value) for value in term_documents]
+            num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
+
+            new_sizes.append(num_fitting)
+            old_sizes.append(len(page))
+
+        print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
+        print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
+
+
+if __name__ == '__main__':
+    run()
diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py
index fb61405..ce94c5c 100644
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
 def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
     for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
         score = link_counts.get(url, DEFAULT_SCORE)
-        yield tokenize_document(url, title_cleaned, extract, score, nlp)
+        yield tokenize_document(url, title_cleaned, extract, score)
 
         if i % 1000 == 0:
             print("Processed", i)
@@ -61,7 +61,7 @@ def get_index_tokens(tokens):
     return set(first_tokens + bigrams)
 
 
-def tokenize_document(url, title_cleaned, extract, score, nlp):
+def tokenize_document(url, title_cleaned, extract, score):
     title_tokens = tokenize(title_cleaned)
     prepared_url = prepare_url_for_tokenizing(unquote(url))
     url_tokens = tokenize(prepared_url)
diff --git a/mwmbl/indexer/index_batches.py b/mwmbl/indexer/index_batches.py
index a6e0488..926ed6e 100644
--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@@ -71,7 +71,7 @@ def preprocess_documents(documents, index_path, nlp):
     page_documents = defaultdict(list)
     with TinyIndex(Document, index_path, 'w') as indexer:
         for document in documents:
-            tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
+            tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
             # logger.debug(f"Tokenized: {tokenized}")
             page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
             for page in page_indexes:
diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py
index ffe508b..c8edba3 100644
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -79,6 +79,7 @@ class TinyIndexMetadata:
         values = json.loads(data[constant_length:].decode('utf8'))
         return TinyIndexMetadata(**values)
 
+
 # Find the optimal amount of data that fits onto a page
 # We do this by leveraging binary search to quickly find the index where:
 #     - index+1 cannot fit onto a page
@@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
             # No better match, use our index
             return mid, compressed_data
 
+
 def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
     # Find max number of items that fit on a page
     return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
 
+
 def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
     num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)