Add a script to evaluate how much it costs to add the term to the index

Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449
2023-11-16 17:42:18 +00:00 · 2023-11-16 17:42:18 +00:00 · a2b872008f
commit a2b872008f
parent 8790d758a3
4 changed files with 66 additions and 3 deletions
--- a/analyse/add_term_info.py
+++ b/analyse/add_term_info.py
@ -0,0 +1,60 @@
 """
 Investigate adding term information to the database.
 How much extra space will it take?
 """
 import os
 from pathlib import Path
 from random import Random
 import numpy as np
 from scipy.stats import sem
 from mwmbl.indexer.index import tokenize_document
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
 from zstandard import ZstdCompressor
 random = Random(1)
 INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch"
 # INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
 def add_term_info(document: Document, index: TinyIndex, page_index: int):
    tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
    for token in tokenized.tokens:
        token_page_index = index.get_key_page_index(token)
        if token_page_index == page_index:
            return Document(document.title, document.url, document.extract, document.score, token)
    raise ValueError("Could not find token in page index")
 def run():
    compressor = ZstdCompressor()
    with TinyIndex(Document, INDEX_PATH) as index:
        # Get some random integers between 0 and index.num_pages:
        pages = random.sample(range(index.num_pages), 10000)
        old_sizes = []
        new_sizes = []
        for i in pages:
            page = index.get_page(i)
            term_documents = []
            for document in page:
                term_document = add_term_info(document, index, i)
                term_documents.append(term_document)
            value_tuples = [astuple(value) for value in term_documents]
            num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
            new_sizes.append(num_fitting)
            old_sizes.append(len(page))
        print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
        print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
 if __name__ == '__main__':
    run()
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
 def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
    for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
        score = link_counts.get(url, DEFAULT_SCORE)
-        yield tokenize_document(url, title_cleaned, extract, score, nlp)
+        yield tokenize_document(url, title_cleaned, extract, score)
        if i % 1000 == 0:
            print("Processed", i)
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
    return set(first_tokens + bigrams)
-def tokenize_document(url, title_cleaned, extract, score, nlp):
+def tokenize_document(url, title_cleaned, extract, score):
    title_tokens = tokenize(title_cleaned)
    prepared_url = prepare_url_for_tokenizing(unquote(url))
    url_tokens = tokenize(prepared_url)
--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@ -71,7 +71,7 @@ def preprocess_documents(documents, index_path, nlp):
    page_documents = defaultdict(list)
    with TinyIndex(Document, index_path, 'w') as indexer:
        for document in documents:
-            tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
+            tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
            # logger.debug(f"Tokenized: {tokenized}")
            page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
            for page in page_indexes:
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@ -79,6 +79,7 @@ class TinyIndexMetadata:
        values = json.loads(data[constant_length:].decode('utf8'))
        return TinyIndexMetadata(**values)
 # Find the optimal amount of data that fits onto a page
 # We do this by leveraging binary search to quickly find the index where:
 #     - index+1 cannot fit onto a page
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
            # No better match, use our index
            return mid, compressed_data
 def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
    # Find max number of items that fit on a page
    return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
 def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
    num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)