mwmbl/analyse/add_term_info.py

"""
Investigate adding term information to the database.

How much extra space will it take?
"""
import os
from pathlib import Path
from random import Random

import numpy as np
from scipy.stats import sem

from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple

from zstandard import ZstdCompressor

from mwmbl.utils import add_term_info

random = Random(1)

INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"


def run():
    compressor = ZstdCompressor()
    with TinyIndex(Document, INDEX_PATH) as index:
        # Get some random integers between 0 and index.num_pages:
        pages = random.sample(range(index.num_pages), 10000)

        old_sizes = []
        new_sizes = []

        for i in pages:
            page = index.get_page(i)
            term_documents = []
            for document in page:
                term_document = add_term_info(document, index, i)
                term_documents.append(term_document)

            value_tuples = [astuple(value) for value in term_documents]
            num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)

            new_sizes.append(num_fitting)
            old_sizes.append(len(page))

        print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
        print("New sizes mean", np.mean(new_sizes), sem(new_sizes))


if __name__ == '__main__':
    run()
Add a script to evaluate how much it costs to add the term to the index Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449 2023-11-16 17:42:18 +00:00			`"""`
			`Investigate adding term information to the database.`

			`How much extra space will it take?`
			`"""`
			`import os`
			`from pathlib import Path`
			`from random import Random`

			`import numpy as np`
			`from scipy.stats import sem`

			`from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple`

			`from zstandard import ZstdCompressor`

Add term info to index 2023-11-18 18:49:41 +00:00			`from mwmbl.utils import add_term_info`
Add a script to evaluate how much it costs to add the term to the index Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449 2023-11-16 17:42:18 +00:00
Add term info to index 2023-11-18 18:49:41 +00:00			`random = Random(1)`
Add a script to evaluate how much it costs to add the term to the index Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449 2023-11-16 17:42:18 +00:00
Add term info to index 2023-11-18 18:49:41 +00:00			`INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"`
Add a script to evaluate how much it costs to add the term to the index Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449 2023-11-16 17:42:18 +00:00

			`def run():`
			`compressor = ZstdCompressor()`
			`with TinyIndex(Document, INDEX_PATH) as index:`
			`# Get some random integers between 0 and index.num_pages:`
			`pages = random.sample(range(index.num_pages), 10000)`

			`old_sizes = []`
			`new_sizes = []`

			`for i in pages:`
			`page = index.get_page(i)`
			`term_documents = []`
			`for document in page:`
			`term_document = add_term_info(document, index, i)`
			`term_documents.append(term_document)`

			`value_tuples = [astuple(value) for value in term_documents]`
			`num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)`

			`new_sizes.append(num_fitting)`
			`old_sizes.append(len(page))`

			`print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))`
			`print("New sizes mean", np.mean(new_sizes), sem(new_sizes))`


			`if __name__ == '__main__':`
			`run()`