2023-11-16 17:42:18 +00:00
|
|
|
"""
|
|
|
|
Investigate adding term information to the database.
|
|
|
|
|
|
|
|
How much extra space will it take?
|
|
|
|
"""
|
|
|
|
import os
|
|
|
|
from pathlib import Path
|
|
|
|
from random import Random
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
from scipy.stats import sem
|
|
|
|
|
|
|
|
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
|
|
|
|
|
|
|
|
from zstandard import ZstdCompressor
|
|
|
|
|
2023-11-18 18:49:41 +00:00
|
|
|
from mwmbl.utils import add_term_info
|
2023-11-16 17:42:18 +00:00
|
|
|
|
2023-11-18 18:49:41 +00:00
|
|
|
random = Random(1)
|
2023-11-16 17:42:18 +00:00
|
|
|
|
2023-11-18 18:49:41 +00:00
|
|
|
INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
|
2023-11-16 17:42:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
def run():
|
|
|
|
compressor = ZstdCompressor()
|
|
|
|
with TinyIndex(Document, INDEX_PATH) as index:
|
|
|
|
# Get some random integers between 0 and index.num_pages:
|
|
|
|
pages = random.sample(range(index.num_pages), 10000)
|
|
|
|
|
|
|
|
old_sizes = []
|
|
|
|
new_sizes = []
|
|
|
|
|
|
|
|
for i in pages:
|
|
|
|
page = index.get_page(i)
|
|
|
|
term_documents = []
|
|
|
|
for document in page:
|
|
|
|
term_document = add_term_info(document, index, i)
|
|
|
|
term_documents.append(term_document)
|
|
|
|
|
|
|
|
value_tuples = [astuple(value) for value in term_documents]
|
|
|
|
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
|
|
|
|
|
|
|
|
new_sizes.append(num_fitting)
|
|
|
|
old_sizes.append(len(page))
|
|
|
|
|
|
|
|
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
|
|
|
|
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
run()
|