Add a script to evaluate how much it costs to add the term to the index

Old sizes mean 33.3673 0.08148019988498635
New sizes mean 32.1322 0.07700185221489449
This commit is contained in:
Daoud Clarke 2023-11-16 17:42:18 +00:00
parent 8790d758a3
commit a2b872008f
4 changed files with 66 additions and 3 deletions

60
analyse/add_term_info.py Normal file
View file

@ -0,0 +1,60 @@
"""
Investigate adding term information to the database.
How much extra space will it take?
"""
import os
from pathlib import Path
from random import Random
import numpy as np
from scipy.stats import sem
from mwmbl.indexer.index import tokenize_document
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
from zstandard import ZstdCompressor
random = Random(1)
INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch"
# INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
def add_term_info(document: Document, index: TinyIndex, page_index: int):
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
for token in tokenized.tokens:
token_page_index = index.get_key_page_index(token)
if token_page_index == page_index:
return Document(document.title, document.url, document.extract, document.score, token)
raise ValueError("Could not find token in page index")
def run():
compressor = ZstdCompressor()
with TinyIndex(Document, INDEX_PATH) as index:
# Get some random integers between 0 and index.num_pages:
pages = random.sample(range(index.num_pages), 10000)
old_sizes = []
new_sizes = []
for i in pages:
page = index.get_page(i)
term_documents = []
for document in page:
term_document = add_term_info(document, index, i)
term_documents.append(term_document)
value_tuples = [astuple(value) for value in term_documents]
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
new_sizes.append(num_fitting)
old_sizes.append(len(page))
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
if __name__ == '__main__':
run()

View file

@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]: def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts): for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
score = link_counts.get(url, DEFAULT_SCORE) score = link_counts.get(url, DEFAULT_SCORE)
yield tokenize_document(url, title_cleaned, extract, score, nlp) yield tokenize_document(url, title_cleaned, extract, score)
if i % 1000 == 0: if i % 1000 == 0:
print("Processed", i) print("Processed", i)
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
return set(first_tokens + bigrams) return set(first_tokens + bigrams)
def tokenize_document(url, title_cleaned, extract, score, nlp): def tokenize_document(url, title_cleaned, extract, score):
title_tokens = tokenize(title_cleaned) title_tokens = tokenize(title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url)) prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(prepared_url) url_tokens = tokenize(prepared_url)

View file

@ -71,7 +71,7 @@ def preprocess_documents(documents, index_path, nlp):
page_documents = defaultdict(list) page_documents = defaultdict(list)
with TinyIndex(Document, index_path, 'w') as indexer: with TinyIndex(Document, index_path, 'w') as indexer:
for document in documents: for document in documents:
tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp) tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
# logger.debug(f"Tokenized: {tokenized}") # logger.debug(f"Tokenized: {tokenized}")
page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens] page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
for page in page_indexes: for page in page_indexes:

View file

@ -79,6 +79,7 @@ class TinyIndexMetadata:
values = json.loads(data[constant_length:].decode('utf8')) values = json.loads(data[constant_length:].decode('utf8'))
return TinyIndexMetadata(**values) return TinyIndexMetadata(**values)
# Find the optimal amount of data that fits onto a page # Find the optimal amount of data that fits onto a page
# We do this by leveraging binary search to quickly find the index where: # We do this by leveraging binary search to quickly find the index where:
# - index+1 cannot fit onto a page # - index+1 cannot fit onto a page
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
# No better match, use our index # No better match, use our index
return mid, compressed_data return mid, compressed_data
def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]): def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
# Find max number of items that fit on a page # Find max number of items that fit on a page
return _binary_search_fitting_size(compressor, page_size, items, 0, len(items)) return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]): def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items) num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)