From a2b872008f58daea836cfc4ce8a5cfc1bbf2c282 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Thu, 16 Nov 2023 17:42:18 +0000 Subject: [PATCH] Add a script to evaluate how much it costs to add the term to the index Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449 --- analyse/add_term_info.py | 60 +++++++++++++++++++++++++++++++ mwmbl/indexer/index.py | 4 +-- mwmbl/indexer/index_batches.py | 2 +- mwmbl/tinysearchengine/indexer.py | 3 ++ 4 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 analyse/add_term_info.py diff --git a/analyse/add_term_info.py b/analyse/add_term_info.py new file mode 100644 index 0000000..22864ab --- /dev/null +++ b/analyse/add_term_info.py @@ -0,0 +1,60 @@ +""" +Investigate adding term information to the database. + +How much extra space will it take? +""" +import os +from pathlib import Path +from random import Random + +import numpy as np +from scipy.stats import sem + +from mwmbl.indexer.index import tokenize_document +from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple + +from zstandard import ZstdCompressor + +random = Random(1) + +INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch" +# INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch" + + +def add_term_info(document: Document, index: TinyIndex, page_index: int): + tokenized = tokenize_document(document.url, document.title, document.extract, document.score) + for token in tokenized.tokens: + token_page_index = index.get_key_page_index(token) + if token_page_index == page_index: + return Document(document.title, document.url, document.extract, document.score, token) + raise ValueError("Could not find token in page index") + + +def run(): + compressor = ZstdCompressor() + with TinyIndex(Document, INDEX_PATH) as index: + # Get some random integers between 0 and index.num_pages: + pages = random.sample(range(index.num_pages), 10000) + + old_sizes = [] + new_sizes = [] + + for i in pages: + page = index.get_page(i) + term_documents = [] + for document in page: + term_document = add_term_info(document, index, i) + term_documents.append(term_document) + + value_tuples = [astuple(value) for value in term_documents] + num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples) + + new_sizes.append(num_fitting) + old_sizes.append(len(page)) + + print("Old sizes mean", np.mean(old_sizes), sem(old_sizes)) + print("New sizes mean", np.mean(new_sizes), sem(new_sizes)) + + +if __name__ == '__main__': + run() diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index fb61405..ce94c5c 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str): def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]: for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts): score = link_counts.get(url, DEFAULT_SCORE) - yield tokenize_document(url, title_cleaned, extract, score, nlp) + yield tokenize_document(url, title_cleaned, extract, score) if i % 1000 == 0: print("Processed", i) @@ -61,7 +61,7 @@ def get_index_tokens(tokens): return set(first_tokens + bigrams) -def tokenize_document(url, title_cleaned, extract, score, nlp): +def tokenize_document(url, title_cleaned, extract, score): title_tokens = tokenize(title_cleaned) prepared_url = prepare_url_for_tokenizing(unquote(url)) url_tokens = tokenize(prepared_url) diff --git a/mwmbl/indexer/index_batches.py b/mwmbl/indexer/index_batches.py index a6e0488..926ed6e 100644 --- a/mwmbl/indexer/index_batches.py +++ b/mwmbl/indexer/index_batches.py @@ -71,7 +71,7 @@ def preprocess_documents(documents, index_path, nlp): page_documents = defaultdict(list) with TinyIndex(Document, index_path, 'w') as indexer: for document in documents: - tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp) + tokenized = tokenize_document(document.url, document.title, document.extract, document.score) # logger.debug(f"Tokenized: {tokenized}") page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens] for page in page_indexes: diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index ffe508b..c8edba3 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -79,6 +79,7 @@ class TinyIndexMetadata: values = json.loads(data[constant_length:].decode('utf8')) return TinyIndexMetadata(**values) + # Find the optimal amount of data that fits onto a page # We do this by leveraging binary search to quickly find the index where: # - index+1 cannot fit onto a page @@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item # No better match, use our index return mid, compressed_data + def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]): # Find max number of items that fit on a page return _binary_search_fitting_size(compressor, page_size, items, 0, len(items)) + def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]): num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)