Add a script to evaluate how much it costs to add the term to the index
Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449
This commit is contained in:
parent
8790d758a3
commit
a2b872008f
4 changed files with 66 additions and 3 deletions
60
analyse/add_term_info.py
Normal file
60
analyse/add_term_info.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
"""
|
||||
Investigate adding term information to the database.
|
||||
|
||||
How much extra space will it take?
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from random import Random
|
||||
|
||||
import numpy as np
|
||||
from scipy.stats import sem
|
||||
|
||||
from mwmbl.indexer.index import tokenize_document
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
|
||||
|
||||
from zstandard import ZstdCompressor
|
||||
|
||||
random = Random(1)
|
||||
|
||||
INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch"
|
||||
# INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
|
||||
|
||||
|
||||
def add_term_info(document: Document, index: TinyIndex, page_index: int):
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
|
||||
for token in tokenized.tokens:
|
||||
token_page_index = index.get_key_page_index(token)
|
||||
if token_page_index == page_index:
|
||||
return Document(document.title, document.url, document.extract, document.score, token)
|
||||
raise ValueError("Could not find token in page index")
|
||||
|
||||
|
||||
def run():
|
||||
compressor = ZstdCompressor()
|
||||
with TinyIndex(Document, INDEX_PATH) as index:
|
||||
# Get some random integers between 0 and index.num_pages:
|
||||
pages = random.sample(range(index.num_pages), 10000)
|
||||
|
||||
old_sizes = []
|
||||
new_sizes = []
|
||||
|
||||
for i in pages:
|
||||
page = index.get_page(i)
|
||||
term_documents = []
|
||||
for document in page:
|
||||
term_document = add_term_info(document, index, i)
|
||||
term_documents.append(term_document)
|
||||
|
||||
value_tuples = [astuple(value) for value in term_documents]
|
||||
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
|
||||
|
||||
new_sizes.append(num_fitting)
|
||||
old_sizes.append(len(page))
|
||||
|
||||
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
|
||||
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
|
@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
|
|||
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
|
||||
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
|
||||
score = link_counts.get(url, DEFAULT_SCORE)
|
||||
yield tokenize_document(url, title_cleaned, extract, score, nlp)
|
||||
yield tokenize_document(url, title_cleaned, extract, score)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print("Processed", i)
|
||||
|
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
|
|||
return set(first_tokens + bigrams)
|
||||
|
||||
|
||||
def tokenize_document(url, title_cleaned, extract, score, nlp):
|
||||
def tokenize_document(url, title_cleaned, extract, score):
|
||||
title_tokens = tokenize(title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
url_tokens = tokenize(prepared_url)
|
||||
|
|
|
@ -71,7 +71,7 @@ def preprocess_documents(documents, index_path, nlp):
|
|||
page_documents = defaultdict(list)
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
for document in documents:
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
|
||||
# logger.debug(f"Tokenized: {tokenized}")
|
||||
page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
|
||||
for page in page_indexes:
|
||||
|
|
|
@ -79,6 +79,7 @@ class TinyIndexMetadata:
|
|||
values = json.loads(data[constant_length:].decode('utf8'))
|
||||
return TinyIndexMetadata(**values)
|
||||
|
||||
|
||||
# Find the optimal amount of data that fits onto a page
|
||||
# We do this by leveraging binary search to quickly find the index where:
|
||||
# - index+1 cannot fit onto a page
|
||||
|
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
|
|||
# No better match, use our index
|
||||
return mid, compressed_data
|
||||
|
||||
|
||||
def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
|
||||
# Find max number of items that fit on a page
|
||||
return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
|
||||
|
||||
|
||||
def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
|
||||
num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
|
||||
|
||||
|
|
Loading…
Reference in a new issue