Add a script to evaluate how much it costs to add the term to the index
Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449
This commit is contained in:
parent
8790d758a3
commit
a2b872008f
4 changed files with 66 additions and 3 deletions
60
analyse/add_term_info.py
Normal file
60
analyse/add_term_info.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
"""
|
||||||
|
Investigate adding term information to the database.
|
||||||
|
|
||||||
|
How much extra space will it take?
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from random import Random
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import sem
|
||||||
|
|
||||||
|
from mwmbl.indexer.index import tokenize_document
|
||||||
|
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
|
||||||
|
|
||||||
|
from zstandard import ZstdCompressor
|
||||||
|
|
||||||
|
random = Random(1)
|
||||||
|
|
||||||
|
INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch"
|
||||||
|
# INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
|
||||||
|
|
||||||
|
|
||||||
|
def add_term_info(document: Document, index: TinyIndex, page_index: int):
|
||||||
|
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
|
||||||
|
for token in tokenized.tokens:
|
||||||
|
token_page_index = index.get_key_page_index(token)
|
||||||
|
if token_page_index == page_index:
|
||||||
|
return Document(document.title, document.url, document.extract, document.score, token)
|
||||||
|
raise ValueError("Could not find token in page index")
|
||||||
|
|
||||||
|
|
||||||
|
def run():
|
||||||
|
compressor = ZstdCompressor()
|
||||||
|
with TinyIndex(Document, INDEX_PATH) as index:
|
||||||
|
# Get some random integers between 0 and index.num_pages:
|
||||||
|
pages = random.sample(range(index.num_pages), 10000)
|
||||||
|
|
||||||
|
old_sizes = []
|
||||||
|
new_sizes = []
|
||||||
|
|
||||||
|
for i in pages:
|
||||||
|
page = index.get_page(i)
|
||||||
|
term_documents = []
|
||||||
|
for document in page:
|
||||||
|
term_document = add_term_info(document, index, i)
|
||||||
|
term_documents.append(term_document)
|
||||||
|
|
||||||
|
value_tuples = [astuple(value) for value in term_documents]
|
||||||
|
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
|
||||||
|
|
||||||
|
new_sizes.append(num_fitting)
|
||||||
|
old_sizes.append(len(page))
|
||||||
|
|
||||||
|
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
|
||||||
|
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run()
|
|
@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
|
||||||
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
|
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
|
||||||
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
|
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
|
||||||
score = link_counts.get(url, DEFAULT_SCORE)
|
score = link_counts.get(url, DEFAULT_SCORE)
|
||||||
yield tokenize_document(url, title_cleaned, extract, score, nlp)
|
yield tokenize_document(url, title_cleaned, extract, score)
|
||||||
|
|
||||||
if i % 1000 == 0:
|
if i % 1000 == 0:
|
||||||
print("Processed", i)
|
print("Processed", i)
|
||||||
|
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
|
||||||
return set(first_tokens + bigrams)
|
return set(first_tokens + bigrams)
|
||||||
|
|
||||||
|
|
||||||
def tokenize_document(url, title_cleaned, extract, score, nlp):
|
def tokenize_document(url, title_cleaned, extract, score):
|
||||||
title_tokens = tokenize(title_cleaned)
|
title_tokens = tokenize(title_cleaned)
|
||||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||||
url_tokens = tokenize(prepared_url)
|
url_tokens = tokenize(prepared_url)
|
||||||
|
|
|
@ -71,7 +71,7 @@ def preprocess_documents(documents, index_path, nlp):
|
||||||
page_documents = defaultdict(list)
|
page_documents = defaultdict(list)
|
||||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||||
for document in documents:
|
for document in documents:
|
||||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
|
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
|
||||||
# logger.debug(f"Tokenized: {tokenized}")
|
# logger.debug(f"Tokenized: {tokenized}")
|
||||||
page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
|
page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
|
||||||
for page in page_indexes:
|
for page in page_indexes:
|
||||||
|
|
|
@ -79,6 +79,7 @@ class TinyIndexMetadata:
|
||||||
values = json.loads(data[constant_length:].decode('utf8'))
|
values = json.loads(data[constant_length:].decode('utf8'))
|
||||||
return TinyIndexMetadata(**values)
|
return TinyIndexMetadata(**values)
|
||||||
|
|
||||||
|
|
||||||
# Find the optimal amount of data that fits onto a page
|
# Find the optimal amount of data that fits onto a page
|
||||||
# We do this by leveraging binary search to quickly find the index where:
|
# We do this by leveraging binary search to quickly find the index where:
|
||||||
# - index+1 cannot fit onto a page
|
# - index+1 cannot fit onto a page
|
||||||
|
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
|
||||||
# No better match, use our index
|
# No better match, use our index
|
||||||
return mid, compressed_data
|
return mid, compressed_data
|
||||||
|
|
||||||
|
|
||||||
def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
|
def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
|
||||||
# Find max number of items that fit on a page
|
# Find max number of items that fit on a page
|
||||||
return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
|
return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
|
||||||
|
|
||||||
|
|
||||||
def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
|
def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
|
||||||
num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
|
num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue