From 8e197a09f9e654a43a46e34e233a21d93918bf04 Mon Sep 17 00:00:00 2001 From: Rishabh Singh Ahluwalia Date: Sun, 26 Mar 2023 10:04:05 -0700 Subject: [PATCH 1/4] Fix trimming page size logic while adding to a page --- mwmbl/tinysearchengine/indexer.py | 42 ++++++++---- test/test_indexer.py | 105 +++++++++++++++++++++++++++++- 2 files changed, 134 insertions(+), 13 deletions(-) diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index 591817f..de7daf0 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -65,19 +65,39 @@ class TinyIndexMetadata: values = json.loads(data[constant_length:].decode('utf8')) return TinyIndexMetadata(**values) +# Find the optimal amount of data that fits onto a page +# We do this by leveraging binary search to quickly find the index where: +# - index+1 cannot fit onto a page +# - <=index can fit on a page +def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, items:list[T], lo:int, hi:int): + # Base case: our binary search has gone too far + if lo > hi: + return -1, None + # Check the midpoint to see if it will fit onto a page + mid = (lo+hi)//2 + compressed_data = compressor.compress(json.dumps(items[:mid]).encode('utf8')) + size = len(compressed_data) + if size > page_size: + # We cannot fit this much data into a page + # Reduce the hi boundary, and try again + return _binary_search_fitting_size(compressor,page_size,items,lo,mid-1) + else: + # We can fit this data into a page, but maybe we can fit more data + # Try to see if we have a better match + potential_target, potential_data = _binary_search_fitting_size(compressor,page_size,items,mid+1,hi) + if potential_target != -1: + # We found a larger index that can still fit onto a page, so use that + return potential_target, potential_data + else: + # No better match, use our index + return mid, compressed_data + +def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]): + # Find max number of items that fit on a page + return _binary_search_fitting_size(compressor,page_size,items,0,len(items)) def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]): - bytes_io = BytesIO() - stream_writer = compressor.stream_writer(bytes_io, write_size=128) - - num_fitting = 0 - for i, item in enumerate(items): - serialised_data = json.dumps(item) + '\n' - stream_writer.write(serialised_data.encode('utf8')) - stream_writer.flush() - if len(bytes_io.getvalue()) > page_size: - break - num_fitting = i + 1 + num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items) compressed_data = compressor.compress(json.dumps(items[:num_fitting]).encode('utf8')) assert len(compressed_data) < page_size, "The data shouldn't get bigger" diff --git a/test/test_indexer.py b/test/test_indexer.py index 0ac1cb5..dd25b18 100644 --- a/test/test_indexer.py +++ b/test/test_indexer.py @@ -1,8 +1,9 @@ from pathlib import Path from tempfile import TemporaryDirectory -from mwmbl.tinysearchengine.indexer import Document, TinyIndex - +from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size +from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError +import json def test_create_index(): num_pages = 10 @@ -14,3 +15,103 @@ def test_create_index(): for i in range(num_pages): page = indexer.get_page(i) assert page == [] + +def test_binary_search_fitting_size_all_fit(): + items = [1,2,3,4,5,6,7,8,9] + compressor = ZstdCompressor() + page_size = 4096 + count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items)) + + # We should fit everything + assert count_fit == len(items) + +def test_binary_search_fitting_size_subset_fit(): + items = [1,2,3,4,5,6,7,8,9] + compressor = ZstdCompressor() + page_size = 15 + count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items)) + + # We should not fit everything + assert count_fit < len(items) + +def test_binary_search_fitting_size_none_fit(): + items = [1,2,3,4,5,6,7,8,9] + compressor = ZstdCompressor() + page_size = 5 + count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items)) + + # We should not fit anything + assert count_fit == -1 + assert data is None + +def test_get_page_data_single_doc(): + document1 = Document(title='title1',url='url1',extract='extract1',score=1.0) + documents = [document1] + items = [astuple(value) for value in documents] + + compressor = ZstdCompressor() + page_size = 4096 + + # Trim data + num_fitting,trimmed_data = _trim_items_to_page(compressor,4096,items) + + # We should be able to fit the 1 item into a page + assert num_fitting == 1 + + # Compare the trimmed data to the actual data we're persisting + # We need to pad the trimmmed data, then it should be equal to the data we persist + padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size) + serialized_data = _get_page_data(compressor,page_size,items) + assert serialized_data == padded_trimmed_data + + +def test_get_page_data_many_docs_all_fit(): + # Build giant documents item + documents = [] + documents_len = 500 + page_size = 4096 + for x in range(documents_len): + txt = 'text{}'.format(x) + document = Document(title=txt,url=txt,extract=txt,score=x) + documents.append(document) + items = [astuple(value) for value in documents] + + # Trim the items + compressor = ZstdCompressor() + num_fitting,trimmed_data = _trim_items_to_page(compressor,page_size,items) + + # We should be able to fit all items + assert num_fitting == documents_len + + # Compare the trimmed data to the actual data we're persisting + # We need to pad the trimmed data, then it should be equal to the data we persist + serialized_data = _get_page_data(compressor,page_size,items) + padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size) + + assert serialized_data == padded_trimmed_data + +def test_get_page_data_many_docs_subset_fit(): + # Build giant documents item + documents = [] + documents_len = 5000 + page_size = 4096 + for x in range(documents_len): + txt = 'text{}'.format(x) + document = Document(title=txt,url=txt,extract=txt,score=x) + documents.append(document) + items = [astuple(value) for value in documents] + + # Trim the items + compressor = ZstdCompressor() + num_fitting,trimmed_data = _trim_items_to_page(compressor,page_size,items) + + # We should be able to fit a subset of the items onto the page + assert num_fitting > 1 + assert num_fitting < documents_len + + # Compare the trimmed data to the actual data we're persisting + # We need to pad the trimmed data, then it should be equal to the data we persist + serialized_data = _get_page_data(compressor,page_size,items) + padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size) + + assert serialized_data == padded_trimmed_data \ No newline at end of file From f232badd675fd777bd47bc12f7d46994924a4e3b Mon Sep 17 00:00:00 2001 From: Rishabh Singh Ahluwalia Date: Mon, 27 Mar 2023 22:18:10 -0700 Subject: [PATCH 2/4] fix comma formatting --- mwmbl/tinysearchengine/indexer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index de7daf0..e901441 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -80,11 +80,11 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item if size > page_size: # We cannot fit this much data into a page # Reduce the hi boundary, and try again - return _binary_search_fitting_size(compressor,page_size,items,lo,mid-1) + return _binary_search_fitting_size(compressor, page_size, items, lo, mid-1) else: # We can fit this data into a page, but maybe we can fit more data # Try to see if we have a better match - potential_target, potential_data = _binary_search_fitting_size(compressor,page_size,items,mid+1,hi) + potential_target, potential_data = _binary_search_fitting_size(compressor, page_size, items, mid+1, hi) if potential_target != -1: # We found a larger index that can still fit onto a page, so use that return potential_target, potential_data @@ -94,7 +94,7 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]): # Find max number of items that fit on a page - return _binary_search_fitting_size(compressor,page_size,items,0,len(items)) + return _binary_search_fitting_size(compressor, page_size, items, 0, len(items)) def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]): num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items) From 91269d5100295b4a90bda342f855f6df9b618abd Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 1 Apr 2023 06:35:44 +0100 Subject: [PATCH 3/4] Handle a bad batch --- mwmbl/indexer/batch_cache.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mwmbl/indexer/batch_cache.py b/mwmbl/indexer/batch_cache.py index 3001e81..e7af6db 100644 --- a/mwmbl/indexer/batch_cache.py +++ b/mwmbl/indexer/batch_cache.py @@ -39,7 +39,11 @@ class BatchCache: except FileNotFoundError: logger.exception(f"Missing batch file: {path}") continue - batch = HashedBatch.parse_raw(data) + try: + batch = HashedBatch.parse_raw(data) + except ValidationError: + logger.exception(f"Unable to parse batch, skipping: '{data}'") + continue batches[url] = batch return batches From 3e1f5da28efa822957cd0bca10045b89cec15b95 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 1 Apr 2023 06:40:03 +0100 Subject: [PATCH 4/4] Off by one error with page size --- mwmbl/tinysearchengine/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index caaadda..3e416b6 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -100,7 +100,7 @@ def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]): num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items) compressed_data = compressor.compress(json.dumps(items[:num_fitting]).encode('utf8')) - assert len(compressed_data) < page_size, "The data shouldn't get bigger" + assert len(compressed_data) <= page_size, "The data shouldn't get bigger" return _pad_to_page_size(compressed_data, page_size)