From 8e197a09f9e654a43a46e34e233a21d93918bf04 Mon Sep 17 00:00:00 2001
From: Rishabh Singh Ahluwalia <rishabhsingh8@gmail.com>
Date: Sun, 26 Mar 2023 10:04:05 -0700
Subject: [PATCH 1/4] Fix trimming page size logic while adding to a page

---
 mwmbl/tinysearchengine/indexer.py |  42 ++++++++----
 test/test_indexer.py              | 105 +++++++++++++++++++++++++++++-
 2 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py
index 591817f97e4b69e2b0dbef6289732f62532b8222..de7daf06e1313122585715ef244ffe4b3d1fc389 100644
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -65,19 +65,39 @@ class TinyIndexMetadata:
         values = json.loads(data[constant_length:].decode('utf8'))
         return TinyIndexMetadata(**values)
 
+# Find the optimal amount of data that fits onto a page
+# We do this by leveraging binary search to quickly find the index where:
+#     - index+1 cannot fit onto a page
+#     - <=index can fit on a page
+def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, items:list[T], lo:int, hi:int):
+    # Base case: our binary search has gone too far
+    if lo > hi:
+        return -1, None
+    # Check the midpoint to see if it will fit onto a page
+    mid = (lo+hi)//2
+    compressed_data = compressor.compress(json.dumps(items[:mid]).encode('utf8'))
+    size = len(compressed_data)
+    if size > page_size:
+        # We cannot fit this much data into a page
+        # Reduce the hi boundary, and try again
+        return _binary_search_fitting_size(compressor,page_size,items,lo,mid-1)
+    else:
+        # We can fit this data into a page, but maybe we can fit more data
+        # Try to see if we have a better match
+        potential_target, potential_data = _binary_search_fitting_size(compressor,page_size,items,mid+1,hi)
+        if potential_target != -1:
+            # We found a larger index that can still fit onto a page, so use that
+            return potential_target, potential_data
+        else:
+            # No better match, use our index
+            return mid, compressed_data
+
+def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
+    # Find max number of items that fit on a page
+    return _binary_search_fitting_size(compressor,page_size,items,0,len(items))
 
 def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
-    bytes_io = BytesIO()
-    stream_writer = compressor.stream_writer(bytes_io, write_size=128)
-
-    num_fitting = 0
-    for i, item in enumerate(items):
-        serialised_data = json.dumps(item) + '\n'
-        stream_writer.write(serialised_data.encode('utf8'))
-        stream_writer.flush()
-        if len(bytes_io.getvalue()) > page_size:
-            break
-        num_fitting = i + 1
+    num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
 
     compressed_data = compressor.compress(json.dumps(items[:num_fitting]).encode('utf8'))
     assert len(compressed_data) < page_size, "The data shouldn't get bigger"
diff --git a/test/test_indexer.py b/test/test_indexer.py
index 0ac1cb5b320c47ac6e793c118ac7676d4458dfde..dd25b18eb42f91c41eaed7d21a7524f0af224efe 100644
--- a/test/test_indexer.py
+++ b/test/test_indexer.py
@@ -1,8 +1,9 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
-from mwmbl.tinysearchengine.indexer import Document, TinyIndex
-
+from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
+from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
+import json
 
 def test_create_index():
     num_pages = 10
@@ -14,3 +15,103 @@ def test_create_index():
             for i in range(num_pages):
                 page = indexer.get_page(i)
                 assert page == []
+
+def test_binary_search_fitting_size_all_fit():
+    items = [1,2,3,4,5,6,7,8,9]
+    compressor = ZstdCompressor()
+    page_size = 4096
+    count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
+    
+    # We should fit everything
+    assert count_fit == len(items)
+    
+def test_binary_search_fitting_size_subset_fit():
+    items = [1,2,3,4,5,6,7,8,9]
+    compressor = ZstdCompressor()
+    page_size = 15
+    count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
+    
+    # We should not fit everything
+    assert count_fit < len(items)
+    
+def test_binary_search_fitting_size_none_fit():
+    items = [1,2,3,4,5,6,7,8,9]
+    compressor = ZstdCompressor()
+    page_size = 5
+    count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
+    
+    # We should not fit anything
+    assert count_fit == -1
+    assert data is None
+    
+def test_get_page_data_single_doc():
+    document1 = Document(title='title1',url='url1',extract='extract1',score=1.0)
+    documents = [document1]
+    items = [astuple(value) for value in documents]
+    
+    compressor = ZstdCompressor()
+    page_size = 4096
+    
+    # Trim data
+    num_fitting,trimmed_data = _trim_items_to_page(compressor,4096,items)
+    
+    # We should be able to fit the 1 item into a page
+    assert num_fitting == 1
+    
+    # Compare the trimmed data to the actual data we're persisting
+    # We need to pad the trimmmed data, then it should be equal to the data we persist
+    padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
+    serialized_data = _get_page_data(compressor,page_size,items)
+    assert serialized_data == padded_trimmed_data
+    
+
+def test_get_page_data_many_docs_all_fit():
+    # Build giant documents item
+    documents = []
+    documents_len = 500
+    page_size = 4096
+    for x in range(documents_len):
+        txt = 'text{}'.format(x)
+        document = Document(title=txt,url=txt,extract=txt,score=x)
+        documents.append(document)
+    items = [astuple(value) for value in documents]
+    
+    # Trim the items
+    compressor = ZstdCompressor()
+    num_fitting,trimmed_data = _trim_items_to_page(compressor,page_size,items)
+    
+    # We should be able to fit all items
+    assert num_fitting == documents_len
+    
+    # Compare the trimmed data to the actual data we're persisting
+    # We need to pad the trimmed data, then it should be equal to the data we persist
+    serialized_data = _get_page_data(compressor,page_size,items)
+    padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
+    
+    assert serialized_data == padded_trimmed_data
+    
+def test_get_page_data_many_docs_subset_fit():
+    # Build giant documents item
+    documents = []
+    documents_len = 5000
+    page_size = 4096
+    for x in range(documents_len):
+        txt = 'text{}'.format(x)
+        document = Document(title=txt,url=txt,extract=txt,score=x)
+        documents.append(document)
+    items = [astuple(value) for value in documents]
+    
+    # Trim the items
+    compressor = ZstdCompressor()
+    num_fitting,trimmed_data = _trim_items_to_page(compressor,page_size,items)
+    
+    # We should be able to fit a subset of the items onto the page
+    assert num_fitting > 1
+    assert num_fitting < documents_len
+    
+    # Compare the trimmed data to the actual data we're persisting
+    # We need to pad the trimmed data, then it should be equal to the data we persist
+    serialized_data = _get_page_data(compressor,page_size,items)
+    padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
+    
+    assert serialized_data == padded_trimmed_data
\ No newline at end of file

From f232badd675fd777bd47bc12f7d46994924a4e3b Mon Sep 17 00:00:00 2001
From: Rishabh Singh Ahluwalia <rishabhsingh8@gmail.com>
Date: Mon, 27 Mar 2023 22:18:10 -0700
Subject: [PATCH 2/4] fix comma formatting

---
 mwmbl/tinysearchengine/indexer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py
index de7daf06e1313122585715ef244ffe4b3d1fc389..e901441f0ef18ec07284083e1977bb591e0113cd 100644
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -80,11 +80,11 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
     if size > page_size:
         # We cannot fit this much data into a page
         # Reduce the hi boundary, and try again
-        return _binary_search_fitting_size(compressor,page_size,items,lo,mid-1)
+        return _binary_search_fitting_size(compressor, page_size, items, lo, mid-1)
     else:
         # We can fit this data into a page, but maybe we can fit more data
         # Try to see if we have a better match
-        potential_target, potential_data = _binary_search_fitting_size(compressor,page_size,items,mid+1,hi)
+        potential_target, potential_data = _binary_search_fitting_size(compressor, page_size, items, mid+1, hi)
         if potential_target != -1:
             # We found a larger index that can still fit onto a page, so use that
             return potential_target, potential_data
@@ -94,7 +94,7 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
 
 def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
     # Find max number of items that fit on a page
-    return _binary_search_fitting_size(compressor,page_size,items,0,len(items))
+    return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
 
 def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
     num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)

From 91269d5100295b4a90bda342f855f6df9b618abd Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Sat, 1 Apr 2023 06:35:44 +0100
Subject: [PATCH 3/4] Handle a bad batch

---
 mwmbl/indexer/batch_cache.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mwmbl/indexer/batch_cache.py b/mwmbl/indexer/batch_cache.py
index 3001e81ada5fc32f1e843dfc0a41a43cbeea2a78..e7af6dbf083b42315003e13d10731a22cb123729 100644
--- a/mwmbl/indexer/batch_cache.py
+++ b/mwmbl/indexer/batch_cache.py
@@ -39,7 +39,11 @@ class BatchCache:
             except FileNotFoundError:
                 logger.exception(f"Missing batch file: {path}")
                 continue
-            batch = HashedBatch.parse_raw(data)
+            try:
+                batch = HashedBatch.parse_raw(data)
+            except ValidationError:
+                logger.exception(f"Unable to parse batch, skipping: '{data}'")
+                continue
             batches[url] = batch
         return batches
 

From 3e1f5da28efa822957cd0bca10045b89cec15b95 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Sat, 1 Apr 2023 06:40:03 +0100
Subject: [PATCH 4/4] Off by one error with page size

---
 mwmbl/tinysearchengine/indexer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py
index caaaddabd929d815673be37f27efc2970c9fce56..3e416b636866879877579e7ef9c0c666597fa125 100644
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -100,7 +100,7 @@ def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
     num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
 
     compressed_data = compressor.compress(json.dumps(items[:num_fitting]).encode('utf8'))
-    assert len(compressed_data) < page_size, "The data shouldn't get bigger"
+    assert len(compressed_data) <= page_size, "The data shouldn't get bigger"
     return _pad_to_page_size(compressed_data, page_size)