|
@@ -1,8 +1,9 @@
|
|
|
from pathlib import Path
|
|
|
from tempfile import TemporaryDirectory
|
|
|
|
|
|
-from mwmbl.tinysearchengine.indexer import Document, TinyIndex
|
|
|
-
|
|
|
+from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
|
|
|
+from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
|
|
|
+import json
|
|
|
|
|
|
def test_create_index():
|
|
|
num_pages = 10
|
|
@@ -14,3 +15,103 @@ def test_create_index():
|
|
|
for i in range(num_pages):
|
|
|
page = indexer.get_page(i)
|
|
|
assert page == []
|
|
|
+
|
|
|
+def test_binary_search_fitting_size_all_fit():
|
|
|
+ items = [1,2,3,4,5,6,7,8,9]
|
|
|
+ compressor = ZstdCompressor()
|
|
|
+ page_size = 4096
|
|
|
+ count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
|
|
|
+
|
|
|
+ # We should fit everything
|
|
|
+ assert count_fit == len(items)
|
|
|
+
|
|
|
+def test_binary_search_fitting_size_subset_fit():
|
|
|
+ items = [1,2,3,4,5,6,7,8,9]
|
|
|
+ compressor = ZstdCompressor()
|
|
|
+ page_size = 15
|
|
|
+ count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
|
|
|
+
|
|
|
+ # We should not fit everything
|
|
|
+ assert count_fit < len(items)
|
|
|
+
|
|
|
+def test_binary_search_fitting_size_none_fit():
|
|
|
+ items = [1,2,3,4,5,6,7,8,9]
|
|
|
+ compressor = ZstdCompressor()
|
|
|
+ page_size = 5
|
|
|
+ count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
|
|
|
+
|
|
|
+ # We should not fit anything
|
|
|
+ assert count_fit == -1
|
|
|
+ assert data is None
|
|
|
+
|
|
|
+def test_get_page_data_single_doc():
|
|
|
+ document1 = Document(title='title1',url='url1',extract='extract1',score=1.0)
|
|
|
+ documents = [document1]
|
|
|
+ items = [astuple(value) for value in documents]
|
|
|
+
|
|
|
+ compressor = ZstdCompressor()
|
|
|
+ page_size = 4096
|
|
|
+
|
|
|
+ # Trim data
|
|
|
+ num_fitting,trimmed_data = _trim_items_to_page(compressor,4096,items)
|
|
|
+
|
|
|
+ # We should be able to fit the 1 item into a page
|
|
|
+ assert num_fitting == 1
|
|
|
+
|
|
|
+ # Compare the trimmed data to the actual data we're persisting
|
|
|
+ # We need to pad the trimmmed data, then it should be equal to the data we persist
|
|
|
+ padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
|
|
|
+ serialized_data = _get_page_data(compressor,page_size,items)
|
|
|
+ assert serialized_data == padded_trimmed_data
|
|
|
+
|
|
|
+
|
|
|
+def test_get_page_data_many_docs_all_fit():
|
|
|
+ # Build giant documents item
|
|
|
+ documents = []
|
|
|
+ documents_len = 500
|
|
|
+ page_size = 4096
|
|
|
+ for x in range(documents_len):
|
|
|
+ txt = 'text{}'.format(x)
|
|
|
+ document = Document(title=txt,url=txt,extract=txt,score=x)
|
|
|
+ documents.append(document)
|
|
|
+ items = [astuple(value) for value in documents]
|
|
|
+
|
|
|
+ # Trim the items
|
|
|
+ compressor = ZstdCompressor()
|
|
|
+ num_fitting,trimmed_data = _trim_items_to_page(compressor,page_size,items)
|
|
|
+
|
|
|
+ # We should be able to fit all items
|
|
|
+ assert num_fitting == documents_len
|
|
|
+
|
|
|
+ # Compare the trimmed data to the actual data we're persisting
|
|
|
+ # We need to pad the trimmed data, then it should be equal to the data we persist
|
|
|
+ serialized_data = _get_page_data(compressor,page_size,items)
|
|
|
+ padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
|
|
|
+
|
|
|
+ assert serialized_data == padded_trimmed_data
|
|
|
+
|
|
|
+def test_get_page_data_many_docs_subset_fit():
|
|
|
+ # Build giant documents item
|
|
|
+ documents = []
|
|
|
+ documents_len = 5000
|
|
|
+ page_size = 4096
|
|
|
+ for x in range(documents_len):
|
|
|
+ txt = 'text{}'.format(x)
|
|
|
+ document = Document(title=txt,url=txt,extract=txt,score=x)
|
|
|
+ documents.append(document)
|
|
|
+ items = [astuple(value) for value in documents]
|
|
|
+
|
|
|
+ # Trim the items
|
|
|
+ compressor = ZstdCompressor()
|
|
|
+ num_fitting,trimmed_data = _trim_items_to_page(compressor,page_size,items)
|
|
|
+
|
|
|
+ # We should be able to fit a subset of the items onto the page
|
|
|
+ assert num_fitting > 1
|
|
|
+ assert num_fitting < documents_len
|
|
|
+
|
|
|
+ # Compare the trimmed data to the actual data we're persisting
|
|
|
+ # We need to pad the trimmed data, then it should be equal to the data we persist
|
|
|
+ serialized_data = _get_page_data(compressor,page_size,items)
|
|
|
+ padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
|
|
|
+
|
|
|
+ assert serialized_data == padded_trimmed_data
|