From e6273c7f7636e8afa7b59625705e5352c57e8417 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Fri, 18 Feb 2022 22:12:22 +0000
Subject: [PATCH 1/4] WIP: include metadata in index - using struct approach

---
 mwmbl/indexer/index.py            |   2 +-
 mwmbl/indexer/index_glob.py       |   2 +-
 mwmbl/tinysearchengine/indexer.py | 165 +++++++++++++++++++-----------
 poetry.lock                       | 114 ++++++++++++++++++++-
 pyproject.toml                    |   1 +
 test/test_indexer.py              |  17 +++
 6 files changed, 238 insertions(+), 63 deletions(-)
 create mode 100644 test/test_indexer.py

diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py
index 8bd0dc9786a1ce869ba88bbe9833b23538fdabbb..94dc1e303dcfea0e154902e8e12f12b82dfdcaef 100644
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@@ -67,7 +67,7 @@ def grouper(n: int, iterator: Iterator):
 
 
 def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
-    indexer.create_if_not_exists()
+    indexer.create()
 
     terms = Counter()
     pages = get_pages(nlp, titles_urls_and_extracts)
diff --git a/mwmbl/indexer/index_glob.py b/mwmbl/indexer/index_glob.py
index e9102c213dcc6a45894d30b9976bd931d8ecf002..9bd8b966506d22956a843df9a8aea1dbb4259508 100644
--- a/mwmbl/indexer/index_glob.py
+++ b/mwmbl/indexer/index_glob.py
@@ -12,7 +12,7 @@ from .paths import INDEX_PATH, CRAWL_GLOB
 def run():
     # TODO: item_factory argument is unfilled.
     indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
-    indexer.create_if_not_exists()
+    indexer.create()
     nlp = English()
     for path in glob(CRAWL_GLOB):
         print("Path", path)
diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py
index 6055b7fe7fe3f64c4e0601bceac0ee9e61f688fd..29ccfb092db0dc2bd9e3f17fb848c346da1cd303 100644
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -1,14 +1,21 @@
 import json
 import os
 from dataclasses import astuple, dataclass
-from mmap import mmap, PROT_READ
+from io import UnsupportedOperation
+from mmap import mmap, PROT_READ, PROT_WRITE
 from pathlib import Path
+from struct import pack, unpack, calcsize
 from typing import TypeVar, Generic, Callable, List
 
 import mmh3
 from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
 
 
+VERSION = 1
+METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8')
+METADATA_FORMAT = 'IIIs'
+METADATA_SIZE = 4096
+
 NUM_PAGES = 76800
 PAGE_SIZE = 4096
 
@@ -28,20 +35,84 @@ class TokenizedDocument(Document):
 T = TypeVar('T')
 
 
-class TinyIndexBase(Generic[T]):
-    def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
+@dataclass
+class TinyIndexMetadata:
+    version: int
+    page_size: int
+    num_pages: int
+    item_factory: str
+
+    def to_bytes(self) -> bytes:
+        result = METADATA_CONSTANT + pack(
+            METADATA_FORMAT, self.version, self.page_size, self.num_pages, self.item_factory.encode('utf8')
+        )
+        assert len(result) <= METADATA_SIZE
+        return result
+
+    @staticmethod
+    def from_bytes(data: bytes):
+        constant_length = len(METADATA_CONSTANT)
+        metadata_constant = data[:constant_length]
+        if metadata_constant != METADATA_CONSTANT:
+            raise ValueError("This doesn't seem to be an index file")
+
+        actual_metadata_size = calcsize(METADATA_FORMAT)
+        values = unpack(METADATA_FORMAT, data[constant_length:constant_length+actual_metadata_size])
+        return TinyIndexMetadata(values[0], values[1], values[2], values[3].decode('utf8'))
+
+
+def _get_page_data(compressor, page_size, data):
+    serialised_data = json.dumps(data)
+    compressed_data = compressor.compress(serialised_data.encode('utf8'))
+    return _pad_to_page_size(compressed_data, page_size)
+
+
+def _pad_to_page_size(data: bytes, page_size: int):
+    page_length = len(data)
+    if page_length > page_size:
+        raise ValueError(f"Data is too big ({page_length}) for page size ({page_size})")
+    padding = b'\x00' * (page_size - page_length)
+    page_data = data + padding
+    return page_data
+
+
+class TinyIndex(Generic[T]):
+    def __init__(self, item_factory: Callable[..., T], index_path, mode='r'):
+        if mode not in {'r', 'w'}:
+            raise ValueError(f"Mode should be one of 'r' or 'w', got {mode}")
+
+        with open(index_path, 'rb') as index_file:
+            metadata_page = index_file.read(METADATA_SIZE)
+
+        metadata = TinyIndexMetadata.from_bytes(metadata_page)
+        if metadata.item_factory != item_factory.__name__:
+            raise ValueError(f"Metadata item factory '{metadata.item_factory}' in the index "
+                             f"does not match the passed item factory: '{item_factory.__name__}'")
+
         self.item_factory = item_factory
-        self.num_pages = num_pages
-        self.page_size = page_size
+        self.index_path = index_path
+        self.mode = mode
+
+        self.num_pages = metadata.num_pages
+        self.page_size = metadata.page_size
+        self.compressor = ZstdCompressor()
         self.decompressor = ZstdDecompressor()
+        self.index_file = None
         self.mmap = None
 
+    def __enter__(self):
+        self.index_file = open(self.index_path, 'r+b')
+        prot = PROT_READ if self.mode == 'r' else PROT_READ | PROT_WRITE
+        self.mmap = mmap(self.index_file.fileno(), 0, offset=METADATA_SIZE, prot=prot)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.mmap.close()
+        self.index_file.close()
+
     def retrieve(self, key: str) -> List[T]:
         index = self._get_key_page_index(key)
-        page = self.get_page(index)
-        if page is None:
-            return []
-        return self.convert_items(page)
+        return self.get_page(index)
 
     def _get_key_page_index(self, key):
         key_hash = mmh3.hash(key, signed=False)
@@ -52,45 +123,11 @@ class TinyIndexBase(Generic[T]):
         Get the page at index i, decompress and deserialise it using JSON
         """
         page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
-        try:
-            decompressed_data = self.decompressor.decompress(page_data)
-        except ZstdError:
-            return None
+        decompressed_data = self.decompressor.decompress(page_data)
         results = json.loads(decompressed_data.decode('utf8'))
-        return results
-
-    def convert_items(self, items) -> List[T]:
-        converted = [self.item_factory(*item) for item in items]
+        converted = [self.item_factory(*item) for item in results]
         return converted
 
-
-class TinyIndex(TinyIndexBase[T]):
-    def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
-        super().__init__(item_factory, num_pages, page_size)
-        self.index_path = index_path
-        self.index_file = open(self.index_path, 'rb')
-        self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
-
-
-class TinyIndexer(TinyIndexBase[T]):
-    def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
-        super().__init__(item_factory, num_pages, page_size)
-        self.index_path = index_path
-        self.compressor = ZstdCompressor()
-        self.decompressor = ZstdDecompressor()
-        self.index_file = None
-        self.mmap = None
-
-    def __enter__(self):
-        self.create_if_not_exists()
-        self.index_file = open(self.index_path, 'r+b')
-        self.mmap = mmap(self.index_file.fileno(), 0)
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.mmap.close()
-        self.index_file.close()
-
     def index(self, key: str, value: T):
         # print("Index", value)
         assert type(value) == self.item_factory, f"Can only index the specified type" \
@@ -113,16 +150,28 @@ class TinyIndexer(TinyIndexBase[T]):
         Serialise the data using JSON, compress it and store it at index i.
         If the data is too big, it will raise a ValueError and not store anything
         """
-        serialised_data = json.dumps(data)
-        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
-        page_length = len(compressed_data)
-        if page_length > self.page_size:
-            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
-        padding = b'\x00' * (self.page_size - page_length)
-        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
-
-    def create_if_not_exists(self):
-        if not os.path.isfile(self.index_path):
-            file_length = self.num_pages * self.page_size
-            with open(self.index_path, 'wb') as index_file:
-                index_file.write(b'\x00' * file_length)
+        if self.mode != 'w':
+            raise UnsupportedOperation("The file is open in read mode, you cannot write")
+
+        page_data = _get_page_data(self.compressor, self.page_size, data)
+        self.mmap[i * self.page_size:(i+1) * self.page_size] = page_data
+
+    @staticmethod
+    def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
+        if os.path.isfile(index_path):
+            raise FileExistsError("Index file already exists")
+
+        metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__)
+        metadata_bytes = metadata.to_bytes()
+        metadata_padded = _pad_to_page_size(metadata_bytes, METADATA_SIZE)
+
+        compressor = ZstdCompressor()
+        page_bytes = _get_page_data(compressor, page_size, [])
+
+        with open(index_path, 'wb') as index_file:
+            index_file.write(metadata_padded)
+            for i in range(num_pages):
+                index_file.write(page_bytes)
+
+        return TinyIndex(item_factory, index_path=index_path)
+
diff --git a/poetry.lock b/poetry.lock
index 2d146efb3a518fdb7d197c7a65c62768d5f8fce9..cc3189df801cf5ecb629e34f1e045f2aa47df7ac 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -26,6 +26,28 @@ python-versions = ">=3.7"
 [package.extras]
 tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
 
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+description = "Atomic file writes."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "21.4.0"
+description = "Classes Without Boilerplate"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.extras]
+dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"]
+docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
+tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"]
+tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"]
+
 [[package]]
 name = "beautifulsoup4"
 version = "4.10.0"
@@ -163,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
 [package.source]
 type = "url"
 url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
+
 [[package]]
 name = "fastapi"
 version = "0.70.1"
@@ -197,6 +220,14 @@ category = "main"
 optional = false
 python-versions = ">=3.5"
 
+[[package]]
+name = "iniconfig"
+version = "1.1.1"
+description = "iniconfig: brain-dead simple config-ini parsing"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "jinja2"
 version = "3.0.3"
@@ -314,7 +345,7 @@ name = "packaging"
 version = "21.3"
 description = "Core utilities for Python packages"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"
 
 [package.dependencies]
@@ -354,6 +385,18 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
 s3 = ["boto3"]
 test = ["pytest", "pytest-coverage", "mock", "typer-cli"]
 
+[[package]]
+name = "pluggy"
+version = "1.0.0"
+description = "plugin and hook calling mechanisms for python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
 [[package]]
 name = "preshed"
 version = "3.0.6"
@@ -366,6 +409,14 @@ python-versions = "*"
 cymem = ">=2.0.2,<2.1.0"
 murmurhash = ">=0.28.0,<1.1.0"
 
+[[package]]
+name = "py"
+version = "1.11.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
 [[package]]
 name = "py4j"
 version = "0.10.9.2"
@@ -413,7 +464,7 @@ name = "pyparsing"
 version = "3.0.7"
 description = "Python parsing module"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"
 
 [package.extras]
@@ -436,6 +487,27 @@ mllib = ["numpy (>=1.7)"]
 pandas_on_spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
 sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
 
+[[package]]
+name = "pytest"
+version = "7.0.1"
+description = "pytest: simple powerful testing with Python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=19.2.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+py = ">=1.8.2"
+tomli = ">=1.0.0"
+
+[package.extras]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+
 [[package]]
 name = "python-dateutil"
 version = "2.8.2"
@@ -680,6 +752,14 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"]
 tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"]
 torch = ["torch (>=1.5.0)"]
 
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "tqdm"
 version = "4.62.3"
@@ -797,7 +877,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.10"
-content-hash = "b5af8ce9887d0cf69297180fbb4040e1522e4a3135f8b651415afb35f86124ef"
+content-hash = "edb2d4bc50cb09ac5f7ba311d5238eb2deeab1d12f479067cc7239e3232bf6c9"
 
 [metadata.files]
 anyio = [
@@ -808,6 +888,14 @@ asgiref = [
     {file = "asgiref-3.5.0-py3-none-any.whl", hash = "sha256:88d59c13d634dcffe0510be048210188edd79aeccb6a6c9028cdad6f31d730a9"},
     {file = "asgiref-3.5.0.tar.gz", hash = "sha256:2f8abc20f7248433085eda803936d98992f1343ddb022065779f37c5da0181d0"},
 ]
+atomicwrites = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+attrs = [
+    {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
+    {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
+]
 beautifulsoup4 = [
     {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"},
     {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"},
@@ -941,6 +1029,10 @@ idna = [
     {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
     {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
 ]
+iniconfig = [
+    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
+    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
+]
 jinja2 = [
     {file = "Jinja2-3.0.3-py3-none-any.whl", hash = "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8"},
     {file = "Jinja2-3.0.3.tar.gz", hash = "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"},
@@ -1255,6 +1347,10 @@ pathy = [
     {file = "pathy-0.6.1-py3-none-any.whl", hash = "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a"},
     {file = "pathy-0.6.1.tar.gz", hash = "sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c"},
 ]
+pluggy = [
+    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
+    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+]
 preshed = [
     {file = "preshed-3.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:66a71ced487516cf81fd0431a3a843514262ae2f33e9a7688b87562258fa75d5"},
     {file = "preshed-3.0.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c98f725d8478f3ade4ab1ea00f50a92d2d9406d37276bc46fd8bab1d47452c4"},
@@ -1273,6 +1369,10 @@ preshed = [
     {file = "preshed-3.0.6-cp39-cp39-win_amd64.whl", hash = "sha256:92a8f49d17a63537a8beed48a049b62ef168ca07e0042a5b2bcdf178a1fb5d48"},
     {file = "preshed-3.0.6.tar.gz", hash = "sha256:fb3b7588a3a0f2f2f1bf3fe403361b2b031212b73a37025aea1df7215af3772a"},
 ]
+py = [
+    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
+    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
+]
 py4j = [
     {file = "py4j-0.10.9.2-py2.py3-none-any.whl", hash = "sha256:bf0485388e415ff26710d2dc719cb0ede16cf1164b1ee757e0ebb2e98c471521"},
     {file = "py4j-0.10.9.2.tar.gz", hash = "sha256:624f97c363b8dd84822bc666b12fa7f7d97824632b2ff3d852cc491359ce7615"},
@@ -1350,6 +1450,10 @@ pyparsing = [
 pyspark = [
     {file = "pyspark-3.2.0.tar.gz", hash = "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"},
 ]
+pytest = [
+    {file = "pytest-7.0.1-py3-none-any.whl", hash = "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db"},
+    {file = "pytest-7.0.1.tar.gz", hash = "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"},
+]
 python-dateutil = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
@@ -1539,6 +1643,10 @@ thinc = [
     {file = "thinc-8.0.13-cp39-cp39-win_amd64.whl", hash = "sha256:b3ae088f60d3dfe6a88c6be37548aae40023e46a718cffe3e43953b4f0ffc340"},
     {file = "thinc-8.0.13.tar.gz", hash = "sha256:47662a3ae33d445a77b6ea7b772444805c7bba8991f122e350daf72dedc8171a"},
 ]
+tomli = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
 tqdm = [
     {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},
     {file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"},
diff --git a/pyproject.toml b/pyproject.toml
index 02ae404bc1bd58a2780e569da8cc3521678e7fad..cd790c2b054ea5525629c82d639d84c059f4c910 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,6 +64,7 @@ indexer = [
 # langdetect = "^1.0.9"
 # spacy = "^3.2.1"
 # Levenshtein = "^0.16.0"
+pytest = "^7.0.1"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/test/test_indexer.py b/test/test_indexer.py
new file mode 100644
index 0000000000000000000000000000000000000000..34113dac5db836b99a14285d9c6450737c2c30f3
--- /dev/null
+++ b/test/test_indexer.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from mwmbl.tinysearchengine.indexer import Document, TinyIndex
+
+
+def test_create_index():
+    num_pages = 10
+    page_size = 4096
+
+    with TemporaryDirectory() as temp_dir:
+        index_path = Path(temp_dir) / 'temp-index.tinysearch'
+        indexer = TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size)
+
+        for i in range(num_pages):
+            page = indexer.get_page(i)
+            assert page == []

From 326f7e3d7f412068d5f1194805f9c6c4c2e40352 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Fri, 18 Feb 2022 22:22:47 +0000
Subject: [PATCH 2/4] Use JSON instead of struct to store metadata

---
 mwmbl/tinysearchengine/indexer.py | 26 +++++++++-----------------
 test/test_indexer.py              |  9 ++++-----
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py
index 29ccfb092db0dc2bd9e3f17fb848c346da1cd303..5a33cb27f2a6b636ab2de7a4ad6da2f8c5153c61 100644
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -1,15 +1,12 @@
 import json
 import os
-from dataclasses import astuple, dataclass
+from dataclasses import astuple, dataclass, asdict
 from io import UnsupportedOperation
 from mmap import mmap, PROT_READ, PROT_WRITE
-from pathlib import Path
-from struct import pack, unpack, calcsize
 from typing import TypeVar, Generic, Callable, List
 
 import mmh3
-from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
-
+from zstandard import ZstdDecompressor, ZstdCompressor
 
 VERSION = 1
 METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8')
@@ -43,11 +40,9 @@ class TinyIndexMetadata:
     item_factory: str
 
     def to_bytes(self) -> bytes:
-        result = METADATA_CONSTANT + pack(
-            METADATA_FORMAT, self.version, self.page_size, self.num_pages, self.item_factory.encode('utf8')
-        )
-        assert len(result) <= METADATA_SIZE
-        return result
+        metadata_bytes = METADATA_CONSTANT + json.dumps(asdict(self)).encode('utf8')
+        assert len(metadata_bytes) <= METADATA_SIZE
+        return metadata_bytes
 
     @staticmethod
     def from_bytes(data: bytes):
@@ -56,9 +51,8 @@ class TinyIndexMetadata:
         if metadata_constant != METADATA_CONSTANT:
             raise ValueError("This doesn't seem to be an index file")
 
-        actual_metadata_size = calcsize(METADATA_FORMAT)
-        values = unpack(METADATA_FORMAT, data[constant_length:constant_length+actual_metadata_size])
-        return TinyIndexMetadata(values[0], values[1], values[2], values[3].decode('utf8'))
+        values = json.loads(data[constant_length:].decode('utf8'))
+        return TinyIndexMetadata(**values)
 
 
 def _get_page_data(compressor, page_size, data):
@@ -84,7 +78,8 @@ class TinyIndex(Generic[T]):
         with open(index_path, 'rb') as index_file:
             metadata_page = index_file.read(METADATA_SIZE)
 
-        metadata = TinyIndexMetadata.from_bytes(metadata_page)
+        metadata_bytes = metadata_page.rstrip(b'\x00')
+        metadata = TinyIndexMetadata.from_bytes(metadata_bytes)
         if metadata.item_factory != item_factory.__name__:
             raise ValueError(f"Metadata item factory '{metadata.item_factory}' in the index "
                              f"does not match the passed item factory: '{item_factory.__name__}'")
@@ -129,7 +124,6 @@ class TinyIndex(Generic[T]):
         return converted
 
     def index(self, key: str, value: T):
-        # print("Index", value)
         assert type(value) == self.item_factory, f"Can only index the specified type" \
                                               f" ({self.item_factory.__name__})"
         page_index = self._get_key_page_index(key)
@@ -137,10 +131,8 @@ class TinyIndex(Generic[T]):
         if current_page is None:
             current_page = []
         value_tuple = astuple(value)
-        # print("Value tuple", value_tuple)
         current_page.append(value_tuple)
         try:
-            # print("Page", current_page)
             self._write_page(current_page, page_index)
         except ValueError:
             pass
diff --git a/test/test_indexer.py b/test/test_indexer.py
index 34113dac5db836b99a14285d9c6450737c2c30f3..0ac1cb5b320c47ac6e793c118ac7676d4458dfde 100644
--- a/test/test_indexer.py
+++ b/test/test_indexer.py
@@ -10,8 +10,7 @@ def test_create_index():
 
     with TemporaryDirectory() as temp_dir:
         index_path = Path(temp_dir) / 'temp-index.tinysearch'
-        indexer = TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size)
-
-        for i in range(num_pages):
-            page = indexer.get_page(i)
-            assert page == []
+        with TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size) as indexer:
+            for i in range(num_pages):
+                page = indexer.get_page(i)
+                assert page == []

From ae3b334a7fce469f764afee161e1da334e53c81c Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Tue, 22 Feb 2022 22:12:39 +0000
Subject: [PATCH 3/4] Fixes for API changes

---
 mwmbl/indexer/index.py            |  6 ++----
 mwmbl/indexer/index_crawl.py      | 10 ++++++----
 mwmbl/tinysearchengine/indexer.py | 15 ++++++++-------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py
index 94dc1e303dcfea0e154902e8e12f12b82dfdcaef..93684337013b1d85462146747cd32e5d4d32e6e0 100644
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@@ -10,7 +10,7 @@ import pandas as pd
 
 # NUM_PAGES = 8192
 # PAGE_SIZE = 512
-from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
+from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
 
 HTTP_START = 'http://'
 HTTPS_START = 'https://'
@@ -66,9 +66,7 @@ def grouper(n: int, iterator: Iterator):
         yield chunk
 
 
-def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
-    indexer.create()
-
+def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path):
     terms = Counter()
     pages = get_pages(nlp, titles_urls_and_extracts)
     for page in pages:
diff --git a/mwmbl/indexer/index_crawl.py b/mwmbl/indexer/index_crawl.py
index 3c304827e922dd132fe827f9244fbd4c29b1fed3..44e557e98dcbe4d583e2ddcb4528ccf594e9eeca 100644
--- a/mwmbl/indexer/index_crawl.py
+++ b/mwmbl/indexer/index_crawl.py
@@ -8,16 +8,18 @@ import spacy
 from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
 from mwmbl.indexer.index import index_titles_urls_and_extracts
 from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
-from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
 
 
 logger = getLogger(__name__)
 
 
-def index_mwmbl_craw_data():
+def index_mwmbl_crawl_data():
     nlp = spacy.load("en_core_web_sm")
 
-    with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+    TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
+
+    with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
         titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
         index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
 
@@ -43,4 +45,4 @@ def get_mwmbl_crawl_titles_urls_and_extracts():
 
 
 if __name__ == '__main__':
-    index_mwmbl_craw_data()
+    index_mwmbl_crawl_data()
diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py
index 5a33cb27f2a6b636ab2de7a4ad6da2f8c5153c61..1beb1b047e9a2a5c2c827adf9ee1ef4de3ebf9ed 100644
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -9,8 +9,7 @@ import mmh3
 from zstandard import ZstdDecompressor, ZstdCompressor
 
 VERSION = 1
-METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8')
-METADATA_FORMAT = 'IIIs'
+METADATA_CONSTANT = b'mwmbl-tiny-search'
 METADATA_SIZE = 4096
 
 NUM_PAGES = 76800
@@ -117,17 +116,19 @@ class TinyIndex(Generic[T]):
         """
         Get the page at index i, decompress and deserialise it using JSON
         """
+        results = self._get_page_tuples(i)
+        return [self.item_factory(*item) for item in results]
+
+    def _get_page_tuples(self, i):
         page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
         decompressed_data = self.decompressor.decompress(page_data)
-        results = json.loads(decompressed_data.decode('utf8'))
-        converted = [self.item_factory(*item) for item in results]
-        return converted
+        return json.loads(decompressed_data.decode('utf8'))
 
     def index(self, key: str, value: T):
         assert type(value) == self.item_factory, f"Can only index the specified type" \
                                               f" ({self.item_factory.__name__})"
         page_index = self._get_key_page_index(key)
-        current_page = self.get_page(page_index)
+        current_page = self._get_page_tuples(page_index)
         if current_page is None:
             current_page = []
         value_tuple = astuple(value)
@@ -151,7 +152,7 @@ class TinyIndex(Generic[T]):
     @staticmethod
     def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
         if os.path.isfile(index_path):
-            raise FileExistsError("Index file already exists")
+            raise FileExistsError(f"Index file '{index_path}' already exists")
 
         metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__)
         metadata_bytes = metadata.to_bytes()

From 04a33a134bb06264142c13ab9187bbc0239a0415 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Tue, 22 Feb 2022 22:27:02 +0000
Subject: [PATCH 4/4] Fixes to mwmbl API for changes to the index

---
 config/tinysearchengine.yaml     | 12 ----------
 mwmbl/tinysearchengine/app.py    | 34 ++++++++++-----------------
 mwmbl/tinysearchengine/config.py | 40 --------------------------------
 3 files changed, 12 insertions(+), 74 deletions(-)
 delete mode 100644 config/tinysearchengine.yaml
 delete mode 100644 mwmbl/tinysearchengine/config.py

diff --git a/config/tinysearchengine.yaml b/config/tinysearchengine.yaml
deleted file mode 100644
index 570acc3c814df5dac75a3aac6930a485ff67ef2a..0000000000000000000000000000000000000000
--- a/config/tinysearchengine.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Config for bootstrapping tinysearchengine.
-# Follows the schema/model defined by mwmbl.tinysearchengine.config.ConfigModel
-
-server_config:
-  host: "0.0.0.0"
-  port: 8080
-  log_level: "info"
-
-index_config:
-  index_path: data/index.tinysearch
-  num_pages: 76800
-  page_size: 4096
diff --git a/mwmbl/tinysearchengine/app.py b/mwmbl/tinysearchengine/app.py
index dda7702acd67c4a909a3a839f3fb0088e8460ad9..85078e3559cbb0085e5d25eaa950b1e6cf916224 100644
--- a/mwmbl/tinysearchengine/app.py
+++ b/mwmbl/tinysearchengine/app.py
@@ -1,13 +1,12 @@
-import logging
 import argparse
+import logging
 
 import pandas as pd
 import uvicorn
 
 from mwmbl.tinysearchengine import create_app
 from mwmbl.tinysearchengine.completer import Completer
-from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
-from mwmbl.tinysearchengine.config import parse_config_file
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 from mwmbl.tinysearchengine.rank import Ranker
 
 logging.basicConfig()
@@ -16,7 +15,8 @@ logging.basicConfig()
 def setup_args():
     """Read all the args."""
     parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
-    parser.add_argument("--config", help="Path to tinysearchengine's yaml config.", required=True)
+    parser.add_argument("--index", help="Path to the tinysearchengine index file", required=True)
+    parser.add_argument("--terms", help="Path to the tinysearchengine terms CSV file", required=True)
     args = parser.parse_args()
     return args
 
@@ -30,30 +30,20 @@ def main():
     * Initialize a FastAPI app instance
     * Starts uvicorn server using app instance
     """
-    config, tiny_index = get_config_and_index()
+    args = setup_args()
 
     # Load term data
-    terms = pd.read_csv(config.terms_path)
+    terms = pd.read_csv(args.terms)
     completer = Completer(terms)
 
-    ranker = Ranker(tiny_index, completer)
+    with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index:
+        ranker = Ranker(tiny_index, completer)
 
-    # Initialize FastApi instance
-    app = create_app.create(ranker)
+        # Initialize FastApi instance
+        app = create_app.create(ranker)
 
-    # Initialize uvicorn server using global app instance and server config params
-    uvicorn.run(app, **config.server_config.dict())
-
-
-def get_config_and_index():
-    args = setup_args()
-    config = parse_config_file(config_filename=args.config)
-    # Initialize TinyIndex using index config params
-    tiny_index = TinyIndex(
-        item_factory=Document,
-        **config.index_config.dict()
-    )
-    return config, tiny_index
+        # Initialize uvicorn server using global app instance and server config params
+        uvicorn.run(app, host="0.0.0.0", port=8080)
 
 
 if __name__ == "__main__":
diff --git a/mwmbl/tinysearchengine/config.py b/mwmbl/tinysearchengine/config.py
deleted file mode 100644
index 2fd6f54fe2f6e88fe9138019d0cb2a9f56ff1d7a..0000000000000000000000000000000000000000
--- a/mwmbl/tinysearchengine/config.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import pathlib
-import yaml
-from pydantic import BaseModel, StrictInt, StrictStr, Field
-
-
-class ServerConfigModel(BaseModel):
-    host: StrictStr = "0.0.0.0"
-    port: StrictInt = 8080
-    log_level: StrictStr = "info"
-
-
-class IndexConfigModel(BaseModel):
-    index_path: StrictStr = "data/index.tinysearch"
-    num_pages: StrictInt = 25600
-    page_size: StrictInt = 4096
-
-
-class ConfigModel(BaseModel):
-    server_config: ServerConfigModel = Field(default_factory=ServerConfigModel)
-    index_config: IndexConfigModel = Field(default_factory=IndexConfigModel)
-    terms_path: StrictStr = "data/mwmbl-crawl-terms.csv"
-
-
-def parse_config_file(config_filename: str) -> ConfigModel:
-    """Parse config dictionary and return ConfigModel."""
-    if not pathlib.Path(config_filename).is_file():
-        raise ValueError(
-            f"config_filename: {config_filename} is not a file. Please check if it exists."
-        )
-
-    with open(config_filename) as f:
-        config = yaml.load(f, yaml.Loader)
-
-    return ConfigModel(**config)
-
-
-if __name__ == "__main__":
-    # Call this from the root of the repo using "python -m mwmbl.tinysearchengine.config"
-    config_model = parse_config_file(config_filename="config/tinysearchengine.yaml")
-    print(config_model.dict())