Przeglądaj źródła

Merge pull request #47 from mwmbl/include-metadata-in-index

Include metadata in index
Daoud Clarke 3 lat temu
rodzic
commit
d19e0e51f7

+ 0 - 12
config/tinysearchengine.yaml

@@ -1,12 +0,0 @@
-# Config for bootstrapping tinysearchengine.
-# Follows the schema/model defined by mwmbl.tinysearchengine.config.ConfigModel
-
-server_config:
-  host: "0.0.0.0"
-  port: 8080
-  log_level: "info"
-
-index_config:
-  index_path: data/index.tinysearch
-  num_pages: 76800
-  page_size: 4096

+ 2 - 4
mwmbl/indexer/index.py

@@ -10,7 +10,7 @@ import pandas as pd
 
 
 # NUM_PAGES = 8192
 # NUM_PAGES = 8192
 # PAGE_SIZE = 512
 # PAGE_SIZE = 512
-from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
+from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
 
 
 HTTP_START = 'http://'
 HTTP_START = 'http://'
 HTTPS_START = 'https://'
 HTTPS_START = 'https://'
@@ -66,9 +66,7 @@ def grouper(n: int, iterator: Iterator):
         yield chunk
         yield chunk
 
 
 
 
-def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
-    indexer.create_if_not_exists()
-
+def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path):
     terms = Counter()
     terms = Counter()
     pages = get_pages(nlp, titles_urls_and_extracts)
     pages = get_pages(nlp, titles_urls_and_extracts)
     for page in pages:
     for page in pages:

+ 6 - 4
mwmbl/indexer/index_crawl.py

@@ -8,16 +8,18 @@ import spacy
 from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
 from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
 from mwmbl.indexer.index import index_titles_urls_and_extracts
 from mwmbl.indexer.index import index_titles_urls_and_extracts
 from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
 from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
-from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
 
 
 
 
 logger = getLogger(__name__)
 logger = getLogger(__name__)
 
 
 
 
-def index_mwmbl_craw_data():
+def index_mwmbl_crawl_data():
     nlp = spacy.load("en_core_web_sm")
     nlp = spacy.load("en_core_web_sm")
 
 
-    with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+    TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
+
+    with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
         titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
         titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
         index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
         index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
 
 
@@ -43,4 +45,4 @@ def get_mwmbl_crawl_titles_urls_and_extracts():
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    index_mwmbl_craw_data()
+    index_mwmbl_crawl_data()

+ 1 - 1
mwmbl/indexer/index_glob.py

@@ -12,7 +12,7 @@ from .paths import INDEX_PATH, CRAWL_GLOB
 def run():
 def run():
     # TODO: item_factory argument is unfilled.
     # TODO: item_factory argument is unfilled.
     indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
     indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
-    indexer.create_if_not_exists()
+    indexer.create()
     nlp = English()
     nlp = English()
     for path in glob(CRAWL_GLOB):
     for path in glob(CRAWL_GLOB):
         print("Path", path)
         print("Path", path)

+ 12 - 22
mwmbl/tinysearchengine/app.py

@@ -1,13 +1,12 @@
-import logging
 import argparse
 import argparse
+import logging
 
 
 import pandas as pd
 import pandas as pd
 import uvicorn
 import uvicorn
 
 
 from mwmbl.tinysearchengine import create_app
 from mwmbl.tinysearchengine import create_app
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.tinysearchengine.completer import Completer
-from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
-from mwmbl.tinysearchengine.config import parse_config_file
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 from mwmbl.tinysearchengine.rank import Ranker
 from mwmbl.tinysearchengine.rank import Ranker
 
 
 logging.basicConfig()
 logging.basicConfig()
@@ -16,7 +15,8 @@ logging.basicConfig()
 def setup_args():
 def setup_args():
     """Read all the args."""
     """Read all the args."""
     parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
     parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
-    parser.add_argument("--config", help="Path to tinysearchengine's yaml config.", required=True)
+    parser.add_argument("--index", help="Path to the tinysearchengine index file", required=True)
+    parser.add_argument("--terms", help="Path to the tinysearchengine terms CSV file", required=True)
     args = parser.parse_args()
     args = parser.parse_args()
     return args
     return args
 
 
@@ -30,30 +30,20 @@ def main():
     * Initialize a FastAPI app instance
     * Initialize a FastAPI app instance
     * Starts uvicorn server using app instance
     * Starts uvicorn server using app instance
     """
     """
-    config, tiny_index = get_config_and_index()
+    args = setup_args()
 
 
     # Load term data
     # Load term data
-    terms = pd.read_csv(config.terms_path)
+    terms = pd.read_csv(args.terms)
     completer = Completer(terms)
     completer = Completer(terms)
 
 
-    ranker = Ranker(tiny_index, completer)
+    with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index:
+        ranker = Ranker(tiny_index, completer)
 
 
-    # Initialize FastApi instance
-    app = create_app.create(ranker)
+        # Initialize FastApi instance
+        app = create_app.create(ranker)
 
 
-    # Initialize uvicorn server using global app instance and server config params
-    uvicorn.run(app, **config.server_config.dict())
-
-
-def get_config_and_index():
-    args = setup_args()
-    config = parse_config_file(config_filename=args.config)
-    # Initialize TinyIndex using index config params
-    tiny_index = TinyIndex(
-        item_factory=Document,
-        **config.index_config.dict()
-    )
-    return config, tiny_index
+        # Initialize uvicorn server using global app instance and server config params
+        uvicorn.run(app, host="0.0.0.0", port=8080)
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":

+ 0 - 40
mwmbl/tinysearchengine/config.py

@@ -1,40 +0,0 @@
-import pathlib
-import yaml
-from pydantic import BaseModel, StrictInt, StrictStr, Field
-
-
-class ServerConfigModel(BaseModel):
-    host: StrictStr = "0.0.0.0"
-    port: StrictInt = 8080
-    log_level: StrictStr = "info"
-
-
-class IndexConfigModel(BaseModel):
-    index_path: StrictStr = "data/index.tinysearch"
-    num_pages: StrictInt = 25600
-    page_size: StrictInt = 4096
-
-
-class ConfigModel(BaseModel):
-    server_config: ServerConfigModel = Field(default_factory=ServerConfigModel)
-    index_config: IndexConfigModel = Field(default_factory=IndexConfigModel)
-    terms_path: StrictStr = "data/mwmbl-crawl-terms.csv"
-
-
-def parse_config_file(config_filename: str) -> ConfigModel:
-    """Parse config dictionary and return ConfigModel."""
-    if not pathlib.Path(config_filename).is_file():
-        raise ValueError(
-            f"config_filename: {config_filename} is not a file. Please check if it exists."
-        )
-
-    with open(config_filename) as f:
-        config = yaml.load(f, yaml.Loader)
-
-    return ConfigModel(**config)
-
-
-if __name__ == "__main__":
-    # Call this from the root of the repo using "python -m mwmbl.tinysearchengine.config"
-    config_model = parse_config_file(config_filename="config/tinysearchengine.yaml")
-    print(config_model.dict())

+ 104 - 62
mwmbl/tinysearchengine/indexer.py

@@ -1,13 +1,16 @@
 import json
 import json
 import os
 import os
-from dataclasses import astuple, dataclass
-from mmap import mmap, PROT_READ
-from pathlib import Path
+from dataclasses import astuple, dataclass, asdict
+from io import UnsupportedOperation
+from mmap import mmap, PROT_READ, PROT_WRITE
 from typing import TypeVar, Generic, Callable, List
 from typing import TypeVar, Generic, Callable, List
 
 
 import mmh3
 import mmh3
-from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
+from zstandard import ZstdDecompressor, ZstdCompressor
 
 
+VERSION = 1
+METADATA_CONSTANT = b'mwmbl-tiny-search'
+METADATA_SIZE = 4096
 
 
 NUM_PAGES = 76800
 NUM_PAGES = 76800
 PAGE_SIZE = 4096
 PAGE_SIZE = 4096
@@ -28,82 +31,109 @@ class TokenizedDocument(Document):
 T = TypeVar('T')
 T = TypeVar('T')
 
 
 
 
-class TinyIndexBase(Generic[T]):
-    def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
-        self.item_factory = item_factory
-        self.num_pages = num_pages
-        self.page_size = page_size
-        self.decompressor = ZstdDecompressor()
-        self.mmap = None
+@dataclass
+class TinyIndexMetadata:
+    version: int
+    page_size: int
+    num_pages: int
+    item_factory: str
 
 
-    def retrieve(self, key: str) -> List[T]:
-        index = self._get_key_page_index(key)
-        page = self.get_page(index)
-        if page is None:
-            return []
-        return self.convert_items(page)
+    def to_bytes(self) -> bytes:
+        metadata_bytes = METADATA_CONSTANT + json.dumps(asdict(self)).encode('utf8')
+        assert len(metadata_bytes) <= METADATA_SIZE
+        return metadata_bytes
 
 
-    def _get_key_page_index(self, key):
-        key_hash = mmh3.hash(key, signed=False)
-        return key_hash % self.num_pages
+    @staticmethod
+    def from_bytes(data: bytes):
+        constant_length = len(METADATA_CONSTANT)
+        metadata_constant = data[:constant_length]
+        if metadata_constant != METADATA_CONSTANT:
+            raise ValueError("This doesn't seem to be an index file")
 
 
-    def get_page(self, i):
-        """
-        Get the page at index i, decompress and deserialise it using JSON
-        """
-        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
-        try:
-            decompressed_data = self.decompressor.decompress(page_data)
-        except ZstdError:
-            return None
-        results = json.loads(decompressed_data.decode('utf8'))
-        return results
+        values = json.loads(data[constant_length:].decode('utf8'))
+        return TinyIndexMetadata(**values)
 
 
-    def convert_items(self, items) -> List[T]:
-        converted = [self.item_factory(*item) for item in items]
-        return converted
 
 
+def _get_page_data(compressor, page_size, data):
+    serialised_data = json.dumps(data)
+    compressed_data = compressor.compress(serialised_data.encode('utf8'))
+    return _pad_to_page_size(compressed_data, page_size)
 
 
-class TinyIndex(TinyIndexBase[T]):
-    def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
-        super().__init__(item_factory, num_pages, page_size)
-        self.index_path = index_path
-        self.index_file = open(self.index_path, 'rb')
-        self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
+
+def _pad_to_page_size(data: bytes, page_size: int):
+    page_length = len(data)
+    if page_length > page_size:
+        raise ValueError(f"Data is too big ({page_length}) for page size ({page_size})")
+    padding = b'\x00' * (page_size - page_length)
+    page_data = data + padding
+    return page_data
 
 
 
 
-class TinyIndexer(TinyIndexBase[T]):
-    def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
-        super().__init__(item_factory, num_pages, page_size)
+class TinyIndex(Generic[T]):
+    def __init__(self, item_factory: Callable[..., T], index_path, mode='r'):
+        if mode not in {'r', 'w'}:
+            raise ValueError(f"Mode should be one of 'r' or 'w', got {mode}")
+
+        with open(index_path, 'rb') as index_file:
+            metadata_page = index_file.read(METADATA_SIZE)
+
+        metadata_bytes = metadata_page.rstrip(b'\x00')
+        metadata = TinyIndexMetadata.from_bytes(metadata_bytes)
+        if metadata.item_factory != item_factory.__name__:
+            raise ValueError(f"Metadata item factory '{metadata.item_factory}' in the index "
+                             f"does not match the passed item factory: '{item_factory.__name__}'")
+
+        self.item_factory = item_factory
         self.index_path = index_path
         self.index_path = index_path
+        self.mode = mode
+
+        self.num_pages = metadata.num_pages
+        self.page_size = metadata.page_size
         self.compressor = ZstdCompressor()
         self.compressor = ZstdCompressor()
         self.decompressor = ZstdDecompressor()
         self.decompressor = ZstdDecompressor()
         self.index_file = None
         self.index_file = None
         self.mmap = None
         self.mmap = None
 
 
     def __enter__(self):
     def __enter__(self):
-        self.create_if_not_exists()
         self.index_file = open(self.index_path, 'r+b')
         self.index_file = open(self.index_path, 'r+b')
-        self.mmap = mmap(self.index_file.fileno(), 0)
+        prot = PROT_READ if self.mode == 'r' else PROT_READ | PROT_WRITE
+        self.mmap = mmap(self.index_file.fileno(), 0, offset=METADATA_SIZE, prot=prot)
         return self
         return self
 
 
     def __exit__(self, exc_type, exc_val, exc_tb):
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.mmap.close()
         self.mmap.close()
         self.index_file.close()
         self.index_file.close()
 
 
+    def retrieve(self, key: str) -> List[T]:
+        index = self._get_key_page_index(key)
+        return self.get_page(index)
+
+    def _get_key_page_index(self, key):
+        key_hash = mmh3.hash(key, signed=False)
+        return key_hash % self.num_pages
+
+    def get_page(self, i):
+        """
+        Get the page at index i, decompress and deserialise it using JSON
+        """
+        results = self._get_page_tuples(i)
+        return [self.item_factory(*item) for item in results]
+
+    def _get_page_tuples(self, i):
+        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
+        decompressed_data = self.decompressor.decompress(page_data)
+        return json.loads(decompressed_data.decode('utf8'))
+
     def index(self, key: str, value: T):
     def index(self, key: str, value: T):
-        # print("Index", value)
         assert type(value) == self.item_factory, f"Can only index the specified type" \
         assert type(value) == self.item_factory, f"Can only index the specified type" \
                                               f" ({self.item_factory.__name__})"
                                               f" ({self.item_factory.__name__})"
         page_index = self._get_key_page_index(key)
         page_index = self._get_key_page_index(key)
-        current_page = self.get_page(page_index)
+        current_page = self._get_page_tuples(page_index)
         if current_page is None:
         if current_page is None:
             current_page = []
             current_page = []
         value_tuple = astuple(value)
         value_tuple = astuple(value)
-        # print("Value tuple", value_tuple)
         current_page.append(value_tuple)
         current_page.append(value_tuple)
         try:
         try:
-            # print("Page", current_page)
             self._write_page(current_page, page_index)
             self._write_page(current_page, page_index)
         except ValueError:
         except ValueError:
             pass
             pass
@@ -113,16 +143,28 @@ class TinyIndexer(TinyIndexBase[T]):
         Serialise the data using JSON, compress it and store it at index i.
         Serialise the data using JSON, compress it and store it at index i.
         If the data is too big, it will raise a ValueError and not store anything
         If the data is too big, it will raise a ValueError and not store anything
         """
         """
-        serialised_data = json.dumps(data)
-        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
-        page_length = len(compressed_data)
-        if page_length > self.page_size:
-            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
-        padding = b'\x00' * (self.page_size - page_length)
-        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
-
-    def create_if_not_exists(self):
-        if not os.path.isfile(self.index_path):
-            file_length = self.num_pages * self.page_size
-            with open(self.index_path, 'wb') as index_file:
-                index_file.write(b'\x00' * file_length)
+        if self.mode != 'w':
+            raise UnsupportedOperation("The file is open in read mode, you cannot write")
+
+        page_data = _get_page_data(self.compressor, self.page_size, data)
+        self.mmap[i * self.page_size:(i+1) * self.page_size] = page_data
+
+    @staticmethod
+    def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
+        if os.path.isfile(index_path):
+            raise FileExistsError(f"Index file '{index_path}' already exists")
+
+        metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__)
+        metadata_bytes = metadata.to_bytes()
+        metadata_padded = _pad_to_page_size(metadata_bytes, METADATA_SIZE)
+
+        compressor = ZstdCompressor()
+        page_bytes = _get_page_data(compressor, page_size, [])
+
+        with open(index_path, 'wb') as index_file:
+            index_file.write(metadata_padded)
+            for i in range(num_pages):
+                index_file.write(page_bytes)
+
+        return TinyIndex(item_factory, index_path=index_path)
+

+ 111 - 3
poetry.lock

@@ -26,6 +26,28 @@ python-versions = ">=3.7"
 [package.extras]
 [package.extras]
 tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
 tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
 
 
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+description = "Atomic file writes."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "21.4.0"
+description = "Classes Without Boilerplate"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.extras]
+dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"]
+docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
+tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"]
+tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"]
+
 [[package]]
 [[package]]
 name = "beautifulsoup4"
 name = "beautifulsoup4"
 version = "4.10.0"
 version = "4.10.0"
@@ -163,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
 [package.source]
 [package.source]
 type = "url"
 type = "url"
 url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
 url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
+
 [[package]]
 [[package]]
 name = "fastapi"
 name = "fastapi"
 version = "0.70.1"
 version = "0.70.1"
@@ -197,6 +220,14 @@ category = "main"
 optional = false
 optional = false
 python-versions = ">=3.5"
 python-versions = ">=3.5"
 
 
+[[package]]
+name = "iniconfig"
+version = "1.1.1"
+description = "iniconfig: brain-dead simple config-ini parsing"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 [[package]]
 name = "jinja2"
 name = "jinja2"
 version = "3.0.3"
 version = "3.0.3"
@@ -314,7 +345,7 @@ name = "packaging"
 version = "21.3"
 version = "21.3"
 description = "Core utilities for Python packages"
 description = "Core utilities for Python packages"
 category = "main"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"
 python-versions = ">=3.6"
 
 
 [package.dependencies]
 [package.dependencies]
@@ -354,6 +385,18 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
 s3 = ["boto3"]
 s3 = ["boto3"]
 test = ["pytest", "pytest-coverage", "mock", "typer-cli"]
 test = ["pytest", "pytest-coverage", "mock", "typer-cli"]
 
 
+[[package]]
+name = "pluggy"
+version = "1.0.0"
+description = "plugin and hook calling mechanisms for python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
 [[package]]
 [[package]]
 name = "preshed"
 name = "preshed"
 version = "3.0.6"
 version = "3.0.6"
@@ -366,6 +409,14 @@ python-versions = "*"
 cymem = ">=2.0.2,<2.1.0"
 cymem = ">=2.0.2,<2.1.0"
 murmurhash = ">=0.28.0,<1.1.0"
 murmurhash = ">=0.28.0,<1.1.0"
 
 
+[[package]]
+name = "py"
+version = "1.11.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
 [[package]]
 [[package]]
 name = "py4j"
 name = "py4j"
 version = "0.10.9.2"
 version = "0.10.9.2"
@@ -413,7 +464,7 @@ name = "pyparsing"
 version = "3.0.7"
 version = "3.0.7"
 description = "Python parsing module"
 description = "Python parsing module"
 category = "main"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"
 python-versions = ">=3.6"
 
 
 [package.extras]
 [package.extras]
@@ -436,6 +487,27 @@ mllib = ["numpy (>=1.7)"]
 pandas_on_spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
 pandas_on_spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
 sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
 sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
 
 
+[[package]]
+name = "pytest"
+version = "7.0.1"
+description = "pytest: simple powerful testing with Python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=19.2.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+py = ">=1.8.2"
+tomli = ">=1.0.0"
+
+[package.extras]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+
 [[package]]
 [[package]]
 name = "python-dateutil"
 name = "python-dateutil"
 version = "2.8.2"
 version = "2.8.2"
@@ -680,6 +752,14 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"]
 tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"]
 tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"]
 torch = ["torch (>=1.5.0)"]
 torch = ["torch (>=1.5.0)"]
 
 
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 [[package]]
 name = "tqdm"
 name = "tqdm"
 version = "4.62.3"
 version = "4.62.3"
@@ -797,7 +877,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
 [metadata]
 [metadata]
 lock-version = "1.1"
 lock-version = "1.1"
 python-versions = "^3.10"
 python-versions = "^3.10"
-content-hash = "b5af8ce9887d0cf69297180fbb4040e1522e4a3135f8b651415afb35f86124ef"
+content-hash = "edb2d4bc50cb09ac5f7ba311d5238eb2deeab1d12f479067cc7239e3232bf6c9"
 
 
 [metadata.files]
 [metadata.files]
 anyio = [
 anyio = [
@@ -808,6 +888,14 @@ asgiref = [
     {file = "asgiref-3.5.0-py3-none-any.whl", hash = "sha256:88d59c13d634dcffe0510be048210188edd79aeccb6a6c9028cdad6f31d730a9"},
     {file = "asgiref-3.5.0-py3-none-any.whl", hash = "sha256:88d59c13d634dcffe0510be048210188edd79aeccb6a6c9028cdad6f31d730a9"},
     {file = "asgiref-3.5.0.tar.gz", hash = "sha256:2f8abc20f7248433085eda803936d98992f1343ddb022065779f37c5da0181d0"},
     {file = "asgiref-3.5.0.tar.gz", hash = "sha256:2f8abc20f7248433085eda803936d98992f1343ddb022065779f37c5da0181d0"},
 ]
 ]
+atomicwrites = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+attrs = [
+    {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
+    {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
+]
 beautifulsoup4 = [
 beautifulsoup4 = [
     {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"},
     {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"},
     {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"},
     {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"},
@@ -941,6 +1029,10 @@ idna = [
     {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
     {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
     {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
     {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
 ]
 ]
+iniconfig = [
+    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
+    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
+]
 jinja2 = [
 jinja2 = [
     {file = "Jinja2-3.0.3-py3-none-any.whl", hash = "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8"},
     {file = "Jinja2-3.0.3-py3-none-any.whl", hash = "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8"},
     {file = "Jinja2-3.0.3.tar.gz", hash = "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"},
     {file = "Jinja2-3.0.3.tar.gz", hash = "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"},
@@ -1255,6 +1347,10 @@ pathy = [
     {file = "pathy-0.6.1-py3-none-any.whl", hash = "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a"},
     {file = "pathy-0.6.1-py3-none-any.whl", hash = "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a"},
     {file = "pathy-0.6.1.tar.gz", hash = "sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c"},
     {file = "pathy-0.6.1.tar.gz", hash = "sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c"},
 ]
 ]
+pluggy = [
+    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
+    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+]
 preshed = [
 preshed = [
     {file = "preshed-3.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:66a71ced487516cf81fd0431a3a843514262ae2f33e9a7688b87562258fa75d5"},
     {file = "preshed-3.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:66a71ced487516cf81fd0431a3a843514262ae2f33e9a7688b87562258fa75d5"},
     {file = "preshed-3.0.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c98f725d8478f3ade4ab1ea00f50a92d2d9406d37276bc46fd8bab1d47452c4"},
     {file = "preshed-3.0.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c98f725d8478f3ade4ab1ea00f50a92d2d9406d37276bc46fd8bab1d47452c4"},
@@ -1273,6 +1369,10 @@ preshed = [
     {file = "preshed-3.0.6-cp39-cp39-win_amd64.whl", hash = "sha256:92a8f49d17a63537a8beed48a049b62ef168ca07e0042a5b2bcdf178a1fb5d48"},
     {file = "preshed-3.0.6-cp39-cp39-win_amd64.whl", hash = "sha256:92a8f49d17a63537a8beed48a049b62ef168ca07e0042a5b2bcdf178a1fb5d48"},
     {file = "preshed-3.0.6.tar.gz", hash = "sha256:fb3b7588a3a0f2f2f1bf3fe403361b2b031212b73a37025aea1df7215af3772a"},
     {file = "preshed-3.0.6.tar.gz", hash = "sha256:fb3b7588a3a0f2f2f1bf3fe403361b2b031212b73a37025aea1df7215af3772a"},
 ]
 ]
+py = [
+    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
+    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
+]
 py4j = [
 py4j = [
     {file = "py4j-0.10.9.2-py2.py3-none-any.whl", hash = "sha256:bf0485388e415ff26710d2dc719cb0ede16cf1164b1ee757e0ebb2e98c471521"},
     {file = "py4j-0.10.9.2-py2.py3-none-any.whl", hash = "sha256:bf0485388e415ff26710d2dc719cb0ede16cf1164b1ee757e0ebb2e98c471521"},
     {file = "py4j-0.10.9.2.tar.gz", hash = "sha256:624f97c363b8dd84822bc666b12fa7f7d97824632b2ff3d852cc491359ce7615"},
     {file = "py4j-0.10.9.2.tar.gz", hash = "sha256:624f97c363b8dd84822bc666b12fa7f7d97824632b2ff3d852cc491359ce7615"},
@@ -1350,6 +1450,10 @@ pyparsing = [
 pyspark = [
 pyspark = [
     {file = "pyspark-3.2.0.tar.gz", hash = "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"},
     {file = "pyspark-3.2.0.tar.gz", hash = "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"},
 ]
 ]
+pytest = [
+    {file = "pytest-7.0.1-py3-none-any.whl", hash = "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db"},
+    {file = "pytest-7.0.1.tar.gz", hash = "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"},
+]
 python-dateutil = [
 python-dateutil = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
@@ -1539,6 +1643,10 @@ thinc = [
     {file = "thinc-8.0.13-cp39-cp39-win_amd64.whl", hash = "sha256:b3ae088f60d3dfe6a88c6be37548aae40023e46a718cffe3e43953b4f0ffc340"},
     {file = "thinc-8.0.13-cp39-cp39-win_amd64.whl", hash = "sha256:b3ae088f60d3dfe6a88c6be37548aae40023e46a718cffe3e43953b4f0ffc340"},
     {file = "thinc-8.0.13.tar.gz", hash = "sha256:47662a3ae33d445a77b6ea7b772444805c7bba8991f122e350daf72dedc8171a"},
     {file = "thinc-8.0.13.tar.gz", hash = "sha256:47662a3ae33d445a77b6ea7b772444805c7bba8991f122e350daf72dedc8171a"},
 ]
 ]
+tomli = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
 tqdm = [
 tqdm = [
     {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},
     {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},
     {file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"},
     {file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"},

+ 1 - 0
pyproject.toml

@@ -64,6 +64,7 @@ indexer = [
 # langdetect = "^1.0.9"
 # langdetect = "^1.0.9"
 # spacy = "^3.2.1"
 # spacy = "^3.2.1"
 # Levenshtein = "^0.16.0"
 # Levenshtein = "^0.16.0"
+pytest = "^7.0.1"
 
 
 [build-system]
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 requires = ["poetry-core>=1.0.0"]

+ 16 - 0
test/test_indexer.py

@@ -0,0 +1,16 @@
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from mwmbl.tinysearchengine.indexer import Document, TinyIndex
+
+
+def test_create_index():
+    num_pages = 10
+    page_size = 4096
+
+    with TemporaryDirectory() as temp_dir:
+        index_path = Path(temp_dir) / 'temp-index.tinysearch'
+        with TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size) as indexer:
+            for i in range(num_pages):
+                page = indexer.get_page(i)
+                assert page == []