From e6273c7f7636e8afa7b59625705e5352c57e8417 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Fri, 18 Feb 2022 22:12:22 +0000 Subject: [PATCH 1/4] WIP: include metadata in index - using struct approach --- mwmbl/indexer/index.py | 2 +- mwmbl/indexer/index_glob.py | 2 +- mwmbl/tinysearchengine/indexer.py | 163 +++++++++++++++++++----------- poetry.lock | 114 ++++++++++++++++++++- pyproject.toml | 1 + test/test_indexer.py | 17 ++++ 6 files changed, 237 insertions(+), 62 deletions(-) create mode 100644 test/test_indexer.py diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index 8bd0dc9..94dc1e3 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -67,7 +67,7 @@ def grouper(n: int, iterator: Iterator): def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path): - indexer.create_if_not_exists() + indexer.create() terms = Counter() pages = get_pages(nlp, titles_urls_and_extracts) diff --git a/mwmbl/indexer/index_glob.py b/mwmbl/indexer/index_glob.py index e9102c2..9bd8b96 100644 --- a/mwmbl/indexer/index_glob.py +++ b/mwmbl/indexer/index_glob.py @@ -12,7 +12,7 @@ from .paths import INDEX_PATH, CRAWL_GLOB def run(): # TODO: item_factory argument is unfilled. indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) - indexer.create_if_not_exists() + indexer.create() nlp = English() for path in glob(CRAWL_GLOB): print("Path", path) diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index 6055b7f..29ccfb0 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -1,14 +1,21 @@ import json import os from dataclasses import astuple, dataclass -from mmap import mmap, PROT_READ +from io import UnsupportedOperation +from mmap import mmap, PROT_READ, PROT_WRITE from pathlib import Path +from struct import pack, unpack, calcsize from typing import TypeVar, Generic, Callable, List import mmh3 from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError +VERSION = 1 +METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8') +METADATA_FORMAT = 'IIIs' +METADATA_SIZE = 4096 + NUM_PAGES = 76800 PAGE_SIZE = 4096 @@ -28,20 +35,84 @@ class TokenizedDocument(Document): T = TypeVar('T') -class TinyIndexBase(Generic[T]): - def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int): +@dataclass +class TinyIndexMetadata: + version: int + page_size: int + num_pages: int + item_factory: str + + def to_bytes(self) -> bytes: + result = METADATA_CONSTANT + pack( + METADATA_FORMAT, self.version, self.page_size, self.num_pages, self.item_factory.encode('utf8') + ) + assert len(result) <= METADATA_SIZE + return result + + @staticmethod + def from_bytes(data: bytes): + constant_length = len(METADATA_CONSTANT) + metadata_constant = data[:constant_length] + if metadata_constant != METADATA_CONSTANT: + raise ValueError("This doesn't seem to be an index file") + + actual_metadata_size = calcsize(METADATA_FORMAT) + values = unpack(METADATA_FORMAT, data[constant_length:constant_length+actual_metadata_size]) + return TinyIndexMetadata(values[0], values[1], values[2], values[3].decode('utf8')) + + +def _get_page_data(compressor, page_size, data): + serialised_data = json.dumps(data) + compressed_data = compressor.compress(serialised_data.encode('utf8')) + return _pad_to_page_size(compressed_data, page_size) + + +def _pad_to_page_size(data: bytes, page_size: int): + page_length = len(data) + if page_length > page_size: + raise ValueError(f"Data is too big ({page_length}) for page size ({page_size})") + padding = b'\x00' * (page_size - page_length) + page_data = data + padding + return page_data + + +class TinyIndex(Generic[T]): + def __init__(self, item_factory: Callable[..., T], index_path, mode='r'): + if mode not in {'r', 'w'}: + raise ValueError(f"Mode should be one of 'r' or 'w', got {mode}") + + with open(index_path, 'rb') as index_file: + metadata_page = index_file.read(METADATA_SIZE) + + metadata = TinyIndexMetadata.from_bytes(metadata_page) + if metadata.item_factory != item_factory.__name__: + raise ValueError(f"Metadata item factory '{metadata.item_factory}' in the index " + f"does not match the passed item factory: '{item_factory.__name__}'") + self.item_factory = item_factory - self.num_pages = num_pages - self.page_size = page_size + self.index_path = index_path + self.mode = mode + + self.num_pages = metadata.num_pages + self.page_size = metadata.page_size + self.compressor = ZstdCompressor() self.decompressor = ZstdDecompressor() + self.index_file = None self.mmap = None + def __enter__(self): + self.index_file = open(self.index_path, 'r+b') + prot = PROT_READ if self.mode == 'r' else PROT_READ | PROT_WRITE + self.mmap = mmap(self.index_file.fileno(), 0, offset=METADATA_SIZE, prot=prot) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.mmap.close() + self.index_file.close() + def retrieve(self, key: str) -> List[T]: index = self._get_key_page_index(key) - page = self.get_page(index) - if page is None: - return [] - return self.convert_items(page) + return self.get_page(index) def _get_key_page_index(self, key): key_hash = mmh3.hash(key, signed=False) @@ -52,45 +123,11 @@ class TinyIndexBase(Generic[T]): Get the page at index i, decompress and deserialise it using JSON """ page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size] - try: - decompressed_data = self.decompressor.decompress(page_data) - except ZstdError: - return None + decompressed_data = self.decompressor.decompress(page_data) results = json.loads(decompressed_data.decode('utf8')) - return results - - def convert_items(self, items) -> List[T]: - converted = [self.item_factory(*item) for item in items] + converted = [self.item_factory(*item) for item in results] return converted - -class TinyIndex(TinyIndexBase[T]): - def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size): - super().__init__(item_factory, num_pages, page_size) - self.index_path = index_path - self.index_file = open(self.index_path, 'rb') - self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ) - - -class TinyIndexer(TinyIndexBase[T]): - def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int): - super().__init__(item_factory, num_pages, page_size) - self.index_path = index_path - self.compressor = ZstdCompressor() - self.decompressor = ZstdDecompressor() - self.index_file = None - self.mmap = None - - def __enter__(self): - self.create_if_not_exists() - self.index_file = open(self.index_path, 'r+b') - self.mmap = mmap(self.index_file.fileno(), 0) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.mmap.close() - self.index_file.close() - def index(self, key: str, value: T): # print("Index", value) assert type(value) == self.item_factory, f"Can only index the specified type" \ @@ -113,16 +150,28 @@ class TinyIndexer(TinyIndexBase[T]): Serialise the data using JSON, compress it and store it at index i. If the data is too big, it will raise a ValueError and not store anything """ - serialised_data = json.dumps(data) - compressed_data = self.compressor.compress(serialised_data.encode('utf8')) - page_length = len(compressed_data) - if page_length > self.page_size: - raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})") - padding = b'\x00' * (self.page_size - page_length) - self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding + if self.mode != 'w': + raise UnsupportedOperation("The file is open in read mode, you cannot write") + + page_data = _get_page_data(self.compressor, self.page_size, data) + self.mmap[i * self.page_size:(i+1) * self.page_size] = page_data + + @staticmethod + def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int): + if os.path.isfile(index_path): + raise FileExistsError("Index file already exists") + + metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__) + metadata_bytes = metadata.to_bytes() + metadata_padded = _pad_to_page_size(metadata_bytes, METADATA_SIZE) + + compressor = ZstdCompressor() + page_bytes = _get_page_data(compressor, page_size, []) + + with open(index_path, 'wb') as index_file: + index_file.write(metadata_padded) + for i in range(num_pages): + index_file.write(page_bytes) + + return TinyIndex(item_factory, index_path=index_path) - def create_if_not_exists(self): - if not os.path.isfile(self.index_path): - file_length = self.num_pages * self.page_size - with open(self.index_path, 'wb') as index_file: - index_file.write(b'\x00' * file_length) diff --git a/poetry.lock b/poetry.lock index 2d146ef..cc3189d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -26,6 +26,28 @@ python-versions = ">=3.7" [package.extras] tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"] +[[package]] +name = "atomicwrites" +version = "1.4.0" +description = "Atomic file writes." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "21.4.0" +description = "Classes Without Boilerplate" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] + [[package]] name = "beautifulsoup4" version = "4.10.0" @@ -163,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0" [package.source] type = "url" url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz" + [[package]] name = "fastapi" version = "0.70.1" @@ -197,6 +220,14 @@ category = "main" optional = false python-versions = ">=3.5" +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" + [[package]] name = "jinja2" version = "3.0.3" @@ -314,7 +345,7 @@ name = "packaging" version = "21.3" description = "Core utilities for Python packages" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.dependencies] @@ -354,6 +385,18 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"] s3 = ["boto3"] test = ["pytest", "pytest-coverage", "mock", "typer-cli"] +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "preshed" version = "3.0.6" @@ -366,6 +409,14 @@ python-versions = "*" cymem = ">=2.0.2,<2.1.0" murmurhash = ">=0.28.0,<1.1.0" +[[package]] +name = "py" +version = "1.11.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + [[package]] name = "py4j" version = "0.10.9.2" @@ -413,7 +464,7 @@ name = "pyparsing" version = "3.0.7" description = "Python parsing module" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.extras] @@ -436,6 +487,27 @@ mllib = ["numpy (>=1.7)"] pandas_on_spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"] sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"] +[[package]] +name = "pytest" +version = "7.0.1" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +py = ">=1.8.2" +tomli = ">=1.0.0" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -680,6 +752,14 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"] tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] torch = ["torch (>=1.5.0)"] +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "tqdm" version = "4.62.3" @@ -797,7 +877,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx [metadata] lock-version = "1.1" python-versions = "^3.10" -content-hash = "b5af8ce9887d0cf69297180fbb4040e1522e4a3135f8b651415afb35f86124ef" +content-hash = "edb2d4bc50cb09ac5f7ba311d5238eb2deeab1d12f479067cc7239e3232bf6c9" [metadata.files] anyio = [ @@ -808,6 +888,14 @@ asgiref = [ {file = "asgiref-3.5.0-py3-none-any.whl", hash = "sha256:88d59c13d634dcffe0510be048210188edd79aeccb6a6c9028cdad6f31d730a9"}, {file = "asgiref-3.5.0.tar.gz", hash = "sha256:2f8abc20f7248433085eda803936d98992f1343ddb022065779f37c5da0181d0"}, ] +atomicwrites = [ + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, +] +attrs = [ + {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, + {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, +] beautifulsoup4 = [ {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"}, {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"}, @@ -941,6 +1029,10 @@ idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] jinja2 = [ {file = "Jinja2-3.0.3-py3-none-any.whl", hash = "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8"}, {file = "Jinja2-3.0.3.tar.gz", hash = "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"}, @@ -1255,6 +1347,10 @@ pathy = [ {file = "pathy-0.6.1-py3-none-any.whl", hash = "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a"}, {file = "pathy-0.6.1.tar.gz", hash = "sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c"}, ] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] preshed = [ {file = "preshed-3.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:66a71ced487516cf81fd0431a3a843514262ae2f33e9a7688b87562258fa75d5"}, {file = "preshed-3.0.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c98f725d8478f3ade4ab1ea00f50a92d2d9406d37276bc46fd8bab1d47452c4"}, @@ -1273,6 +1369,10 @@ preshed = [ {file = "preshed-3.0.6-cp39-cp39-win_amd64.whl", hash = "sha256:92a8f49d17a63537a8beed48a049b62ef168ca07e0042a5b2bcdf178a1fb5d48"}, {file = "preshed-3.0.6.tar.gz", hash = "sha256:fb3b7588a3a0f2f2f1bf3fe403361b2b031212b73a37025aea1df7215af3772a"}, ] +py = [ + {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, + {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, +] py4j = [ {file = "py4j-0.10.9.2-py2.py3-none-any.whl", hash = "sha256:bf0485388e415ff26710d2dc719cb0ede16cf1164b1ee757e0ebb2e98c471521"}, {file = "py4j-0.10.9.2.tar.gz", hash = "sha256:624f97c363b8dd84822bc666b12fa7f7d97824632b2ff3d852cc491359ce7615"}, @@ -1350,6 +1450,10 @@ pyparsing = [ pyspark = [ {file = "pyspark-3.2.0.tar.gz", hash = "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"}, ] +pytest = [ + {file = "pytest-7.0.1-py3-none-any.whl", hash = "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db"}, + {file = "pytest-7.0.1.tar.gz", hash = "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"}, +] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -1539,6 +1643,10 @@ thinc = [ {file = "thinc-8.0.13-cp39-cp39-win_amd64.whl", hash = "sha256:b3ae088f60d3dfe6a88c6be37548aae40023e46a718cffe3e43953b4f0ffc340"}, {file = "thinc-8.0.13.tar.gz", hash = "sha256:47662a3ae33d445a77b6ea7b772444805c7bba8991f122e350daf72dedc8171a"}, ] +tomli = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] tqdm = [ {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"}, {file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"}, diff --git a/pyproject.toml b/pyproject.toml index 02ae404..cd790c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ indexer = [ # langdetect = "^1.0.9" # spacy = "^3.2.1" # Levenshtein = "^0.16.0" +pytest = "^7.0.1" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/test/test_indexer.py b/test/test_indexer.py new file mode 100644 index 0000000..34113da --- /dev/null +++ b/test/test_indexer.py @@ -0,0 +1,17 @@ +from pathlib import Path +from tempfile import TemporaryDirectory + +from mwmbl.tinysearchengine.indexer import Document, TinyIndex + + +def test_create_index(): + num_pages = 10 + page_size = 4096 + + with TemporaryDirectory() as temp_dir: + index_path = Path(temp_dir) / 'temp-index.tinysearch' + indexer = TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size) + + for i in range(num_pages): + page = indexer.get_page(i) + assert page == [] From 326f7e3d7f412068d5f1194805f9c6c4c2e40352 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Fri, 18 Feb 2022 22:22:47 +0000 Subject: [PATCH 2/4] Use JSON instead of struct to store metadata --- mwmbl/tinysearchengine/indexer.py | 26 +++++++++----------------- test/test_indexer.py | 9 ++++----- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index 29ccfb0..5a33cb2 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -1,15 +1,12 @@ import json import os -from dataclasses import astuple, dataclass +from dataclasses import astuple, dataclass, asdict from io import UnsupportedOperation from mmap import mmap, PROT_READ, PROT_WRITE -from pathlib import Path -from struct import pack, unpack, calcsize from typing import TypeVar, Generic, Callable, List import mmh3 -from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError - +from zstandard import ZstdDecompressor, ZstdCompressor VERSION = 1 METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8') @@ -43,11 +40,9 @@ class TinyIndexMetadata: item_factory: str def to_bytes(self) -> bytes: - result = METADATA_CONSTANT + pack( - METADATA_FORMAT, self.version, self.page_size, self.num_pages, self.item_factory.encode('utf8') - ) - assert len(result) <= METADATA_SIZE - return result + metadata_bytes = METADATA_CONSTANT + json.dumps(asdict(self)).encode('utf8') + assert len(metadata_bytes) <= METADATA_SIZE + return metadata_bytes @staticmethod def from_bytes(data: bytes): @@ -56,9 +51,8 @@ class TinyIndexMetadata: if metadata_constant != METADATA_CONSTANT: raise ValueError("This doesn't seem to be an index file") - actual_metadata_size = calcsize(METADATA_FORMAT) - values = unpack(METADATA_FORMAT, data[constant_length:constant_length+actual_metadata_size]) - return TinyIndexMetadata(values[0], values[1], values[2], values[3].decode('utf8')) + values = json.loads(data[constant_length:].decode('utf8')) + return TinyIndexMetadata(**values) def _get_page_data(compressor, page_size, data): @@ -84,7 +78,8 @@ class TinyIndex(Generic[T]): with open(index_path, 'rb') as index_file: metadata_page = index_file.read(METADATA_SIZE) - metadata = TinyIndexMetadata.from_bytes(metadata_page) + metadata_bytes = metadata_page.rstrip(b'\x00') + metadata = TinyIndexMetadata.from_bytes(metadata_bytes) if metadata.item_factory != item_factory.__name__: raise ValueError(f"Metadata item factory '{metadata.item_factory}' in the index " f"does not match the passed item factory: '{item_factory.__name__}'") @@ -129,7 +124,6 @@ class TinyIndex(Generic[T]): return converted def index(self, key: str, value: T): - # print("Index", value) assert type(value) == self.item_factory, f"Can only index the specified type" \ f" ({self.item_factory.__name__})" page_index = self._get_key_page_index(key) @@ -137,10 +131,8 @@ class TinyIndex(Generic[T]): if current_page is None: current_page = [] value_tuple = astuple(value) - # print("Value tuple", value_tuple) current_page.append(value_tuple) try: - # print("Page", current_page) self._write_page(current_page, page_index) except ValueError: pass diff --git a/test/test_indexer.py b/test/test_indexer.py index 34113da..0ac1cb5 100644 --- a/test/test_indexer.py +++ b/test/test_indexer.py @@ -10,8 +10,7 @@ def test_create_index(): with TemporaryDirectory() as temp_dir: index_path = Path(temp_dir) / 'temp-index.tinysearch' - indexer = TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size) - - for i in range(num_pages): - page = indexer.get_page(i) - assert page == [] + with TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size) as indexer: + for i in range(num_pages): + page = indexer.get_page(i) + assert page == [] From ae3b334a7fce469f764afee161e1da334e53c81c Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Tue, 22 Feb 2022 22:12:39 +0000 Subject: [PATCH 3/4] Fixes for API changes --- mwmbl/indexer/index.py | 6 ++---- mwmbl/indexer/index_crawl.py | 10 ++++++---- mwmbl/tinysearchengine/indexer.py | 15 ++++++++------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index 94dc1e3..9368433 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -10,7 +10,7 @@ import pandas as pd # NUM_PAGES = 8192 # PAGE_SIZE = 512 -from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument +from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex HTTP_START = 'http://' HTTPS_START = 'https://' @@ -66,9 +66,7 @@ def grouper(n: int, iterator: Iterator): yield chunk -def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path): - indexer.create() - +def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path): terms = Counter() pages = get_pages(nlp, titles_urls_and_extracts) for page in pages: diff --git a/mwmbl/indexer/index_crawl.py b/mwmbl/indexer/index_crawl.py index 3c30482..44e557e 100644 --- a/mwmbl/indexer/index_crawl.py +++ b/mwmbl/indexer/index_crawl.py @@ -8,16 +8,18 @@ import spacy from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError from mwmbl.indexer.index import index_titles_urls_and_extracts from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR -from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE +from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE logger = getLogger(__name__) -def index_mwmbl_craw_data(): +def index_mwmbl_crawl_data(): nlp = spacy.load("en_core_web_sm") - with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer: + TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) + + with TinyIndex(Document, INDEX_PATH, 'w') as indexer: titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts() index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH) @@ -43,4 +45,4 @@ def get_mwmbl_crawl_titles_urls_and_extracts(): if __name__ == '__main__': - index_mwmbl_craw_data() + index_mwmbl_crawl_data() diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index 5a33cb2..1beb1b0 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -9,8 +9,7 @@ import mmh3 from zstandard import ZstdDecompressor, ZstdCompressor VERSION = 1 -METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8') -METADATA_FORMAT = 'IIIs' +METADATA_CONSTANT = b'mwmbl-tiny-search' METADATA_SIZE = 4096 NUM_PAGES = 76800 @@ -117,17 +116,19 @@ class TinyIndex(Generic[T]): """ Get the page at index i, decompress and deserialise it using JSON """ + results = self._get_page_tuples(i) + return [self.item_factory(*item) for item in results] + + def _get_page_tuples(self, i): page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size] decompressed_data = self.decompressor.decompress(page_data) - results = json.loads(decompressed_data.decode('utf8')) - converted = [self.item_factory(*item) for item in results] - return converted + return json.loads(decompressed_data.decode('utf8')) def index(self, key: str, value: T): assert type(value) == self.item_factory, f"Can only index the specified type" \ f" ({self.item_factory.__name__})" page_index = self._get_key_page_index(key) - current_page = self.get_page(page_index) + current_page = self._get_page_tuples(page_index) if current_page is None: current_page = [] value_tuple = astuple(value) @@ -151,7 +152,7 @@ class TinyIndex(Generic[T]): @staticmethod def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int): if os.path.isfile(index_path): - raise FileExistsError("Index file already exists") + raise FileExistsError(f"Index file '{index_path}' already exists") metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__) metadata_bytes = metadata.to_bytes() From 04a33a134bb06264142c13ab9187bbc0239a0415 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Tue, 22 Feb 2022 22:27:02 +0000 Subject: [PATCH 4/4] Fixes to mwmbl API for changes to the index --- config/tinysearchengine.yaml | 12 ---------- mwmbl/tinysearchengine/app.py | 34 ++++++++++----------------- mwmbl/tinysearchengine/config.py | 40 -------------------------------- 3 files changed, 12 insertions(+), 74 deletions(-) delete mode 100644 config/tinysearchengine.yaml delete mode 100644 mwmbl/tinysearchengine/config.py diff --git a/config/tinysearchengine.yaml b/config/tinysearchengine.yaml deleted file mode 100644 index 570acc3..0000000 --- a/config/tinysearchengine.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Config for bootstrapping tinysearchengine. -# Follows the schema/model defined by mwmbl.tinysearchengine.config.ConfigModel - -server_config: - host: "0.0.0.0" - port: 8080 - log_level: "info" - -index_config: - index_path: data/index.tinysearch - num_pages: 76800 - page_size: 4096 diff --git a/mwmbl/tinysearchengine/app.py b/mwmbl/tinysearchengine/app.py index dda7702..85078e3 100644 --- a/mwmbl/tinysearchengine/app.py +++ b/mwmbl/tinysearchengine/app.py @@ -1,13 +1,12 @@ -import logging import argparse +import logging import pandas as pd import uvicorn from mwmbl.tinysearchengine import create_app from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document -from mwmbl.tinysearchengine.config import parse_config_file +from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine.rank import Ranker logging.basicConfig() @@ -16,7 +15,8 @@ logging.basicConfig() def setup_args(): """Read all the args.""" parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine") - parser.add_argument("--config", help="Path to tinysearchengine's yaml config.", required=True) + parser.add_argument("--index", help="Path to the tinysearchengine index file", required=True) + parser.add_argument("--terms", help="Path to the tinysearchengine terms CSV file", required=True) args = parser.parse_args() return args @@ -30,30 +30,20 @@ def main(): * Initialize a FastAPI app instance * Starts uvicorn server using app instance """ - config, tiny_index = get_config_and_index() + args = setup_args() # Load term data - terms = pd.read_csv(config.terms_path) + terms = pd.read_csv(args.terms) completer = Completer(terms) - ranker = Ranker(tiny_index, completer) + with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index: + ranker = Ranker(tiny_index, completer) - # Initialize FastApi instance - app = create_app.create(ranker) + # Initialize FastApi instance + app = create_app.create(ranker) - # Initialize uvicorn server using global app instance and server config params - uvicorn.run(app, **config.server_config.dict()) - - -def get_config_and_index(): - args = setup_args() - config = parse_config_file(config_filename=args.config) - # Initialize TinyIndex using index config params - tiny_index = TinyIndex( - item_factory=Document, - **config.index_config.dict() - ) - return config, tiny_index + # Initialize uvicorn server using global app instance and server config params + uvicorn.run(app, host="0.0.0.0", port=8080) if __name__ == "__main__": diff --git a/mwmbl/tinysearchengine/config.py b/mwmbl/tinysearchengine/config.py deleted file mode 100644 index 2fd6f54..0000000 --- a/mwmbl/tinysearchengine/config.py +++ /dev/null @@ -1,40 +0,0 @@ -import pathlib -import yaml -from pydantic import BaseModel, StrictInt, StrictStr, Field - - -class ServerConfigModel(BaseModel): - host: StrictStr = "0.0.0.0" - port: StrictInt = 8080 - log_level: StrictStr = "info" - - -class IndexConfigModel(BaseModel): - index_path: StrictStr = "data/index.tinysearch" - num_pages: StrictInt = 25600 - page_size: StrictInt = 4096 - - -class ConfigModel(BaseModel): - server_config: ServerConfigModel = Field(default_factory=ServerConfigModel) - index_config: IndexConfigModel = Field(default_factory=IndexConfigModel) - terms_path: StrictStr = "data/mwmbl-crawl-terms.csv" - - -def parse_config_file(config_filename: str) -> ConfigModel: - """Parse config dictionary and return ConfigModel.""" - if not pathlib.Path(config_filename).is_file(): - raise ValueError( - f"config_filename: {config_filename} is not a file. Please check if it exists." - ) - - with open(config_filename) as f: - config = yaml.load(f, yaml.Loader) - - return ConfigModel(**config) - - -if __name__ == "__main__": - # Call this from the root of the repo using "python -m mwmbl.tinysearchengine.config" - config_model = parse_config_file(config_filename="config/tinysearchengine.yaml") - print(config_model.dict())