WIP: include metadata in index - using struct approach
This commit is contained in:
parent
82c46b50bc
commit
e6273c7f76
6 changed files with 237 additions and 62 deletions
|
@ -67,7 +67,7 @@ def grouper(n: int, iterator: Iterator):
|
|||
|
||||
|
||||
def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
|
||||
indexer.create_if_not_exists()
|
||||
indexer.create()
|
||||
|
||||
terms = Counter()
|
||||
pages = get_pages(nlp, titles_urls_and_extracts)
|
||||
|
|
|
@ -12,7 +12,7 @@ from .paths import INDEX_PATH, CRAWL_GLOB
|
|||
def run():
|
||||
# TODO: item_factory argument is unfilled.
|
||||
indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
indexer.create_if_not_exists()
|
||||
indexer.create()
|
||||
nlp = English()
|
||||
for path in glob(CRAWL_GLOB):
|
||||
print("Path", path)
|
||||
|
|
|
@ -1,14 +1,21 @@
|
|||
import json
|
||||
import os
|
||||
from dataclasses import astuple, dataclass
|
||||
from mmap import mmap, PROT_READ
|
||||
from io import UnsupportedOperation
|
||||
from mmap import mmap, PROT_READ, PROT_WRITE
|
||||
from pathlib import Path
|
||||
from struct import pack, unpack, calcsize
|
||||
from typing import TypeVar, Generic, Callable, List
|
||||
|
||||
import mmh3
|
||||
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
|
||||
|
||||
|
||||
VERSION = 1
|
||||
METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8')
|
||||
METADATA_FORMAT = 'IIIs'
|
||||
METADATA_SIZE = 4096
|
||||
|
||||
NUM_PAGES = 76800
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
@ -28,20 +35,84 @@ class TokenizedDocument(Document):
|
|||
T = TypeVar('T')
|
||||
|
||||
|
||||
class TinyIndexBase(Generic[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
|
||||
@dataclass
|
||||
class TinyIndexMetadata:
|
||||
version: int
|
||||
page_size: int
|
||||
num_pages: int
|
||||
item_factory: str
|
||||
|
||||
def to_bytes(self) -> bytes:
|
||||
result = METADATA_CONSTANT + pack(
|
||||
METADATA_FORMAT, self.version, self.page_size, self.num_pages, self.item_factory.encode('utf8')
|
||||
)
|
||||
assert len(result) <= METADATA_SIZE
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def from_bytes(data: bytes):
|
||||
constant_length = len(METADATA_CONSTANT)
|
||||
metadata_constant = data[:constant_length]
|
||||
if metadata_constant != METADATA_CONSTANT:
|
||||
raise ValueError("This doesn't seem to be an index file")
|
||||
|
||||
actual_metadata_size = calcsize(METADATA_FORMAT)
|
||||
values = unpack(METADATA_FORMAT, data[constant_length:constant_length+actual_metadata_size])
|
||||
return TinyIndexMetadata(values[0], values[1], values[2], values[3].decode('utf8'))
|
||||
|
||||
|
||||
def _get_page_data(compressor, page_size, data):
|
||||
serialised_data = json.dumps(data)
|
||||
compressed_data = compressor.compress(serialised_data.encode('utf8'))
|
||||
return _pad_to_page_size(compressed_data, page_size)
|
||||
|
||||
|
||||
def _pad_to_page_size(data: bytes, page_size: int):
|
||||
page_length = len(data)
|
||||
if page_length > page_size:
|
||||
raise ValueError(f"Data is too big ({page_length}) for page size ({page_size})")
|
||||
padding = b'\x00' * (page_size - page_length)
|
||||
page_data = data + padding
|
||||
return page_data
|
||||
|
||||
|
||||
class TinyIndex(Generic[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], index_path, mode='r'):
|
||||
if mode not in {'r', 'w'}:
|
||||
raise ValueError(f"Mode should be one of 'r' or 'w', got {mode}")
|
||||
|
||||
with open(index_path, 'rb') as index_file:
|
||||
metadata_page = index_file.read(METADATA_SIZE)
|
||||
|
||||
metadata = TinyIndexMetadata.from_bytes(metadata_page)
|
||||
if metadata.item_factory != item_factory.__name__:
|
||||
raise ValueError(f"Metadata item factory '{metadata.item_factory}' in the index "
|
||||
f"does not match the passed item factory: '{item_factory.__name__}'")
|
||||
|
||||
self.item_factory = item_factory
|
||||
self.num_pages = num_pages
|
||||
self.page_size = page_size
|
||||
self.index_path = index_path
|
||||
self.mode = mode
|
||||
|
||||
self.num_pages = metadata.num_pages
|
||||
self.page_size = metadata.page_size
|
||||
self.compressor = ZstdCompressor()
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.index_file = None
|
||||
self.mmap = None
|
||||
|
||||
def __enter__(self):
|
||||
self.index_file = open(self.index_path, 'r+b')
|
||||
prot = PROT_READ if self.mode == 'r' else PROT_READ | PROT_WRITE
|
||||
self.mmap = mmap(self.index_file.fileno(), 0, offset=METADATA_SIZE, prot=prot)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.mmap.close()
|
||||
self.index_file.close()
|
||||
|
||||
def retrieve(self, key: str) -> List[T]:
|
||||
index = self._get_key_page_index(key)
|
||||
page = self.get_page(index)
|
||||
if page is None:
|
||||
return []
|
||||
return self.convert_items(page)
|
||||
return self.get_page(index)
|
||||
|
||||
def _get_key_page_index(self, key):
|
||||
key_hash = mmh3.hash(key, signed=False)
|
||||
|
@ -52,45 +123,11 @@ class TinyIndexBase(Generic[T]):
|
|||
Get the page at index i, decompress and deserialise it using JSON
|
||||
"""
|
||||
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
||||
try:
|
||||
decompressed_data = self.decompressor.decompress(page_data)
|
||||
except ZstdError:
|
||||
return None
|
||||
decompressed_data = self.decompressor.decompress(page_data)
|
||||
results = json.loads(decompressed_data.decode('utf8'))
|
||||
return results
|
||||
|
||||
def convert_items(self, items) -> List[T]:
|
||||
converted = [self.item_factory(*item) for item in items]
|
||||
converted = [self.item_factory(*item) for item in results]
|
||||
return converted
|
||||
|
||||
|
||||
class TinyIndex(TinyIndexBase[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
|
||||
super().__init__(item_factory, num_pages, page_size)
|
||||
self.index_path = index_path
|
||||
self.index_file = open(self.index_path, 'rb')
|
||||
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
|
||||
|
||||
|
||||
class TinyIndexer(TinyIndexBase[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
|
||||
super().__init__(item_factory, num_pages, page_size)
|
||||
self.index_path = index_path
|
||||
self.compressor = ZstdCompressor()
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.index_file = None
|
||||
self.mmap = None
|
||||
|
||||
def __enter__(self):
|
||||
self.create_if_not_exists()
|
||||
self.index_file = open(self.index_path, 'r+b')
|
||||
self.mmap = mmap(self.index_file.fileno(), 0)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.mmap.close()
|
||||
self.index_file.close()
|
||||
|
||||
def index(self, key: str, value: T):
|
||||
# print("Index", value)
|
||||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||
|
@ -113,16 +150,28 @@ class TinyIndexer(TinyIndexBase[T]):
|
|||
Serialise the data using JSON, compress it and store it at index i.
|
||||
If the data is too big, it will raise a ValueError and not store anything
|
||||
"""
|
||||
serialised_data = json.dumps(data)
|
||||
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
|
||||
page_length = len(compressed_data)
|
||||
if page_length > self.page_size:
|
||||
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
|
||||
padding = b'\x00' * (self.page_size - page_length)
|
||||
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
|
||||
if self.mode != 'w':
|
||||
raise UnsupportedOperation("The file is open in read mode, you cannot write")
|
||||
|
||||
page_data = _get_page_data(self.compressor, self.page_size, data)
|
||||
self.mmap[i * self.page_size:(i+1) * self.page_size] = page_data
|
||||
|
||||
@staticmethod
|
||||
def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
|
||||
if os.path.isfile(index_path):
|
||||
raise FileExistsError("Index file already exists")
|
||||
|
||||
metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__)
|
||||
metadata_bytes = metadata.to_bytes()
|
||||
metadata_padded = _pad_to_page_size(metadata_bytes, METADATA_SIZE)
|
||||
|
||||
compressor = ZstdCompressor()
|
||||
page_bytes = _get_page_data(compressor, page_size, [])
|
||||
|
||||
with open(index_path, 'wb') as index_file:
|
||||
index_file.write(metadata_padded)
|
||||
for i in range(num_pages):
|
||||
index_file.write(page_bytes)
|
||||
|
||||
return TinyIndex(item_factory, index_path=index_path)
|
||||
|
||||
def create_if_not_exists(self):
|
||||
if not os.path.isfile(self.index_path):
|
||||
file_length = self.num_pages * self.page_size
|
||||
with open(self.index_path, 'wb') as index_file:
|
||||
index_file.write(b'\x00' * file_length)
|
||||
|
|
114
poetry.lock
generated
114
poetry.lock
generated
|
@ -26,6 +26,28 @@ python-versions = ">=3.7"
|
|||
[package.extras]
|
||||
tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
|
||||
|
||||
[[package]]
|
||||
name = "atomicwrites"
|
||||
version = "1.4.0"
|
||||
description = "Atomic file writes."
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
|
||||
[[package]]
|
||||
name = "attrs"
|
||||
version = "21.4.0"
|
||||
description = "Classes Without Boilerplate"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
|
||||
[package.extras]
|
||||
dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"]
|
||||
docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
|
||||
tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"]
|
||||
tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.10.0"
|
||||
|
@ -163,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
|
|||
[package.source]
|
||||
type = "url"
|
||||
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
|
||||
|
||||
[[package]]
|
||||
name = "fastapi"
|
||||
version = "0.70.1"
|
||||
|
@ -197,6 +220,14 @@ category = "main"
|
|||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "1.1.1"
|
||||
description = "iniconfig: brain-dead simple config-ini parsing"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "jinja2"
|
||||
version = "3.0.3"
|
||||
|
@ -314,7 +345,7 @@ name = "packaging"
|
|||
version = "21.3"
|
||||
description = "Core utilities for Python packages"
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -354,6 +385,18 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
|
|||
s3 = ["boto3"]
|
||||
test = ["pytest", "pytest-coverage", "mock", "typer-cli"]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.0.0"
|
||||
description = "plugin and hook calling mechanisms for python"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
|
||||
[package.extras]
|
||||
dev = ["pre-commit", "tox"]
|
||||
testing = ["pytest", "pytest-benchmark"]
|
||||
|
||||
[[package]]
|
||||
name = "preshed"
|
||||
version = "3.0.6"
|
||||
|
@ -366,6 +409,14 @@ python-versions = "*"
|
|||
cymem = ">=2.0.2,<2.1.0"
|
||||
murmurhash = ">=0.28.0,<1.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "py"
|
||||
version = "1.11.0"
|
||||
description = "library with cross-python path, ini-parsing, io, code, log facilities"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
|
||||
[[package]]
|
||||
name = "py4j"
|
||||
version = "0.10.9.2"
|
||||
|
@ -413,7 +464,7 @@ name = "pyparsing"
|
|||
version = "3.0.7"
|
||||
description = "Python parsing module"
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
|
||||
[package.extras]
|
||||
|
@ -436,6 +487,27 @@ mllib = ["numpy (>=1.7)"]
|
|||
pandas_on_spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
|
||||
sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "7.0.1"
|
||||
description = "pytest: simple powerful testing with Python"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
|
||||
[package.dependencies]
|
||||
atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
|
||||
attrs = ">=19.2.0"
|
||||
colorama = {version = "*", markers = "sys_platform == \"win32\""}
|
||||
iniconfig = "*"
|
||||
packaging = "*"
|
||||
pluggy = ">=0.12,<2.0"
|
||||
py = ">=1.8.2"
|
||||
tomli = ">=1.0.0"
|
||||
|
||||
[package.extras]
|
||||
testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.8.2"
|
||||
|
@ -680,6 +752,14 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"]
|
|||
tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"]
|
||||
torch = ["torch (>=1.5.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "tomli"
|
||||
version = "2.0.1"
|
||||
description = "A lil' TOML parser"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
|
||||
[[package]]
|
||||
name = "tqdm"
|
||||
version = "4.62.3"
|
||||
|
@ -797,7 +877,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
|
|||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "b5af8ce9887d0cf69297180fbb4040e1522e4a3135f8b651415afb35f86124ef"
|
||||
content-hash = "edb2d4bc50cb09ac5f7ba311d5238eb2deeab1d12f479067cc7239e3232bf6c9"
|
||||
|
||||
[metadata.files]
|
||||
anyio = [
|
||||
|
@ -808,6 +888,14 @@ asgiref = [
|
|||
{file = "asgiref-3.5.0-py3-none-any.whl", hash = "sha256:88d59c13d634dcffe0510be048210188edd79aeccb6a6c9028cdad6f31d730a9"},
|
||||
{file = "asgiref-3.5.0.tar.gz", hash = "sha256:2f8abc20f7248433085eda803936d98992f1343ddb022065779f37c5da0181d0"},
|
||||
]
|
||||
atomicwrites = [
|
||||
{file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
|
||||
{file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
|
||||
]
|
||||
attrs = [
|
||||
{file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
|
||||
{file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
|
||||
]
|
||||
beautifulsoup4 = [
|
||||
{file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"},
|
||||
{file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"},
|
||||
|
@ -941,6 +1029,10 @@ idna = [
|
|||
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
|
||||
{file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
|
||||
]
|
||||
iniconfig = [
|
||||
{file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
|
||||
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
|
||||
]
|
||||
jinja2 = [
|
||||
{file = "Jinja2-3.0.3-py3-none-any.whl", hash = "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8"},
|
||||
{file = "Jinja2-3.0.3.tar.gz", hash = "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"},
|
||||
|
@ -1255,6 +1347,10 @@ pathy = [
|
|||
{file = "pathy-0.6.1-py3-none-any.whl", hash = "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a"},
|
||||
{file = "pathy-0.6.1.tar.gz", hash = "sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c"},
|
||||
]
|
||||
pluggy = [
|
||||
{file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
|
||||
{file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
|
||||
]
|
||||
preshed = [
|
||||
{file = "preshed-3.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:66a71ced487516cf81fd0431a3a843514262ae2f33e9a7688b87562258fa75d5"},
|
||||
{file = "preshed-3.0.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c98f725d8478f3ade4ab1ea00f50a92d2d9406d37276bc46fd8bab1d47452c4"},
|
||||
|
@ -1273,6 +1369,10 @@ preshed = [
|
|||
{file = "preshed-3.0.6-cp39-cp39-win_amd64.whl", hash = "sha256:92a8f49d17a63537a8beed48a049b62ef168ca07e0042a5b2bcdf178a1fb5d48"},
|
||||
{file = "preshed-3.0.6.tar.gz", hash = "sha256:fb3b7588a3a0f2f2f1bf3fe403361b2b031212b73a37025aea1df7215af3772a"},
|
||||
]
|
||||
py = [
|
||||
{file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
|
||||
{file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
|
||||
]
|
||||
py4j = [
|
||||
{file = "py4j-0.10.9.2-py2.py3-none-any.whl", hash = "sha256:bf0485388e415ff26710d2dc719cb0ede16cf1164b1ee757e0ebb2e98c471521"},
|
||||
{file = "py4j-0.10.9.2.tar.gz", hash = "sha256:624f97c363b8dd84822bc666b12fa7f7d97824632b2ff3d852cc491359ce7615"},
|
||||
|
@ -1350,6 +1450,10 @@ pyparsing = [
|
|||
pyspark = [
|
||||
{file = "pyspark-3.2.0.tar.gz", hash = "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"},
|
||||
]
|
||||
pytest = [
|
||||
{file = "pytest-7.0.1-py3-none-any.whl", hash = "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db"},
|
||||
{file = "pytest-7.0.1.tar.gz", hash = "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"},
|
||||
]
|
||||
python-dateutil = [
|
||||
{file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
|
||||
{file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
|
||||
|
@ -1539,6 +1643,10 @@ thinc = [
|
|||
{file = "thinc-8.0.13-cp39-cp39-win_amd64.whl", hash = "sha256:b3ae088f60d3dfe6a88c6be37548aae40023e46a718cffe3e43953b4f0ffc340"},
|
||||
{file = "thinc-8.0.13.tar.gz", hash = "sha256:47662a3ae33d445a77b6ea7b772444805c7bba8991f122e350daf72dedc8171a"},
|
||||
]
|
||||
tomli = [
|
||||
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
|
||||
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
|
||||
]
|
||||
tqdm = [
|
||||
{file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},
|
||||
{file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"},
|
||||
|
|
|
@ -64,6 +64,7 @@ indexer = [
|
|||
# langdetect = "^1.0.9"
|
||||
# spacy = "^3.2.1"
|
||||
# Levenshtein = "^0.16.0"
|
||||
pytest = "^7.0.1"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
|
|
17
test/test_indexer.py
Normal file
17
test/test_indexer.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
|
||||
|
||||
|
||||
def test_create_index():
|
||||
num_pages = 10
|
||||
page_size = 4096
|
||||
|
||||
with TemporaryDirectory() as temp_dir:
|
||||
index_path = Path(temp_dir) / 'temp-index.tinysearch'
|
||||
indexer = TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size)
|
||||
|
||||
for i in range(num_pages):
|
||||
page = indexer.get_page(i)
|
||||
assert page == []
|
Loading…
Add table
Reference in a new issue