Fixes for API changes

This commit is contained in:
Daoud Clarke 2022-02-22 22:12:39 +00:00
parent 326f7e3d7f
commit ae3b334a7f
3 changed files with 16 additions and 15 deletions

View file

@ -10,7 +10,7 @@ import pandas as pd
# NUM_PAGES = 8192
# PAGE_SIZE = 512
from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
HTTP_START = 'http://'
HTTPS_START = 'https://'
@ -66,9 +66,7 @@ def grouper(n: int, iterator: Iterator):
yield chunk
def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
indexer.create()
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path):
terms = Counter()
pages = get_pages(nlp, titles_urls_and_extracts)
for page in pages:

View file

@ -8,16 +8,18 @@ import spacy
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
from mwmbl.indexer.index import index_titles_urls_and_extracts
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
logger = getLogger(__name__)
def index_mwmbl_craw_data():
def index_mwmbl_crawl_data():
nlp = spacy.load("en_core_web_sm")
with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
@ -43,4 +45,4 @@ def get_mwmbl_crawl_titles_urls_and_extracts():
if __name__ == '__main__':
index_mwmbl_craw_data()
index_mwmbl_crawl_data()

View file

@ -9,8 +9,7 @@ import mmh3
from zstandard import ZstdDecompressor, ZstdCompressor
VERSION = 1
METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8')
METADATA_FORMAT = 'IIIs'
METADATA_CONSTANT = b'mwmbl-tiny-search'
METADATA_SIZE = 4096
NUM_PAGES = 76800
@ -117,17 +116,19 @@ class TinyIndex(Generic[T]):
"""
Get the page at index i, decompress and deserialise it using JSON
"""
results = self._get_page_tuples(i)
return [self.item_factory(*item) for item in results]
def _get_page_tuples(self, i):
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
decompressed_data = self.decompressor.decompress(page_data)
results = json.loads(decompressed_data.decode('utf8'))
converted = [self.item_factory(*item) for item in results]
return converted
return json.loads(decompressed_data.decode('utf8'))
def index(self, key: str, value: T):
assert type(value) == self.item_factory, f"Can only index the specified type" \
f" ({self.item_factory.__name__})"
page_index = self._get_key_page_index(key)
current_page = self.get_page(page_index)
current_page = self._get_page_tuples(page_index)
if current_page is None:
current_page = []
value_tuple = astuple(value)
@ -151,7 +152,7 @@ class TinyIndex(Generic[T]):
@staticmethod
def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
if os.path.isfile(index_path):
raise FileExistsError("Index file already exists")
raise FileExistsError(f"Index file '{index_path}' already exists")
metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__)
metadata_bytes = metadata.to_bytes()