Fixes for API changes
This commit is contained in:
parent
326f7e3d7f
commit
ae3b334a7f
3 changed files with 16 additions and 15 deletions
|
@ -10,7 +10,7 @@ import pandas as pd
|
|||
|
||||
# NUM_PAGES = 8192
|
||||
# PAGE_SIZE = 512
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
|
||||
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
|
||||
|
||||
HTTP_START = 'http://'
|
||||
HTTPS_START = 'https://'
|
||||
|
@ -66,9 +66,7 @@ def grouper(n: int, iterator: Iterator):
|
|||
yield chunk
|
||||
|
||||
|
||||
def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
|
||||
indexer.create()
|
||||
|
||||
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path):
|
||||
terms = Counter()
|
||||
pages = get_pages(nlp, titles_urls_and_extracts)
|
||||
for page in pages:
|
||||
|
|
|
@ -8,16 +8,18 @@ import spacy
|
|||
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
|
||||
from mwmbl.indexer.index import index_titles_urls_and_extracts
|
||||
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
||||
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
def index_mwmbl_craw_data():
|
||||
def index_mwmbl_crawl_data():
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
|
||||
with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
|
||||
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
|
||||
|
||||
|
@ -43,4 +45,4 @@ def get_mwmbl_crawl_titles_urls_and_extracts():
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
index_mwmbl_craw_data()
|
||||
index_mwmbl_crawl_data()
|
||||
|
|
|
@ -9,8 +9,7 @@ import mmh3
|
|||
from zstandard import ZstdDecompressor, ZstdCompressor
|
||||
|
||||
VERSION = 1
|
||||
METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8')
|
||||
METADATA_FORMAT = 'IIIs'
|
||||
METADATA_CONSTANT = b'mwmbl-tiny-search'
|
||||
METADATA_SIZE = 4096
|
||||
|
||||
NUM_PAGES = 76800
|
||||
|
@ -117,17 +116,19 @@ class TinyIndex(Generic[T]):
|
|||
"""
|
||||
Get the page at index i, decompress and deserialise it using JSON
|
||||
"""
|
||||
results = self._get_page_tuples(i)
|
||||
return [self.item_factory(*item) for item in results]
|
||||
|
||||
def _get_page_tuples(self, i):
|
||||
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
||||
decompressed_data = self.decompressor.decompress(page_data)
|
||||
results = json.loads(decompressed_data.decode('utf8'))
|
||||
converted = [self.item_factory(*item) for item in results]
|
||||
return converted
|
||||
return json.loads(decompressed_data.decode('utf8'))
|
||||
|
||||
def index(self, key: str, value: T):
|
||||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||
f" ({self.item_factory.__name__})"
|
||||
page_index = self._get_key_page_index(key)
|
||||
current_page = self.get_page(page_index)
|
||||
current_page = self._get_page_tuples(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
value_tuple = astuple(value)
|
||||
|
@ -151,7 +152,7 @@ class TinyIndex(Generic[T]):
|
|||
@staticmethod
|
||||
def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
|
||||
if os.path.isfile(index_path):
|
||||
raise FileExistsError("Index file already exists")
|
||||
raise FileExistsError(f"Index file '{index_path}' already exists")
|
||||
|
||||
metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__)
|
||||
metadata_bytes = metadata.to_bytes()
|
||||
|
|
Loading…
Add table
Reference in a new issue