浏览代码

Fixes for API changes

Daoud Clarke 3 年之前
父节点
当前提交
ae3b334a7f
共有 3 个文件被更改,包括 16 次插入15 次删除
  1. 2 4
      mwmbl/indexer/index.py
  2. 6 4
      mwmbl/indexer/index_crawl.py
  3. 8 7
      mwmbl/tinysearchengine/indexer.py

+ 2 - 4
mwmbl/indexer/index.py

@@ -10,7 +10,7 @@ import pandas as pd
 
 # NUM_PAGES = 8192
 # PAGE_SIZE = 512
-from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
+from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
 
 HTTP_START = 'http://'
 HTTPS_START = 'https://'
@@ -66,9 +66,7 @@ def grouper(n: int, iterator: Iterator):
         yield chunk
 
 
-def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
-    indexer.create()
-
+def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path):
     terms = Counter()
     pages = get_pages(nlp, titles_urls_and_extracts)
     for page in pages:

+ 6 - 4
mwmbl/indexer/index_crawl.py

@@ -8,16 +8,18 @@ import spacy
 from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
 from mwmbl.indexer.index import index_titles_urls_and_extracts
 from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
-from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
 
 
 logger = getLogger(__name__)
 
 
-def index_mwmbl_craw_data():
+def index_mwmbl_crawl_data():
     nlp = spacy.load("en_core_web_sm")
 
-    with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+    TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
+
+    with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
         titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
         index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
 
@@ -43,4 +45,4 @@ def get_mwmbl_crawl_titles_urls_and_extracts():
 
 
 if __name__ == '__main__':
-    index_mwmbl_craw_data()
+    index_mwmbl_crawl_data()

+ 8 - 7
mwmbl/tinysearchengine/indexer.py

@@ -9,8 +9,7 @@ import mmh3
 from zstandard import ZstdDecompressor, ZstdCompressor
 
 VERSION = 1
-METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8')
-METADATA_FORMAT = 'IIIs'
+METADATA_CONSTANT = b'mwmbl-tiny-search'
 METADATA_SIZE = 4096
 
 NUM_PAGES = 76800
@@ -117,17 +116,19 @@ class TinyIndex(Generic[T]):
         """
         Get the page at index i, decompress and deserialise it using JSON
         """
+        results = self._get_page_tuples(i)
+        return [self.item_factory(*item) for item in results]
+
+    def _get_page_tuples(self, i):
         page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
         decompressed_data = self.decompressor.decompress(page_data)
-        results = json.loads(decompressed_data.decode('utf8'))
-        converted = [self.item_factory(*item) for item in results]
-        return converted
+        return json.loads(decompressed_data.decode('utf8'))
 
     def index(self, key: str, value: T):
         assert type(value) == self.item_factory, f"Can only index the specified type" \
                                               f" ({self.item_factory.__name__})"
         page_index = self._get_key_page_index(key)
-        current_page = self.get_page(page_index)
+        current_page = self._get_page_tuples(page_index)
         if current_page is None:
             current_page = []
         value_tuple = astuple(value)
@@ -151,7 +152,7 @@ class TinyIndex(Generic[T]):
     @staticmethod
     def create(item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
         if os.path.isfile(index_path):
-            raise FileExistsError("Index file already exists")
+            raise FileExistsError(f"Index file '{index_path}' already exists")
 
         metadata = TinyIndexMetadata(VERSION, page_size, num_pages, item_factory.__name__)
         metadata_bytes = metadata.to_bytes()