Browse Source

Analysis to confirm that 'leek and potato soup' page was really missing

Daoud Clarke 3 years ago
parent
commit
9ee6f37a60
3 changed files with 15 additions and 18 deletions
  1. 9 5
      analyse/inspect_index.py
  2. 2 7
      create_app.py
  3. 4 6
      index.py

+ 9 - 5
analyse/inspect_index.py

@@ -2,14 +2,18 @@ from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
 from paths import INDEX_PATH
 from paths import INDEX_PATH
 
 
 
 
+def get_items():
+    tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
+    items = tiny_index.retrieve('soup')
+    if items:
+        for item in items:
+            print("Items", item)
+
+
 def run():
 def run():
     tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
     tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
     for i in range(100):
     for i in range(100):
-        items = tiny_index.retrieve('eggless')
-        # items = tiny_index.convert_items(page)
-        if items:
-            print("Items", items)
-            break
+        tiny_index.get_page(i)
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':

+ 2 - 7
create_app.py

@@ -1,16 +1,13 @@
 import re
 import re
 from logging import getLogger
 from logging import getLogger
 from operator import itemgetter
 from operator import itemgetter
-from typing import List
 
 
-import Levenshtein
 from fastapi import FastAPI
 from fastapi import FastAPI
-from starlette.responses import RedirectResponse, FileResponse, HTMLResponse
+from starlette.responses import FileResponse
 from starlette.staticfiles import StaticFiles
 from starlette.staticfiles import StaticFiles
 
 
 from index import TinyIndex, Document
 from index import TinyIndex, Document
 
 
-
 logger = getLogger(__name__)
 logger = getLogger(__name__)
 
 
 
 
@@ -62,10 +59,8 @@ def create(tiny_index: TinyIndex):
     def order_results(terms: list[str], results: list[Document]):
     def order_results(terms: list[str], results: list[Document]):
         results_and_scores = [(score_result(terms, result), result) for result in results]
         results_and_scores = [(score_result(terms, result), result) for result in results]
         ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
         ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
-        print("Ordered results", ordered_results)
+        # print("Ordered results", ordered_results)
         filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
         filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
-        # ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True)
-        # print("Order results", query, ordered_results, sep='\n')
         return filtered_results
         return filtered_results
 
 
     @app.get("/complete")
     @app.get("/complete")

+ 4 - 6
index.py

@@ -89,11 +89,14 @@ class TinyIndexBase(Generic[T]):
         Get the page at index i, decompress and deserialise it using JSON
         Get the page at index i, decompress and deserialise it using JSON
         """
         """
         page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
         page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
+        zeros = page_data.count(b'\x00\x00\x00\x00') * 4
         try:
         try:
             decompressed_data = self.decompressor.decompress(page_data)
             decompressed_data = self.decompressor.decompress(page_data)
         except ZstdError:
         except ZstdError:
             return None
             return None
-        return json.loads(decompressed_data.decode('utf8'))
+        results = json.loads(decompressed_data.decode('utf8'))
+        # print(f"Num results: {len(results)}, num zeros: {zeros}")
+        return results
 
 
     def convert_items(self, items) -> List[T]:
     def convert_items(self, items) -> List[T]:
         converted = [self.item_factory(*item) for item in items]
         converted = [self.item_factory(*item) for item in items]
@@ -129,11 +132,6 @@ class TinyIndexer(TinyIndexBase[T]):
         self.mmap.close()
         self.mmap.close()
         self.index_file.close()
         self.index_file.close()
 
 
-    # def index(self, documents: List[TokenizedDocument]):
-    #     for document in documents:
-    #         for token in document.tokens:
-    #             self._index_document(document, token)
-
     def index(self, key: str, value: T):
     def index(self, key: str, value: T):
         # print("Index", value)
         # print("Index", value)
         assert type(value) == self.item_factory, f"Can only index the specified type" \
         assert type(value) == self.item_factory, f"Can only index the specified type" \