From 9ee6f37a608c9d3bfd042657d056334e08004afa Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 19 Dec 2021 21:09:00 +0000 Subject: [PATCH] Analysis to confirm that 'leek and potato soup' page was really missing --- analyse/inspect_index.py | 14 +++++++++----- create_app.py | 9 ++------- index.py | 10 ++++------ 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/analyse/inspect_index.py b/analyse/inspect_index.py index 4286a86..56fe5bb 100644 --- a/analyse/inspect_index.py +++ b/analyse/inspect_index.py @@ -2,14 +2,18 @@ from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE from paths import INDEX_PATH +def get_items(): + tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) + items = tiny_index.retrieve('soup') + if items: + for item in items: + print("Items", item) + + def run(): tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) for i in range(100): - items = tiny_index.retrieve('eggless') - # items = tiny_index.convert_items(page) - if items: - print("Items", items) - break + tiny_index.get_page(i) if __name__ == '__main__': diff --git a/create_app.py b/create_app.py index 37165ab..7b6adec 100644 --- a/create_app.py +++ b/create_app.py @@ -1,16 +1,13 @@ import re from logging import getLogger from operator import itemgetter -from typing import List -import Levenshtein from fastapi import FastAPI -from starlette.responses import RedirectResponse, FileResponse, HTMLResponse +from starlette.responses import FileResponse from starlette.staticfiles import StaticFiles from index import TinyIndex, Document - logger = getLogger(__name__) @@ -62,10 +59,8 @@ def create(tiny_index: TinyIndex): def order_results(terms: list[str], results: list[Document]): results_and_scores = [(score_result(terms, result), result) for result in results] ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True) - print("Ordered results", ordered_results) + # print("Ordered results", ordered_results) filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD] - # ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True) - # print("Order results", query, ordered_results, sep='\n') return filtered_results @app.get("/complete") diff --git a/index.py b/index.py index b38317a..ae2fd10 100644 --- a/index.py +++ b/index.py @@ -89,11 +89,14 @@ class TinyIndexBase(Generic[T]): Get the page at index i, decompress and deserialise it using JSON """ page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size] + zeros = page_data.count(b'\x00\x00\x00\x00') * 4 try: decompressed_data = self.decompressor.decompress(page_data) except ZstdError: return None - return json.loads(decompressed_data.decode('utf8')) + results = json.loads(decompressed_data.decode('utf8')) + # print(f"Num results: {len(results)}, num zeros: {zeros}") + return results def convert_items(self, items) -> List[T]: converted = [self.item_factory(*item) for item in items] @@ -129,11 +132,6 @@ class TinyIndexer(TinyIndexBase[T]): self.mmap.close() self.index_file.close() - # def index(self, documents: List[TokenizedDocument]): - # for document in documents: - # for token in document.tokens: - # self._index_document(document, token) - def index(self, key: str, value: T): # print("Index", value) assert type(value) == self.item_factory, f"Can only index the specified type" \