Analysis to confirm that 'leek and potato soup' page was really missing

This commit is contained in:
Daoud Clarke 2021-12-19 21:09:00 +00:00
parent 4cbed29c08
commit 9ee6f37a60
3 changed files with 15 additions and 18 deletions

View file

@ -2,14 +2,18 @@ from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
from paths import INDEX_PATH
def get_items():
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
items = tiny_index.retrieve('soup')
if items:
for item in items:
print("Items", item)
def run():
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
for i in range(100):
items = tiny_index.retrieve('eggless')
# items = tiny_index.convert_items(page)
if items:
print("Items", items)
break
tiny_index.get_page(i)
if __name__ == '__main__':

View file

@ -1,16 +1,13 @@
import re
from logging import getLogger
from operator import itemgetter
from typing import List
import Levenshtein
from fastapi import FastAPI
from starlette.responses import RedirectResponse, FileResponse, HTMLResponse
from starlette.responses import FileResponse
from starlette.staticfiles import StaticFiles
from index import TinyIndex, Document
logger = getLogger(__name__)
@ -62,10 +59,8 @@ def create(tiny_index: TinyIndex):
def order_results(terms: list[str], results: list[Document]):
results_and_scores = [(score_result(terms, result), result) for result in results]
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
print("Ordered results", ordered_results)
# print("Ordered results", ordered_results)
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
# ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True)
# print("Order results", query, ordered_results, sep='\n')
return filtered_results
@app.get("/complete")

View file

@ -89,11 +89,14 @@ class TinyIndexBase(Generic[T]):
Get the page at index i, decompress and deserialise it using JSON
"""
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
zeros = page_data.count(b'\x00\x00\x00\x00') * 4
try:
decompressed_data = self.decompressor.decompress(page_data)
except ZstdError:
return None
return json.loads(decompressed_data.decode('utf8'))
results = json.loads(decompressed_data.decode('utf8'))
# print(f"Num results: {len(results)}, num zeros: {zeros}")
return results
def convert_items(self, items) -> List[T]:
converted = [self.item_factory(*item) for item in items]
@ -129,11 +132,6 @@ class TinyIndexer(TinyIndexBase[T]):
self.mmap.close()
self.index_file.close()
# def index(self, documents: List[TokenizedDocument]):
# for document in documents:
# for token in document.tokens:
# self._index_document(document, token)
def index(self, key: str, value: T):
# print("Index", value)
assert type(value) == self.item_factory, f"Can only index the specified type" \