Analysis to confirm that 'leek and potato soup' page was really missing
This commit is contained in:
parent
4cbed29c08
commit
9ee6f37a60
3 changed files with 15 additions and 18 deletions
|
@ -2,14 +2,18 @@ from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
|||
from paths import INDEX_PATH
|
||||
|
||||
|
||||
def get_items():
|
||||
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
items = tiny_index.retrieve('soup')
|
||||
if items:
|
||||
for item in items:
|
||||
print("Items", item)
|
||||
|
||||
|
||||
def run():
|
||||
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
for i in range(100):
|
||||
items = tiny_index.retrieve('eggless')
|
||||
# items = tiny_index.convert_items(page)
|
||||
if items:
|
||||
print("Items", items)
|
||||
break
|
||||
tiny_index.get_page(i)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,16 +1,13 @@
|
|||
import re
|
||||
from logging import getLogger
|
||||
from operator import itemgetter
|
||||
from typing import List
|
||||
|
||||
import Levenshtein
|
||||
from fastapi import FastAPI
|
||||
from starlette.responses import RedirectResponse, FileResponse, HTMLResponse
|
||||
from starlette.responses import FileResponse
|
||||
from starlette.staticfiles import StaticFiles
|
||||
|
||||
from index import TinyIndex, Document
|
||||
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
|
@ -62,10 +59,8 @@ def create(tiny_index: TinyIndex):
|
|||
def order_results(terms: list[str], results: list[Document]):
|
||||
results_and_scores = [(score_result(terms, result), result) for result in results]
|
||||
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
|
||||
print("Ordered results", ordered_results)
|
||||
# print("Ordered results", ordered_results)
|
||||
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
|
||||
# ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True)
|
||||
# print("Order results", query, ordered_results, sep='\n')
|
||||
return filtered_results
|
||||
|
||||
@app.get("/complete")
|
||||
|
|
10
index.py
10
index.py
|
@ -89,11 +89,14 @@ class TinyIndexBase(Generic[T]):
|
|||
Get the page at index i, decompress and deserialise it using JSON
|
||||
"""
|
||||
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
||||
zeros = page_data.count(b'\x00\x00\x00\x00') * 4
|
||||
try:
|
||||
decompressed_data = self.decompressor.decompress(page_data)
|
||||
except ZstdError:
|
||||
return None
|
||||
return json.loads(decompressed_data.decode('utf8'))
|
||||
results = json.loads(decompressed_data.decode('utf8'))
|
||||
# print(f"Num results: {len(results)}, num zeros: {zeros}")
|
||||
return results
|
||||
|
||||
def convert_items(self, items) -> List[T]:
|
||||
converted = [self.item_factory(*item) for item in items]
|
||||
|
@ -129,11 +132,6 @@ class TinyIndexer(TinyIndexBase[T]):
|
|||
self.mmap.close()
|
||||
self.index_file.close()
|
||||
|
||||
# def index(self, documents: List[TokenizedDocument]):
|
||||
# for document in documents:
|
||||
# for token in document.tokens:
|
||||
# self._index_document(document, token)
|
||||
|
||||
def index(self, key: str, value: T):
|
||||
# print("Index", value)
|
||||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||
|
|
Loading…
Add table
Reference in a new issue