From 4fa1c4a39a96b9514d80225739e5277c75ba1fab Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 18 Dec 2021 22:35:59 +0000 Subject: [PATCH] Filter results with low scores --- create_app.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/create_app.py b/create_app.py index 1ba01df..c15a8e9 100644 --- a/create_app.py +++ b/create_app.py @@ -1,5 +1,6 @@ import re from logging import getLogger +from operator import itemgetter from typing import List import Levenshtein @@ -13,6 +14,9 @@ from index import TinyIndex, Document logger = getLogger(__name__) +SCORE_THRESHOLD = 0.25 + + def create(tiny_index: TinyIndex): app = FastAPI() @@ -49,12 +53,16 @@ def create(tiny_index: TinyIndex): match_length = sum(len(x) for x in match_strings) num_words = len(re.findall(r'\b\w+\b', r)) - return match_length + 1./num_words + total_possible_match_length = sum(len(x) for x in terms) + return (match_length + 1./num_words) / (total_possible_match_length + 1) def order_results(terms: list[str], results: list[Document]): - ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True) + results_and_scores = [(score_result(terms, result.title), result) for result in results] + ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True) + filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD] + # ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True) # print("Order results", query, ordered_results, sep='\n') - return ordered_results + return filtered_results @app.get("/complete") def complete(q: str):