|
@@ -1,3 +1,4 @@
|
|
|
|
+import math
|
|
import re
|
|
import re
|
|
from abc import abstractmethod
|
|
from abc import abstractmethod
|
|
from logging import getLogger
|
|
from logging import getLogger
|
|
@@ -12,6 +13,7 @@ logger = getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
SCORE_THRESHOLD = 0.0
|
|
SCORE_THRESHOLD = 0.0
|
|
|
|
+LENGTH_PENALTY=0.01
|
|
|
|
|
|
|
|
|
|
def _get_query_regex(terms, is_complete, is_url):
|
|
def _get_query_regex(terms, is_complete, is_url):
|
|
@@ -28,17 +30,27 @@ def _get_query_regex(terms, is_complete, is_url):
|
|
return pattern
|
|
return pattern
|
|
|
|
|
|
|
|
|
|
-def _score_result(terms, result: Document, is_complete: bool, max_score: float):
|
|
|
|
|
|
+def _score_result(terms, result: Document, is_complete: bool):
|
|
domain_score = get_domain_score(result.url)
|
|
domain_score = get_domain_score(result.url)
|
|
|
|
|
|
- result_string = f"{result.title.strip()} {result.extract.strip()}"
|
|
|
|
- last_match_char, match_length, total_possible_match_length = get_match_features(
|
|
|
|
- terms, result_string, is_complete, False)
|
|
|
|
-
|
|
|
|
- match_score = score_match(last_match_char, match_length, total_possible_match_length)
|
|
|
|
- score = 0.01 * domain_score + 0.99 * match_score
|
|
|
|
|
|
+ parsed_url = urlparse(result.url)
|
|
|
|
+ domain = parsed_url.netloc
|
|
|
|
+ path = parsed_url.path
|
|
|
|
+ string_scores = []
|
|
|
|
+ for result_string, is_url in [(result.title, False), (result.extract, False), (domain, True), (domain, False), (path, True)]:
|
|
|
|
+ last_match_char, match_length, total_possible_match_length = get_match_features(
|
|
|
|
+ terms, result_string, is_complete, is_url)
|
|
|
|
+
|
|
|
|
+ new_score = score_match(last_match_char, match_length, total_possible_match_length)
|
|
|
|
+ string_scores.append(new_score)
|
|
|
|
+ title_score, extract_score, domain_score, domain_split_score, path_score = string_scores
|
|
|
|
+
|
|
|
|
+ length_penalty = math.e ** (-LENGTH_PENALTY * len(result.url))
|
|
|
|
+ score = (0.01 * domain_score + 0.99 * (
|
|
|
|
+ 4 * title_score + extract_score + 2 * domain_score + 2 * domain_split_score + path_score) * 0.1) * length_penalty
|
|
# score = (0.1 + 0.9*match_score) * (0.1 + 0.9*(result.score / max_score))
|
|
# score = (0.1 + 0.9*match_score) * (0.1 + 0.9*(result.score / max_score))
|
|
# score = 0.01 * match_score + 0.99 * (result.score / max_score)
|
|
# score = 0.01 * match_score + 0.99 * (result.score / max_score)
|
|
|
|
+ # print("Result", result, string_scores, score)
|
|
return score
|
|
return score
|
|
|
|
|
|
|
|
|
|
@@ -75,7 +87,7 @@ def order_results(terms: list[str], results: list[Document], is_complete: bool)
|
|
return []
|
|
return []
|
|
|
|
|
|
max_score = max(result.score for result in results)
|
|
max_score = max(result.score for result in results)
|
|
- results_and_scores = [(_score_result(terms, result, is_complete, max_score), result) for result in results]
|
|
|
|
|
|
+ results_and_scores = [(_score_result(terms, result, is_complete), result) for result in results]
|
|
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
|
|
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
|
|
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
|
|
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
|
|
return filtered_results
|
|
return filtered_results
|