diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index 04e1bd3..bb8b1d9 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -13,8 +13,9 @@ logger = getLogger(__name__) SCORE_THRESHOLD = 0.0 -LENGTH_PENALTY = 0.01 +LENGTH_PENALTY = 0.04 MATCH_EXPONENT = 2 +DOMAIN_SCORE_SMOOTHING = 50 def _get_query_regex(terms, is_complete, is_url): @@ -33,21 +34,6 @@ def _get_query_regex(terms, is_complete, is_url): def _score_result(terms: list[str], result: Document, is_complete: bool): features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete) - # domain_score = get_domain_score(result.url) - # - # parsed_url = urlparse(result.url) - # domain = parsed_url.netloc - # path = parsed_url.path - # string_scores = [] - # logger.debug(f"Item: {result}") - # for result_string, is_url in [(result.title, False), (result.extract, False), (domain, True), (domain, False), (path, True)]: - # last_match_char, match_length, total_possible_match_length = get_match_features( - # terms, result_string, is_complete, is_url) - # - # new_score = score_match(last_match_char, match_length, total_possible_match_length) - # logger.debug(f"Item score: {new_score}, result {result_string}") - # string_scores.append(new_score) - # title_score, extract_score, domain_score, domain_split_score, path_score = string_scores length_penalty = math.e ** (-LENGTH_PENALTY * len(result.url)) score = ( @@ -55,10 +41,9 @@ def _score_result(terms: list[str], result: Document, is_complete: bool): + features['match_score_extract'] + 2 * features['match_score_domain'] + 2 * features['match_score_domain_tokenized'] - + features['match_score_path']) * length_penalty / 10 - # score = (0.1 + 0.9*match_score) * (0.1 + 0.9*(result.score / max_score)) - # score = 0.01 * match_score + 0.99 * (result.score / max_score) - # print("Result", result, string_scores, score) + + features['match_score_path']) * length_penalty * (features['domain_score'] + DOMAIN_SCORE_SMOOTHING) / 10 + # best_match_score = max(features[f'match_score_{name}'] for name in ['title', 'extract', 'domain', 'domain_tokenized']) + # score = best_match_score * length_penalty * (features['domain_score'] + DOMAIN_SCORE_SMOOTHING) return score @@ -77,11 +62,13 @@ def get_features(terms, title, url, extract, score, is_complete): (domain, 'domain', True), (domain, 'domain_tokenized', False), (path, 'path', True)]: - last_match_char, match_length, total_possible_match_length = get_match_features(terms, part, is_complete, is_url) + last_match_char, match_length, total_possible_match_length, match_terms = \ + get_match_features(terms, part, is_complete, is_url) features[f'last_match_char_{name}'] = last_match_char features[f'match_length_{name}'] = match_length features[f'total_possible_match_length_{name}'] = total_possible_match_length features[f'match_score_{name}'] = score_match(last_match_char, match_length, total_possible_match_length) + features[f'match_terms_{name}'] = match_terms features['num_terms'] = len(terms) features['num_chars'] = len(' '.join(terms)) features['domain_score'] = get_domain_score(url) @@ -100,19 +87,21 @@ def get_domain_score(url): def get_match_features(terms, result_string, is_complete, is_url): query_regex = _get_query_regex(terms, is_complete, is_url) matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE)) - match_strings = {x.group(0).lower() for x in matches} - match_length = sum(len(x) for x in match_strings) + # match_strings = {x.group(0).lower() for x in matches} + # match_length = sum(len(x) for x in match_strings) last_match_char = 1 seen_matches = set() + match_length = 0 for match in matches: value = match.group(0).lower() if value not in seen_matches: last_match_char = match.span()[1] seen_matches.add(value) + match_length += len(value) total_possible_match_length = sum(len(x) for x in terms) - return last_match_char, match_length, total_possible_match_length + return last_match_char, match_length, total_possible_match_length, len(seen_matches) def order_results(terms: list[str], results: list[Document], is_complete: bool) -> list[Document]: