Add in match score feature (although it hurts the results)

This commit is contained in:
Daoud Clarke 2022-08-09 00:08:55 +01:00
parent c60b73a403
commit 8b50643303

View file

@ -76,7 +76,7 @@ def get_features(terms, title, url, extract, score, is_complete):
features[f'last_match_char_{name}'] = last_match_char
features[f'match_length_{name}'] = match_length
features[f'total_possible_match_length_{name}'] = total_possible_match_length
# features[f'score_{part}'] = score_match(last_match_char, match_length, total_possible_match_length)
features[f'score_{name}'] = score_match(last_match_char, match_length, total_possible_match_length)
features['num_terms'] = len(terms)
features['num_chars'] = len(' '.join(terms))
features['domain_score'] = get_domain_score(url)
@ -94,7 +94,6 @@ def get_domain_score(url):
def get_match_features(terms, result_string, is_complete, is_url):
query_regex = _get_query_regex(terms, is_complete, is_url)
print("Result string", result_string)
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
match_strings = {x.group(0).lower() for x in matches}
match_length = sum(len(x) for x in match_strings)