Get features for each string separately
This commit is contained in:
parent
4740d89c6a
commit
94287cec01
2 changed files with 24 additions and 13 deletions
|
@ -26,13 +26,23 @@ class ThresholdPredictor(BaseEstimator, RegressorMixin):
|
|||
|
||||
def get_match_features_as_series(item: Series):
|
||||
terms = item['query'].lower().split()
|
||||
last_match_char, match_length, total_possible_match_length = get_match_features(
|
||||
terms, item['title'], item['extract'], True)
|
||||
last_match_char_title, match_length_title, total_possible_match_length_title = get_match_features(
|
||||
terms, item['title'], True, False)
|
||||
last_match_char_extract, match_length_extract, total_possible_match_length_extract = get_match_features(
|
||||
terms, item['extract'], True, False)
|
||||
last_match_char_url, match_length_url, total_possible_match_length_url = get_match_features(
|
||||
terms, item['title'], True, False)
|
||||
domain_score = get_domain_score(item['url'])
|
||||
return Series({
|
||||
'last_match_char': last_match_char,
|
||||
'match_length': match_length,
|
||||
'total_possible_match_length': total_possible_match_length,
|
||||
'last_match_char_title': last_match_char_title,
|
||||
'match_length_title': match_length_title,
|
||||
'total_possible_match_length_title': total_possible_match_length_title,
|
||||
'last_match_char_extract': last_match_char_extract,
|
||||
'match_length_extract': match_length_extract,
|
||||
'total_possible_match_length_extract': total_possible_match_length_extract,
|
||||
'last_match_char_url': last_match_char_url,
|
||||
'match_length_url': match_length_url,
|
||||
'total_possible_match_length_url': total_possible_match_length_url,
|
||||
'num_terms': len(terms),
|
||||
'domain_score': domain_score,
|
||||
'item_score': item['score'],
|
||||
|
|
|
@ -17,14 +17,16 @@ logger = getLogger(__name__)
|
|||
SCORE_THRESHOLD = 0.0
|
||||
|
||||
|
||||
def _get_query_regex(terms, is_complete):
|
||||
def _get_query_regex(terms, is_complete, is_url):
|
||||
if not terms:
|
||||
return ''
|
||||
|
||||
word_sep = r'\b' if is_url else ''
|
||||
if is_complete:
|
||||
term_patterns = [rf'\b{re.escape(term)}\b' for term in terms]
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
|
||||
else:
|
||||
term_patterns = [rf'\b{re.escape(term)}\b' for term in terms[:-1]] + [rf'\b{re.escape(terms[-1])}']
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
|
||||
rf'{word_sep}{re.escape(terms[-1])}']
|
||||
pattern = '|'.join(term_patterns)
|
||||
return pattern
|
||||
|
||||
|
@ -32,8 +34,9 @@ def _get_query_regex(terms, is_complete):
|
|||
def _score_result(terms, result: Document, is_complete: bool, max_score: float):
|
||||
domain_score = get_domain_score(result.url)
|
||||
|
||||
result_string = f"{result.title.strip()} {result.extract.strip()}"
|
||||
last_match_char, match_length, total_possible_match_length = get_match_features(
|
||||
terms, result.title, result.extract, is_complete)
|
||||
terms, result_string, is_complete, False)
|
||||
|
||||
match_score = (match_length + 1. / last_match_char) / (total_possible_match_length + 1)
|
||||
score = 0.01 * domain_score + 0.99 * match_score
|
||||
|
@ -48,10 +51,8 @@ def get_domain_score(url):
|
|||
return domain_score
|
||||
|
||||
|
||||
def get_match_features(terms, title, extract, is_complete):
|
||||
result_string = f"{title.strip()} {extract.strip()}"
|
||||
|
||||
query_regex = _get_query_regex(terms, is_complete)
|
||||
def get_match_features(terms, result_string, is_complete, is_url):
|
||||
query_regex = _get_query_regex(terms, is_complete, is_url)
|
||||
print("Query regex", query_regex)
|
||||
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
|
||||
match_strings = {x.group(0).lower() for x in matches}
|
||||
|
|
Loading…
Add table
Reference in a new issue