Add domain score feature

This commit is contained in:
Daoud Clarke 2022-03-21 21:13:20 +00:00
parent af6a28fac3
commit 4740d89c6a
2 changed files with 14 additions and 4 deletions

View file

@ -4,7 +4,7 @@ Learning to rank predictor
from pandas import DataFrame, Series
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from mwmbl.tinysearchengine.rank import get_match_features
from mwmbl.tinysearchengine.rank import get_match_features, get_domain_score
class ThresholdPredictor(BaseEstimator, RegressorMixin):
@ -28,11 +28,14 @@ def get_match_features_as_series(item: Series):
terms = item['query'].lower().split()
last_match_char, match_length, total_possible_match_length = get_match_features(
terms, item['title'], item['extract'], True)
domain_score = get_domain_score(item['url'])
return Series({
'last_match_char': last_match_char,
'match_length': match_length,
'total_possible_match_length': total_possible_match_length,
'num_terms': len(terms),
'domain_score': domain_score,
'item_score': item['score'],
})
@ -41,6 +44,8 @@ class FeatureExtractor(BaseEstimator, TransformerMixin):
return self
def transform(self, X: DataFrame, y=None):
return X.apply(get_match_features_as_series, axis=1)
features = X.apply(get_match_features_as_series, axis=1)
print("Features", features.columns)
return features

View file

@ -30,8 +30,7 @@ def _get_query_regex(terms, is_complete):
def _score_result(terms, result: Document, is_complete: bool, max_score: float):
domain = urlparse(result.url).netloc
domain_score = DOMAINS.get(domain, 0.0)
domain_score = get_domain_score(result.url)
last_match_char, match_length, total_possible_match_length = get_match_features(
terms, result.title, result.extract, is_complete)
@ -43,6 +42,12 @@ def _score_result(terms, result: Document, is_complete: bool, max_score: float):
return score
def get_domain_score(url):
domain = urlparse(url).netloc
domain_score = DOMAINS.get(domain, 0.0)
return domain_score
def get_match_features(terms, title, extract, is_complete):
result_string = f"{title.strip()} {extract.strip()}"