Add domain score feature

2022-03-21 21:13:20 +00:00 · 2022-03-21 21:13:20 +00:00 · 4740d89c6a
commit 4740d89c6a
parent af6a28fac3
2 changed files with 14 additions and 4 deletions
--- a/mwmbl/tinysearchengine/ltr.py
+++ b/mwmbl/tinysearchengine/ltr.py
@ -4,7 +4,7 @@ Learning to rank predictor
 from pandas import DataFrame, Series
 from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin

-from mwmbl.tinysearchengine.rank import get_match_features
+from mwmbl.tinysearchengine.rank import get_match_features, get_domain_score


 class ThresholdPredictor(BaseEstimator, RegressorMixin):
@ -28,11 +28,14 @@ def get_match_features_as_series(item: Series):
    terms = item['query'].lower().split()
    last_match_char, match_length, total_possible_match_length = get_match_features(
        terms, item['title'], item['extract'], True)
+    domain_score = get_domain_score(item['url'])
    return Series({
        'last_match_char': last_match_char,
        'match_length': match_length,
        'total_possible_match_length': total_possible_match_length,
        'num_terms': len(terms),
+        'domain_score': domain_score,
+        'item_score': item['score'],
    })


@ -41,6 +44,8 @@ class FeatureExtractor(BaseEstimator, TransformerMixin):
        return self

    def transform(self, X: DataFrame, y=None):
-        return X.apply(get_match_features_as_series, axis=1)
+        features = X.apply(get_match_features_as_series, axis=1)
+        print("Features", features.columns)
+        return features


--- a/mwmbl/tinysearchengine/rank.py
+++ b/mwmbl/tinysearchengine/rank.py
@ -30,8 +30,7 @@ def _get_query_regex(terms, is_complete):


 def _score_result(terms, result: Document, is_complete: bool, max_score: float):
-    domain = urlparse(result.url).netloc
-    domain_score = DOMAINS.get(domain, 0.0)
+    domain_score = get_domain_score(result.url)

    last_match_char, match_length, total_possible_match_length = get_match_features(
        terms, result.title, result.extract, is_complete)
@ -43,6 +42,12 @@ def _score_result(terms, result: Document, is_complete: bool, max_score: float):
    return score


+def get_domain_score(url):
+    domain = urlparse(url).netloc
+    domain_score = DOMAINS.get(domain, 0.0)
+    return domain_score
+
+
 def get_match_features(terms, title, extract, is_complete):
    result_string = f"{title.strip()} {extract.strip()}"