From 4740d89c6a1809156b4ae2095c336f81320e1972 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Mon, 21 Mar 2022 21:13:20 +0000
Subject: [PATCH] Add domain score feature

---
 mwmbl/tinysearchengine/ltr.py  | 9 +++++++--
 mwmbl/tinysearchengine/rank.py | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/mwmbl/tinysearchengine/ltr.py b/mwmbl/tinysearchengine/ltr.py
index e30a10436bdadbc88baaed0d6ecac9414081f895..33b3ad433ecd95be7b1df022c535658ce427c3c2 100644
--- a/mwmbl/tinysearchengine/ltr.py
+++ b/mwmbl/tinysearchengine/ltr.py
@@ -4,7 +4,7 @@ Learning to rank predictor
 from pandas import DataFrame, Series
 from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
 
-from mwmbl.tinysearchengine.rank import get_match_features
+from mwmbl.tinysearchengine.rank import get_match_features, get_domain_score
 
 
 class ThresholdPredictor(BaseEstimator, RegressorMixin):
@@ -28,11 +28,14 @@ def get_match_features_as_series(item: Series):
     terms = item['query'].lower().split()
     last_match_char, match_length, total_possible_match_length = get_match_features(
         terms, item['title'], item['extract'], True)
+    domain_score = get_domain_score(item['url'])
     return Series({
         'last_match_char': last_match_char,
         'match_length': match_length,
         'total_possible_match_length': total_possible_match_length,
         'num_terms': len(terms),
+        'domain_score': domain_score,
+        'item_score': item['score'],
     })
 
 
@@ -41,6 +44,8 @@ class FeatureExtractor(BaseEstimator, TransformerMixin):
         return self
 
     def transform(self, X: DataFrame, y=None):
-        return X.apply(get_match_features_as_series, axis=1)
+        features = X.apply(get_match_features_as_series, axis=1)
+        print("Features", features.columns)
+        return features
 
 
diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py
index c9360b7239d925bb6be771af78e432bcf09b0ce0..fade4b073dbc1b5021ff66dc37b515ec47312494 100644
--- a/mwmbl/tinysearchengine/rank.py
+++ b/mwmbl/tinysearchengine/rank.py
@@ -30,8 +30,7 @@ def _get_query_regex(terms, is_complete):
 
 
 def _score_result(terms, result: Document, is_complete: bool, max_score: float):
-    domain = urlparse(result.url).netloc
-    domain_score = DOMAINS.get(domain, 0.0)
+    domain_score = get_domain_score(result.url)
 
     last_match_char, match_length, total_possible_match_length = get_match_features(
         terms, result.title, result.extract, is_complete)
@@ -43,6 +42,12 @@ def _score_result(terms, result: Document, is_complete: bool, max_score: float):
     return score
 
 
+def get_domain_score(url):
+    domain = urlparse(url).netloc
+    domain_score = DOMAINS.get(domain, 0.0)
+    return domain_score
+
+
 def get_match_features(terms, title, extract, is_complete):
     result_string = f"{title.strip()} {extract.strip()}"