From 4740d89c6a1809156b4ae2095c336f81320e1972 Mon Sep 17 00:00:00 2001 From: Daoud Clarke <daoud.clarke@gmail.com> Date: Mon, 21 Mar 2022 21:13:20 +0000 Subject: [PATCH] Add domain score feature --- mwmbl/tinysearchengine/ltr.py | 9 +++++++-- mwmbl/tinysearchengine/rank.py | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/mwmbl/tinysearchengine/ltr.py b/mwmbl/tinysearchengine/ltr.py index e30a104..33b3ad4 100644 --- a/mwmbl/tinysearchengine/ltr.py +++ b/mwmbl/tinysearchengine/ltr.py @@ -4,7 +4,7 @@ Learning to rank predictor from pandas import DataFrame, Series from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin -from mwmbl.tinysearchengine.rank import get_match_features +from mwmbl.tinysearchengine.rank import get_match_features, get_domain_score class ThresholdPredictor(BaseEstimator, RegressorMixin): @@ -28,11 +28,14 @@ def get_match_features_as_series(item: Series): terms = item['query'].lower().split() last_match_char, match_length, total_possible_match_length = get_match_features( terms, item['title'], item['extract'], True) + domain_score = get_domain_score(item['url']) return Series({ 'last_match_char': last_match_char, 'match_length': match_length, 'total_possible_match_length': total_possible_match_length, 'num_terms': len(terms), + 'domain_score': domain_score, + 'item_score': item['score'], }) @@ -41,6 +44,8 @@ class FeatureExtractor(BaseEstimator, TransformerMixin): return self def transform(self, X: DataFrame, y=None): - return X.apply(get_match_features_as_series, axis=1) + features = X.apply(get_match_features_as_series, axis=1) + print("Features", features.columns) + return features diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index c9360b7..fade4b0 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -30,8 +30,7 @@ def _get_query_regex(terms, is_complete): def _score_result(terms, result: Document, is_complete: bool, max_score: float): - domain = urlparse(result.url).netloc - domain_score = DOMAINS.get(domain, 0.0) + domain_score = get_domain_score(result.url) last_match_char, match_length, total_possible_match_length = get_match_features( terms, result.title, result.extract, is_complete) @@ -43,6 +42,12 @@ def _score_result(terms, result: Document, is_complete: bool, max_score: float): return score +def get_domain_score(url): + domain = urlparse(url).netloc + domain_score = DOMAINS.get(domain, 0.0) + return domain_score + + def get_match_features(terms, title, extract, is_complete): result_string = f"{title.strip()} {extract.strip()}"