diff --git a/mwmbl/tinysearchengine/ltr.py b/mwmbl/tinysearchengine/ltr.py index 5ee6c3f..9089d23 100644 --- a/mwmbl/tinysearchengine/ltr.py +++ b/mwmbl/tinysearchengine/ltr.py @@ -4,7 +4,7 @@ Learning to rank predictor from pandas import DataFrame, Series from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin -from mwmbl.tinysearchengine.rank import get_match_features, get_domain_score, score_match +from mwmbl.tinysearchengine.rank import get_features class ThresholdPredictor(BaseEstimator, RegressorMixin): @@ -24,21 +24,9 @@ class ThresholdPredictor(BaseEstimator, RegressorMixin): return predictions -def get_match_features_as_series(item: Series): +def get_features_as_series(item: Series): terms = item['query'].lower().split() - features = {} - for part in ['title', 'extract', 'url']: - last_match_char, match_length, total_possible_match_length = get_match_features(terms, item[part], True, False) - features[f'last_match_char_{part}'] = last_match_char - features[f'match_length_{part}'] = match_length - features[f'total_possible_match_length_{part}'] = total_possible_match_length - # features[f'score_{part}'] = score_match(last_match_char, match_length, total_possible_match_length) - - features['num_terms'] = len(terms) - features['num_chars'] = len(' '.join(terms)) - features['domain_score'] = get_domain_score(item['url']) - features['url_length'] = len(item['url']) - features['item_score'] = item['score'] + features = get_features(terms, item['title'], item['url'], item['extract'], item['score'], True) return Series(features) @@ -47,7 +35,7 @@ class FeatureExtractor(BaseEstimator, TransformerMixin): return self def transform(self, X: DataFrame, y=None): - features = X.apply(get_match_features_as_series, axis=1) + features = X.apply(get_features_as_series, axis=1) print("Features", features.columns) return features diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index 54a1753..0be01b9 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -31,7 +31,7 @@ def _get_query_regex(terms, is_complete, is_url): return pattern -def _score_result(terms, result: Document, is_complete: bool): +def _score_result(terms: list[str], result: Document, is_complete: bool): domain_score = get_domain_score(result.url) parsed_url = urlparse(result.url) @@ -62,6 +62,30 @@ def score_match(last_match_char, match_length, total_possible_match_length): return MATCH_EXPONENT ** (match_length - total_possible_match_length) / last_match_char +def get_features(terms, title, url, extract, score, is_complete): + features = {} + parsed_url = urlparse(url) + domain = parsed_url.netloc + path = parsed_url.path + for part, name, is_url in [(title, 'title', False), + (extract, 'extract', False), + (domain, 'domain', True), + (domain, 'domain_tokenized', False), + (path, 'path', True)]: + last_match_char, match_length, total_possible_match_length = get_match_features(terms, part, is_complete, is_url) + features[f'last_match_char_{name}'] = last_match_char + features[f'match_length_{name}'] = match_length + features[f'total_possible_match_length_{name}'] = total_possible_match_length + # features[f'score_{part}'] = score_match(last_match_char, match_length, total_possible_match_length) + features['num_terms'] = len(terms) + features['num_chars'] = len(' '.join(terms)) + features['domain_score'] = get_domain_score(url) + features['path_length'] = len(path) + features['domain_length'] = len(domain) + features['item_score'] = score + return features + + def get_domain_score(url): domain = urlparse(url).netloc domain_score = DOMAINS.get(domain, 0.0) @@ -165,4 +189,3 @@ class Ranker: class HeuristicRanker(Ranker): def order_results(self, terms, pages, is_complete): return order_results(terms, pages, is_complete) -