2 سال پیش · c60b73a403
--- a/mwmbl/tinysearchengine/ltr.py
+++ b/mwmbl/tinysearchengine/ltr.py
@@ -4,7 +4,7 @@ Learning to rank predictor
 
				 from pandas import DataFrame, Series
			
 
				 from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
			
 
				 
			
 
				-from mwmbl.tinysearchengine.rank import get_match_features, get_domain_score, score_match
			
 
				+from mwmbl.tinysearchengine.rank import get_features
			
 
				 
			
 
				 
			
 
				 class ThresholdPredictor(BaseEstimator, RegressorMixin):
			
@@ -24,21 +24,9 @@ class ThresholdPredictor(BaseEstimator, RegressorMixin):
 
				         return predictions
			
 
				 
			
 
				 
			
 
				-def get_match_features_as_series(item: Series):
			
 
				+def get_features_as_series(item: Series):
			
 
				     terms = item['query'].lower().split()
			
 
				-    features = {}
			
 
				-    for part in ['title', 'extract', 'url']:
			
 
				-        last_match_char, match_length, total_possible_match_length = get_match_features(terms, item[part], True, False)
			
 
				-        features[f'last_match_char_{part}'] = last_match_char
			
 
				-        features[f'match_length_{part}'] = match_length
			
 
				-        features[f'total_possible_match_length_{part}'] = total_possible_match_length
			
 
				-        # features[f'score_{part}'] = score_match(last_match_char, match_length, total_possible_match_length)
			
 
				-
			
 
				-    features['num_terms'] = len(terms)
			
 
				-    features['num_chars'] = len(' '.join(terms))
			
 
				-    features['domain_score'] = get_domain_score(item['url'])
			
 
				-    features['url_length'] = len(item['url'])
			
 
				-    features['item_score'] = item['score']
			
 
				+    features = get_features(terms, item['title'], item['url'], item['extract'], item['score'], True)
			
 
				     return Series(features)
			
 
				 
			
 
				 
			
@@ -47,7 +35,7 @@ class FeatureExtractor(BaseEstimator, TransformerMixin):
 
				         return self
			
 
				 
			
 
				     def transform(self, X: DataFrame, y=None):
			
 
				-        features = X.apply(get_match_features_as_series, axis=1)
			
 
				+        features = X.apply(get_features_as_series, axis=1)
			
 
				         print("Features", features.columns)
			
 
				         return features
			
 
				 
			
--- a/mwmbl/tinysearchengine/rank.py
+++ b/mwmbl/tinysearchengine/rank.py
@@ -31,7 +31,7 @@ def _get_query_regex(terms, is_complete, is_url):
 
				     return pattern
			
 
				 
			
 
				 
			
 
				-def _score_result(terms, result: Document, is_complete: bool):
			
 
				+def _score_result(terms: list[str], result: Document, is_complete: bool):
			
 
				     domain_score = get_domain_score(result.url)
			
 
				 
			
 
				     parsed_url = urlparse(result.url)
			
@@ -62,6 +62,30 @@ def score_match(last_match_char, match_length, total_possible_match_length):
 
				     return MATCH_EXPONENT ** (match_length - total_possible_match_length) / last_match_char
			
 
				 
			
 
				 
			
 
				+def get_features(terms, title, url, extract, score, is_complete):
			
 
				+    features = {}
			
 
				+    parsed_url = urlparse(url)
			
 
				+    domain = parsed_url.netloc
			
 
				+    path = parsed_url.path
			
 
				+    for part, name, is_url in [(title, 'title', False),
			
 
				+                               (extract, 'extract', False),
			
 
				+                               (domain, 'domain', True),
			
 
				+                               (domain, 'domain_tokenized', False),
			
 
				+                               (path, 'path', True)]:
			
 
				+        last_match_char, match_length, total_possible_match_length = get_match_features(terms, part, is_complete, is_url)
			
 
				+        features[f'last_match_char_{name}'] = last_match_char
			
 
				+        features[f'match_length_{name}'] = match_length
			
 
				+        features[f'total_possible_match_length_{name}'] = total_possible_match_length
			
 
				+        # features[f'score_{part}'] = score_match(last_match_char, match_length, total_possible_match_length)
			
 
				+    features['num_terms'] = len(terms)
			
 
				+    features['num_chars'] = len(' '.join(terms))
			
 
				+    features['domain_score'] = get_domain_score(url)
			
 
				+    features['path_length'] = len(path)
			
 
				+    features['domain_length'] = len(domain)
			
 
				+    features['item_score'] = score
			
 
				+    return features
			
 
				+
			
 
				+
			
 
				 def get_domain_score(url):
			
 
				     domain = urlparse(url).netloc
			
 
				     domain_score = DOMAINS.get(domain, 0.0)
			
@@ -165,4 +189,3 @@ class Ranker:
 
				 class HeuristicRanker(Ranker):
			
 
				     def order_results(self, terms, pages, is_complete):
			
 
				         return order_results(terms, pages, is_complete)
			
 
				-