2 년 전 · c6773b46c4
--- a/analyse/search.py
+++ b/analyse/search.py
@@ -0,0 +1,32 @@
 
				+import logging
			
 
				+import sys
			
 
				+from itertools import islice
			
 
				+
			
 
				+from mwmbl.indexer.paths import INDEX_PATH
			
 
				+from mwmbl.tinysearchengine.completer import Completer
			
 
				+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
			
 
				+from mwmbl.tinysearchengine.rank import HeuristicRanker
			
 
				+
			
 
				+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
			
 
				+
			
 
				+
			
 
				+def clean(sequence):
			
 
				+    return ''.join(x['value'] for x in sequence)
			
 
				+
			
 
				+
			
 
				+def run():
			
 
				+    with TinyIndex(Document, INDEX_PATH) as tiny_index:
			
 
				+        completer = Completer()
			
 
				+        ranker = HeuristicRanker(tiny_index, completer)
			
 
				+        items = ranker.search('google')
			
 
				+        print()
			
 
				+        if items:
			
 
				+            for i, item in enumerate(islice(items, 10)):
			
 
				+                print(f"{i + 1}. {item['url']}")
			
 
				+                print(clean(item['title']))
			
 
				+                print(clean(item['extract']))
			
 
				+                print()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    run()
			
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@@ -20,9 +20,14 @@ from mwmbl.tinysearchengine.rank import HeuristicRanker
 
				 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
			
 
				 
			
 
				 
			
 
				+MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
			
 
				+
			
 
				+
			
 
				 def setup_args():
			
 
				     parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
			
 
				     parser.add_argument("--data", help="Path to the tinysearchengine index file", default="/app/storage/")
			
 
				+    parser.add_argument("--no-background", help="Disable running the background script to collect data",
			
 
				+                        action='store_true')
			
 
				     args = parser.parse_args()
			
 
				     return args
			
 
				 
			
@@ -46,12 +51,16 @@ def run():
 
				         TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
			
 
				 
			
 
				     url_queue = Queue()
			
 
				-    Process(target=background.run, args=(args.data, url_queue)).start()
			
 
				+
			
 
				+    if not args.no_background:
			
 
				+        Process(target=background.run, args=(args.data, url_queue)).start()
			
 
				 
			
 
				     completer = Completer()
			
 
				 
			
 
				     with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index:
			
 
				         ranker = HeuristicRanker(tiny_index, completer)
			
 
				+        # model = pickle.load(open(MODEL_PATH, 'rb'))
			
 
				+        # ranker = LTRRanker(model, tiny_index, completer)
			
 
				 
			
 
				         # Initialize FastApi instance
			
 
				         app = FastAPI()
			
--- a/mwmbl/resources/model.pickle
+++ b/mwmbl/resources/model.pickle
--- a/mwmbl/tinysearchengine/ltr.py
+++ b/mwmbl/tinysearchengine/ltr.py
@@ -4,7 +4,7 @@ Learning to rank predictor
 
				 from pandas import DataFrame, Series
			
 
				 from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
			
 
				 
			
 
				-from mwmbl.tinysearchengine.rank import get_match_features, get_domain_score, score_match
			
 
				+from mwmbl.tinysearchengine.rank import get_features
			
 
				 
			
 
				 
			
 
				 class ThresholdPredictor(BaseEstimator, RegressorMixin):
			
@@ -24,20 +24,10 @@ class ThresholdPredictor(BaseEstimator, RegressorMixin):
 
				         return predictions
			
 
				 
			
 
				 
			
 
				-def get_match_features_as_series(item: Series):
			
 
				+def get_features_as_series(item: Series):
			
 
				     terms = item['query'].lower().split()
			
 
				-    features = {}
			
 
				-    for part in ['title', 'extract', 'url']:
			
 
				-        last_match_char, match_length, total_possible_match_length = get_match_features(terms, item[part], True, False)
			
 
				-        features[f'last_match_char_{part}'] = last_match_char
			
 
				-        features[f'match_length_{part}'] = match_length
			
 
				-        features[f'total_possible_match_length_{part}'] = total_possible_match_length
			
 
				-        # features[f'score_{part}'] = score_match(last_match_char, match_length, total_possible_match_length)
			
 
				-
			
 
				-    features['num_terms'] = len(terms)
			
 
				-    features['num_chars'] = len(' '.join(terms))
			
 
				-    features['domain_score'] = get_domain_score(item['url'])
			
 
				-    features['item_score'] = item['score']
			
 
				+    features = get_features(terms, item['title'], item['url'], item['extract'], item['score'], True)
			
 
				+    # features_filtered = {k: v for k, v in features.items() if 'match_score' not in k}
			
 
				     return Series(features)
			
 
				 
			
 
				 
			
@@ -46,7 +36,7 @@ class FeatureExtractor(BaseEstimator, TransformerMixin):
 
				         return self
			
 
				 
			
 
				     def transform(self, X: DataFrame, y=None):
			
 
				-        features = X.apply(get_match_features_as_series, axis=1)
			
 
				+        features = X.apply(get_features_as_series, axis=1)
			
 
				         print("Features", features.columns)
			
 
				         return features
			
 
				 
			
--- a/mwmbl/tinysearchengine/rank.py
+++ b/mwmbl/tinysearchengine/rank.py
@@ -13,7 +13,9 @@ logger = getLogger(__name__)
 
				 
			
 
				 
			
 
				 SCORE_THRESHOLD = 0.0
			
 
				-LENGTH_PENALTY=0.01
			
 
				+LENGTH_PENALTY = 0.04
			
 
				+MATCH_EXPONENT = 2
			
 
				+DOMAIN_SCORE_SMOOTHING = 50
			
 
				 
			
 
				 
			
 
				 def _get_query_regex(terms, is_complete, is_url):
			
@@ -30,32 +32,50 @@ def _get_query_regex(terms, is_complete, is_url):
 
				     return pattern
			
 
				 
			
 
				 
			
 
				-def _score_result(terms, result: Document, is_complete: bool):
			
 
				-    domain_score = get_domain_score(result.url)
			
 
				-
			
 
				-    parsed_url = urlparse(result.url)
			
 
				-    domain = parsed_url.netloc
			
 
				-    path = parsed_url.path
			
 
				-    string_scores = []
			
 
				-    for result_string, is_url in [(result.title, False), (result.extract, False), (domain, True), (domain, False), (path, True)]:
			
 
				-        last_match_char, match_length, total_possible_match_length = get_match_features(
			
 
				-            terms, result_string, is_complete, is_url)
			
 
				-
			
 
				-        new_score = score_match(last_match_char, match_length, total_possible_match_length)
			
 
				-        string_scores.append(new_score)
			
 
				-    title_score, extract_score, domain_score, domain_split_score, path_score = string_scores
			
 
				+def _score_result(terms: list[str], result: Document, is_complete: bool):
			
 
				+    features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete)
			
 
				 
			
 
				     length_penalty = math.e ** (-LENGTH_PENALTY * len(result.url))
			
 
				-    score = (0.01 * domain_score + 0.99 * (
			
 
				-        4 * title_score + extract_score + 2 * domain_score + 2 * domain_split_score + path_score) * 0.1) * length_penalty
			
 
				-    # score = (0.1 + 0.9*match_score) * (0.1 + 0.9*(result.score / max_score))
			
 
				-    # score = 0.01 * match_score + 0.99 * (result.score / max_score)
			
 
				-    # print("Result", result, string_scores, score)
			
 
				+    score = (
			
 
				+        4 * features['match_score_title']
			
 
				+        + features['match_score_extract'] +
			
 
				+        2 * features['match_score_domain'] +
			
 
				+        2 * features['match_score_domain_tokenized']
			
 
				+        + features['match_score_path']) * length_penalty * (features['domain_score'] + DOMAIN_SCORE_SMOOTHING) / 10
			
 
				+    # best_match_score = max(features[f'match_score_{name}'] for name in ['title', 'extract', 'domain', 'domain_tokenized'])
			
 
				+    # score = best_match_score * length_penalty * (features['domain_score'] + DOMAIN_SCORE_SMOOTHING)
			
 
				     return score
			
 
				 
			
 
				 
			
 
				 def score_match(last_match_char, match_length, total_possible_match_length):
			
 
				-    return (match_length + 1. / last_match_char) / (total_possible_match_length + 1)
			
 
				+    # return (match_length + 1. / last_match_char) / (total_possible_match_length + 1)
			
 
				+    return MATCH_EXPONENT ** (match_length - total_possible_match_length) / last_match_char
			
 
				+
			
 
				+
			
 
				+def get_features(terms, title, url, extract, score, is_complete):
			
 
				+    features = {}
			
 
				+    parsed_url = urlparse(url)
			
 
				+    domain = parsed_url.netloc
			
 
				+    path = parsed_url.path
			
 
				+    for part, name, is_url in [(title, 'title', False),
			
 
				+                               (extract, 'extract', False),
			
 
				+                               (domain, 'domain', True),
			
 
				+                               (domain, 'domain_tokenized', False),
			
 
				+                               (path, 'path', True)]:
			
 
				+        last_match_char, match_length, total_possible_match_length, match_terms = \
			
 
				+            get_match_features(terms, part, is_complete, is_url)
			
 
				+        features[f'last_match_char_{name}'] = last_match_char
			
 
				+        features[f'match_length_{name}'] = match_length
			
 
				+        features[f'total_possible_match_length_{name}'] = total_possible_match_length
			
 
				+        features[f'match_score_{name}'] = score_match(last_match_char, match_length, total_possible_match_length)
			
 
				+        features[f'match_terms_{name}'] = match_terms
			
 
				+    features['num_terms'] = len(terms)
			
 
				+    features['num_chars'] = len(' '.join(terms))
			
 
				+    features['domain_score'] = get_domain_score(url)
			
 
				+    features['path_length'] = len(path)
			
 
				+    features['domain_length'] = len(domain)
			
 
				+    features['item_score'] = score
			
 
				+    return features
			
 
				 
			
 
				 
			
 
				 def get_domain_score(url):
			
@@ -67,19 +87,21 @@ def get_domain_score(url):
 
				 def get_match_features(terms, result_string, is_complete, is_url):
			
 
				     query_regex = _get_query_regex(terms, is_complete, is_url)
			
 
				     matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
			
 
				-    match_strings = {x.group(0).lower() for x in matches}
			
 
				-    match_length = sum(len(x) for x in match_strings)
			
 
				+    # match_strings = {x.group(0).lower() for x in matches}
			
 
				+    # match_length = sum(len(x) for x in match_strings)
			
 
				 
			
 
				     last_match_char = 1
			
 
				     seen_matches = set()
			
 
				+    match_length = 0
			
 
				     for match in matches:
			
 
				         value = match.group(0).lower()
			
 
				         if value not in seen_matches:
			
 
				             last_match_char = match.span()[1]
			
 
				             seen_matches.add(value)
			
 
				+            match_length += len(value)
			
 
				 
			
 
				     total_possible_match_length = sum(len(x) for x in terms)
			
 
				-    return last_match_char, match_length, total_possible_match_length
			
 
				+    return last_match_char, match_length, total_possible_match_length, len(seen_matches)
			
 
				 
			
 
				 
			
 
				 def order_results(terms: list[str], results: list[Document], is_complete: bool) -> list[Document]:
			
@@ -138,9 +160,9 @@ class Ranker:
 
				         terms = [x.lower() for x in q.replace('.', ' ').split()]
			
 
				         is_complete = q.endswith(' ')
			
 
				         if len(terms) > 0 and not is_complete:
			
 
				-            retrieval_terms = terms[:-1] + self.completer.complete(terms[-1])
			
 
				+            retrieval_terms = set(terms + self.completer.complete(terms[-1]))
			
 
				         else:
			
 
				-            retrieval_terms = terms
			
 
				+            retrieval_terms = set(terms)
			
 
				 
			
 
				         pages = []
			
 
				         seen_items = set()
			
@@ -160,4 +182,3 @@ class Ranker:
 
				 class HeuristicRanker(Ranker):
			
 
				     def order_results(self, terms, pages, is_complete):
			
 
				         return order_results(terms, pages, is_complete)
			
 
				-