Преглед на файлове

Improve handling of incomplete words:
- Correctly generate regex for incomplete vs complete words
- Return more than one top word from completer
- Correctly handle no terms

Daoud Clarke преди 3 години
родител
ревизия
fe6ace93e6
променени са 2 файла, в които са добавени 34 реда и са изтрити 19 реда
  1. 13 8
      mwmbl/tinysearchengine/completer.py
  2. 21 11
      mwmbl/tinysearchengine/create_app.py

+ 13 - 8
mwmbl/tinysearchengine/completer.py

@@ -6,21 +6,26 @@ from pandas import DataFrame
 
 
 
 
 class Completer:
 class Completer:
-    def __init__(self, terms: DataFrame):
+    def __init__(self, terms: DataFrame, num_matches: int = 3):
         terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict()
         terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict()
         self.terms = list(terms_dict.keys())
         self.terms = list(terms_dict.keys())
         self.counts = list(terms_dict.values())
         self.counts = list(terms_dict.values())
+        self.num_matches = num_matches
         print("Terms", self.terms[:100], self.counts[:100])
         print("Terms", self.terms[:100], self.counts[:100])
 
 
-    def complete(self, term):
+    def complete(self, term) -> list[str]:
         term_length = len(term)
         term_length = len(term)
-        start = bisect_left(self.terms, term, key=lambda x: x[:term_length])
-        end = bisect_right(self.terms, term, key=lambda x: x[:term_length])
+        start_index = bisect_left(self.terms, term, key=lambda x: x[:term_length])
+        end_index = bisect_right(self.terms, term, key=lambda x: x[:term_length])
 
 
-        matching_terms = zip(self.counts[start:end], self.terms[start:end])
-        top_count, top_term = max(matching_terms)
-        print("Top term", top_term, top_count)
-        return top_term
+        matching_terms = zip(self.counts[start_index:end_index], self.terms[start_index:end_index])
+        top_terms = sorted(matching_terms, reverse=True)[:self.num_matches]
+        print("Top terms, counts", top_terms)
+        if not top_terms:
+            return []
+
+        counts, terms = zip(*top_terms)
+        return list(terms)
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':

+ 21 - 11
mwmbl/tinysearchengine/create_app.py

@@ -31,7 +31,8 @@ def create(tiny_index: TinyIndex, completer: Completer):
     def search(s: str):
     def search(s: str):
         results, terms = get_results(s)
         results, terms = get_results(s)
 
 
-        pattern = get_query_regex(terms)
+        is_complete = s.endswith(' ')
+        pattern = get_query_regex(terms, is_complete)
         formatted_results = []
         formatted_results = []
         for result in results:
         for result in results:
             formatted_result = {}
             formatted_result = {}
@@ -51,17 +52,23 @@ def create(tiny_index: TinyIndex, completer: Completer):
         logger.info("Return results: %r", formatted_results)
         logger.info("Return results: %r", formatted_results)
         return formatted_results
         return formatted_results
 
 
-    def get_query_regex(terms):
-        term_patterns = [rf'\b{term}\b' for term in terms]
+    def get_query_regex(terms, is_complete):
+        if not terms:
+            return ''
+
+        if is_complete:
+            term_patterns = [rf'\b{term}\b' for term in terms]
+        else:
+            term_patterns = [rf'\b{term}\b' for term in terms[:-1]] + [rf'\b{terms[-1]}']
         pattern = '|'.join(term_patterns)
         pattern = '|'.join(term_patterns)
         return pattern
         return pattern
 
 
-    def score_result(terms, result: Document):
+    def score_result(terms, result: Document, is_complete: bool):
         domain = urlparse(result.url).netloc
         domain = urlparse(result.url).netloc
         domain_score = DOMAINS.get(domain, 0.0)
         domain_score = DOMAINS.get(domain, 0.0)
 
 
         result_string = f"{result.title.strip()} {result.extract.strip()}"
         result_string = f"{result.title.strip()} {result.extract.strip()}"
-        query_regex = get_query_regex(terms)
+        query_regex = get_query_regex(terms, is_complete)
         matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
         matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
         match_strings = {x.group(0).lower() for x in matches}
         match_strings = {x.group(0).lower() for x in matches}
         match_length = sum(len(x) for x in match_strings)
         match_length = sum(len(x) for x in match_strings)
@@ -78,8 +85,8 @@ def create(tiny_index: TinyIndex, completer: Completer):
         score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
         score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
         return score
         return score
 
 
-    def order_results(terms: list[str], results: list[Document]):
-        results_and_scores = [(score_result(terms, result), result) for result in results]
+    def order_results(terms: list[str], results: list[Document], is_complete: bool):
+        results_and_scores = [(score_result(terms, result, is_complete), result) for result in results]
         ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
         ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
         filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
         filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
         return filtered_results
         return filtered_results
@@ -95,12 +102,15 @@ def create(tiny_index: TinyIndex, completer: Completer):
 
 
     def get_results(q):
     def get_results(q):
         terms = [x.lower() for x in q.replace('.', ' ').split()]
         terms = [x.lower() for x in q.replace('.', ' ').split()]
-        if not q.endswith(' '):
-            terms[-1] = completer.complete(terms[-1])
+        is_complete = q.endswith(' ')
+        if len(terms) > 0 and not is_complete:
+            retrieval_terms = terms[:-1] + completer.complete(terms[-1])
+        else:
+            retrieval_terms = terms
 
 
         pages = []
         pages = []
         seen_items = set()
         seen_items = set()
-        for term in terms:
+        for term in retrieval_terms:
             items = tiny_index.retrieve(term)
             items = tiny_index.retrieve(term)
             if items is not None:
             if items is not None:
                 for item in items:
                 for item in items:
@@ -109,6 +119,6 @@ def create(tiny_index: TinyIndex, completer: Completer):
                             pages.append(item)
                             pages.append(item)
                             seen_items.add(item.title)
                             seen_items.add(item.title)
 
 
-        ordered_results = order_results(terms, pages)
+        ordered_results = order_results(terms, pages, is_complete)
         return ordered_results, terms
         return ordered_results, terms
     return app
     return app