|
@@ -31,7 +31,8 @@ def create(tiny_index: TinyIndex, completer: Completer):
|
|
def search(s: str):
|
|
def search(s: str):
|
|
results, terms = get_results(s)
|
|
results, terms = get_results(s)
|
|
|
|
|
|
- pattern = get_query_regex(terms)
|
|
|
|
|
|
+ is_complete = s.endswith(' ')
|
|
|
|
+ pattern = get_query_regex(terms, is_complete)
|
|
formatted_results = []
|
|
formatted_results = []
|
|
for result in results:
|
|
for result in results:
|
|
formatted_result = {}
|
|
formatted_result = {}
|
|
@@ -51,17 +52,23 @@ def create(tiny_index: TinyIndex, completer: Completer):
|
|
logger.info("Return results: %r", formatted_results)
|
|
logger.info("Return results: %r", formatted_results)
|
|
return formatted_results
|
|
return formatted_results
|
|
|
|
|
|
- def get_query_regex(terms):
|
|
|
|
- term_patterns = [rf'\b{term}\b' for term in terms]
|
|
|
|
|
|
+ def get_query_regex(terms, is_complete):
|
|
|
|
+ if not terms:
|
|
|
|
+ return ''
|
|
|
|
+
|
|
|
|
+ if is_complete:
|
|
|
|
+ term_patterns = [rf'\b{term}\b' for term in terms]
|
|
|
|
+ else:
|
|
|
|
+ term_patterns = [rf'\b{term}\b' for term in terms[:-1]] + [rf'\b{terms[-1]}']
|
|
pattern = '|'.join(term_patterns)
|
|
pattern = '|'.join(term_patterns)
|
|
return pattern
|
|
return pattern
|
|
|
|
|
|
- def score_result(terms, result: Document):
|
|
|
|
|
|
+ def score_result(terms, result: Document, is_complete: bool):
|
|
domain = urlparse(result.url).netloc
|
|
domain = urlparse(result.url).netloc
|
|
domain_score = DOMAINS.get(domain, 0.0)
|
|
domain_score = DOMAINS.get(domain, 0.0)
|
|
|
|
|
|
result_string = f"{result.title.strip()} {result.extract.strip()}"
|
|
result_string = f"{result.title.strip()} {result.extract.strip()}"
|
|
- query_regex = get_query_regex(terms)
|
|
|
|
|
|
+ query_regex = get_query_regex(terms, is_complete)
|
|
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
|
|
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
|
|
match_strings = {x.group(0).lower() for x in matches}
|
|
match_strings = {x.group(0).lower() for x in matches}
|
|
match_length = sum(len(x) for x in match_strings)
|
|
match_length = sum(len(x) for x in match_strings)
|
|
@@ -78,8 +85,8 @@ def create(tiny_index: TinyIndex, completer: Completer):
|
|
score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
|
|
score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
|
|
return score
|
|
return score
|
|
|
|
|
|
- def order_results(terms: list[str], results: list[Document]):
|
|
|
|
- results_and_scores = [(score_result(terms, result), result) for result in results]
|
|
|
|
|
|
+ def order_results(terms: list[str], results: list[Document], is_complete: bool):
|
|
|
|
+ results_and_scores = [(score_result(terms, result, is_complete), result) for result in results]
|
|
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
|
|
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
|
|
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
|
|
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
|
|
return filtered_results
|
|
return filtered_results
|
|
@@ -95,12 +102,15 @@ def create(tiny_index: TinyIndex, completer: Completer):
|
|
|
|
|
|
def get_results(q):
|
|
def get_results(q):
|
|
terms = [x.lower() for x in q.replace('.', ' ').split()]
|
|
terms = [x.lower() for x in q.replace('.', ' ').split()]
|
|
- if not q.endswith(' '):
|
|
|
|
- terms[-1] = completer.complete(terms[-1])
|
|
|
|
|
|
+ is_complete = q.endswith(' ')
|
|
|
|
+ if len(terms) > 0 and not is_complete:
|
|
|
|
+ retrieval_terms = terms[:-1] + completer.complete(terms[-1])
|
|
|
|
+ else:
|
|
|
|
+ retrieval_terms = terms
|
|
|
|
|
|
pages = []
|
|
pages = []
|
|
seen_items = set()
|
|
seen_items = set()
|
|
- for term in terms:
|
|
|
|
|
|
+ for term in retrieval_terms:
|
|
items = tiny_index.retrieve(term)
|
|
items = tiny_index.retrieve(term)
|
|
if items is not None:
|
|
if items is not None:
|
|
for item in items:
|
|
for item in items:
|
|
@@ -109,6 +119,6 @@ def create(tiny_index: TinyIndex, completer: Completer):
|
|
pages.append(item)
|
|
pages.append(item)
|
|
seen_items.add(item.title)
|
|
seen_items.add(item.title)
|
|
|
|
|
|
- ordered_results = order_results(terms, pages)
|
|
|
|
|
|
+ ordered_results = order_results(terms, pages, is_complete)
|
|
return ordered_results, terms
|
|
return ordered_results, terms
|
|
return app
|
|
return app
|