From 4cbed29c08e28f8481942fb1402f04d5e047d1fb Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 19 Dec 2021 20:48:28 +0000 Subject: [PATCH] Show the extract --- analyse/inspect_index.py | 16 ++++++++++++++++ create_app.py | 30 +++++++++++++++++++++--------- index.py | 1 + 3 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 analyse/inspect_index.py diff --git a/analyse/inspect_index.py b/analyse/inspect_index.py new file mode 100644 index 0000000..4286a86 --- /dev/null +++ b/analyse/inspect_index.py @@ -0,0 +1,16 @@ +from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE +from paths import INDEX_PATH + + +def run(): + tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) + for i in range(100): + items = tiny_index.retrieve('eggless') + # items = tiny_index.convert_items(page) + if items: + print("Items", items) + break + + +if __name__ == '__main__': + run() diff --git a/create_app.py b/create_app.py index c15a8e9..37165ab 100644 --- a/create_app.py +++ b/create_app.py @@ -27,15 +27,16 @@ def create(tiny_index: TinyIndex): formatted_results = [] for result in results: pattern = get_query_regex(terms) - title = result.title - matches = re.finditer(pattern, title, re.IGNORECASE) - all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(title)] + title_and_extract = f"{result.title} - {result.extract}" + matches = re.finditer(pattern, title_and_extract, re.IGNORECASE) + all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(title_and_extract)] formatted_result = [] + title_length = len(result.title) for i in range(len(all_spans) - 1): is_bold = i % 2 == 1 start = all_spans[i] end = all_spans[i + 1] - formatted_result.append({'value': title[start:end], 'is_bold': is_bold}) + formatted_result.append({'value': title_and_extract[start:end], 'is_bold': is_bold}) formatted_results.append({'title': formatted_result, 'url': result.url}) logger.info("Return results: %r", formatted_results) @@ -46,19 +47,22 @@ def create(tiny_index: TinyIndex): pattern = '|'.join(term_patterns) return pattern - def score_result(terms, r): + def score_result(terms, result: Document): + print("Score result", result) + result_string = f"{result.title} {result.extract}" query_regex = get_query_regex(terms) - matches = re.findall(query_regex, r, flags=re.IGNORECASE) + matches = re.findall(query_regex, result_string, flags=re.IGNORECASE) match_strings = {x.lower() for x in matches} match_length = sum(len(x) for x in match_strings) - num_words = len(re.findall(r'\b\w+\b', r)) + num_words = len(re.findall(r'\b\w+\b', result_string)) total_possible_match_length = sum(len(x) for x in terms) return (match_length + 1./num_words) / (total_possible_match_length + 1) def order_results(terms: list[str], results: list[Document]): - results_and_scores = [(score_result(terms, result.title), result) for result in results] + results_and_scores = [(score_result(terms, result), result) for result in results] ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True) + print("Ordered results", ordered_results) filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD] # ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True) # print("Order results", query, ordered_results, sep='\n') @@ -75,15 +79,23 @@ def create(tiny_index: TinyIndex): # print("Results", results) return [q, results] + # TODO: why does 'leek and potato soup' result not get returned for 'potato soup' query? def get_results(q): terms = [x.lower() for x in q.replace('.', ' ').split()] # completed = complete_term(terms[-1]) # terms = terms[:-1] + [completed] pages = [] + seen_items = set() for term in terms: items = tiny_index.retrieve(term) + print("Items", items) if items is not None: - pages += [item for item in items if term in item.title.lower()] + for item in items: + if term in item.title.lower() or term in item.extract.lower(): + if item.title not in seen_items: + pages.append(item) + seen_items.add(item.title) + ordered_results = order_results(terms, pages) return ordered_results, terms diff --git a/index.py b/index.py index 8ab62e2..b38317a 100644 --- a/index.py +++ b/index.py @@ -188,6 +188,7 @@ def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]: prepared_url = prepare_url_for_tokenizing(unquote(url)) url_tokens = tokenize(nlp, prepared_url) extract_tokens = tokenize(nlp, extract) + print("Extract tokens", extract_tokens) tokens = title_tokens | url_tokens | extract_tokens yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)