mwmbl/create_app.py

96 lines
3.4 KiB
Python
Raw Normal View History

2021-12-17 21:31:26 +00:00
import re
2021-12-16 21:36:01 +00:00
from logging import getLogger
2021-12-18 22:35:59 +00:00
from operator import itemgetter
from typing import List
import Levenshtein
from fastapi import FastAPI
2021-12-14 22:01:59 +00:00
from starlette.responses import RedirectResponse, FileResponse, HTMLResponse
from starlette.staticfiles import StaticFiles
from index import TinyIndex, Document
2021-12-16 21:36:01 +00:00
logger = getLogger(__name__)
2021-12-18 22:35:59 +00:00
SCORE_THRESHOLD = 0.25
def create(tiny_index: TinyIndex):
app = FastAPI()
@app.get("/search")
def search(s: str):
2021-12-17 21:31:26 +00:00
results, terms = get_results(s)
formatted_results = []
for result in results:
2021-12-18 12:42:04 +00:00
pattern = get_query_regex(terms)
2021-12-17 21:31:26 +00:00
title = result.title
matches = re.finditer(pattern, title, re.IGNORECASE)
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(title)]
formatted_result = []
for i in range(len(all_spans) - 1):
is_bold = i % 2 == 1
start = all_spans[i]
end = all_spans[i + 1]
formatted_result.append({'value': title[start:end], 'is_bold': is_bold})
formatted_results.append({'title': formatted_result, 'url': result.url})
logger.info("Return results: %r", formatted_results)
return formatted_results
2021-12-18 12:42:04 +00:00
def get_query_regex(terms):
term_patterns = [rf'\b{term}\b' for term in terms]
pattern = '|'.join(term_patterns)
return pattern
def score_result(terms, r):
query_regex = get_query_regex(terms)
matches = re.findall(query_regex, r, flags=re.IGNORECASE)
match_strings = {x.lower() for x in matches}
match_length = sum(len(x) for x in match_strings)
num_words = len(re.findall(r'\b\w+\b', r))
2021-12-18 22:35:59 +00:00
total_possible_match_length = sum(len(x) for x in terms)
return (match_length + 1./num_words) / (total_possible_match_length + 1)
2021-12-18 12:42:04 +00:00
def order_results(terms: list[str], results: list[Document]):
2021-12-18 22:35:59 +00:00
results_and_scores = [(score_result(terms, result.title), result) for result in results]
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
# ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True)
2021-06-11 20:43:12 +00:00
# print("Order results", query, ordered_results, sep='\n')
2021-12-18 22:35:59 +00:00
return filtered_results
@app.get("/complete")
def complete(q: str):
2021-12-17 21:31:26 +00:00
ordered_results, terms = get_results(q)
2021-12-14 22:01:59 +00:00
results = [item.title.replace("\n", "") + '' +
item.url.replace("\n", "") for item in ordered_results]
if len(results) == 0:
# print("No results")
return []
# print("Results", results)
return [q, results]
2021-12-14 22:01:59 +00:00
def get_results(q):
terms = [x.lower() for x in q.replace('.', ' ').split()]
# completed = complete_term(terms[-1])
# terms = terms[:-1] + [completed]
pages = []
for term in terms:
items = tiny_index.retrieve(term)
if items is not None:
pages += [item for item in items if term in item.title.lower()]
2021-12-18 12:42:04 +00:00
ordered_results = order_results(terms, pages)
2021-12-17 21:31:26 +00:00
return ordered_results, terms
@app.get('/')
def index():
return FileResponse('static/index.html')
app.mount('/', StaticFiles(directory="static"), name="static")
return app