123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- import re
- from logging import getLogger
- from typing import List
- import Levenshtein
- from fastapi import FastAPI
- from starlette.responses import RedirectResponse, FileResponse, HTMLResponse
- from starlette.staticfiles import StaticFiles
- from index import TinyIndex, Document
- logger = getLogger(__name__)
- def create(tiny_index: TinyIndex):
- app = FastAPI()
- @app.get("/search")
- def search(s: str):
- results, terms = get_results(s)
- formatted_results = []
- for result in results:
- pattern = get_query_regex(terms)
- title = result.title
- matches = re.finditer(pattern, title, re.IGNORECASE)
- all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(title)]
- formatted_result = []
- for i in range(len(all_spans) - 1):
- is_bold = i % 2 == 1
- start = all_spans[i]
- end = all_spans[i + 1]
- formatted_result.append({'value': title[start:end], 'is_bold': is_bold})
- formatted_results.append({'title': formatted_result, 'url': result.url})
- logger.info("Return results: %r", formatted_results)
- return formatted_results
- def get_query_regex(terms):
- term_patterns = [rf'\b{term}\b' for term in terms]
- pattern = '|'.join(term_patterns)
- return pattern
- def score_result(terms, r):
- query_regex = get_query_regex(terms)
- matches = re.findall(query_regex, r, flags=re.IGNORECASE)
- match_strings = {x.lower() for x in matches}
- match_length = sum(len(x) for x in match_strings)
- num_words = len(re.findall(r'\b\w+\b', r))
- return match_length + 1./num_words
- def order_results(terms: list[str], results: list[Document]):
- ordered_results = sorted(results, key=lambda result: score_result(terms, result.title), reverse=True)
- # print("Order results", query, ordered_results, sep='\n')
- return ordered_results
- @app.get("/complete")
- def complete(q: str):
- ordered_results, terms = get_results(q)
- results = [item.title.replace("\n", "") + ' — ' +
- item.url.replace("\n", "") for item in ordered_results]
- if len(results) == 0:
- # print("No results")
- return []
- # print("Results", results)
- return [q, results]
- def get_results(q):
- terms = [x.lower() for x in q.replace('.', ' ').split()]
- # completed = complete_term(terms[-1])
- # terms = terms[:-1] + [completed]
- pages = []
- for term in terms:
- items = tiny_index.retrieve(term)
- if items is not None:
- pages += [item for item in items if term in item.title.lower()]
- ordered_results = order_results(terms, pages)
- return ordered_results, terms
- @app.get('/')
- def index():
- return FileResponse('static/index.html')
- app.mount('/', StaticFiles(directory="static"), name="static")
- return app
|