2021-12-17 21:31:26 +00:00
|
|
|
import re
|
2021-12-16 21:36:01 +00:00
|
|
|
from logging import getLogger
|
2021-12-18 22:35:59 +00:00
|
|
|
from operator import itemgetter
|
2021-12-23 21:30:51 +00:00
|
|
|
from pathlib import Path
|
2021-06-05 21:22:31 +00:00
|
|
|
|
|
|
|
from fastapi import FastAPI
|
2021-12-19 21:09:00 +00:00
|
|
|
from starlette.responses import FileResponse
|
2021-06-05 21:22:31 +00:00
|
|
|
from starlette.staticfiles import StaticFiles
|
|
|
|
|
2021-12-22 23:21:23 +00:00
|
|
|
from tinysearchengine.indexer import TinyIndex, Document
|
2021-06-05 21:22:31 +00:00
|
|
|
|
2021-12-16 21:36:01 +00:00
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
|
|
|
|
2021-12-23 21:30:51 +00:00
|
|
|
STATIC_FILES_PATH = Path(__file__).parent / 'static'
|
2021-12-18 22:35:59 +00:00
|
|
|
SCORE_THRESHOLD = 0.25
|
|
|
|
|
|
|
|
|
2021-06-05 21:22:31 +00:00
|
|
|
def create(tiny_index: TinyIndex):
|
|
|
|
app = FastAPI()
|
|
|
|
|
|
|
|
@app.get("/search")
|
|
|
|
def search(s: str):
|
2021-12-17 21:31:26 +00:00
|
|
|
results, terms = get_results(s)
|
|
|
|
|
|
|
|
formatted_results = []
|
|
|
|
for result in results:
|
2021-12-18 12:42:04 +00:00
|
|
|
pattern = get_query_regex(terms)
|
2021-12-19 22:16:01 +00:00
|
|
|
formatted_result = {}
|
|
|
|
for content_type, content in [('title', result.title), ('extract', result.extract)]:
|
|
|
|
matches = re.finditer(pattern, content, re.IGNORECASE)
|
|
|
|
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
|
|
|
|
content_result = []
|
|
|
|
for i in range(len(all_spans) - 1):
|
|
|
|
is_bold = i % 2 == 1
|
|
|
|
start = all_spans[i]
|
|
|
|
end = all_spans[i + 1]
|
|
|
|
content_result.append({'value': content[start:end], 'is_bold': is_bold})
|
|
|
|
formatted_result[content_type] = content_result
|
|
|
|
formatted_result['url'] = result.url
|
|
|
|
formatted_results.append(formatted_result)
|
2021-12-17 21:31:26 +00:00
|
|
|
|
|
|
|
logger.info("Return results: %r", formatted_results)
|
|
|
|
return formatted_results
|
2021-06-05 21:22:31 +00:00
|
|
|
|
2021-12-18 12:42:04 +00:00
|
|
|
def get_query_regex(terms):
|
|
|
|
term_patterns = [rf'\b{term}\b' for term in terms]
|
|
|
|
pattern = '|'.join(term_patterns)
|
|
|
|
return pattern
|
|
|
|
|
2021-12-19 20:48:28 +00:00
|
|
|
def score_result(terms, result: Document):
|
2021-12-19 21:38:17 +00:00
|
|
|
result_string = f"{result.title.strip()} {result.extract.strip()}"
|
2021-12-18 12:42:04 +00:00
|
|
|
query_regex = get_query_regex(terms)
|
2021-12-19 21:38:17 +00:00
|
|
|
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
|
|
|
|
match_strings = {x.group(0).lower() for x in matches}
|
2021-12-18 12:42:04 +00:00
|
|
|
match_length = sum(len(x) for x in match_strings)
|
|
|
|
|
2021-12-19 21:38:17 +00:00
|
|
|
last_match_char = 1
|
|
|
|
seen_matches = set()
|
|
|
|
for match in matches:
|
|
|
|
value = match.group(0).lower()
|
|
|
|
if value not in seen_matches:
|
|
|
|
last_match_char = match.span()[1]
|
|
|
|
seen_matches.add(value)
|
|
|
|
|
2021-12-18 22:35:59 +00:00
|
|
|
total_possible_match_length = sum(len(x) for x in terms)
|
2021-12-19 21:38:17 +00:00
|
|
|
score = (match_length + 1./last_match_char) / (total_possible_match_length + 1)
|
|
|
|
return score
|
2021-12-18 12:42:04 +00:00
|
|
|
|
|
|
|
def order_results(terms: list[str], results: list[Document]):
|
2021-12-19 20:48:28 +00:00
|
|
|
results_and_scores = [(score_result(terms, result), result) for result in results]
|
2021-12-18 22:35:59 +00:00
|
|
|
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
|
|
|
|
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
|
|
|
|
return filtered_results
|
2021-06-05 21:22:31 +00:00
|
|
|
|
|
|
|
@app.get("/complete")
|
|
|
|
def complete(q: str):
|
2021-12-17 21:31:26 +00:00
|
|
|
ordered_results, terms = get_results(q)
|
2021-12-14 22:01:59 +00:00
|
|
|
results = [item.title.replace("\n", "") + ' — ' +
|
|
|
|
item.url.replace("\n", "") for item in ordered_results]
|
|
|
|
if len(results) == 0:
|
|
|
|
return []
|
|
|
|
return [q, results]
|
2021-06-05 21:22:31 +00:00
|
|
|
|
2021-12-14 22:01:59 +00:00
|
|
|
def get_results(q):
|
|
|
|
terms = [x.lower() for x in q.replace('.', ' ').split()]
|
2021-06-05 21:22:31 +00:00
|
|
|
pages = []
|
2021-12-19 20:48:28 +00:00
|
|
|
seen_items = set()
|
2021-06-05 21:22:31 +00:00
|
|
|
for term in terms:
|
|
|
|
items = tiny_index.retrieve(term)
|
|
|
|
if items is not None:
|
2021-12-19 20:48:28 +00:00
|
|
|
for item in items:
|
|
|
|
if term in item.title.lower() or term in item.extract.lower():
|
|
|
|
if item.title not in seen_items:
|
|
|
|
pages.append(item)
|
|
|
|
seen_items.add(item.title)
|
|
|
|
|
2021-12-18 12:42:04 +00:00
|
|
|
ordered_results = order_results(terms, pages)
|
2021-12-17 21:31:26 +00:00
|
|
|
return ordered_results, terms
|
2021-06-05 21:22:31 +00:00
|
|
|
|
|
|
|
@app.get('/')
|
|
|
|
def index():
|
2021-12-23 21:30:51 +00:00
|
|
|
return FileResponse(STATIC_FILES_PATH / 'index.html')
|
2021-06-05 21:22:31 +00:00
|
|
|
|
2021-12-23 21:30:51 +00:00
|
|
|
app.mount('/', StaticFiles(directory=STATIC_FILES_PATH), name="static")
|
2021-06-05 21:22:31 +00:00
|
|
|
return app
|