Compare commits
2 commits
allow-extr
...
main
Author | SHA1 | Date | |
---|---|---|---|
|
cfe18162f1 | ||
|
b868b6284b |
8 changed files with 22 additions and 47 deletions
|
@ -18,7 +18,7 @@ def run():
|
|||
with TinyIndex(Document, INDEX_PATH) as tiny_index:
|
||||
completer = Completer()
|
||||
ranker = HeuristicRanker(tiny_index, completer)
|
||||
items = ranker.search('jasper fforde', [])
|
||||
items = ranker.search('jasper fforde')
|
||||
print()
|
||||
if items:
|
||||
for i, item in enumerate(islice(items, 10)):
|
||||
|
|
Binary file not shown.
|
@ -148,13 +148,6 @@ body {
|
|||
font-size: .9rem;
|
||||
}
|
||||
|
||||
.via {
|
||||
font-size: .9rem;
|
||||
font-weight: var(--default-font-weight);
|
||||
/* italics */
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.result .title, .result .title>* {
|
||||
color: var(--primary-color);
|
||||
font-size: 1.1rem;
|
||||
|
|
|
@ -3,7 +3,7 @@ import re
|
|||
from mwmbl.tokenizer import tokenize, clean_unicode
|
||||
|
||||
|
||||
def format_result_with_pattern(pattern, result, source):
|
||||
def format_result_with_pattern(pattern, result):
|
||||
formatted_result = {}
|
||||
for content_type, content_raw in [('title', result.title), ('extract', result.extract)]:
|
||||
content = clean_unicode(content_raw)
|
||||
|
@ -17,7 +17,6 @@ def format_result_with_pattern(pattern, result, source):
|
|||
content_result.append({'value': content[start:end], 'is_bold': is_bold})
|
||||
formatted_result[content_type] = content_result
|
||||
formatted_result['url'] = result.url
|
||||
formatted_result['source'] = source
|
||||
return formatted_result
|
||||
|
||||
|
||||
|
@ -35,8 +34,8 @@ def get_query_regex(terms, is_complete, is_url):
|
|||
return pattern
|
||||
|
||||
|
||||
def format_result(result, query, source):
|
||||
def format_result(result, query):
|
||||
tokens = tokenize(query)
|
||||
pattern = get_query_regex(tokens, True, False)
|
||||
return format_result_with_pattern(pattern, result, source)
|
||||
return format_result_with_pattern(pattern, result)
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ SCORE_FOR_SAME_DOMAIN = 0.01
|
|||
EXTRA_LINK_MULTIPLIER = 0.001
|
||||
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
|
||||
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
|
||||
DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$")
|
||||
DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$|omgoat\.org$")
|
||||
CORE_DOMAINS = {
|
||||
'github.com',
|
||||
'en.wikipedia.org',
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<div class="result-container">
|
||||
<div class="result-link">
|
||||
<a href="{{result.url}}">
|
||||
<p><span class='link'>{{result.url}}</span> <span class="via">— found via {{result.source|title}}</span></p>
|
||||
<p class='link'>{{result.url}}</p>
|
||||
<p class='title'>{{result.title|strengthen}}</p>
|
||||
</a>
|
||||
<p class='extract'>{{result.extract|strengthen}}</p>
|
||||
|
|
|
@ -3,7 +3,6 @@ import re
|
|||
from abc import abstractmethod
|
||||
from logging import getLogger
|
||||
from operator import itemgetter
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.format import format_result_with_pattern, get_query_regex
|
||||
|
@ -119,22 +118,17 @@ class Ranker:
|
|||
def order_results(self, terms, pages, is_complete):
|
||||
pass
|
||||
|
||||
def search(self, s: str, additional_results: list[Document], source: Optional[str] = None):
|
||||
mwmbl_results, terms, _ = self.get_results(s)
|
||||
def search(self, s: str):
|
||||
results, terms, _ = self.get_results(s)
|
||||
|
||||
is_complete = s.endswith(' ')
|
||||
pattern = get_query_regex(terms, is_complete, False)
|
||||
formatted_results = []
|
||||
seen_urls = set()
|
||||
for results, source in [(additional_results, source), (mwmbl_results, 'mwmbl')]:
|
||||
for result in results:
|
||||
if result.url in seen_urls:
|
||||
continue
|
||||
formatted_result = format_result_with_pattern(pattern, result, source)
|
||||
formatted_results.append(formatted_result)
|
||||
seen_urls.add(result.url)
|
||||
for result in results:
|
||||
formatted_result = format_result_with_pattern(pattern, result)
|
||||
formatted_results.append(formatted_result)
|
||||
|
||||
logger.info("Return results: %d", len(formatted_results))
|
||||
logger.info("Return results: %r", formatted_results)
|
||||
return formatted_results
|
||||
|
||||
def complete(self, q: str):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from itertools import groupby
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, ParseResult
|
||||
|
||||
import justext
|
||||
import requests
|
||||
|
@ -66,12 +66,13 @@ def home_fragment(request):
|
|||
"query": query,
|
||||
"activity": activity,
|
||||
})
|
||||
current_url = request.htmx.current_url
|
||||
# Replace query string with new query
|
||||
stripped_url = current_url[:current_url.index("?")] if "?" in current_url else current_url
|
||||
query_string = "?q=" + query if len(query) > 0 else ""
|
||||
new_url = stripped_url + query_string
|
||||
# Set the htmx replace header
|
||||
|
||||
# Encode the new query string
|
||||
if query:
|
||||
new_query_string = urlencode({"q": query}, doseq=True)
|
||||
new_url = "/?" + new_query_string
|
||||
else:
|
||||
new_url = "/"
|
||||
response["HX-Replace-Url"] = new_url
|
||||
return response
|
||||
|
||||
|
@ -88,19 +89,7 @@ class Activity:
|
|||
def _get_results_and_activity(request):
|
||||
query = request.GET.get("q")
|
||||
if query:
|
||||
# There may be extra results in the request that we need to add in
|
||||
# format is ?enhanced=google&title=title1&url=url1&extract=extract1&title=title2&url=url2&extract=extract2
|
||||
source = request.GET.get("enhanced", "unknown")
|
||||
titles = request.GET.getlist(f"title")
|
||||
urls = request.GET.getlist(f"url")
|
||||
extracts = request.GET.getlist(f"extract")
|
||||
|
||||
additional_results = [
|
||||
Document(title=title, url=url, extract=extract, score=0.0)
|
||||
for title, url, extract in zip(titles, urls, extracts)
|
||||
]
|
||||
|
||||
results = ranker.search(query, additional_results=additional_results, source=source)
|
||||
results = ranker.search(query)
|
||||
activity = None
|
||||
else:
|
||||
results = None
|
||||
|
@ -137,5 +126,5 @@ def fetch_url(request):
|
|||
|
||||
result = Document(title=title, url=url, extract=extract, score=0.0)
|
||||
return render(request, "result.html", {
|
||||
"result": format_result(result, query, "user"),
|
||||
"result": format_result(result, query),
|
||||
})
|
||||
|
|
Loading…
Reference in a new issue