Format fetched url
This commit is contained in:
parent
0a4e1e4aee
commit
8676abbc63
3 changed files with 49 additions and 34 deletions
|
@ -18,6 +18,7 @@ from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revis
|
|||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.format import format_result
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
|
||||
from mwmbl.settings import (
|
||||
|
@ -32,6 +33,7 @@ from mwmbl.settings import (
|
|||
PUBLIC_USER_ID_LENGTH,
|
||||
FILE_NAME_SUFFIX,
|
||||
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
|
||||
|
||||
def get_bucket(name):
|
||||
|
@ -85,7 +87,7 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
|
|||
return url_db.create_tables()
|
||||
|
||||
@router.get('/fetch')
|
||||
def fetch_url(url: str):
|
||||
def fetch_url(url: str, query: str):
|
||||
response = requests.get(url)
|
||||
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
||||
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
||||
|
@ -94,11 +96,8 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
|
|||
if len(extract) > NUM_EXTRACT_CHARS:
|
||||
extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
|
||||
|
||||
return {
|
||||
'url': url,
|
||||
'title': title,
|
||||
'extract': extract,
|
||||
}
|
||||
result = Document(title=title, url=url, extract=extract, score=0.0)
|
||||
return format_result(result, query)
|
||||
|
||||
@router.post('/batches/')
|
||||
def create_batch(batch: Batch):
|
||||
|
|
40
mwmbl/format.py
Normal file
40
mwmbl/format.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
import re
|
||||
|
||||
from mwmbl.tokenizer import tokenize
|
||||
|
||||
|
||||
def format_result_with_pattern(pattern, result):
|
||||
formatted_result = {}
|
||||
for content_type, content in [('title', result.title), ('extract', result.extract)]:
|
||||
matches = re.finditer(pattern, content, re.IGNORECASE)
|
||||
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
|
||||
content_result = []
|
||||
for i in range(len(all_spans) - 1):
|
||||
is_bold = i % 2 == 1
|
||||
start = all_spans[i]
|
||||
end = all_spans[i + 1]
|
||||
content_result.append({'value': content[start:end], 'is_bold': is_bold})
|
||||
formatted_result[content_type] = content_result
|
||||
formatted_result['url'] = result.url
|
||||
return formatted_result
|
||||
|
||||
|
||||
def get_query_regex(terms, is_complete, is_url):
|
||||
if not terms:
|
||||
return ''
|
||||
|
||||
word_sep = r'\b' if is_url else ''
|
||||
if is_complete:
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
|
||||
else:
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
|
||||
rf'{word_sep}{re.escape(terms[-1])}']
|
||||
pattern = '|'.join(term_patterns)
|
||||
return pattern
|
||||
|
||||
|
||||
def format_result(result, query):
|
||||
tokens = tokenize(query)
|
||||
pattern = get_query_regex(tokens, True, False)
|
||||
return format_result_with_pattern(pattern, result)
|
||||
|
|
@ -5,6 +5,7 @@ from logging import getLogger
|
|||
from operator import itemgetter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.format import format_result_with_pattern, get_query_regex
|
||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
|
@ -21,20 +22,6 @@ DOMAIN_SCORE_SMOOTHING = 50
|
|||
HTTPS_STRING = 'https://'
|
||||
|
||||
|
||||
def _get_query_regex(terms, is_complete, is_url):
|
||||
if not terms:
|
||||
return ''
|
||||
|
||||
word_sep = r'\b' if is_url else ''
|
||||
if is_complete:
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
|
||||
else:
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
|
||||
rf'{word_sep}{re.escape(terms[-1])}']
|
||||
pattern = '|'.join(term_patterns)
|
||||
return pattern
|
||||
|
||||
|
||||
def score_result(terms: list[str], result: Document, is_complete: bool):
|
||||
features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete)
|
||||
|
||||
|
@ -93,7 +80,7 @@ def get_domain_score(url):
|
|||
|
||||
|
||||
def get_match_features(terms, result_string, is_complete, is_url):
|
||||
query_regex = _get_query_regex(terms, is_complete, is_url)
|
||||
query_regex = get_query_regex(terms, is_complete, is_url)
|
||||
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
|
||||
# match_strings = {x.group(0).lower() for x in matches}
|
||||
# match_length = sum(len(x) for x in match_strings)
|
||||
|
@ -135,21 +122,10 @@ class Ranker:
|
|||
results, terms, _ = self.get_results(s)
|
||||
|
||||
is_complete = s.endswith(' ')
|
||||
pattern = _get_query_regex(terms, is_complete, False)
|
||||
pattern = get_query_regex(terms, is_complete, False)
|
||||
formatted_results = []
|
||||
for result in results:
|
||||
formatted_result = {}
|
||||
for content_type, content in [('title', result.title), ('extract', result.extract)]:
|
||||
matches = re.finditer(pattern, content, re.IGNORECASE)
|
||||
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
|
||||
content_result = []
|
||||
for i in range(len(all_spans) - 1):
|
||||
is_bold = i % 2 == 1
|
||||
start = all_spans[i]
|
||||
end = all_spans[i + 1]
|
||||
content_result.append({'value': content[start:end], 'is_bold': is_bold})
|
||||
formatted_result[content_type] = content_result
|
||||
formatted_result['url'] = result.url
|
||||
formatted_result = format_result_with_pattern(pattern, result)
|
||||
formatted_results.append(formatted_result)
|
||||
|
||||
logger.info("Return results: %r", formatted_results)
|
||||
|
|
Loading…
Reference in a new issue