Format fetched url

This commit is contained in:
Daoud Clarke 2022-12-24 19:59:15 +00:00
parent 0a4e1e4aee
commit 8676abbc63
3 changed files with 49 additions and 34 deletions

View file

@ -18,6 +18,7 @@ from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revis
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
from mwmbl.database import Database
from mwmbl.format import format_result
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
from mwmbl.settings import (
@ -32,6 +33,7 @@ from mwmbl.settings import (
PUBLIC_USER_ID_LENGTH,
FILE_NAME_SUFFIX,
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
from mwmbl.tinysearchengine.indexer import Document
def get_bucket(name):
@ -85,7 +87,7 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
return url_db.create_tables()
@router.get('/fetch')
def fetch_url(url: str):
def fetch_url(url: str, query: str):
response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
@ -94,11 +96,8 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
if len(extract) > NUM_EXTRACT_CHARS:
extract = extract[:NUM_EXTRACT_CHARS - 1] + ''
return {
'url': url,
'title': title,
'extract': extract,
}
result = Document(title=title, url=url, extract=extract, score=0.0)
return format_result(result, query)
@router.post('/batches/')
def create_batch(batch: Batch):

40
mwmbl/format.py Normal file
View file

@ -0,0 +1,40 @@
import re
from mwmbl.tokenizer import tokenize
def format_result_with_pattern(pattern, result):
formatted_result = {}
for content_type, content in [('title', result.title), ('extract', result.extract)]:
matches = re.finditer(pattern, content, re.IGNORECASE)
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
content_result = []
for i in range(len(all_spans) - 1):
is_bold = i % 2 == 1
start = all_spans[i]
end = all_spans[i + 1]
content_result.append({'value': content[start:end], 'is_bold': is_bold})
formatted_result[content_type] = content_result
formatted_result['url'] = result.url
return formatted_result
def get_query_regex(terms, is_complete, is_url):
if not terms:
return ''
word_sep = r'\b' if is_url else ''
if is_complete:
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
else:
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
rf'{word_sep}{re.escape(terms[-1])}']
pattern = '|'.join(term_patterns)
return pattern
def format_result(result, query):
tokens = tokenize(query)
pattern = get_query_regex(tokens, True, False)
return format_result_with_pattern(pattern, result)

View file

@ -5,6 +5,7 @@ from logging import getLogger
from operator import itemgetter
from urllib.parse import urlparse
from mwmbl.format import format_result_with_pattern, get_query_regex
from mwmbl.tokenizer import tokenize, get_bigrams
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.hn_top_domains_filtered import DOMAINS
@ -21,20 +22,6 @@ DOMAIN_SCORE_SMOOTHING = 50
HTTPS_STRING = 'https://'
def _get_query_regex(terms, is_complete, is_url):
if not terms:
return ''
word_sep = r'\b' if is_url else ''
if is_complete:
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
else:
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
rf'{word_sep}{re.escape(terms[-1])}']
pattern = '|'.join(term_patterns)
return pattern
def score_result(terms: list[str], result: Document, is_complete: bool):
features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete)
@ -93,7 +80,7 @@ def get_domain_score(url):
def get_match_features(terms, result_string, is_complete, is_url):
query_regex = _get_query_regex(terms, is_complete, is_url)
query_regex = get_query_regex(terms, is_complete, is_url)
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
# match_strings = {x.group(0).lower() for x in matches}
# match_length = sum(len(x) for x in match_strings)
@ -135,21 +122,10 @@ class Ranker:
results, terms, _ = self.get_results(s)
is_complete = s.endswith(' ')
pattern = _get_query_regex(terms, is_complete, False)
pattern = get_query_regex(terms, is_complete, False)
formatted_results = []
for result in results:
formatted_result = {}
for content_type, content in [('title', result.title), ('extract', result.extract)]:
matches = re.finditer(pattern, content, re.IGNORECASE)
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
content_result = []
for i in range(len(all_spans) - 1):
is_bold = i % 2 == 1
start = all_spans[i]
end = all_spans[i + 1]
content_result.append({'value': content[start:end], 'is_bold': is_bold})
formatted_result[content_type] = content_result
formatted_result['url'] = result.url
formatted_result = format_result_with_pattern(pattern, result)
formatted_results.append(formatted_result)
logger.info("Return results: %r", formatted_results)