Format fetched url

2022-12-24 19:59:15 +00:00 · 2022-12-24 19:59:15 +00:00 · 8676abbc63
commit 8676abbc63
parent 0a4e1e4aee
3 changed files with 49 additions and 34 deletions
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -18,6 +18,7 @@ from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revis
 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
 from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
 from mwmbl.database import Database
+from mwmbl.format import format_result
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
 from mwmbl.settings import (
@ -32,6 +33,7 @@ from mwmbl.settings import (
    PUBLIC_USER_ID_LENGTH,
    FILE_NAME_SUFFIX,
    DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
+from mwmbl.tinysearchengine.indexer import Document


 def get_bucket(name):
@ -85,7 +87,7 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
            return url_db.create_tables()

    @router.get('/fetch')
-    def fetch_url(url: str):
+    def fetch_url(url: str, query: str):
        response = requests.get(url)
        paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
        good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
@ -94,11 +96,8 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
        if len(extract) > NUM_EXTRACT_CHARS:
            extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'

-        return {
-            'url': url,
-            'title': title,
-            'extract': extract,
-        }
+        result = Document(title=title, url=url, extract=extract, score=0.0)
+        return format_result(result, query)

    @router.post('/batches/')
    def create_batch(batch: Batch):
--- a/mwmbl/format.py
+++ b/mwmbl/format.py
@ -0,0 +1,40 @@
+import re
+
+from mwmbl.tokenizer import tokenize
+
+
+def format_result_with_pattern(pattern, result):
+    formatted_result = {}
+    for content_type, content in [('title', result.title), ('extract', result.extract)]:
+        matches = re.finditer(pattern, content, re.IGNORECASE)
+        all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
+        content_result = []
+        for i in range(len(all_spans) - 1):
+            is_bold = i % 2 == 1
+            start = all_spans[i]
+            end = all_spans[i + 1]
+            content_result.append({'value': content[start:end], 'is_bold': is_bold})
+        formatted_result[content_type] = content_result
+    formatted_result['url'] = result.url
+    return formatted_result
+
+
+def get_query_regex(terms, is_complete, is_url):
+    if not terms:
+        return ''
+
+    word_sep = r'\b' if is_url else ''
+    if is_complete:
+        term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
+    else:
+        term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
+            rf'{word_sep}{re.escape(terms[-1])}']
+    pattern = '|'.join(term_patterns)
+    return pattern
+
+
+def format_result(result, query):
+    tokens = tokenize(query)
+    pattern = get_query_regex(tokens, True, False)
+    return format_result_with_pattern(pattern, result)
+
--- a/mwmbl/tinysearchengine/rank.py
+++ b/mwmbl/tinysearchengine/rank.py
@ -5,6 +5,7 @@ from logging import getLogger
 from operator import itemgetter
 from urllib.parse import urlparse

+from mwmbl.format import format_result_with_pattern, get_query_regex
 from mwmbl.tokenizer import tokenize, get_bigrams
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.hn_top_domains_filtered import DOMAINS
@ -21,20 +22,6 @@ DOMAIN_SCORE_SMOOTHING = 50
 HTTPS_STRING = 'https://'


-def _get_query_regex(terms, is_complete, is_url):
-    if not terms:
-        return ''
-
-    word_sep = r'\b' if is_url else ''
-    if is_complete:
-        term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
-    else:
-        term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
-            rf'{word_sep}{re.escape(terms[-1])}']
-    pattern = '|'.join(term_patterns)
-    return pattern
-
-
 def score_result(terms: list[str], result: Document, is_complete: bool):
    features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete)

@ -93,7 +80,7 @@ def get_domain_score(url):


 def get_match_features(terms, result_string, is_complete, is_url):
-    query_regex = _get_query_regex(terms, is_complete, is_url)
+    query_regex = get_query_regex(terms, is_complete, is_url)
    matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
    # match_strings = {x.group(0).lower() for x in matches}
    # match_length = sum(len(x) for x in match_strings)
@ -135,21 +122,10 @@ class Ranker:
        results, terms, _ = self.get_results(s)

        is_complete = s.endswith(' ')
-        pattern = _get_query_regex(terms, is_complete, False)
+        pattern = get_query_regex(terms, is_complete, False)
        formatted_results = []
        for result in results:
-            formatted_result = {}
-            for content_type, content in [('title', result.title), ('extract', result.extract)]:
-                matches = re.finditer(pattern, content, re.IGNORECASE)
-                all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
-                content_result = []
-                for i in range(len(all_spans) - 1):
-                    is_bold = i % 2 == 1
-                    start = all_spans[i]
-                    end = all_spans[i + 1]
-                    content_result.append({'value': content[start:end], 'is_bold': is_bold})
-                formatted_result[content_type] = content_result
-            formatted_result['url'] = result.url
+            formatted_result = format_result_with_pattern(pattern, result)
            formatted_results.append(formatted_result)

        logger.info("Return results: %r", formatted_results)