Add endpoint to fetch a URL and return title and extract

2022-12-21 21:15:34 +00:00 · 2022-12-21 21:15:34 +00:00 · 0a4e1e4aee
commit 0a4e1e4aee
parent c7571120cc
4 changed files with 55 additions and 5 deletions
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -8,8 +8,12 @@ from typing import Union
 from uuid import uuid4

 import boto3
+import justext
 import requests
 from fastapi import HTTPException, APIRouter
+from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
+    LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
+    STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor

 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
 from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
@ -27,7 +31,7 @@ from mwmbl.settings import (
    PUBLIC_URL_PREFIX,
    PUBLIC_USER_ID_LENGTH,
    FILE_NAME_SUFFIX,
-    DATE_REGEX)
+    DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)


 def get_bucket(name):
@ -45,6 +49,32 @@ def upload(data: bytes, name: str):
 last_batch = None


+def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
+        length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
+        stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
+        max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
+        encoding=None, default_encoding=DEFAULT_ENCODING,
+        enc_errors=DEFAULT_ENC_ERRORS):
+    """
+    Converts an HTML page into a list of classified paragraphs. Each paragraph
+    is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
+    """
+    dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
+
+    titles = dom.xpath("//title")
+    title = titles[0].text if len(titles) > 0 else None
+
+    dom = preprocessor(dom)
+
+    paragraphs = ParagraphMaker.make_paragraphs(dom)
+
+    classify_paragraphs(paragraphs, stoplist, length_low, length_high,
+        stopwords_low, stopwords_high, max_link_density, no_headings)
+    revise_paragraph_classification(paragraphs, max_heading_distance)
+
+    return paragraphs, title
+
+
 def get_router(batch_cache: BatchCache, url_queue: Queue):
    router = APIRouter(prefix="/crawler", tags=["crawler"])

@ -54,6 +84,22 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
            url_db = URLDatabase(db.connection)
            return url_db.create_tables()

+    @router.get('/fetch')
+    def fetch_url(url: str):
+        response = requests.get(url)
+        paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
+        good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
+
+        extract = ' '.join([p.text for p in good_paragraphs])
+        if len(extract) > NUM_EXTRACT_CHARS:
+            extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
+
+        return {
+            'url': url,
+            'title': title,
+            'extract': extract,
+        }
+
    @router.post('/batches/')
    def create_batch(batch: Batch):
        if len(batch.items) > MAX_BATCH_SIZE:
--- a/mwmbl/settings.py
+++ b/mwmbl/settings.py
@ -23,6 +23,9 @@ DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
 PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
 FILE_NAME_SUFFIX = '.json.gz'

+NUM_TITLE_CHARS = 65
+NUM_EXTRACT_CHARS = 155
+
 SCORE_FOR_ROOT_PATH = 0.1
 SCORE_FOR_DIFFERENT_DOMAIN = 1.0
 SCORE_FOR_SAME_DOMAIN = 0.01
--- a/poetry.lock
+++ b/poetry.lock
@ -176,7 +176,7 @@ name = "en-core-web-sm"
 version = "3.2.0"
 description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer."
 category = "main"
-optional = true
+optional = false
 python-versions = "*"

 [package.dependencies]
@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
 [package.source]
 type = "url"
 url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
+
 [[package]]
 name = "fastapi"
 version = "0.70.1"
@ -928,12 +929,12 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 cffi = ["cffi (>=1.11)"]

 [extras]
-indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein", "en-core-web-sm"]
+indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"]

 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.10,<3.11"
-content-hash = "cd17d671d8fa708240c3bf7bf5240bcf4bbcebea4538e5ad091bbcf9d10c5574"
+content-hash = "073bb40814f94368f5a6658acbff4b0b7cf720644593f016d1a28f7a6f86042d"

 [metadata.files]
 anyio = [
--- a/pyproject.toml
+++ b/pyproject.toml
@ -18,6 +18,7 @@ boto3 = "^1.20.37"
 requests = "^2.27.1"
 psycopg2-binary = "^2.9.3"
 spacy = "==3.2.1"
+jusText = "==3.0.0"

 # Optional dependencies do not get installed by default. Look under tool.poetry.extras section
 # to see which extras to use.
@ -26,7 +27,6 @@ warcio = {version= "==1.7.4", optional = true}
 idna = {version= "==3.3", optional = true}
 beautifulsoup4 = {version= "==4.10.0", optional = true}
 lxml = {version= "==4.6.4", optional = true}
-jusText = {version= "==3.0.0", optional = true}
 langdetect = {version= "==1.0.9", optional = true}
 pyarrow = {version= "==6.0.0", optional = true}
 pyspark = {version= "==3.2.0", optional = true}