From 0a4e1e4aee25ad13798603ca14fbbb25ff2c61ef Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Wed, 21 Dec 2022 21:15:34 +0000 Subject: [PATCH] Add endpoint to fetch a URL and return title and extract --- mwmbl/crawler/app.py | 48 +++++++++++++++++++++++++++++++++++++++++++- mwmbl/settings.py | 3 +++ poetry.lock | 7 ++++--- pyproject.toml | 2 +- 4 files changed, 55 insertions(+), 5 deletions(-) diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py index c7aa9de..3e1805b 100644 --- a/mwmbl/crawler/app.py +++ b/mwmbl/crawler/app.py @@ -8,8 +8,12 @@ from typing import Union from uuid import uuid4 import boto3 +import justext import requests from fastapi import HTTPException, APIRouter +from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ + LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ + STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus @@ -27,7 +31,7 @@ from mwmbl.settings import ( PUBLIC_URL_PREFIX, PUBLIC_USER_ID_LENGTH, FILE_NAME_SUFFIX, - DATE_REGEX) + DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS) def get_bucket(name): @@ -45,6 +49,32 @@ def upload(data: bytes, name: str): last_batch = None +def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT, + length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT, + stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT, + max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT, + encoding=None, default_encoding=DEFAULT_ENCODING, + enc_errors=DEFAULT_ENC_ERRORS): + """ + Converts an HTML page into a list of classified paragraphs. Each paragraph + is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙. + """ + dom = html_to_dom(html_text, default_encoding, encoding, enc_errors) + + titles = dom.xpath("//title") + title = titles[0].text if len(titles) > 0 else None + + dom = preprocessor(dom) + + paragraphs = ParagraphMaker.make_paragraphs(dom) + + classify_paragraphs(paragraphs, stoplist, length_low, length_high, + stopwords_low, stopwords_high, max_link_density, no_headings) + revise_paragraph_classification(paragraphs, max_heading_distance) + + return paragraphs, title + + def get_router(batch_cache: BatchCache, url_queue: Queue): router = APIRouter(prefix="/crawler", tags=["crawler"]) @@ -54,6 +84,22 @@ def get_router(batch_cache: BatchCache, url_queue: Queue): url_db = URLDatabase(db.connection) return url_db.create_tables() + @router.get('/fetch') + def fetch_url(url: str): + response = requests.get(url) + paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English")) + good_paragraphs = [p for p in paragraphs if p.class_type == 'good'] + + extract = ' '.join([p.text for p in good_paragraphs]) + if len(extract) > NUM_EXTRACT_CHARS: + extract = extract[:NUM_EXTRACT_CHARS - 1] + '…' + + return { + 'url': url, + 'title': title, + 'extract': extract, + } + @router.post('/batches/') def create_batch(batch: Batch): if len(batch.items) > MAX_BATCH_SIZE: diff --git a/mwmbl/settings.py b/mwmbl/settings.py index 9335314..b6ee8e6 100644 --- a/mwmbl/settings.py +++ b/mwmbl/settings.py @@ -23,6 +23,9 @@ DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}') PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/' FILE_NAME_SUFFIX = '.json.gz' +NUM_TITLE_CHARS = 65 +NUM_EXTRACT_CHARS = 155 + SCORE_FOR_ROOT_PATH = 0.1 SCORE_FOR_DIFFERENT_DOMAIN = 1.0 SCORE_FOR_SAME_DOMAIN = 0.01 diff --git a/poetry.lock b/poetry.lock index 5372b42..61c994b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -176,7 +176,7 @@ name = "en-core-web-sm" version = "3.2.0" description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer." category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] @@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0" [package.source] type = "url" url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz" + [[package]] name = "fastapi" version = "0.70.1" @@ -928,12 +929,12 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein", "en-core-web-sm"] +indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"] [metadata] lock-version = "1.1" python-versions = ">=3.10,<3.11" -content-hash = "cd17d671d8fa708240c3bf7bf5240bcf4bbcebea4538e5ad091bbcf9d10c5574" +content-hash = "073bb40814f94368f5a6658acbff4b0b7cf720644593f016d1a28f7a6f86042d" [metadata.files] anyio = [ diff --git a/pyproject.toml b/pyproject.toml index 53c1eb4..9f17bda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ boto3 = "^1.20.37" requests = "^2.27.1" psycopg2-binary = "^2.9.3" spacy = "==3.2.1" +jusText = "==3.0.0" # Optional dependencies do not get installed by default. Look under tool.poetry.extras section # to see which extras to use. @@ -26,7 +27,6 @@ warcio = {version= "==1.7.4", optional = true} idna = {version= "==3.3", optional = true} beautifulsoup4 = {version= "==4.10.0", optional = true} lxml = {version= "==4.6.4", optional = true} -jusText = {version= "==3.0.0", optional = true} langdetect = {version= "==1.0.9", optional = true} pyarrow = {version= "==6.0.0", optional = true} pyspark = {version= "==3.2.0", optional = true}