Add endpoint to fetch a URL and return title and extract

This commit is contained in:
Daoud Clarke 2022-12-21 21:15:34 +00:00
parent c7571120cc
commit 0a4e1e4aee
4 changed files with 55 additions and 5 deletions

View file

@ -8,8 +8,12 @@ from typing import Union
from uuid import uuid4
import boto3
import justext
import requests
from fastapi import HTTPException, APIRouter
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
@ -27,7 +31,7 @@ from mwmbl.settings import (
PUBLIC_URL_PREFIX,
PUBLIC_USER_ID_LENGTH,
FILE_NAME_SUFFIX,
DATE_REGEX)
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
def get_bucket(name):
@ -45,6 +49,32 @@ def upload(data: bytes, name: str):
last_batch = None
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
encoding=None, default_encoding=DEFAULT_ENCODING,
enc_errors=DEFAULT_ENC_ERRORS):
"""
Converts an HTML page into a list of classified paragraphs. Each paragraph
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
"""
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
titles = dom.xpath("//title")
title = titles[0].text if len(titles) > 0 else None
dom = preprocessor(dom)
paragraphs = ParagraphMaker.make_paragraphs(dom)
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, no_headings)
revise_paragraph_classification(paragraphs, max_heading_distance)
return paragraphs, title
def get_router(batch_cache: BatchCache, url_queue: Queue):
router = APIRouter(prefix="/crawler", tags=["crawler"])
@ -54,6 +84,22 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
url_db = URLDatabase(db.connection)
return url_db.create_tables()
@router.get('/fetch')
def fetch_url(url: str):
response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
extract = ' '.join([p.text for p in good_paragraphs])
if len(extract) > NUM_EXTRACT_CHARS:
extract = extract[:NUM_EXTRACT_CHARS - 1] + ''
return {
'url': url,
'title': title,
'extract': extract,
}
@router.post('/batches/')
def create_batch(batch: Batch):
if len(batch.items) > MAX_BATCH_SIZE:

View file

@ -23,6 +23,9 @@ DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
FILE_NAME_SUFFIX = '.json.gz'
NUM_TITLE_CHARS = 65
NUM_EXTRACT_CHARS = 155
SCORE_FOR_ROOT_PATH = 0.1
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
SCORE_FOR_SAME_DOMAIN = 0.01

7
poetry.lock generated
View file

@ -176,7 +176,7 @@ name = "en-core-web-sm"
version = "3.2.0"
description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer."
category = "main"
optional = true
optional = false
python-versions = "*"
[package.dependencies]
@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
[package.source]
type = "url"
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
[[package]]
name = "fastapi"
version = "0.70.1"
@ -928,12 +929,12 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein", "en-core-web-sm"]
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
[metadata]
lock-version = "1.1"
python-versions = ">=3.10,<3.11"
content-hash = "cd17d671d8fa708240c3bf7bf5240bcf4bbcebea4538e5ad091bbcf9d10c5574"
content-hash = "073bb40814f94368f5a6658acbff4b0b7cf720644593f016d1a28f7a6f86042d"
[metadata.files]
anyio = [

View file

@ -18,6 +18,7 @@ boto3 = "^1.20.37"
requests = "^2.27.1"
psycopg2-binary = "^2.9.3"
spacy = "==3.2.1"
jusText = "==3.0.0"
# Optional dependencies do not get installed by default. Look under tool.poetry.extras section
# to see which extras to use.
@ -26,7 +27,6 @@ warcio = {version= "==1.7.4", optional = true}
idna = {version= "==3.3", optional = true}
beautifulsoup4 = {version= "==4.10.0", optional = true}
lxml = {version= "==4.6.4", optional = true}
jusText = {version= "==3.0.0", optional = true}
langdetect = {version= "==1.0.9", optional = true}
pyarrow = {version= "==6.0.0", optional = true}
pyspark = {version= "==3.2.0", optional = true}