Add endpoint to fetch a URL and return title and extract
This commit is contained in:
parent
c7571120cc
commit
0a4e1e4aee
4 changed files with 55 additions and 5 deletions
|
@ -8,8 +8,12 @@ from typing import Union
|
|||
from uuid import uuid4
|
||||
|
||||
import boto3
|
||||
import justext
|
||||
import requests
|
||||
from fastapi import HTTPException, APIRouter
|
||||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||
|
||||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
|
||||
|
@ -27,7 +31,7 @@ from mwmbl.settings import (
|
|||
PUBLIC_URL_PREFIX,
|
||||
PUBLIC_USER_ID_LENGTH,
|
||||
FILE_NAME_SUFFIX,
|
||||
DATE_REGEX)
|
||||
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
|
||||
|
||||
|
||||
def get_bucket(name):
|
||||
|
@ -45,6 +49,32 @@ def upload(data: bytes, name: str):
|
|||
last_batch = None
|
||||
|
||||
|
||||
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
|
||||
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
|
||||
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
|
||||
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
|
||||
encoding=None, default_encoding=DEFAULT_ENCODING,
|
||||
enc_errors=DEFAULT_ENC_ERRORS):
|
||||
"""
|
||||
Converts an HTML page into a list of classified paragraphs. Each paragraph
|
||||
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
|
||||
"""
|
||||
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
|
||||
|
||||
titles = dom.xpath("//title")
|
||||
title = titles[0].text if len(titles) > 0 else None
|
||||
|
||||
dom = preprocessor(dom)
|
||||
|
||||
paragraphs = ParagraphMaker.make_paragraphs(dom)
|
||||
|
||||
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
|
||||
stopwords_low, stopwords_high, max_link_density, no_headings)
|
||||
revise_paragraph_classification(paragraphs, max_heading_distance)
|
||||
|
||||
return paragraphs, title
|
||||
|
||||
|
||||
def get_router(batch_cache: BatchCache, url_queue: Queue):
|
||||
router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
|
||||
|
@ -54,6 +84,22 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
|
|||
url_db = URLDatabase(db.connection)
|
||||
return url_db.create_tables()
|
||||
|
||||
@router.get('/fetch')
|
||||
def fetch_url(url: str):
|
||||
response = requests.get(url)
|
||||
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
||||
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
||||
|
||||
extract = ' '.join([p.text for p in good_paragraphs])
|
||||
if len(extract) > NUM_EXTRACT_CHARS:
|
||||
extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
|
||||
|
||||
return {
|
||||
'url': url,
|
||||
'title': title,
|
||||
'extract': extract,
|
||||
}
|
||||
|
||||
@router.post('/batches/')
|
||||
def create_batch(batch: Batch):
|
||||
if len(batch.items) > MAX_BATCH_SIZE:
|
||||
|
|
|
@ -23,6 +23,9 @@ DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
|
|||
PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
|
||||
FILE_NAME_SUFFIX = '.json.gz'
|
||||
|
||||
NUM_TITLE_CHARS = 65
|
||||
NUM_EXTRACT_CHARS = 155
|
||||
|
||||
SCORE_FOR_ROOT_PATH = 0.1
|
||||
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
|
||||
SCORE_FOR_SAME_DOMAIN = 0.01
|
||||
|
|
7
poetry.lock
generated
7
poetry.lock
generated
|
@ -176,7 +176,7 @@ name = "en-core-web-sm"
|
|||
version = "3.2.0"
|
||||
description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer."
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
|
|||
[package.source]
|
||||
type = "url"
|
||||
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
|
||||
|
||||
[[package]]
|
||||
name = "fastapi"
|
||||
version = "0.70.1"
|
||||
|
@ -928,12 +929,12 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
|||
cffi = ["cffi (>=1.11)"]
|
||||
|
||||
[extras]
|
||||
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein", "en-core-web-sm"]
|
||||
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = ">=3.10,<3.11"
|
||||
content-hash = "cd17d671d8fa708240c3bf7bf5240bcf4bbcebea4538e5ad091bbcf9d10c5574"
|
||||
content-hash = "073bb40814f94368f5a6658acbff4b0b7cf720644593f016d1a28f7a6f86042d"
|
||||
|
||||
[metadata.files]
|
||||
anyio = [
|
||||
|
|
|
@ -18,6 +18,7 @@ boto3 = "^1.20.37"
|
|||
requests = "^2.27.1"
|
||||
psycopg2-binary = "^2.9.3"
|
||||
spacy = "==3.2.1"
|
||||
jusText = "==3.0.0"
|
||||
|
||||
# Optional dependencies do not get installed by default. Look under tool.poetry.extras section
|
||||
# to see which extras to use.
|
||||
|
@ -26,7 +27,6 @@ warcio = {version= "==1.7.4", optional = true}
|
|||
idna = {version= "==3.3", optional = true}
|
||||
beautifulsoup4 = {version= "==4.10.0", optional = true}
|
||||
lxml = {version= "==4.6.4", optional = true}
|
||||
jusText = {version= "==3.0.0", optional = true}
|
||||
langdetect = {version= "==1.0.9", optional = true}
|
||||
pyarrow = {version= "==6.0.0", optional = true}
|
||||
pyspark = {version= "==3.2.0", optional = true}
|
||||
|
|
Loading…
Reference in a new issue