Fix fetch url to return HTML instead of JSON

This commit is contained in:
Daoud Clarke 2023-10-30 16:39:58 +00:00
parent fb27053295
commit 6e39893bc1
6 changed files with 59 additions and 62 deletions

View file

@ -3,7 +3,7 @@ import config from "../../../config.js";
import {globalBus} from "../../utils/events.js"; import {globalBus} from "../../utils/events.js";
const FETCH_URL = `${config['publicApiURL']}crawler/fetch?` const FETCH_URL = '/app/fetch?'
const template = () => /*html*/` const template = () => /*html*/`
@ -56,7 +56,7 @@ export default define('add-result', class extends HTMLDivElement {
const url = `${FETCH_URL}url=${encodeURIComponent(value)}&query=${encodeURIComponent(query)}`; const url = `${FETCH_URL}url=${encodeURIComponent(value)}&query=${encodeURIComponent(query)}`;
const response = await fetch(url); const response = await fetch(url);
if (response.status === 200) { if (response.status === 200) {
const data = await response.json(); const data = await response.text();
console.log("Data", data); console.log("Data", data);
const addResultEvent = new CustomEvent('curate-add-result', {detail: data}); const addResultEvent = new CustomEvent('curate-add-result', {detail: data});

View file

@ -115,15 +115,7 @@ class ResultsHandler {
console.log("Add result", e); console.log("Add result", e);
this.__beginCurating(); this.__beginCurating();
const resultData = e.detail; const resultData = e.detail;
const resultHTML = /*html*/` this.results.insertAdjacentHTML('afterbegin', resultData);
<li
is='${result}'
data-url='${escapeString(resultData.url)}'
data-title='${escapeString(JSON.stringify(resultData.title))}'
data-extract='${escapeString(JSON.stringify(resultData.extract))}'
></li>
`;
this.results.insertAdjacentHTML('afterbegin', resultHTML);
const newResults = this.__getResults(); const newResults = this.__getResults();

View file

@ -102,7 +102,7 @@
</ul> </ul>
</mwmbl-results> </mwmbl-results>
</main> </main>
<div is="${addResult}"></div> <div is="mwmbl-add-result"></div>
<footer is="mwmbl-footer"></footer> <footer is="mwmbl-footer"></footer>
</main> </main>
</body> </body>

View file

@ -8,12 +8,8 @@ from typing import Union
from uuid import uuid4 from uuid import uuid4
import boto3 import boto3
import justext
import requests import requests
from fastapi import HTTPException from fastapi import HTTPException
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from ninja import Router from ninja import Router
from redis import Redis from redis import Redis
@ -21,7 +17,6 @@ from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.stats import MwmblStats, StatsManager from mwmbl.crawler.stats import MwmblStats, StatsManager
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
from mwmbl.database import Database from mwmbl.database import Database
from mwmbl.format import format_result
from mwmbl.indexer.batch_cache import BatchCache from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
from mwmbl.settings import ( from mwmbl.settings import (
@ -35,9 +30,7 @@ from mwmbl.settings import (
PUBLIC_URL_PREFIX, PUBLIC_URL_PREFIX,
PUBLIC_USER_ID_LENGTH, PUBLIC_USER_ID_LENGTH,
FILE_NAME_SUFFIX, FILE_NAME_SUFFIX,
DATE_REGEX, NUM_EXTRACT_CHARS) DATE_REGEX)
from mwmbl.tinysearchengine.indexer import Document
stats_manager = StatsManager(Redis.from_url(os.environ.get("REDIS_URL"))) stats_manager = StatsManager(Redis.from_url(os.environ.get("REDIS_URL")))
@ -57,32 +50,6 @@ def upload(data: bytes, name: str):
last_batch = None last_batch = None
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
encoding=None, default_encoding=DEFAULT_ENCODING,
enc_errors=DEFAULT_ENC_ERRORS):
"""
Converts an HTML page into a list of classified paragraphs. Each paragraph
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
"""
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
titles = dom.xpath("//title")
title = titles[0].text if len(titles) > 0 else None
dom = preprocessor(dom)
paragraphs = ParagraphMaker.make_paragraphs(dom)
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, no_headings)
revise_paragraph_classification(paragraphs, max_heading_distance)
return paragraphs, title
def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router: def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
router = Router(tags=["crawler"]) router = Router(tags=["crawler"])
@ -90,19 +57,6 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
# # # #
# # url_db.create_tables() # # url_db.create_tables()
@router.get('/fetch')
def fetch_url(request, url: str, query: str):
response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
extract = ' '.join([p.text for p in good_paragraphs])
if len(extract) > NUM_EXTRACT_CHARS:
extract = extract[:NUM_EXTRACT_CHARS - 1] + ''
result = Document(title=title, url=url, extract=extract, score=0.0)
return format_result(result, query)
@router.post('/batches/') @router.post('/batches/')
def post_batch(request, batch: Batch): def post_batch(request, batch: Batch):
if len(batch.items) > MAX_BATCH_SIZE: if len(batch.items) > MAX_BATCH_SIZE:

View file

@ -18,7 +18,7 @@ from django.contrib import admin
from django.urls import path, include from django.urls import path, include
from mwmbl.api import api_original as api, api_v1 from mwmbl.api import api_original as api, api_v1
from mwmbl.views import profile, search_results from mwmbl.views import profile, search_results, fetch_url
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
@ -27,6 +27,6 @@ urlpatterns = [
path('accounts/', include('allauth.urls')), path('accounts/', include('allauth.urls')),
path('accounts/profile/', profile, name='profile'), path('accounts/profile/', profile, name='profile'),
path('app/search/', search_results, name="search_results") path('app/search/', search_results, name="search_results"),
path('app/fetch/', fetch_url, name="fetch_url")
] ]

View file

@ -1,8 +1,44 @@
import justext
import requests
from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import login_required
from django.shortcuts import render from django.shortcuts import render
from mwmbl.format import format_result
from mwmbl.search_setup import ranker from mwmbl.search_setup import ranker
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from mwmbl.settings import NUM_EXTRACT_CHARS
from mwmbl.tinysearchengine.indexer import Document
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
encoding=None, default_encoding=DEFAULT_ENCODING,
enc_errors=DEFAULT_ENC_ERRORS):
"""
Converts an HTML page into a list of classified paragraphs. Each paragraph
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
"""
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
titles = dom.xpath("//title")
title = titles[0].text if len(titles) > 0 else None
dom = preprocessor(dom)
paragraphs = ParagraphMaker.make_paragraphs(dom)
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, no_headings)
revise_paragraph_classification(paragraphs, max_heading_distance)
return paragraphs, title
@login_required @login_required
def profile(request): def profile(request):
@ -13,3 +49,18 @@ def search_results(request):
query = request.GET["query"] query = request.GET["query"]
results = ranker.search(query) results = ranker.search(query)
return render(request, "results.html", {"results": results}) return render(request, "results.html", {"results": results})
def fetch_url(request):
url = request.GET["url"]
query = request.GET["query"]
response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
extract = ' '.join([p.text for p in good_paragraphs])
if len(extract) > NUM_EXTRACT_CHARS:
extract = extract[:NUM_EXTRACT_CHARS - 1] + ''
result = Document(title=title, url=url, extract=extract, score=0.0)
return render(request, "results.html", {"results": [format_result(result, query)]})