From 6e39893bc170a326dababb19a8961c105f8853cf Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Mon, 30 Oct 2023 16:39:58 +0000
Subject: [PATCH] Fix fetch url to return HTML instead of JSON

---
 .../src/components/molecules/add-result.js    |  4 +-
 front-end/src/components/organisms/results.js | 10 +---
 front-end/src/index.html                      |  2 +-
 mwmbl/crawler/app.py                          | 48 +----------------
 mwmbl/urls.py                                 |  6 +--
 mwmbl/views.py                                | 51 +++++++++++++++++++
 6 files changed, 59 insertions(+), 62 deletions(-)
diff --git a/front-end/src/components/molecules/add-result.js b/front-end/src/components/molecules/add-result.js
index 6e9ce17..b6a790a 100644
--- a/front-end/src/components/molecules/add-result.js
+++ b/front-end/src/components/molecules/add-result.js
@@ -3,7 +3,7 @@ import config from "../../../config.js";
 import {globalBus} from "../../utils/events.js";
 
 
-const FETCH_URL = `${config['publicApiURL']}crawler/fetch?`
+const FETCH_URL = '/app/fetch?'
 
 
 const template = () => /*html*/`
@@ -56,7 +56,7 @@ export default define('add-result', class extends HTMLDivElement {
     const url = `${FETCH_URL}url=${encodeURIComponent(value)}&query=${encodeURIComponent(query)}`;
     const response = await fetch(url);
     if (response.status === 200) {
-      const data = await response.json();
+      const data = await response.text();
       console.log("Data", data);
 
       const addResultEvent = new CustomEvent('curate-add-result', {detail: data});
diff --git a/front-end/src/components/organisms/results.js b/front-end/src/components/organisms/results.js
index d466c49..ce97295 100644
--- a/front-end/src/components/organisms/results.js
+++ b/front-end/src/components/organisms/results.js
@@ -115,15 +115,7 @@ class ResultsHandler {
       console.log("Add result", e);
       this.__beginCurating();
       const resultData = e.detail;
-      const resultHTML = /*html*/`
-        <li
-          is='${result}'
-          data-url='${escapeString(resultData.url)}'
-          data-title='${escapeString(JSON.stringify(resultData.title))}'
-          data-extract='${escapeString(JSON.stringify(resultData.extract))}'
-        ></li>
-      `;
-      this.results.insertAdjacentHTML('afterbegin', resultHTML);
+      this.results.insertAdjacentHTML('afterbegin', resultData);
 
       const newResults = this.__getResults();
 
diff --git a/front-end/src/index.html b/front-end/src/index.html
index 74e9a21..a08e557 100644
--- a/front-end/src/index.html
+++ b/front-end/src/index.html
@@ -102,7 +102,7 @@
         </ul>
       </mwmbl-results>
     </main>
-    <div is="${addResult}"></div>
+    <div is="mwmbl-add-result"></div>
     <footer is="mwmbl-footer"></footer>
   </main>
 </body>
diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py
index a4f0524..932081c 100644
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@@ -8,12 +8,8 @@ from typing import Union
 from uuid import uuid4
 
 import boto3
-import justext
 import requests
 from fastapi import HTTPException
-from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
-    LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
-    STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
 from ninja import Router
 from redis import Redis
 
@@ -21,7 +17,6 @@ from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
 from mwmbl.crawler.stats import MwmblStats, StatsManager
 from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
 from mwmbl.database import Database
-from mwmbl.format import format_result
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
 from mwmbl.settings import (
@@ -35,9 +30,7 @@ from mwmbl.settings import (
     PUBLIC_URL_PREFIX,
     PUBLIC_USER_ID_LENGTH,
     FILE_NAME_SUFFIX,
-    DATE_REGEX, NUM_EXTRACT_CHARS)
-from mwmbl.tinysearchengine.indexer import Document
-
+    DATE_REGEX)
 
 stats_manager = StatsManager(Redis.from_url(os.environ.get("REDIS_URL")))
 
@@ -57,32 +50,6 @@ def upload(data: bytes, name: str):
 last_batch = None
 
 
-def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
-        length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
-        stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
-        max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
-        encoding=None, default_encoding=DEFAULT_ENCODING,
-        enc_errors=DEFAULT_ENC_ERRORS):
-    """
-    Converts an HTML page into a list of classified paragraphs. Each paragraph
-    is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
-    """
-    dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
-
-    titles = dom.xpath("//title")
-    title = titles[0].text if len(titles) > 0 else None
-
-    dom = preprocessor(dom)
-
-    paragraphs = ParagraphMaker.make_paragraphs(dom)
-
-    classify_paragraphs(paragraphs, stoplist, length_low, length_high,
-        stopwords_low, stopwords_high, max_link_density, no_headings)
-    revise_paragraph_classification(paragraphs, max_heading_distance)
-
-    return paragraphs, title
-
-
 def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
     router = Router(tags=["crawler"])
 
@@ -90,19 +57,6 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
     #       #
     #       #     url_db.create_tables()
 
-    @router.get('/fetch')
-    def fetch_url(request, url: str, query: str):
-        response = requests.get(url)
-        paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
-        good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
-
-        extract = ' '.join([p.text for p in good_paragraphs])
-        if len(extract) > NUM_EXTRACT_CHARS:
-            extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
-
-        result = Document(title=title, url=url, extract=extract, score=0.0)
-        return format_result(result, query)
-
     @router.post('/batches/')
     def post_batch(request, batch: Batch):
         if len(batch.items) > MAX_BATCH_SIZE:
diff --git a/mwmbl/urls.py b/mwmbl/urls.py
index 2c6c565..8ce524a 100644
--- a/mwmbl/urls.py
+++ b/mwmbl/urls.py
@@ -18,7 +18,7 @@ from django.contrib import admin
 from django.urls import path, include
 
 from mwmbl.api import api_original as api, api_v1
-from mwmbl.views import profile, search_results
+from mwmbl.views import profile, search_results, fetch_url
 
 urlpatterns = [
     path('admin/', admin.site.urls),
@@ -27,6 +27,6 @@ urlpatterns = [
     path('accounts/', include('allauth.urls')),
 
     path('accounts/profile/', profile, name='profile'),
-    path('app/search/', search_results, name="search_results")
-
+    path('app/search/', search_results, name="search_results"),
+    path('app/fetch/', fetch_url, name="fetch_url")
 ]
diff --git a/mwmbl/views.py b/mwmbl/views.py
index fe5713d..a3fb215 100644
--- a/mwmbl/views.py
+++ b/mwmbl/views.py
@@ -1,8 +1,44 @@
+import justext
+import requests
 from django.contrib.auth.decorators import login_required
 from django.shortcuts import render
 
+from mwmbl.format import format_result
 from mwmbl.search_setup import ranker
 
+from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
+    LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
+    STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
+
+from mwmbl.settings import NUM_EXTRACT_CHARS
+from mwmbl.tinysearchengine.indexer import Document
+
+
+def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
+        length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
+        stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
+        max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
+        encoding=None, default_encoding=DEFAULT_ENCODING,
+        enc_errors=DEFAULT_ENC_ERRORS):
+    """
+    Converts an HTML page into a list of classified paragraphs. Each paragraph
+    is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
+    """
+    dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
+
+    titles = dom.xpath("//title")
+    title = titles[0].text if len(titles) > 0 else None
+
+    dom = preprocessor(dom)
+
+    paragraphs = ParagraphMaker.make_paragraphs(dom)
+
+    classify_paragraphs(paragraphs, stoplist, length_low, length_high,
+        stopwords_low, stopwords_high, max_link_density, no_headings)
+    revise_paragraph_classification(paragraphs, max_heading_distance)
+
+    return paragraphs, title
+
 
 @login_required
 def profile(request):
@@ -13,3 +49,18 @@ def search_results(request):
     query = request.GET["query"]
     results = ranker.search(query)
     return render(request, "results.html", {"results": results})
+
+
+def fetch_url(request):
+    url = request.GET["url"]
+    query = request.GET["query"]
+    response = requests.get(url)
+    paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
+    good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
+
+    extract = ' '.join([p.text for p in good_paragraphs])
+    if len(extract) > NUM_EXTRACT_CHARS:
+        extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
+
+    result = Document(title=title, url=url, extract=extract, score=0.0)
+    return render(request, "results.html", {"results": [format_result(result, query)]})