diff --git a/front-end/src/components/app.js b/front-end/src/components/app.js deleted file mode 100644 index 91ef881..0000000 --- a/front-end/src/components/app.js +++ /dev/null @@ -1,33 +0,0 @@ -import define from '../utils/define.js'; -import addResult from "./molecules/add-result.js"; -import save from "./organisms/save.js"; - -const template = () => /*html*/` -
- -
Login Sign up
-
- mwmbl logo - MWMBL -
- -
-
- -
-
- -`; - -export default define('app', class extends HTMLElement { - constructor() { - super(); - this.__setup(); - } - - __setup() { - this.innerHTML = template(); - } -}); \ No newline at end of file diff --git a/front-end/src/components/molecules/add-result.js b/front-end/src/components/molecules/add-result.js index 6e9ce17..b6a790a 100644 --- a/front-end/src/components/molecules/add-result.js +++ b/front-end/src/components/molecules/add-result.js @@ -3,7 +3,7 @@ import config from "../../../config.js"; import {globalBus} from "../../utils/events.js"; -const FETCH_URL = `${config['publicApiURL']}crawler/fetch?` +const FETCH_URL = '/app/fetch?' const template = () => /*html*/` @@ -56,7 +56,7 @@ export default define('add-result', class extends HTMLDivElement { const url = `${FETCH_URL}url=${encodeURIComponent(value)}&query=${encodeURIComponent(query)}`; const response = await fetch(url); if (response.status === 200) { - const data = await response.json(); + const data = await response.text(); console.log("Data", data); const addResultEvent = new CustomEvent('curate-add-result', {detail: data}); diff --git a/front-end/src/components/molecules/result.js b/front-end/src/components/molecules/result.js index 2a2859e..9ab38ee 100644 --- a/front-end/src/components/molecules/result.js +++ b/front-end/src/components/molecules/result.js @@ -1,26 +1,7 @@ import define from '../../utils/define.js'; import escapeString from '../../utils/escapeString.js'; import { globalBus } from '../../utils/events.js'; -import deleteButton from "./delete-button.js"; -import validateButton from "./validate-button.js"; -import addButton from "./add-button.js"; -const template = ({ data }) => /*html*/` -
-
- - - -
- -
-`; export default define('result', class extends HTMLLIElement { constructor() { @@ -30,11 +11,6 @@ export default define('result', class extends HTMLLIElement { } __setup() { - this.innerHTML = template({ data: { - url: this.dataset.url, - title: this.__handleBold(JSON.parse(this.dataset.title)), - extract: this.__handleBold(JSON.parse(this.dataset.extract)) - }}); this.__events(); } diff --git a/front-end/src/components/organisms/results.js b/front-end/src/components/organisms/results.js index 9cd9993..7a0e6b4 100644 --- a/front-end/src/components/organisms/results.js +++ b/front-end/src/components/organisms/results.js @@ -1,21 +1,13 @@ -import define from '../../utils/define.js'; import {globalBus} from '../../utils/events.js'; -// Components -import result from '../molecules/result.js'; -import emptyResult from '../molecules/empty-result.js'; -import home from './home.js'; -import escapeString from '../../utils/escapeString.js'; -const template = () => /*html*/` - -`; +document.body.addEventListener('htmx:load', function(evt) { -export default define('results', class extends HTMLElement { +}); + + +class ResultsHandler { constructor() { - super(); this.results = null; this.oldIndex = null; this.curating = false; @@ -23,50 +15,12 @@ export default define('results', class extends HTMLElement { } __setup() { - this.innerHTML = template(); - this.results = this.querySelector('.results'); this.__events(); } __events() { - globalBus.on('search', (e) => { - this.results.innerHTML = ''; - let resultsHTML = ''; - if (!e.detail.error) { - // If there is no details the input is empty - if (!e.detail.results) { - resultsHTML = /*html*/` -
  • - `; - } - // If the details array has results display them - else if (e.detail.results.length > 0) { - for(const resultData of e.detail.results) { - resultsHTML += /*html*/` -
  • - `; - } - } - // If the details array is empty there is no result - else { - resultsHTML = /*html*/` -
  • - `; - } - } - else { - // If there is an error display an empty result - resultsHTML = /*html*/` -
  • - `; - } - // Bind HTML to the DOM - this.results.innerHTML = resultsHTML; + document.body.addEventListener('htmx:load', e => { + this.results = document.querySelector('.results'); // Allow the user to re-order search results $(".results").sortable({ @@ -142,15 +96,7 @@ export default define('results', class extends HTMLElement { console.log("Add result", e); this.__beginCurating(); const resultData = e.detail; - const resultHTML = /*html*/` -
  • - `; - this.results.insertAdjacentHTML('afterbegin', resultHTML); + this.results.insertAdjacentHTML('afterbegin', resultData); const newResults = this.__getResults(); @@ -236,4 +182,6 @@ export default define('results', class extends HTMLElement { }); globalBus.dispatch(curationMoveEvent); } -}); \ No newline at end of file +} + +const resultsHandler = new ResultsHandler(); diff --git a/front-end/src/index.html b/front-end/src/index.html index 65bacda..a08e557 100644 --- a/front-end/src/index.html +++ b/front-end/src/index.html @@ -48,6 +48,8 @@ + + + +
    +
    + +
    Login Sign up
    +
    + mwmbl logo + MWMBL +
    + +
    +
    + +
      +
    • +
    +
    +
    +
    + +
    \ No newline at end of file diff --git a/front-end/src/index.js b/front-end/src/index.js index f8f51b3..34cd8cb 100644 --- a/front-end/src/index.js +++ b/front-end/src/index.js @@ -14,7 +14,6 @@ if (!redirected) { // Load components only after redirects are checked. - import('./components/app.js'); import('./components/login.js'); import('./components/register.js'); import("./components/organisms/search-bar.js"); diff --git a/mwmbl/api.py b/mwmbl/api.py index 5fe1129..3a99609 100644 --- a/mwmbl/api.py +++ b/mwmbl/api.py @@ -1,28 +1,10 @@ -from multiprocessing import Queue -from pathlib import Path - -from django.conf import settings from ninja import NinjaAPI from ninja.security import django_auth import mwmbl.crawler.app as crawler -from mwmbl.indexer.batch_cache import BatchCache -from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME from mwmbl.platform import curate +from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache from mwmbl.tinysearchengine import search -from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, Document -from mwmbl.tinysearchengine.rank import HeuristicRanker - - -queued_batches = Queue() -completer = Completer() - -index_path = Path(settings.DATA_PATH) / INDEX_NAME -tiny_index = TinyIndex(item_factory=Document, index_path=index_path) -tiny_index.__enter__() -ranker = HeuristicRanker(tiny_index, completer) -batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME) def create_api(version): diff --git a/mwmbl/apps.py b/mwmbl/apps.py index dff27b6..30a95a7 100644 --- a/mwmbl/apps.py +++ b/mwmbl/apps.py @@ -13,7 +13,7 @@ class MwmblConfig(AppConfig): def ready(self): # Imports here to avoid AppRegistryNotReady exception - from mwmbl.api import queued_batches + from mwmbl.search_setup import queued_batches from mwmbl import background from mwmbl.indexer.paths import INDEX_NAME from mwmbl.indexer.update_urls import update_urls_continuously diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py index a4f0524..932081c 100644 --- a/mwmbl/crawler/app.py +++ b/mwmbl/crawler/app.py @@ -8,12 +8,8 @@ from typing import Union from uuid import uuid4 import boto3 -import justext import requests from fastapi import HTTPException -from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ - LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ - STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor from ninja import Router from redis import Redis @@ -21,7 +17,6 @@ from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch from mwmbl.crawler.stats import MwmblStats, StatsManager from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus from mwmbl.database import Database -from mwmbl.format import format_result from mwmbl.indexer.batch_cache import BatchCache from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus from mwmbl.settings import ( @@ -35,9 +30,7 @@ from mwmbl.settings import ( PUBLIC_URL_PREFIX, PUBLIC_USER_ID_LENGTH, FILE_NAME_SUFFIX, - DATE_REGEX, NUM_EXTRACT_CHARS) -from mwmbl.tinysearchengine.indexer import Document - + DATE_REGEX) stats_manager = StatsManager(Redis.from_url(os.environ.get("REDIS_URL"))) @@ -57,32 +50,6 @@ def upload(data: bytes, name: str): last_batch = None -def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT, - length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT, - stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT, - max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT, - encoding=None, default_encoding=DEFAULT_ENCODING, - enc_errors=DEFAULT_ENC_ERRORS): - """ - Converts an HTML page into a list of classified paragraphs. Each paragraph - is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙. - """ - dom = html_to_dom(html_text, default_encoding, encoding, enc_errors) - - titles = dom.xpath("//title") - title = titles[0].text if len(titles) > 0 else None - - dom = preprocessor(dom) - - paragraphs = ParagraphMaker.make_paragraphs(dom) - - classify_paragraphs(paragraphs, stoplist, length_low, length_high, - stopwords_low, stopwords_high, max_link_density, no_headings) - revise_paragraph_classification(paragraphs, max_heading_distance) - - return paragraphs, title - - def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router: router = Router(tags=["crawler"]) @@ -90,19 +57,6 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router: # # # # url_db.create_tables() - @router.get('/fetch') - def fetch_url(request, url: str, query: str): - response = requests.get(url) - paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English")) - good_paragraphs = [p for p in paragraphs if p.class_type == 'good'] - - extract = ' '.join([p.text for p in good_paragraphs]) - if len(extract) > NUM_EXTRACT_CHARS: - extract = extract[:NUM_EXTRACT_CHARS - 1] + '…' - - result = Document(title=title, url=url, extract=extract, score=0.0) - return format_result(result, query) - @router.post('/batches/') def post_batch(request, batch: Batch): if len(batch.items) > MAX_BATCH_SIZE: diff --git a/mwmbl/search_setup.py b/mwmbl/search_setup.py new file mode 100644 index 0000000..f44867a --- /dev/null +++ b/mwmbl/search_setup.py @@ -0,0 +1,19 @@ +from multiprocessing import Queue +from pathlib import Path + +from django.conf import settings + +from mwmbl.indexer.batch_cache import BatchCache +from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME +from mwmbl.tinysearchengine.completer import Completer +from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine.rank import HeuristicRanker + +queued_batches = Queue() +completer = Completer() +index_path = Path(settings.DATA_PATH) / INDEX_NAME +tiny_index = TinyIndex(item_factory=Document, index_path=index_path) +tiny_index.__enter__() + +ranker = HeuristicRanker(tiny_index, completer) +batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME) diff --git a/mwmbl/templates/results.html b/mwmbl/templates/results.html new file mode 100644 index 0000000..11fa224 --- /dev/null +++ b/mwmbl/templates/results.html @@ -0,0 +1,19 @@ +{% load result_filters %} +{% for result in results %} +
  • +
    +
    + + + +
    + +
    +
  • +{% endfor %} diff --git a/mwmbl/templatetags/__init__.py b/mwmbl/templatetags/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mwmbl/templatetags/result_filters.py b/mwmbl/templatetags/result_filters.py new file mode 100644 index 0000000..8acf02d --- /dev/null +++ b/mwmbl/templatetags/result_filters.py @@ -0,0 +1,18 @@ +from django.template import Library +from django.utils.html import conditional_escape +from django.utils.safestring import mark_safe + +register = Library() + + +@register.filter(needs_autoescape=True) +def strengthen(spans, autoescape=True): + escape = conditional_escape if autoescape else lambda x: x + strengthened = [] + for span in spans: + escaped_value = escape(span["value"]) + if span["is_bold"]: + strengthened.append(f"{escaped_value}") + else: + strengthened.append(escaped_value) + return mark_safe("".join(strengthened)) diff --git a/mwmbl/urls.py b/mwmbl/urls.py index b416085..8ce524a 100644 --- a/mwmbl/urls.py +++ b/mwmbl/urls.py @@ -15,12 +15,10 @@ Including another URLconf 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ from django.contrib import admin -from django.contrib.auth import login, logout -from django.template.defaulttags import url from django.urls import path, include from mwmbl.api import api_original as api, api_v1 -from mwmbl.views import signup, profile +from mwmbl.views import profile, search_results, fetch_url urlpatterns = [ path('admin/', admin.site.urls), @@ -28,9 +26,7 @@ urlpatterns = [ path('api/v1/', api_v1.urls), path('accounts/', include('allauth.urls')), - # path("accounts/", include("django.contrib.auth.urls")), - # path('accounts/new/', signup, name='signup'), path('accounts/profile/', profile, name='profile'), - # path('login/', login, {'template_name': 'login.html'}, name='login'), - # path('logout/', logout, {'next_page': 'login'}, name='logout'), + path('app/search/', search_results, name="search_results"), + path('app/fetch/', fetch_url, name="fetch_url") ] diff --git a/mwmbl/views.py b/mwmbl/views.py index 0ae3ba8..a3fb215 100644 --- a/mwmbl/views.py +++ b/mwmbl/views.py @@ -1,24 +1,66 @@ -from django.contrib.auth import authenticate, login +import justext +import requests from django.contrib.auth.decorators import login_required -from django.contrib.auth.forms import UserCreationForm -from django.shortcuts import redirect, render +from django.shortcuts import render + +from mwmbl.format import format_result +from mwmbl.search_setup import ranker + +from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ + LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ + STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor + +from mwmbl.settings import NUM_EXTRACT_CHARS +from mwmbl.tinysearchengine.indexer import Document -def signup(request): - if request.method == 'POST': - form = UserCreationForm(request.POST) - if form.is_valid(): - form.save() - username = form.cleaned_data.get('username') - raw_password = form.cleaned_data.get('password1') - user = authenticate(username=username, password=raw_password) - login(request, user) - return redirect('/') - else: - form = UserCreationForm() - return render(request, 'signup.html', {'form': form}) +def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT, + length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT, + stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT, + max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT, + encoding=None, default_encoding=DEFAULT_ENCODING, + enc_errors=DEFAULT_ENC_ERRORS): + """ + Converts an HTML page into a list of classified paragraphs. Each paragraph + is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙. + """ + dom = html_to_dom(html_text, default_encoding, encoding, enc_errors) + + titles = dom.xpath("//title") + title = titles[0].text if len(titles) > 0 else None + + dom = preprocessor(dom) + + paragraphs = ParagraphMaker.make_paragraphs(dom) + + classify_paragraphs(paragraphs, stoplist, length_low, length_high, + stopwords_low, stopwords_high, max_link_density, no_headings) + revise_paragraph_classification(paragraphs, max_heading_distance) + + return paragraphs, title @login_required def profile(request): return render(request, 'profile.html') + + +def search_results(request): + query = request.GET["query"] + results = ranker.search(query) + return render(request, "results.html", {"results": results}) + + +def fetch_url(request): + url = request.GET["url"] + query = request.GET["query"] + response = requests.get(url) + paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English")) + good_paragraphs = [p for p in paragraphs if p.class_type == 'good'] + + extract = ' '.join([p.text for p in good_paragraphs]) + if len(extract) > NUM_EXTRACT_CHARS: + extract = extract[:NUM_EXTRACT_CHARS - 1] + '…' + + result = Document(title=title, url=url, extract=extract, score=0.0) + return render(request, "results.html", {"results": [format_result(result, query)]})