Blacklist another domain

Encode URLs properly
Merge pull request #130 from mwmbl/fix-csrf-requirement
2023-11-21 11:24:48 +00:00 · 2023-11-21 10:45:50 +00:00 · 2023-11-19 20:53:55 +00:00 · 2023-11-19 20:48:18 +00:00 · 2023-11-19 10:02:27 +00:00 · 2023-11-19 10:01:48 +00:00
10 changed files with 36 additions and 59 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -3,3 +3,4 @@ Contributions are very welcome!
 Please join the discussion at https://matrix.to/#/#mwmbl:matrix.org and let us know what you're planning to do.

 See https://book.mwmbl.org/page/developers/ for a guide to development.
+
--- a/mwmbl/api.py
+++ b/mwmbl/api.py
@ -1,27 +0,0 @@
-from ninja import NinjaAPI
-from ninja.security import django_auth
-
-import mwmbl.crawler.app as crawler
-from mwmbl.platform import curate
-from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
-from mwmbl.tinysearchengine import search
-
-
-def create_api(version):
-    # Set csrf to True to all cookie-based authentication
-    api = NinjaAPI(version=version, csrf=True)
-
-    search_router = search.create_router(ranker)
-    api.add_router("/search/", search_router)
-
-    crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
-    api.add_router("/crawler/", crawler_router)
-
-    curation_router = curate.create_router(index_path)
-    api.add_router("/curation/", curation_router, auth=django_auth)
-    return api
-
-
-# Work around because Django-Ninja doesn't allow using multiple URLs for the same thing
-api_original = create_api("0.1")
-api_v1 = create_api("1.0.0")
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -10,7 +10,7 @@ from uuid import uuid4
 import boto3
 import requests
 from fastapi import HTTPException
-from ninja import Router
+from ninja import NinjaAPI
 from redis import Redis

 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@ -50,12 +50,8 @@ def upload(data: bytes, name: str):
 last_batch = None


-def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
-    router = Router(tags=["crawler"])
-
-    # TODO: # ensure tables are created before crawler code is used:
-    #       #
-    #       #     url_db.create_tables()
+def create_router(batch_cache: BatchCache, queued_batches: Queue, version: str) -> NinjaAPI:
+    router = NinjaAPI(urls_namespace=f"crawler-{version}")

    @router.post('/batches/')
    def post_batch(request, batch: Batch):
--- a/mwmbl/platform/curate.py
+++ b/mwmbl/platform/curate.py
@ -2,7 +2,7 @@ from logging import getLogger
 from typing import Any
 from urllib.parse import parse_qs

-from ninja import Router
+from ninja import Router, NinjaAPI

 from mwmbl.indexer.update_urls import get_datetime_from_timestamp
 from mwmbl.models import UserCuration
@ -19,8 +19,8 @@ MAX_CURATED_SCORE = 1_111_111.0
 logger = getLogger(__name__)


-def create_router(index_path: str) -> Router:
-    router = Router(tags=["user"])
+def create_router(index_path: str, version: str) -> NinjaAPI:
+    router = NinjaAPI(urls_namespace=f"curate-{version}", csrf=True)

    @router.post("/begin")
    def user_begin_curate(request, curate_begin: make_curation_type(CurateBegin)):
--- a/mwmbl/settings.py
+++ b/mwmbl/settings.py
@ -32,7 +32,7 @@ SCORE_FOR_SAME_DOMAIN = 0.01
 EXTRA_LINK_MULTIPLIER = 0.001
 UNKNOWN_DOMAIN_MULTIPLIER = 0.001
 EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
-DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$")
+DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$|omgoat\.org$")
 CORE_DOMAINS = {
    'github.com',
    'en.wikipedia.org',
--- a/mwmbl/settings_bg_prod.py
+++ b/mwmbl/settings_bg_prod.py
@ -1,8 +1,3 @@
-from mwmbl.settings_common import *
+from mwmbl.settings_prod import *

-DEBUG = False
-ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org"]
-
-DATA_PATH = "/app/storage"
 RUN_BACKGROUND_PROCESSES = True
-NUM_PAGES = 10240000
--- a/mwmbl/settings_common.py
+++ b/mwmbl/settings_common.py
@ -9,7 +9,6 @@ https://docs.djangoproject.com/en/4.2/topics/settings/
 For the full list of settings and their values, see
 https://docs.djangoproject.com/en/4.2/ref/settings/
 """
-
 from pathlib import Path

 # Build paths inside the project like this: BASE_DIR / 'subdir'.
--- a/mwmbl/tinysearchengine/search.py
+++ b/mwmbl/tinysearchengine/search.py
@ -1,6 +1,6 @@
 from logging import getLogger

-from ninja import Router
+from ninja import NinjaAPI

 from mwmbl.tinysearchengine.rank import HeuristicRanker

@ -10,8 +10,8 @@ logger = getLogger(__name__)
 SCORE_THRESHOLD = 0.25


-def create_router(ranker: HeuristicRanker) -> Router:
-    router = Router(tags=["search"])
+def create_router(ranker: HeuristicRanker, version: str) -> NinjaAPI:
+    router = NinjaAPI(urls_namespace=f"search-{version}")

    @router.get("")
    def search(request, s: str):
--- a/mwmbl/urls.py
+++ b/mwmbl/urls.py
@ -17,15 +17,27 @@ Including another URLconf
 from django.contrib import admin
 from django.urls import path, include

-from mwmbl.api import api_v1
+import mwmbl.crawler.app as crawler
+from mwmbl.platform import curate
+from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
+from mwmbl.tinysearchengine import search
 from mwmbl.views import home_fragment, fetch_url, index

 urlpatterns = [
    path('admin/', admin.site.urls),
-    path('api/v1/', api_v1.urls),
    path('accounts/', include('allauth.urls')),

-    path('', index, name="home"),
+    path('', index, name="index"),
    path('app/home/', home_fragment, name="home"),
-    path('app/fetch/', fetch_url, name="fetch_url")
+    path('app/fetch/', fetch_url, name="fetch_url"),
+
+    # TODO: this is the old API, deprecated and to be removed once all clients have moved over
+    path("search/", search.create_router(ranker, "0.1").urls),
+    path("crawler/", crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches, version="0.1").urls),
+    path("curation/", curate.create_router(index_path, version="0.1").urls),
+
+    # New API
+    path("api/v1/search/", search.create_router(ranker, "1.0.0").urls),
+    path("api/v1/crawler/", crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches, version="1.0.0").urls),
+    path("api/v1/curation/", curate.create_router(index_path, version="1.0.0").urls),
 ]
--- a/mwmbl/views.py
+++ b/mwmbl/views.py
@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from datetime import datetime
 from itertools import groupby
-from urllib.parse import urlparse, parse_qs
+from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, ParseResult

 import justext
 import requests
@ -66,12 +66,13 @@ def home_fragment(request):
        "query": query,
        "activity": activity,
    })
-    current_url = request.htmx.current_url
-    # Replace query string with new query
-    stripped_url = current_url[:current_url.index("?")] if "?" in current_url else current_url
-    query_string = "?q=" + query if len(query) > 0 else ""
-    new_url = stripped_url + query_string
-    # Set the htmx replace header
+
+    # Encode the new query string
+    if query:
+        new_query_string = urlencode({"q": query}, doseq=True)
+        new_url = "/?" + new_query_string
+    else:
+        new_url = "/"
    response["HX-Replace-Url"] = new_url
    return response
Author	SHA1	Message	Date
Daoud Clarke	cfe18162f1	Blacklist another domain	2023-11-21 11:24:48 +00:00
Daoud Clarke	b868b6284b	Encode URLs properly	2023-11-21 10:45:50 +00:00
Daoud Clarke	c1489a27cf	Merge pull request #130 from mwmbl/fix-csrf-requirement Use CSRF only for curation requests	2023-11-19 20:53:55 +00:00
Daoud Clarke	a2fd3d95d8	Use CSRF only for curation requests	2023-11-19 20:48:18 +00:00
Daoud Clarke	5874720801	Merge pull request #129 from mwmbl/allow-running-old-api Allow running old API	2023-11-19 10:02:27 +00:00
Daoud Clarke	da787a67db	Unused setting	2023-11-19 10:01:48 +00:00
Daoud Clarke	56ee43e730	Remove unused settings	2023-11-19 10:01:04 +00:00
Daoud Clarke	69f6a16cce	Reinstate old API	2023-11-19 10:00:31 +00:00
Daoud Clarke	8c45b94aa6	Outdated settings file	2023-11-18 20:21:57 +00:00
Daoud Clarke	3c61f5818d	Whitespace to allow git push	2023-11-18 20:15:39 +00:00
Daoud Clarke	a3cc316d15	Merge pull request #128 from mwmbl/beta Allow users to curate search results	2023-11-18 20:14:50 +00:00
Daoud Clarke	36df016445	Merge pull request #127 from mwmbl/add-term-info-to-index Add term info to index	2023-11-18 18:56:53 +00:00