Blacklist another domain

Encode URLs properly
Merge pull request #130 from mwmbl/fix-csrf-requirement
2023-11-21 11:24:48 +00:00 · 2023-11-21 10:45:50 +00:00 · 2023-11-19 20:53:55 +00:00 · 2023-11-19 20:48:18 +00:00 · 2023-11-19 10:02:27 +00:00 · 2023-11-19 10:01:48 +00:00
25 changed files with 273 additions and 319 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -3,3 +3,4 @@ Contributions are very welcome!
 Please join the discussion at https://matrix.to/#/#mwmbl:matrix.org and let us know what you're planning to do.

 See https://book.mwmbl.org/page/developers/ for a guide to development.
+
--- a/analyse/add_term_info.py
+++ b/analyse/add_term_info.py
@ -0,0 +1,51 @@
+"""
+Investigate adding term information to the database.
+
+How much extra space will it take?
+"""
+import os
+from pathlib import Path
+from random import Random
+
+import numpy as np
+from scipy.stats import sem
+
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
+
+from zstandard import ZstdCompressor
+
+from mwmbl.utils import add_term_info
+
+random = Random(1)
+
+INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
+
+
+def run():
+    compressor = ZstdCompressor()
+    with TinyIndex(Document, INDEX_PATH) as index:
+        # Get some random integers between 0 and index.num_pages:
+        pages = random.sample(range(index.num_pages), 10000)
+
+        old_sizes = []
+        new_sizes = []
+
+        for i in pages:
+            page = index.get_page(i)
+            term_documents = []
+            for document in page:
+                term_document = add_term_info(document, index, i)
+                term_documents.append(term_document)
+
+            value_tuples = [astuple(value) for value in term_documents]
+            num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
+
+            new_sizes.append(num_fitting)
+            old_sizes.append(len(page))
+
+        print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
+        print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
+
+
+if __name__ == '__main__':
+    run()
--- a/analyse/index_local.py
+++ b/analyse/index_local.py
@ -1,57 +0,0 @@
-"""
-Index batches stored locally on the filesystem for the purpose of evaluation.
-"""
-import glob
-import gzip
-import json
-import logging
-import os
-import sys
-from datetime import datetime
-
-import spacy
-
-from mwmbl.crawler import HashedBatch
-from mwmbl.crawler.urls import URLDatabase
-from mwmbl.database import Database
-from mwmbl.indexer import index_batches
-from mwmbl.tinysearchengine import TinyIndex, Document
-
-LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
-NUM_BATCHES = 10000
-EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
-NUM_PAGES = 1_024_000
-PAGE_SIZE = 4096
-
-
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-
-
-def get_batches():
-    for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
-        data = json.load(gzip.open(path))
-        yield HashedBatch.parse_obj(data)
-
-
-def run():
-    try:
-        os.remove(EVALUATE_INDEX_PATH)
-    except FileNotFoundError:
-        pass
-    TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
-
-    batches = get_batches()
-
-    start = datetime.now()
-    with Database() as db:
-        nlp = spacy.load("en_core_web_sm")
-        url_db = URLDatabase(db.connection)
-        index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
-    end = datetime.now()
-
-    total_time = (end - start).total_seconds()
-    print("total_seconds:", total_time)
-
-
-if __name__ == '__main__':
-    run()
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@ -1,60 +0,0 @@
-import logging
-import sys
-
-import numpy as np
-import spacy
-
-from analyse.index_local import EVALUATE_INDEX_PATH
-from mwmbl.indexer import tokenize_document
-from mwmbl.indexer import INDEX_PATH
-from mwmbl.tinysearchengine import TinyIndex, Document
-
-
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-nlp = spacy.load("en_core_web_sm")
-
-
-def store():
-    document = Document(
-        title='A nation in search of the new black | Theatre | The Guardian',
-        url='https://www.theguardian.com/stage/2007/nov/18/theatre',
-        extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
-        score=1.0
-    )
-    with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
-        tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
-        print("Tokenized", tokenized)
-        # for token in tokenized.tokens:
-        #
-        #     tiny_index.index(token, document)
-
-
-def get_items():
-    with TinyIndex(Document, INDEX_PATH) as tiny_index:
-        items = tiny_index.retrieve('wikipedia')
-        if items:
-            for item in items:
-                print("Items", item)
-
-
-def run(index_path):
-    with TinyIndex(Document, index_path) as tiny_index:
-        sizes = {}
-        for i in range(tiny_index.num_pages):
-            page = tiny_index.get_page(i)
-            if page:
-                sizes[i] = len(page)
-            if len(page) > 50:
-                print("Page", len(page), page)
-            # for item in page:
-            #     if ' search' in item.title:
-            #         print("Page", i, item)
-        print("Max", max(sizes.values()))
-        print("Top", sorted(sizes.values())[-100:])
-        print("Mean", np.mean(list(sizes.values())))
-
-
-if __name__ == '__main__':
-    # store()
-    run(EVALUATE_INDEX_PATH)
-    # get_items()
--- a/devdata/index-v2.tinysearch
+++ b/devdata/index-v2.tinysearch
--- a/mwmbl/api.py
+++ b/mwmbl/api.py
@ -1,27 +0,0 @@
-from ninja import NinjaAPI
-from ninja.security import django_auth
-
-import mwmbl.crawler.app as crawler
-from mwmbl.platform import curate
-from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
-from mwmbl.tinysearchengine import search
-
-
-def create_api(version):
-    # Set csrf to True to all cookie-based authentication
-    api = NinjaAPI(version=version, csrf=True)
-
-    search_router = search.create_router(ranker)
-    api.add_router("/search/", search_router)
-
-    crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
-    api.add_router("/crawler/", crawler_router)
-
-    curation_router = curate.create_router(index_path)
-    api.add_router("/curation/", curation_router, auth=django_auth)
-    return api
-
-
-# Work around because Django-Ninja doesn't allow using multiple URLs for the same thing
-api_original = create_api("0.1")
-api_v1 = create_api("1.0.0")
--- a/mwmbl/apps.py
+++ b/mwmbl/apps.py
@ -6,6 +6,10 @@ from pathlib import Path
 from django.apps import AppConfig
 from django.conf import settings

+from mwmbl.crawler.urls import URLDatabase
+from mwmbl.database import Database
+from mwmbl.indexer.indexdb import IndexDatabase
+

 class MwmblConfig(AppConfig):
    name = "mwmbl"
@ -31,6 +35,12 @@ class MwmblConfig(AppConfig):
            TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
                             page_size=PAGE_SIZE)

+        with Database() as db:
+            url_db = URLDatabase(db.connection)
+            url_db.create_tables()
+            index_db = IndexDatabase(db.connection)
+            index_db.create_tables()
+
        if settings.RUN_BACKGROUND_PROCESSES:
            new_item_queue = Queue()
            Process(target=background.run, args=(settings.DATA_PATH,)).start()
--- a/mwmbl/background.py
+++ b/mwmbl/background.py
@ -1,7 +1,9 @@
 """
 Script that updates data in a background process.
 """
-from logging import getLogger
+import logging
+import sys
+from logging import getLogger, basicConfig
 from pathlib import Path
 from time import sleep

@ -11,6 +13,8 @@ from mwmbl.indexer import index_batches, historical
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME

+
+basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = getLogger(__name__)


--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -10,7 +10,7 @@ from uuid import uuid4
 import boto3
 import requests
 from fastapi import HTTPException
-from ninja import Router
+from ninja import NinjaAPI
 from redis import Redis

 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@ -50,12 +50,8 @@ def upload(data: bytes, name: str):
 last_batch = None


-def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
-    router = Router(tags=["crawler"])
-
-    # TODO: # ensure tables are created before crawler code is used:
-    #       #
-    #       #     url_db.create_tables()
+def create_router(batch_cache: BatchCache, queued_batches: Queue, version: str) -> NinjaAPI:
+    router = NinjaAPI(urls_namespace=f"crawler-{version}")

    @router.post('/batches/')
    def post_batch(request, batch: Batch):
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
 def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
    for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
        score = link_counts.get(url, DEFAULT_SCORE)
-        yield tokenize_document(url, title_cleaned, extract, score, nlp)
+        yield tokenize_document(url, title_cleaned, extract, score)

        if i % 1000 == 0:
            print("Processed", i)
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
    return set(first_tokens + bigrams)


-def tokenize_document(url, title_cleaned, extract, score, nlp):
+def tokenize_document(url, title_cleaned, extract, score):
    title_tokens = tokenize(title_cleaned)
    prepared_url = prepare_url_for_tokenizing(unquote(url))
    url_tokens = tokenize(prepared_url)
--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@ -16,6 +16,7 @@ from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.index import tokenize_document
 from mwmbl.indexer.indexdb import BatchStatus
 from mwmbl.tinysearchengine.indexer import Document, TinyIndex
+from mwmbl.utils import add_term_info, add_term_infos

 logger = getLogger(__name__)

@ -31,22 +32,20 @@ def run(batch_cache: BatchCache, index_path: str):

    def process(batches: Collection[HashedBatch]):
        with Database() as db:
-            nlp = spacy.load("en_core_web_sm")
            url_db = URLDatabase(db.connection)
-            index_batches(batches, index_path, nlp, url_db)
+            index_batches(batches, index_path, url_db)
            logger.info("Indexed pages")

    process_batch.run(batch_cache, BatchStatus.URLS_UPDATED, BatchStatus.INDEXED, process)


-def index_batches(batch_data: Collection[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
+def index_batches(batch_data: Collection[HashedBatch], index_path: str, url_db: URLDatabase):
    document_tuples = list(get_documents_from_batches(batch_data))
    urls = [url for title, url, extract in document_tuples]
-    logger.info(f"Got {len(urls)} document tuples")
    url_scores = url_db.get_url_scores(urls)
-    logger.info(f"Got {len(url_scores)} scores")
+    logger.info(f"Indexing {len(urls)} document tuples and {len(url_scores)} URL scores")
    documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
-    page_documents = preprocess_documents(documents, index_path, nlp)
+    page_documents = preprocess_documents(documents, index_path)
    index_pages(index_path, page_documents)


@ -58,24 +57,27 @@ def index_pages(index_path, page_documents):
            seen_urls = set()
            seen_titles = set()
            sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
-            for document in sorted_documents:
+            # TODO: for now we add the term here, until all the documents in the index have terms
+            sorted_documents_with_terms = add_term_infos(sorted_documents, indexer, page)
+            for document in sorted_documents_with_terms:
                if document.title in seen_titles or document.url in seen_urls:
                    continue
                new_documents.append(document)
                seen_urls.add(document.url)
                seen_titles.add(document.title)
+            logger.info(f"Storing {len(new_documents)} documents for page {page}, originally {len(existing_documents)}")
            indexer.store_in_page(page, new_documents)


-def preprocess_documents(documents, index_path, nlp):
+def preprocess_documents(documents, index_path):
    page_documents = defaultdict(list)
    with TinyIndex(Document, index_path, 'w') as indexer:
        for document in documents:
-            tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
-            # logger.debug(f"Tokenized: {tokenized}")
-            page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
-            for page in page_indexes:
-                page_documents[page].append(document)
+            tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
+            for token in tokenized.tokens:
+                page = indexer.get_key_page_index(token)
+                term_document = Document(document.title, document.url, document.extract, document.score, token)
+                page_documents[page].append(term_document)
    print(f"Preprocessed for {len(page_documents)} pages")
    return page_documents

--- a/mwmbl/indexer/update_urls.py
+++ b/mwmbl/indexer/update_urls.py
@ -86,7 +86,7 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
 def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
    parsed_link = urlparse(link)
    if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
-        logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
+        logger.debug(f"Excluding link for blacklisted domain: {parsed_link}")
        return

    extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
--- a/mwmbl/platform/curate.py
+++ b/mwmbl/platform/curate.py
@ -1,7 +1,8 @@
+from logging import getLogger
 from typing import Any
 from urllib.parse import parse_qs

-from ninja import Router
+from ninja import Router, NinjaAPI

 from mwmbl.indexer.update_urls import get_datetime_from_timestamp
 from mwmbl.models import UserCuration
@ -9,13 +10,17 @@ from mwmbl.platform.data import CurateBegin, CurateMove, CurateDelete, CurateAdd
    make_curation_type
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 from mwmbl.tokenizer import tokenize
+from mwmbl.utils import add_term_info, add_term_infos

 RESULT_URL = "https://mwmbl.org/?q="
 MAX_CURATED_SCORE = 1_111_111.0


-def create_router(index_path: str) -> Router:
-    router = Router(tags=["user"])
+logger = getLogger(__name__)
+
+
+def create_router(index_path: str, version: str) -> NinjaAPI:
+    router = NinjaAPI(urls_namespace=f"curate-{version}", csrf=True)

    @router.post("/begin")
    def user_begin_curate(request, curate_begin: make_curation_type(CurateBegin)):
@ -58,20 +63,24 @@ def create_router(index_path: str) -> Router:
                raise ValueError(f"Should be one query value in the URL: {curation.url}")

            query = queries[0]
-            print("Query", query)
            tokens = tokenize(query)
-            print("Tokens", tokens)
            term = " ".join(tokens)
-            print("Key", term)

            documents = [
                Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
                for i, result in enumerate(curation.results)
            ]
+
            page_index = indexer.get_key_page_index(term)
-            print("Page index", page_index)
-            print("Storing documents", documents)
-            indexer.store_in_page(page_index, documents)
+            existing_documents_no_terms = indexer.get_page(page_index)
+            existing_documents = add_term_infos(existing_documents_no_terms, indexer, page_index)
+            other_documents = [doc for doc in existing_documents if doc.term != term]
+            logger.info(f"Found {len(other_documents)} other documents for term {term} at page {page_index} "
+                        f"with terms { {doc.term for doc in other_documents} }")
+
+            all_documents = documents + other_documents
+            logger.info(f"Storing {len(all_documents)} documents at page {page_index}")
+            indexer.store_in_page(page_index, all_documents)

        return {"curation": "ok"}

--- a/mwmbl/settings.py
+++ b/mwmbl/settings.py
@ -32,7 +32,7 @@ SCORE_FOR_SAME_DOMAIN = 0.01
 EXTRA_LINK_MULTIPLIER = 0.001
 UNKNOWN_DOMAIN_MULTIPLIER = 0.001
 EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
-DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$")
+DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$|omgoat\.org$")
 CORE_DOMAINS = {
    'github.com',
    'en.wikipedia.org',
--- a/mwmbl/settings_bg_prod.py
+++ b/mwmbl/settings_bg_prod.py
@ -1,8 +1,3 @@
-from mwmbl.settings_common import *
+from mwmbl.settings_prod import *

-DEBUG = False
-ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org"]
-
-DATA_PATH = "/app/storage"
 RUN_BACKGROUND_PROCESSES = True
-NUM_PAGES = 10240000
--- a/mwmbl/settings_common.py
+++ b/mwmbl/settings_common.py
@ -9,7 +9,6 @@ https://docs.djangoproject.com/en/4.2/topics/settings/
 For the full list of settings and their values, see
 https://docs.djangoproject.com/en/4.2/ref/settings/
 """
-
 from pathlib import Path

 # Build paths inside the project like this: BASE_DIR / 'subdir'.
--- a/mwmbl/templates/base.html
+++ b/mwmbl/templates/base.html
@ -1,95 +1,22 @@
-{% load django_vite %}
 <!DOCTYPE html>
 <html lang="en">
-
-<head>
-  <!-- Metas -->
-  <meta charset="UTF-8">
-  <meta http-equiv="X-UA-Compatible" content="IE=edge">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-
-  {%  include "title.html" %}
-  <meta name="description" content="The free, open-source and non-profit search engine.">
-
-  <!-- Favicons -->
-  <link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
-
-  <!-- Fonts import -->
-  <link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
-  <noscript>
-    <link rel="stylesheet" href="/static/fonts/inter/inter.css">
-  </noscript>
-
-  <!-- CSS Stylesheets (this is critical CSS) -->
-  <link rel="stylesheet" type="text/css" href="/static/css/reset.css">
-  <link rel="stylesheet" type="text/css" href="/static/css/theme.css">
-  <link rel="stylesheet" type="text/css" href="/static/css/global.css">
-
-  <!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
-  <link rel="preload" href="/static/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
-  <noscript>
-    <link rel="stylesheet" href="/static/fonts/phosphor/icons.css">
-  </noscript>
-
-  <!-- Custom Element Polyfill for Safari -->
-  <script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
-
-  <!-- OpenSearch -->
-  <link rel="search" type="application/opensearchdescription+xml" href="/static/assets/opensearch.xml" title="Mwmbl Search">
-
-  <script src="https://unpkg.com/htmx.org@1.9.6"></script>
-
-  {% vite_hmr_client %}
-</head>
-
-<body>
-  <mwmbl-app></mwmbl-app>
-    <header class="search-menu compact">
-      <a href="/" class="branding">
-        <img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
-        <span class="brand-title">Mwmbl</span>
-      </a>
-      <form class="search-bar">
-        <i class="ph-magnifying-glass-bold"></i>
-        <input
-          type='search'
-          name='q'
-          class='search-bar-input'
-          placeholder='Search on Mwmbl...'
-          title='Use "CTRL+K" or "/" to focus.'
-          autocomplete='off'
-          value='{{ query|default_if_none:"" }}'
-          hx-get="/app/home/"
-          hx-trigger="keyup changed delay:100ms"
-          hx-target=".main"
-        >
-      </form>
-      <div is="mwmbl-save"></div>
-      {%  if user.is_authenticated %}
-        <p class="login-info">Logged in as {{ user.username }}</p>
-        <a class="button" href="/accounts/logout/">Log out</a>
+  <head>
+    <meta charset="utf-8">
+    <title>{% block title %}Simple is Better Than Complex{% endblock %}</title>
+  </head>
+  <body>
+    <header>
+      <h1>My Site</h1>
+      {% if user.is_authenticated %}
+        <a href="{% url 'account_logout' %}">logout</a>
      {% else %}
-        <a class="button" href="/accounts/login/">Login</a>
-        <a class="button" href="/accounts/signup/">Sign up</a>
+        <a href="{% url 'account_login' %}">login</a> / <a href="{% url 'signup' %}">signup</a>
      {% endif %}
+      <hr>
    </header>
-    {% block content %}
-    {% endblock %}
-  <div class="footer">
-    <ul class="footer-list">
-      {% for link in footer_links %}
-        <li class="footer-item">
-          <a href="{{ link.href }}" class="footer-link" target="__blank">
-            <i class="{{ link.icon }}"></i>
-            <span>{{ link.name }}</span>
-          </a>
-        </li>
-      {% endfor %}
-    </ul>
-  </div>
-{% vite_asset 'index.js' %}
-{% vite_legacy_polyfills %}
-{% vite_legacy_asset 'index-legacy.js' %}
-</body>
-
+    <main>
+      {% block content %}
+      {% endblock %}
+    </main>
+  </body>
 </html>
--- a/mwmbl/templates/history.html
+++ b/mwmbl/templates/history.html
@ -1,11 +0,0 @@
-{% extends "base.html" %}
-{%  load humanize %}
-
-{% block content %}
-  <div class="main">
-    <h1>Curations for <a href="{{ url }}">{{ query }}</a></h1>
-    {% for user_curation in curations %}
-      <p>{{ user_curation.user.username }} {{ user_curation.curation_type }} {{ user_curation.timestamp | naturaltime}}</p>
-    {% endfor %}
-  </div>
-{% endblock %}
--- a/mwmbl/templates/home.html
+++ b/mwmbl/templates/home.html
@ -3,7 +3,6 @@
 <div class="main">
  {% if query %}
    <button class="button curate-add" is="mwmbl-add-button">＋ Add new</button>
-{#    <a href="{% url "history:url" %}" class="button curate-add">View history</a>#}
    {% if results %}
      <ul class='results'>
        {% for result in results %}
--- a/mwmbl/templates/index.html
+++ b/mwmbl/templates/index.html
@ -1,6 +1,97 @@
-{% extends "base.html" %}
+{% load django_vite %}
+<!DOCTYPE html>
+<html lang="en">

-{% block content %}
-  {% include "home.html" %}
-  <div is="mwmbl-add-result"></div>
-{% endblock %}
+<head>
+  <!-- Metas -->
+  <meta charset="UTF-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+  {%  include "title.html" %}
+  <meta name="description" content="The free, open-source and non-profit search engine.">
+
+  <!-- Favicons -->
+  <link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
+
+  <!-- Fonts import -->
+  <link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
+  <noscript>
+    <link rel="stylesheet" href="/static/fonts/inter/inter.css">
+  </noscript>
+
+  <!-- CSS Stylesheets (this is critical CSS) -->
+  <link rel="stylesheet" type="text/css" href="/static/css/reset.css">
+  <link rel="stylesheet" type="text/css" href="/static/css/theme.css">
+  <link rel="stylesheet" type="text/css" href="/static/css/global.css">
+
+  <!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
+  <link rel="preload" href="/static/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
+  <noscript>
+    <link rel="stylesheet" href="/static/fonts/phosphor/icons.css">
+  </noscript>
+
+  <!-- Custom Element Polyfill for Safari -->
+  <script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
+
+  <!-- OpenSearch -->
+  <link rel="search" type="application/opensearchdescription+xml" href="/static/assets/opensearch.xml" title="Mwmbl Search">
+
+  <script src="https://unpkg.com/htmx.org@1.9.6"></script>
+
+  {% vite_hmr_client %}
+</head>
+
+<body>
+  <mwmbl-app></mwmbl-app>
+    <header class="search-menu compact">
+      <a href="/" class="branding">
+        <img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
+        <span class="brand-title">Mwmbl</span>
+      </a>
+      <form class="search-bar">
+        <i class="ph-magnifying-glass-bold"></i>
+        <input
+          type='search'
+          name='q'
+          class='search-bar-input'
+          placeholder='Search on Mwmbl...'
+          title='Use "CTRL+K" or "/" to focus.'
+          autocomplete='off'
+          value='{{ query|default_if_none:"" }}'
+          hx-get="/app/home/"
+          hx-trigger="keyup changed delay:100ms"
+          hx-target=".main"
+        >
+      </form>
+      <div is="mwmbl-save"></div>
+      {%  if user.is_authenticated %}
+        <p class="login-info">Logged in as {{ user.username }}</p>
+        <a class="button" href="/accounts/logout/">Log out</a>
+      {% else %}
+        <a class="button" href="/accounts/login/">Login</a>
+        <a class="button" href="/accounts/signup/">Sign up</a>
+      {% endif %}
+    </header>
+    <main>
+      {%  include "home.html" %}
+    </main>
+    <div is="mwmbl-add-result"></div>
+  <div class="footer">
+    <ul class="footer-list">
+      {% for link in footer_links %}
+        <li class="footer-item">
+          <a href="{{ link.href }}" class="footer-link" target="__blank">
+            <i class="{{ link.icon }}"></i>
+            <span>{{ link.name }}</span>
+          </a>
+        </li>
+      {% endfor %}
+    </ul>
+  </div>
+{% vite_asset 'index.js' %}
+{% vite_legacy_polyfills %}
+{% vite_legacy_asset 'index-legacy.js' %}
+</body>
+
+</html>
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@ -79,6 +79,7 @@ class TinyIndexMetadata:
        values = json.loads(data[constant_length:].decode('utf8'))
        return TinyIndexMetadata(**values)

+
 # Find the optimal amount of data that fits onto a page
 # We do this by leveraging binary search to quickly find the index where:
 #     - index+1 cannot fit onto a page
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
            # No better match, use our index
            return mid, compressed_data

+
 def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
    # Find max number of items that fit on a page
    return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))

+
 def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
    num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)

@ -186,7 +189,6 @@ class TinyIndex(Generic[T]):
        except ZstdError:
            logger.exception(f"Error decompressing page data, content: {page_data}")
            return []
-        # logger.debug(f"Decompressed data: {decompressed_data}")
        return json.loads(decompressed_data.decode('utf8'))

    def store_in_page(self, page_index: int, values: list[T]):
--- a/mwmbl/tinysearchengine/search.py
+++ b/mwmbl/tinysearchengine/search.py
@ -1,6 +1,6 @@
 from logging import getLogger

-from ninja import Router
+from ninja import NinjaAPI

 from mwmbl.tinysearchengine.rank import HeuristicRanker

@ -10,8 +10,8 @@ logger = getLogger(__name__)
 SCORE_THRESHOLD = 0.25


-def create_router(ranker: HeuristicRanker) -> Router:
-    router = Router(tags=["search"])
+def create_router(ranker: HeuristicRanker, version: str) -> NinjaAPI:
+    router = NinjaAPI(urls_namespace=f"search-{version}")

    @router.get("")
    def search(request, s: str):
--- a/mwmbl/urls.py
+++ b/mwmbl/urls.py
@ -17,16 +17,27 @@ Including another URLconf
 from django.contrib import admin
 from django.urls import path, include

-from mwmbl.api import api_v1
-from mwmbl.views import home_fragment, fetch_url, index, page_history
+import mwmbl.crawler.app as crawler
+from mwmbl.platform import curate
+from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
+from mwmbl.tinysearchengine import search
+from mwmbl.views import home_fragment, fetch_url, index

 urlpatterns = [
    path('admin/', admin.site.urls),
-    path('api/v1/', api_v1.urls),
    path('accounts/', include('allauth.urls')),

-    path('', index, name="home"),
+    path('', index, name="index"),
    path('app/home/', home_fragment, name="home"),
    path('app/fetch/', fetch_url, name="fetch_url"),
-    path('app/history/', page_history, name="history"),
+
+    # TODO: this is the old API, deprecated and to be removed once all clients have moved over
+    path("search/", search.create_router(ranker, "0.1").urls),
+    path("crawler/", crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches, version="0.1").urls),
+    path("curation/", curate.create_router(index_path, version="0.1").urls),
+
+    # New API
+    path("api/v1/search/", search.create_router(ranker, "1.0.0").urls),
+    path("api/v1/crawler/", crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches, version="1.0.0").urls),
+    path("api/v1/curation/", curate.create_router(index_path, version="1.0.0").urls),
 ]
--- a/mwmbl/utils.py
+++ b/mwmbl/utils.py
@ -1,5 +1,8 @@
 import re

+from mwmbl.indexer.index import tokenize_document
+from mwmbl.tinysearchengine.indexer import Document, TinyIndex
+
 DOMAIN_REGEX = re.compile(r".*://([^/]*)")


@ -17,3 +20,23 @@ def get_domain(url):
    if results is None or len(results.groups()) == 0:
        raise ValueError(f"Unable to parse domain from URL {url}")
    return results.group(1)
+
+
+def add_term_info(document: Document, index: TinyIndex, page_index: int):
+    tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
+    for token in tokenized.tokens:
+        token_page_index = index.get_key_page_index(token)
+        if token_page_index == page_index:
+            return Document(document.title, document.url, document.extract, document.score, token)
+    raise ValueError("Could not find token in page index")
+
+
+def add_term_infos(documents: list[Document], index: TinyIndex, page_index: int):
+    for document in documents:
+        if document.term is not None:
+            yield document
+            continue
+        try:
+            yield add_term_info(document, index, page_index)
+        except ValueError:
+            continue
--- a/mwmbl/views.py
+++ b/mwmbl/views.py
@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from datetime import datetime
 from itertools import groupby
-from urllib.parse import urlparse, parse_qs
+from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, ParseResult

 import justext
 import requests
@ -66,12 +66,13 @@ def home_fragment(request):
        "query": query,
        "activity": activity,
    })
-    current_url = request.htmx.current_url
-    # Replace query string with new query
-    stripped_url = current_url[:current_url.index("?")] if "?" in current_url else current_url
-    query_string = "?q=" + query if len(query) > 0 else ""
-    new_url = stripped_url + query_string
-    # Set the htmx replace header
+
+    # Encode the new query string
+    if query:
+        new_query_string = urlencode({"q": query}, doseq=True)
+        new_url = "/?" + new_query_string
+    else:
+        new_url = "/"
    response["HX-Replace-Url"] = new_url
    return response

@ -127,15 +128,3 @@ def fetch_url(request):
    return render(request, "result.html", {
        "result": format_result(result, query),
    })
-
-
-def page_history(request):
-    url = request.GET["url"]
-    parsed_url_query = parse_qs(urlparse(url).query)
-    query = parsed_url_query.get("q", [""])[0]
-    curations = UserCuration.objects.filter(url=url).order_by("-timestamp")
-    return render(request, "history.html", {
-        "curations": curations,
-        "url": url,
-        "query": query,
-    })
Author	SHA1	Message	Date
Daoud Clarke	cfe18162f1	Blacklist another domain	2023-11-21 11:24:48 +00:00
Daoud Clarke	b868b6284b	Encode URLs properly	2023-11-21 10:45:50 +00:00
Daoud Clarke	c1489a27cf	Merge pull request #130 from mwmbl/fix-csrf-requirement Use CSRF only for curation requests	2023-11-19 20:53:55 +00:00
Daoud Clarke	a2fd3d95d8	Use CSRF only for curation requests	2023-11-19 20:48:18 +00:00
Daoud Clarke	5874720801	Merge pull request #129 from mwmbl/allow-running-old-api Allow running old API	2023-11-19 10:02:27 +00:00
Daoud Clarke	da787a67db	Unused setting	2023-11-19 10:01:48 +00:00
Daoud Clarke	56ee43e730	Remove unused settings	2023-11-19 10:01:04 +00:00
Daoud Clarke	69f6a16cce	Reinstate old API	2023-11-19 10:00:31 +00:00
Daoud Clarke	8c45b94aa6	Outdated settings file	2023-11-18 20:21:57 +00:00
Daoud Clarke	3c61f5818d	Whitespace to allow git push	2023-11-18 20:15:39 +00:00
Daoud Clarke	a3cc316d15	Merge pull request #128 from mwmbl/beta Allow users to curate search results	2023-11-18 20:14:50 +00:00
Daoud Clarke	36df016445	Merge pull request #127 from mwmbl/add-term-info-to-index Add term info to index	2023-11-18 18:56:53 +00:00
Daoud Clarke	204304e18e	Add term info to index	2023-11-18 18:49:41 +00:00
Daoud Clarke	a2b872008f	Add a script to evaluate how much it costs to add the term to the index Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449	2023-11-16 17:42:18 +00:00