Compare commits

..

14 commits

Author SHA1 Message Date
Daoud Clarke
cfe18162f1 Blacklist another domain 2023-11-21 11:24:48 +00:00
Daoud Clarke
b868b6284b Encode URLs properly 2023-11-21 10:45:50 +00:00
Daoud Clarke
c1489a27cf
Merge pull request #130 from mwmbl/fix-csrf-requirement
Use CSRF only for curation requests
2023-11-19 20:53:55 +00:00
Daoud Clarke
a2fd3d95d8 Use CSRF only for curation requests 2023-11-19 20:48:18 +00:00
Daoud Clarke
5874720801
Merge pull request #129 from mwmbl/allow-running-old-api
Allow running old API
2023-11-19 10:02:27 +00:00
Daoud Clarke
da787a67db Unused setting 2023-11-19 10:01:48 +00:00
Daoud Clarke
56ee43e730 Remove unused settings 2023-11-19 10:01:04 +00:00
Daoud Clarke
69f6a16cce Reinstate old API 2023-11-19 10:00:31 +00:00
Daoud Clarke
8c45b94aa6 Outdated settings file 2023-11-18 20:21:57 +00:00
Daoud Clarke
3c61f5818d Whitespace to allow git push 2023-11-18 20:15:39 +00:00
Daoud Clarke
a3cc316d15
Merge pull request #128 from mwmbl/beta
Allow users to curate search results
2023-11-18 20:14:50 +00:00
Daoud Clarke
36df016445
Merge pull request #127 from mwmbl/add-term-info-to-index
Add term info to index
2023-11-18 18:56:53 +00:00
Daoud Clarke
204304e18e Add term info to index 2023-11-18 18:49:41 +00:00
Daoud Clarke
a2b872008f Add a script to evaluate how much it costs to add the term to the index
Old sizes mean 33.3673 0.08148019988498635
New sizes mean 32.1322 0.07700185221489449
2023-11-16 17:42:18 +00:00
25 changed files with 273 additions and 319 deletions

View file

@ -3,3 +3,4 @@ Contributions are very welcome!
Please join the discussion at https://matrix.to/#/#mwmbl:matrix.org and let us know what you're planning to do.
See https://book.mwmbl.org/page/developers/ for a guide to development.

51
analyse/add_term_info.py Normal file
View file

@ -0,0 +1,51 @@
"""
Investigate adding term information to the database.
How much extra space will it take?
"""
import os
from pathlib import Path
from random import Random
import numpy as np
from scipy.stats import sem
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
from zstandard import ZstdCompressor
from mwmbl.utils import add_term_info
random = Random(1)
INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
def run():
compressor = ZstdCompressor()
with TinyIndex(Document, INDEX_PATH) as index:
# Get some random integers between 0 and index.num_pages:
pages = random.sample(range(index.num_pages), 10000)
old_sizes = []
new_sizes = []
for i in pages:
page = index.get_page(i)
term_documents = []
for document in page:
term_document = add_term_info(document, index, i)
term_documents.append(term_document)
value_tuples = [astuple(value) for value in term_documents]
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
new_sizes.append(num_fitting)
old_sizes.append(len(page))
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
if __name__ == '__main__':
run()

View file

@ -1,57 +0,0 @@
"""
Index batches stored locally on the filesystem for the purpose of evaluation.
"""
import glob
import gzip
import json
import logging
import os
import sys
from datetime import datetime
import spacy
from mwmbl.crawler import HashedBatch
from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database
from mwmbl.indexer import index_batches
from mwmbl.tinysearchengine import TinyIndex, Document
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
NUM_BATCHES = 10000
EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
NUM_PAGES = 1_024_000
PAGE_SIZE = 4096
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
def get_batches():
for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
data = json.load(gzip.open(path))
yield HashedBatch.parse_obj(data)
def run():
try:
os.remove(EVALUATE_INDEX_PATH)
except FileNotFoundError:
pass
TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
batches = get_batches()
start = datetime.now()
with Database() as db:
nlp = spacy.load("en_core_web_sm")
url_db = URLDatabase(db.connection)
index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
end = datetime.now()
total_time = (end - start).total_seconds()
print("total_seconds:", total_time)
if __name__ == '__main__':
run()

View file

@ -1,60 +0,0 @@
import logging
import sys
import numpy as np
import spacy
from analyse.index_local import EVALUATE_INDEX_PATH
from mwmbl.indexer import tokenize_document
from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine import TinyIndex, Document
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
nlp = spacy.load("en_core_web_sm")
def store():
document = Document(
title='A nation in search of the new black | Theatre | The Guardian',
url='https://www.theguardian.com/stage/2007/nov/18/theatre',
extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
score=1.0
)
with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
print("Tokenized", tokenized)
# for token in tokenized.tokens:
#
# tiny_index.index(token, document)
def get_items():
with TinyIndex(Document, INDEX_PATH) as tiny_index:
items = tiny_index.retrieve('wikipedia')
if items:
for item in items:
print("Items", item)
def run(index_path):
with TinyIndex(Document, index_path) as tiny_index:
sizes = {}
for i in range(tiny_index.num_pages):
page = tiny_index.get_page(i)
if page:
sizes[i] = len(page)
if len(page) > 50:
print("Page", len(page), page)
# for item in page:
# if ' search' in item.title:
# print("Page", i, item)
print("Max", max(sizes.values()))
print("Top", sorted(sizes.values())[-100:])
print("Mean", np.mean(list(sizes.values())))
if __name__ == '__main__':
# store()
run(EVALUATE_INDEX_PATH)
# get_items()

Binary file not shown.

View file

@ -1,27 +0,0 @@
from ninja import NinjaAPI
from ninja.security import django_auth
import mwmbl.crawler.app as crawler
from mwmbl.platform import curate
from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
from mwmbl.tinysearchengine import search
def create_api(version):
# Set csrf to True to all cookie-based authentication
api = NinjaAPI(version=version, csrf=True)
search_router = search.create_router(ranker)
api.add_router("/search/", search_router)
crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
api.add_router("/crawler/", crawler_router)
curation_router = curate.create_router(index_path)
api.add_router("/curation/", curation_router, auth=django_auth)
return api
# Work around because Django-Ninja doesn't allow using multiple URLs for the same thing
api_original = create_api("0.1")
api_v1 = create_api("1.0.0")

View file

@ -6,6 +6,10 @@ from pathlib import Path
from django.apps import AppConfig
from django.conf import settings
from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database
from mwmbl.indexer.indexdb import IndexDatabase
class MwmblConfig(AppConfig):
name = "mwmbl"
@ -31,6 +35,12 @@ class MwmblConfig(AppConfig):
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
page_size=PAGE_SIZE)
with Database() as db:
url_db = URLDatabase(db.connection)
url_db.create_tables()
index_db = IndexDatabase(db.connection)
index_db.create_tables()
if settings.RUN_BACKGROUND_PROCESSES:
new_item_queue = Queue()
Process(target=background.run, args=(settings.DATA_PATH,)).start()

View file

@ -1,7 +1,9 @@
"""
Script that updates data in a background process.
"""
from logging import getLogger
import logging
import sys
from logging import getLogger, basicConfig
from pathlib import Path
from time import sleep
@ -11,6 +13,8 @@ from mwmbl.indexer import index_batches, historical
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME
basicConfig(stream=sys.stdout, level=logging.INFO)
logger = getLogger(__name__)

View file

@ -10,7 +10,7 @@ from uuid import uuid4
import boto3
import requests
from fastapi import HTTPException
from ninja import Router
from ninja import NinjaAPI
from redis import Redis
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@ -50,12 +50,8 @@ def upload(data: bytes, name: str):
last_batch = None
def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
router = Router(tags=["crawler"])
# TODO: # ensure tables are created before crawler code is used:
# #
# # url_db.create_tables()
def create_router(batch_cache: BatchCache, queued_batches: Queue, version: str) -> NinjaAPI:
router = NinjaAPI(urls_namespace=f"crawler-{version}")
@router.post('/batches/')
def post_batch(request, batch: Batch):

View file

@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
score = link_counts.get(url, DEFAULT_SCORE)
yield tokenize_document(url, title_cleaned, extract, score, nlp)
yield tokenize_document(url, title_cleaned, extract, score)
if i % 1000 == 0:
print("Processed", i)
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
return set(first_tokens + bigrams)
def tokenize_document(url, title_cleaned, extract, score, nlp):
def tokenize_document(url, title_cleaned, extract, score):
title_tokens = tokenize(title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(prepared_url)

View file

@ -16,6 +16,7 @@ from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.index import tokenize_document
from mwmbl.indexer.indexdb import BatchStatus
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
from mwmbl.utils import add_term_info, add_term_infos
logger = getLogger(__name__)
@ -31,22 +32,20 @@ def run(batch_cache: BatchCache, index_path: str):
def process(batches: Collection[HashedBatch]):
with Database() as db:
nlp = spacy.load("en_core_web_sm")
url_db = URLDatabase(db.connection)
index_batches(batches, index_path, nlp, url_db)
index_batches(batches, index_path, url_db)
logger.info("Indexed pages")
process_batch.run(batch_cache, BatchStatus.URLS_UPDATED, BatchStatus.INDEXED, process)
def index_batches(batch_data: Collection[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
def index_batches(batch_data: Collection[HashedBatch], index_path: str, url_db: URLDatabase):
document_tuples = list(get_documents_from_batches(batch_data))
urls = [url for title, url, extract in document_tuples]
logger.info(f"Got {len(urls)} document tuples")
url_scores = url_db.get_url_scores(urls)
logger.info(f"Got {len(url_scores)} scores")
logger.info(f"Indexing {len(urls)} document tuples and {len(url_scores)} URL scores")
documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
page_documents = preprocess_documents(documents, index_path, nlp)
page_documents = preprocess_documents(documents, index_path)
index_pages(index_path, page_documents)
@ -58,24 +57,27 @@ def index_pages(index_path, page_documents):
seen_urls = set()
seen_titles = set()
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
for document in sorted_documents:
# TODO: for now we add the term here, until all the documents in the index have terms
sorted_documents_with_terms = add_term_infos(sorted_documents, indexer, page)
for document in sorted_documents_with_terms:
if document.title in seen_titles or document.url in seen_urls:
continue
new_documents.append(document)
seen_urls.add(document.url)
seen_titles.add(document.title)
logger.info(f"Storing {len(new_documents)} documents for page {page}, originally {len(existing_documents)}")
indexer.store_in_page(page, new_documents)
def preprocess_documents(documents, index_path, nlp):
def preprocess_documents(documents, index_path):
page_documents = defaultdict(list)
with TinyIndex(Document, index_path, 'w') as indexer:
for document in documents:
tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
# logger.debug(f"Tokenized: {tokenized}")
page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
for page in page_indexes:
page_documents[page].append(document)
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
for token in tokenized.tokens:
page = indexer.get_key_page_index(token)
term_document = Document(document.title, document.url, document.extract, document.score, token)
page_documents[page].append(term_document)
print(f"Preprocessed for {len(page_documents)} pages")
return page_documents

View file

@ -86,7 +86,7 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
parsed_link = urlparse(link)
if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
logger.debug(f"Excluding link for blacklisted domain: {parsed_link}")
return
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0

View file

@ -1,7 +1,8 @@
from logging import getLogger
from typing import Any
from urllib.parse import parse_qs
from ninja import Router
from ninja import Router, NinjaAPI
from mwmbl.indexer.update_urls import get_datetime_from_timestamp
from mwmbl.models import UserCuration
@ -9,13 +10,17 @@ from mwmbl.platform.data import CurateBegin, CurateMove, CurateDelete, CurateAdd
make_curation_type
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize
from mwmbl.utils import add_term_info, add_term_infos
RESULT_URL = "https://mwmbl.org/?q="
MAX_CURATED_SCORE = 1_111_111.0
def create_router(index_path: str) -> Router:
router = Router(tags=["user"])
logger = getLogger(__name__)
def create_router(index_path: str, version: str) -> NinjaAPI:
router = NinjaAPI(urls_namespace=f"curate-{version}", csrf=True)
@router.post("/begin")
def user_begin_curate(request, curate_begin: make_curation_type(CurateBegin)):
@ -58,20 +63,24 @@ def create_router(index_path: str) -> Router:
raise ValueError(f"Should be one query value in the URL: {curation.url}")
query = queries[0]
print("Query", query)
tokens = tokenize(query)
print("Tokens", tokens)
term = " ".join(tokens)
print("Key", term)
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
for i, result in enumerate(curation.results)
]
page_index = indexer.get_key_page_index(term)
print("Page index", page_index)
print("Storing documents", documents)
indexer.store_in_page(page_index, documents)
existing_documents_no_terms = indexer.get_page(page_index)
existing_documents = add_term_infos(existing_documents_no_terms, indexer, page_index)
other_documents = [doc for doc in existing_documents if doc.term != term]
logger.info(f"Found {len(other_documents)} other documents for term {term} at page {page_index} "
f"with terms { {doc.term for doc in other_documents} }")
all_documents = documents + other_documents
logger.info(f"Storing {len(all_documents)} documents at page {page_index}")
indexer.store_in_page(page_index, all_documents)
return {"curation": "ok"}

View file

@ -32,7 +32,7 @@ SCORE_FOR_SAME_DOMAIN = 0.01
EXTRA_LINK_MULTIPLIER = 0.001
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$")
DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$|omgoat\.org$")
CORE_DOMAINS = {
'github.com',
'en.wikipedia.org',

View file

@ -1,8 +1,3 @@
from mwmbl.settings_common import *
from mwmbl.settings_prod import *
DEBUG = False
ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org"]
DATA_PATH = "/app/storage"
RUN_BACKGROUND_PROCESSES = True
NUM_PAGES = 10240000

View file

@ -9,7 +9,6 @@ https://docs.djangoproject.com/en/4.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.2/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.

View file

@ -1,95 +1,22 @@
{% load django_vite %}
<!DOCTYPE html>
<html lang="en">
<head>
<!-- Metas -->
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
{% include "title.html" %}
<meta name="description" content="The free, open-source and non-profit search engine.">
<!-- Favicons -->
<link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
<!-- Fonts import -->
<link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/static/fonts/inter/inter.css">
</noscript>
<!-- CSS Stylesheets (this is critical CSS) -->
<link rel="stylesheet" type="text/css" href="/static/css/reset.css">
<link rel="stylesheet" type="text/css" href="/static/css/theme.css">
<link rel="stylesheet" type="text/css" href="/static/css/global.css">
<!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
<link rel="preload" href="/static/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/static/fonts/phosphor/icons.css">
</noscript>
<!-- Custom Element Polyfill for Safari -->
<script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
<!-- OpenSearch -->
<link rel="search" type="application/opensearchdescription+xml" href="/static/assets/opensearch.xml" title="Mwmbl Search">
<script src="https://unpkg.com/htmx.org@1.9.6"></script>
{% vite_hmr_client %}
</head>
<body>
<mwmbl-app></mwmbl-app>
<header class="search-menu compact">
<a href="/" class="branding">
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
<span class="brand-title">Mwmbl</span>
</a>
<form class="search-bar">
<i class="ph-magnifying-glass-bold"></i>
<input
type='search'
name='q'
class='search-bar-input'
placeholder='Search on Mwmbl...'
title='Use "CTRL+K" or "/" to focus.'
autocomplete='off'
value='{{ query|default_if_none:"" }}'
hx-get="/app/home/"
hx-trigger="keyup changed delay:100ms"
hx-target=".main"
>
</form>
<div is="mwmbl-save"></div>
{% if user.is_authenticated %}
<p class="login-info">Logged in as {{ user.username }}</p>
<a class="button" href="/accounts/logout/">Log out</a>
<head>
<meta charset="utf-8">
<title>{% block title %}Simple is Better Than Complex{% endblock %}</title>
</head>
<body>
<header>
<h1>My Site</h1>
{% if user.is_authenticated %}
<a href="{% url 'account_logout' %}">logout</a>
{% else %}
<a class="button" href="/accounts/login/">Login</a>
<a class="button" href="/accounts/signup/">Sign up</a>
<a href="{% url 'account_login' %}">login</a> / <a href="{% url 'signup' %}">signup</a>
{% endif %}
<hr>
</header>
{% block content %}
{% endblock %}
<div class="footer">
<ul class="footer-list">
{% for link in footer_links %}
<li class="footer-item">
<a href="{{ link.href }}" class="footer-link" target="__blank">
<i class="{{ link.icon }}"></i>
<span>{{ link.name }}</span>
</a>
</li>
{% endfor %}
</ul>
</div>
{% vite_asset 'index.js' %}
{% vite_legacy_polyfills %}
{% vite_legacy_asset 'index-legacy.js' %}
</body>
<main>
{% block content %}
{% endblock %}
</main>
</body>
</html>

View file

@ -1,11 +0,0 @@
{% extends "base.html" %}
{% load humanize %}
{% block content %}
<div class="main">
<h1>Curations for <a href="{{ url }}">{{ query }}</a></h1>
{% for user_curation in curations %}
<p>{{ user_curation.user.username }} {{ user_curation.curation_type }} {{ user_curation.timestamp | naturaltime}}</p>
{% endfor %}
</div>
{% endblock %}

View file

@ -3,7 +3,6 @@
<div class="main">
{% if query %}
<button class="button curate-add" is="mwmbl-add-button"> Add new</button>
{# <a href="{% url "history:url" %}" class="button curate-add">View history</a>#}
{% if results %}
<ul class='results'>
{% for result in results %}

View file

@ -1,6 +1,97 @@
{% extends "base.html" %}
{% load django_vite %}
<!DOCTYPE html>
<html lang="en">
{% block content %}
{% include "home.html" %}
<div is="mwmbl-add-result"></div>
{% endblock %}
<head>
<!-- Metas -->
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
{% include "title.html" %}
<meta name="description" content="The free, open-source and non-profit search engine.">
<!-- Favicons -->
<link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
<!-- Fonts import -->
<link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/static/fonts/inter/inter.css">
</noscript>
<!-- CSS Stylesheets (this is critical CSS) -->
<link rel="stylesheet" type="text/css" href="/static/css/reset.css">
<link rel="stylesheet" type="text/css" href="/static/css/theme.css">
<link rel="stylesheet" type="text/css" href="/static/css/global.css">
<!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
<link rel="preload" href="/static/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/static/fonts/phosphor/icons.css">
</noscript>
<!-- Custom Element Polyfill for Safari -->
<script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
<!-- OpenSearch -->
<link rel="search" type="application/opensearchdescription+xml" href="/static/assets/opensearch.xml" title="Mwmbl Search">
<script src="https://unpkg.com/htmx.org@1.9.6"></script>
{% vite_hmr_client %}
</head>
<body>
<mwmbl-app></mwmbl-app>
<header class="search-menu compact">
<a href="/" class="branding">
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
<span class="brand-title">Mwmbl</span>
</a>
<form class="search-bar">
<i class="ph-magnifying-glass-bold"></i>
<input
type='search'
name='q'
class='search-bar-input'
placeholder='Search on Mwmbl...'
title='Use "CTRL+K" or "/" to focus.'
autocomplete='off'
value='{{ query|default_if_none:"" }}'
hx-get="/app/home/"
hx-trigger="keyup changed delay:100ms"
hx-target=".main"
>
</form>
<div is="mwmbl-save"></div>
{% if user.is_authenticated %}
<p class="login-info">Logged in as {{ user.username }}</p>
<a class="button" href="/accounts/logout/">Log out</a>
{% else %}
<a class="button" href="/accounts/login/">Login</a>
<a class="button" href="/accounts/signup/">Sign up</a>
{% endif %}
</header>
<main>
{% include "home.html" %}
</main>
<div is="mwmbl-add-result"></div>
<div class="footer">
<ul class="footer-list">
{% for link in footer_links %}
<li class="footer-item">
<a href="{{ link.href }}" class="footer-link" target="__blank">
<i class="{{ link.icon }}"></i>
<span>{{ link.name }}</span>
</a>
</li>
{% endfor %}
</ul>
</div>
{% vite_asset 'index.js' %}
{% vite_legacy_polyfills %}
{% vite_legacy_asset 'index-legacy.js' %}
</body>
</html>

View file

@ -79,6 +79,7 @@ class TinyIndexMetadata:
values = json.loads(data[constant_length:].decode('utf8'))
return TinyIndexMetadata(**values)
# Find the optimal amount of data that fits onto a page
# We do this by leveraging binary search to quickly find the index where:
# - index+1 cannot fit onto a page
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
# No better match, use our index
return mid, compressed_data
def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
# Find max number of items that fit on a page
return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
@ -186,7 +189,6 @@ class TinyIndex(Generic[T]):
except ZstdError:
logger.exception(f"Error decompressing page data, content: {page_data}")
return []
# logger.debug(f"Decompressed data: {decompressed_data}")
return json.loads(decompressed_data.decode('utf8'))
def store_in_page(self, page_index: int, values: list[T]):

View file

@ -1,6 +1,6 @@
from logging import getLogger
from ninja import Router
from ninja import NinjaAPI
from mwmbl.tinysearchengine.rank import HeuristicRanker
@ -10,8 +10,8 @@ logger = getLogger(__name__)
SCORE_THRESHOLD = 0.25
def create_router(ranker: HeuristicRanker) -> Router:
router = Router(tags=["search"])
def create_router(ranker: HeuristicRanker, version: str) -> NinjaAPI:
router = NinjaAPI(urls_namespace=f"search-{version}")
@router.get("")
def search(request, s: str):

View file

@ -17,16 +17,27 @@ Including another URLconf
from django.contrib import admin
from django.urls import path, include
from mwmbl.api import api_v1
from mwmbl.views import home_fragment, fetch_url, index, page_history
import mwmbl.crawler.app as crawler
from mwmbl.platform import curate
from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
from mwmbl.tinysearchengine import search
from mwmbl.views import home_fragment, fetch_url, index
urlpatterns = [
path('admin/', admin.site.urls),
path('api/v1/', api_v1.urls),
path('accounts/', include('allauth.urls')),
path('', index, name="home"),
path('', index, name="index"),
path('app/home/', home_fragment, name="home"),
path('app/fetch/', fetch_url, name="fetch_url"),
path('app/history/', page_history, name="history"),
# TODO: this is the old API, deprecated and to be removed once all clients have moved over
path("search/", search.create_router(ranker, "0.1").urls),
path("crawler/", crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches, version="0.1").urls),
path("curation/", curate.create_router(index_path, version="0.1").urls),
# New API
path("api/v1/search/", search.create_router(ranker, "1.0.0").urls),
path("api/v1/crawler/", crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches, version="1.0.0").urls),
path("api/v1/curation/", curate.create_router(index_path, version="1.0.0").urls),
]

View file

@ -1,5 +1,8 @@
import re
from mwmbl.indexer.index import tokenize_document
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
DOMAIN_REGEX = re.compile(r".*://([^/]*)")
@ -17,3 +20,23 @@ def get_domain(url):
if results is None or len(results.groups()) == 0:
raise ValueError(f"Unable to parse domain from URL {url}")
return results.group(1)
def add_term_info(document: Document, index: TinyIndex, page_index: int):
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
for token in tokenized.tokens:
token_page_index = index.get_key_page_index(token)
if token_page_index == page_index:
return Document(document.title, document.url, document.extract, document.score, token)
raise ValueError("Could not find token in page index")
def add_term_infos(documents: list[Document], index: TinyIndex, page_index: int):
for document in documents:
if document.term is not None:
yield document
continue
try:
yield add_term_info(document, index, page_index)
except ValueError:
continue

View file

@ -1,7 +1,7 @@
from dataclasses import dataclass
from datetime import datetime
from itertools import groupby
from urllib.parse import urlparse, parse_qs
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, ParseResult
import justext
import requests
@ -66,12 +66,13 @@ def home_fragment(request):
"query": query,
"activity": activity,
})
current_url = request.htmx.current_url
# Replace query string with new query
stripped_url = current_url[:current_url.index("?")] if "?" in current_url else current_url
query_string = "?q=" + query if len(query) > 0 else ""
new_url = stripped_url + query_string
# Set the htmx replace header
# Encode the new query string
if query:
new_query_string = urlencode({"q": query}, doseq=True)
new_url = "/?" + new_query_string
else:
new_url = "/"
response["HX-Replace-Url"] = new_url
return response
@ -127,15 +128,3 @@ def fetch_url(request):
return render(request, "result.html", {
"result": format_result(result, query),
})
def page_history(request):
url = request.GET["url"]
parsed_url_query = parse_qs(urlparse(url).query)
query = parsed_url_query.get("q", [""])[0]
curations = UserCuration.objects.filter(url=url).order_by("-timestamp")
return render(request, "history.html", {
"curations": curations,
"url": url,
"query": query,
})