Compare commits
1 commit
main
...
fastapi-us
Author | SHA1 | Date | |
---|---|---|---|
|
e75b7121b0 |
110 changed files with 1699 additions and 26242 deletions
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
|
@ -53,7 +53,5 @@ jobs:
|
|||
# run test suite
|
||||
#----------------------------------------------
|
||||
- name: Run tests
|
||||
env:
|
||||
DJANGO_SETTINGS_MODULE: mwmbl.settings_dev
|
||||
run: |
|
||||
poetry run pytest
|
||||
poetry run pytest
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -17,7 +17,6 @@ __pycache__/
|
|||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
front-end/dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
|
|
|
@ -3,4 +3,3 @@ Contributions are very welcome!
|
|||
Please join the discussion at https://matrix.to/#/#mwmbl:matrix.org and let us know what you're planning to do.
|
||||
|
||||
See https://book.mwmbl.org/page/developers/ for a guide to development.
|
||||
|
||||
|
|
14
Dockerfile
14
Dockerfile
|
@ -1,10 +1,3 @@
|
|||
FROM node:hydrogen-bullseye as front-end
|
||||
|
||||
COPY front-end /front-end
|
||||
WORKDIR /front-end
|
||||
RUN npm install && npm run build
|
||||
|
||||
|
||||
FROM python:3.10.2-bullseye as base
|
||||
|
||||
ENV PYTHONFAULTHANDLER=1 \
|
||||
|
@ -46,15 +39,12 @@ RUN apt-get update && apt-get install -y postgresql-client
|
|||
# Copy only the required /venv directory from the builder image that contains mwmbl and its dependencies
|
||||
COPY --from=builder /venv /venv
|
||||
|
||||
# Copy the front end build
|
||||
COPY --from=front-end /front-end/dist /front-end-build
|
||||
|
||||
ADD nginx.conf.sigil /app
|
||||
# ADD app.json /app
|
||||
|
||||
# Set up a volume where the data will live
|
||||
VOLUME ["/data"]
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
CMD ["/venv/bin/mwmbl-tinysearchengine"]
|
||||
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
|
||||
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
"""
|
||||
Investigate adding term information to the database.
|
||||
|
||||
How much extra space will it take?
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from random import Random
|
||||
|
||||
import numpy as np
|
||||
from scipy.stats import sem
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
|
||||
|
||||
from zstandard import ZstdCompressor
|
||||
|
||||
from mwmbl.utils import add_term_info
|
||||
|
||||
random = Random(1)
|
||||
|
||||
INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
|
||||
|
||||
|
||||
def run():
|
||||
compressor = ZstdCompressor()
|
||||
with TinyIndex(Document, INDEX_PATH) as index:
|
||||
# Get some random integers between 0 and index.num_pages:
|
||||
pages = random.sample(range(index.num_pages), 10000)
|
||||
|
||||
old_sizes = []
|
||||
new_sizes = []
|
||||
|
||||
for i in pages:
|
||||
page = index.get_page(i)
|
||||
term_documents = []
|
||||
for document in page:
|
||||
term_document = add_term_info(document, index, i)
|
||||
term_documents.append(term_document)
|
||||
|
||||
value_tuples = [astuple(value) for value in term_documents]
|
||||
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
|
||||
|
||||
new_sizes.append(num_fitting)
|
||||
old_sizes.append(len(page))
|
||||
|
||||
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
|
||||
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
|
@ -7,8 +7,8 @@ import json
|
|||
from collections import defaultdict, Counter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.crawler import HashedBatch
|
||||
from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
|
||||
|
||||
|
||||
# TODO: remove this line - temporary override
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import json
|
||||
|
||||
from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
|
||||
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
|
|||
"""
|
||||
import sqlite3
|
||||
|
||||
from mwmbl.indexer import URLS_PATH
|
||||
from mwmbl.indexer.paths import URLS_PATH
|
||||
from mwmbl.app import get_config_and_index
|
||||
|
||||
|
||||
|
|
58
analyse/index_local.py
Normal file
58
analyse/index_local.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
"""
|
||||
Index batches stored locally on the filesystem for the purpose of evaluation.
|
||||
"""
|
||||
import glob
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
import spacy
|
||||
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
from mwmbl.crawler.urls import URLDatabase
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.indexer.index_batches import index_batches
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
||||
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
|
||||
NUM_BATCHES = 10000
|
||||
EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
|
||||
NUM_PAGES = 1_024_000
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
|
||||
|
||||
def get_batches():
|
||||
for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
|
||||
data = json.load(gzip.open(path))
|
||||
yield HashedBatch.parse_obj(data)
|
||||
|
||||
|
||||
def run():
|
||||
try:
|
||||
os.remove(EVALUATE_INDEX_PATH)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
|
||||
|
||||
batches = get_batches()
|
||||
|
||||
start = datetime.now()
|
||||
with Database() as db:
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
url_db = URLDatabase(db.connection)
|
||||
index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
|
||||
end = datetime.now()
|
||||
|
||||
total_time = (end - start).total_seconds()
|
||||
print("total_seconds:", total_time)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
|
@ -1,7 +1,7 @@
|
|||
"""
|
||||
Count unique URLs in the index.
|
||||
"""
|
||||
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
||||
|
||||
def run():
|
||||
|
|
60
analyse/inspect_index.py
Normal file
60
analyse/inspect_index.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
import logging
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import spacy
|
||||
|
||||
from analyse.index_local import EVALUATE_INDEX_PATH
|
||||
from mwmbl.indexer.index import tokenize_document
|
||||
from mwmbl.indexer.paths import INDEX_PATH
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
|
||||
def store():
|
||||
document = Document(
|
||||
title='A nation in search of the new black | Theatre | The Guardian',
|
||||
url='https://www.theguardian.com/stage/2007/nov/18/theatre',
|
||||
extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
|
||||
score=1.0
|
||||
)
|
||||
with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
|
||||
print("Tokenized", tokenized)
|
||||
# for token in tokenized.tokens:
|
||||
#
|
||||
# tiny_index.index(token, document)
|
||||
|
||||
|
||||
def get_items():
|
||||
with TinyIndex(Document, INDEX_PATH) as tiny_index:
|
||||
items = tiny_index.retrieve('wikipedia')
|
||||
if items:
|
||||
for item in items:
|
||||
print("Items", item)
|
||||
|
||||
|
||||
def run(index_path):
|
||||
with TinyIndex(Document, index_path) as tiny_index:
|
||||
sizes = {}
|
||||
for i in range(tiny_index.num_pages):
|
||||
page = tiny_index.get_page(i)
|
||||
if page:
|
||||
sizes[i] = len(page)
|
||||
if len(page) > 50:
|
||||
print("Page", len(page), page)
|
||||
# for item in page:
|
||||
# if ' search' in item.title:
|
||||
# print("Page", i, item)
|
||||
print("Max", max(sizes.values()))
|
||||
print("Top", sorted(sizes.values())[-100:])
|
||||
print("Mean", np.mean(list(sizes.values())))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# store()
|
||||
run(EVALUATE_INDEX_PATH)
|
||||
# get_items()
|
|
@ -4,10 +4,12 @@ See how many unique URLs and root domains we have crawled.
|
|||
import glob
|
||||
import gzip
|
||||
import json
|
||||
from collections import defaultdict, Counter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from mwmbl.indexer import CRAWL_GLOB
|
||||
from mwmbl.indexer.paths import CRAWL_GLOB
|
||||
|
||||
|
||||
API_ENDPOINT = "http://95.216.215.29/batches/historical"
|
||||
|
|
|
@ -2,9 +2,9 @@ import logging
|
|||
import sys
|
||||
from itertools import islice
|
||||
|
||||
from mwmbl.indexer import INDEX_PATH
|
||||
from mwmbl.indexer.paths import INDEX_PATH
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
|
|
|
@ -3,7 +3,7 @@ Send a batch to a running instance.
|
|||
"""
|
||||
import requests
|
||||
|
||||
from mwmbl.crawler import Batch, Item, ItemContent
|
||||
from mwmbl.crawler.batch import Batch, Item, ItemContent
|
||||
|
||||
|
||||
URL = 'http://localhost:5000/crawler/batches/'
|
||||
|
|
|
@ -4,7 +4,7 @@ from datetime import datetime
|
|||
from pathlib import Path
|
||||
from queue import Queue
|
||||
|
||||
from mwmbl.indexer import record_urls_in_database
|
||||
from mwmbl.indexer.update_urls import record_urls_in_database
|
||||
|
||||
|
||||
def run_update_urls_on_fixed_batches():
|
||||
|
|
7
app.json
7
app.json
|
@ -1,7 +0,0 @@
|
|||
{
|
||||
"scripts": {
|
||||
"dokku": {
|
||||
"predeploy": "rm -rf /app/static/* && cp -r /front-end-build/* /app/static/"
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
|
@ -1,341 +0,0 @@
|
|||
body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
overflow-y: scroll;
|
||||
background-color: var(--light-color);
|
||||
min-height: 100vh;
|
||||
height: fit-content;
|
||||
padding-top: 25px;
|
||||
transition: padding 300ms ease;
|
||||
}
|
||||
|
||||
@media (prefers-reduced-motion) {
|
||||
body {
|
||||
transition: none;
|
||||
}
|
||||
}
|
||||
|
||||
.branding {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
margin: 25px;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 600px) {
|
||||
.branding {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
|
||||
.brand-title {
|
||||
text-align: center;
|
||||
font-weight: var(--black-font-weight);
|
||||
font-size: 1.5rem;
|
||||
margin: 10px 15px 10px 10px;
|
||||
}
|
||||
|
||||
.brand-icon {
|
||||
height: 2.5rem;
|
||||
}
|
||||
|
||||
.search-menu {
|
||||
position: sticky;
|
||||
top: 0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
width: 100%;
|
||||
padding: 10px;
|
||||
background-color: rgba(248, 248, 248, .9);
|
||||
z-index: 10;
|
||||
}
|
||||
|
||||
.search-menu.compact {
|
||||
flex-direction: row;
|
||||
}
|
||||
|
||||
.search-menu.compact .branding {
|
||||
margin: 0 25px 0 0;
|
||||
}
|
||||
|
||||
.search-menu.compact .brand-title {
|
||||
font-size: 1.2rem;
|
||||
}
|
||||
|
||||
.search-menu.compact .brand-icon {
|
||||
height: 2rem;
|
||||
}
|
||||
|
||||
.search-bar {
|
||||
position: relative;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.search-bar-input {
|
||||
background-color: var(--gray-color);
|
||||
border: none;
|
||||
padding: 15px 15px 15px 50px;
|
||||
border-radius: 10px;
|
||||
outline: none;
|
||||
font-size: var(--default-font-size);
|
||||
width: 100%;
|
||||
font-weight: var(--bold-font-weight);
|
||||
box-shadow: 0 0 0 0 var(--primary-color);
|
||||
transition:
|
||||
box-shadow 200ms ease-in-out;
|
||||
}
|
||||
|
||||
.search-bar-input::placeholder {
|
||||
color: var(--dark-color);
|
||||
opacity: .3;
|
||||
}
|
||||
|
||||
.search-bar-input:focus {
|
||||
box-shadow: 0 0 0 0.2rem var(--primary-color);
|
||||
}
|
||||
|
||||
.search-bar i {
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
left: 15px;
|
||||
transform: translateY(-50%);
|
||||
color: var(--dark-color);
|
||||
opacity: .3;
|
||||
font-size: 1.5rem;
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
.main, footer {
|
||||
display: block;
|
||||
max-width: 800px;
|
||||
width: 100%;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
.results {
|
||||
max-width: 100%;
|
||||
list-style-type: none;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.result {
|
||||
min-height: 120px;
|
||||
}
|
||||
|
||||
.result-container {
|
||||
text-decoration: none;
|
||||
color: var(--dark-color);
|
||||
padding: 15px;
|
||||
border-radius: 10px;
|
||||
outline: 3px solid transparent;
|
||||
outline-offset: 3px;
|
||||
transition:
|
||||
background-color 200ms ease-in-out,
|
||||
outline 100ms ease-in-out;
|
||||
}
|
||||
|
||||
.result-container:hover,.result-container:focus {
|
||||
background-color: var(--gray-color);
|
||||
}
|
||||
|
||||
.result-container:focus {
|
||||
outline: 3px solid var(--primary-color);
|
||||
}
|
||||
|
||||
.result .link {
|
||||
font-size: .9rem;
|
||||
}
|
||||
|
||||
.result .title, .result .title>* {
|
||||
color: var(--primary-color);
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
.result .extract {
|
||||
opacity: .8;
|
||||
font-size: .9rem;
|
||||
}
|
||||
|
||||
.empty-result, .home {
|
||||
text-align: center;
|
||||
opacity: .5;
|
||||
font-weight: var(--bold-font-weight);
|
||||
}
|
||||
|
||||
.footer {
|
||||
position: sticky;
|
||||
top: 100vh;
|
||||
margin-bottom: 25px;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.footer-text {
|
||||
text-align: center;
|
||||
opacity: .5;
|
||||
font-weight: var(--bold-font-weight);
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.footer-list {
|
||||
list-style-type: none;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.footer-link {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
text-decoration: none;
|
||||
padding: 10px;
|
||||
color: var(--dark-color);
|
||||
border-radius: 10px;
|
||||
background-color: var(--gray-color);
|
||||
box-shadow: 0 0 0 0 var(--primary-color);
|
||||
transition:
|
||||
box-shadow 200ms ease-in-out;
|
||||
}
|
||||
|
||||
.footer-link:hover {
|
||||
box-shadow: 0 0 0 0.2rem var(--dark-color);
|
||||
}
|
||||
|
||||
.footer-link i {
|
||||
font-size: 1.2rem;
|
||||
margin-right: 5px;
|
||||
color: inherit;
|
||||
}
|
||||
|
||||
.footer-link>span {
|
||||
color: inherit;
|
||||
font-size: var(--default-font-size);
|
||||
font-weight: var(--bold-font-weight);
|
||||
}
|
||||
|
||||
@media screen and (min-width:576px) {
|
||||
.brand-title {
|
||||
margin: 0 25px 0 15px;
|
||||
}
|
||||
}
|
||||
|
||||
.noscript {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
height: calc(100vh - 25px);
|
||||
width: 100%;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
a {
|
||||
font-weight: var(--bold-font-weight);
|
||||
color: var(--primary-color);
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.curation-buttons {
|
||||
display: grid;
|
||||
grid-auto-flow: column;
|
||||
grid-column-gap: 20px;
|
||||
grid-auto-columns: max-content;
|
||||
}
|
||||
|
||||
.result-container .button {
|
||||
background-color: var(--dark-gray-color);
|
||||
color: white;
|
||||
padding: 5px 10px;
|
||||
margin: 0;
|
||||
font-size: var(--small-font-size);
|
||||
font-weight: var(--bold-font-weight);
|
||||
}
|
||||
|
||||
.validated {
|
||||
background-color: green !important;
|
||||
}
|
||||
|
||||
.modal {
|
||||
/*display: none; !* Hidden by default *!*/
|
||||
position: fixed; /* Stay in place */
|
||||
z-index: 100; /* Sit on top */
|
||||
left: 0;
|
||||
top: 0;
|
||||
width: 100%; /* Full width */
|
||||
height: 100%; /* Full height */
|
||||
overflow: auto; /* Enable scroll if needed */
|
||||
background-color: rgb(0,0,0); /* Fallback color */
|
||||
background-color: rgba(0,0,0,0.4); /* Black w/ opacity */
|
||||
}
|
||||
|
||||
/* Modal Content/Box */
|
||||
.modal-content {
|
||||
background-color: #fefefe;
|
||||
margin: 15% auto; /* 15% from the top and centered */
|
||||
padding: 20px;
|
||||
border: 1px solid #888;
|
||||
max-width: 800px;
|
||||
width: 80%; /* Could be more or less, depending on screen size */
|
||||
}
|
||||
|
||||
/* The Close Button */
|
||||
.close {
|
||||
color: #aaa;
|
||||
float: right;
|
||||
font-size: 28px;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.close:hover,
|
||||
.close:focus {
|
||||
color: black;
|
||||
text-decoration: none;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.button {
|
||||
background-color: var(--primary-color);
|
||||
border: none;
|
||||
color: white;
|
||||
padding: 10px 20px;
|
||||
margin: 10px;
|
||||
text-align: center;
|
||||
text-decoration: none;
|
||||
display: inline-block;
|
||||
font-size: var(--default-font-size);
|
||||
border-radius: 50px;
|
||||
cursor: pointer;
|
||||
flex-shrink: 0;
|
||||
transition: background-color 200ms ease-in-out;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 600px) {
|
||||
.button {
|
||||
padding: 5px 10px;
|
||||
font-size: var(--small-font-size);
|
||||
margin: 5px;
|
||||
}
|
||||
}
|
||||
|
||||
.button:hover {
|
||||
background-color: var(--dark-color);
|
||||
}
|
||||
|
||||
.login-info {
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
/* Sortable styling is not working in HTML 5 yet */
|
||||
/*.sortable-drag {*/
|
||||
/* opacity: 1.0;*/
|
||||
/*}*/
|
||||
|
||||
/*.sortable-ghost {*/
|
||||
/* opacity: 1.0;*/
|
||||
/*}*/
|
||||
|
||||
/*.sortable-chosen {*/
|
||||
/* opacity: 0;*/
|
||||
/*}*/
|
|
@ -1,33 +0,0 @@
|
|||
/*
|
||||
Josh's Custom CSS Reset
|
||||
https://www.joshwcomeau.com/css/custom-css-reset/
|
||||
*/
|
||||
*, *::before, *::after {
|
||||
box-sizing: border-box;
|
||||
font-family: var(--regular-font);
|
||||
color: var(--dark-color);
|
||||
font-size: var(--default-font-size);
|
||||
}
|
||||
* {
|
||||
margin: 0;
|
||||
}
|
||||
html, body {
|
||||
height: 100%;
|
||||
}
|
||||
body {
|
||||
line-height: 1.5;
|
||||
-webkit-font-smoothing: antialiased;
|
||||
}
|
||||
img, picture, video, canvas, svg {
|
||||
display: block;
|
||||
max-width: 100%;
|
||||
}
|
||||
input, button, textarea, select {
|
||||
font: inherit;
|
||||
}
|
||||
p, h1, h2, h3, h4, h5, h6 {
|
||||
overflow-wrap: break-word;
|
||||
}
|
||||
#root, #__next {
|
||||
isolation: isolate;
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
:root {
|
||||
|
||||
/* This is the theme file, use it to define theme variables. */
|
||||
|
||||
/* Colors: */
|
||||
--dark-color: #0A1931;
|
||||
--primary-color: #185ADB;
|
||||
--gray-color: #EEEEEE;
|
||||
--light-color: #F8F8F8;
|
||||
--dark-gray-color: #767676;
|
||||
|
||||
/* Fonts: */
|
||||
--regular-font: 'Inter', sans-serif;
|
||||
--small-font-size: 12px;
|
||||
--default-font-size: 16px;
|
||||
--default-font-weight: 400;
|
||||
--bold-font-weight: 700;
|
||||
--black-font-weight: 900;
|
||||
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,50 +0,0 @@
|
|||
@font-face {
|
||||
font-family: 'Inter';
|
||||
font-style: normal;
|
||||
font-weight: 400;
|
||||
font-display: swap;
|
||||
src: url("Inter-Regular.woff2?v=3.19") format("woff2"),
|
||||
url("Inter-Regular.woff?v=3.19") format("woff");
|
||||
}
|
||||
@font-face {
|
||||
font-family: 'Inter';
|
||||
font-style: italic;
|
||||
font-weight: 400;
|
||||
font-display: swap;
|
||||
src: url("Inter-Italic.woff2?v=3.19") format("woff2"),
|
||||
url("Inter-Italic.woff?v=3.19") format("woff");
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'Inter';
|
||||
font-style: normal;
|
||||
font-weight: 700;
|
||||
font-display: swap;
|
||||
src: url("Inter-Bold.woff2?v=3.19") format("woff2"),
|
||||
url("Inter-Bold.woff?v=3.19") format("woff");
|
||||
}
|
||||
@font-face {
|
||||
font-family: 'Inter';
|
||||
font-style: italic;
|
||||
font-weight: 700;
|
||||
font-display: swap;
|
||||
src: url("Inter-BoldItalic.woff2?v=3.19") format("woff2"),
|
||||
url("Inter-BoldItalic.woff?v=3.19") format("woff");
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'Inter';
|
||||
font-style: normal;
|
||||
font-weight: 900;
|
||||
font-display: swap;
|
||||
src: url("Inter-Black.woff2?v=3.19") format("woff2"),
|
||||
url("Inter-Black.woff?v=3.19") format("woff");
|
||||
}
|
||||
@font-face {
|
||||
font-family: 'Inter';
|
||||
font-style: italic;
|
||||
font-weight: 900;
|
||||
font-display: swap;
|
||||
src: url("Inter-BlackItalic.woff2?v=3.19") format("woff2"),
|
||||
url("Inter-BlackItalic.woff?v=3.19") format("woff");
|
||||
}
|
Binary file not shown.
|
@ -1,122 +0,0 @@
|
|||
/*--------------------------------
|
||||
|
||||
Phosphor Web Font
|
||||
|
||||
-------------------------------- */
|
||||
@font-face {
|
||||
font-family: 'Phosphor';
|
||||
src: url("Phosphor.woff2") format("woff2");
|
||||
font-weight: normal;
|
||||
font-style: normal;
|
||||
font-display: swap;
|
||||
}
|
||||
/*------------------------
|
||||
base class definition
|
||||
-------------------------*/
|
||||
[class^="ph-"],
|
||||
[class*=" ph-"] {
|
||||
display: inline-flex;
|
||||
}
|
||||
|
||||
[class^="ph-"]:before,
|
||||
[class*=" ph-"]:before {
|
||||
font: normal normal normal 1em/1 "Phosphor";
|
||||
color: inherit;
|
||||
flex-shrink: 0;
|
||||
speak: none;
|
||||
text-transform: none;
|
||||
text-decoration: inherit;
|
||||
text-align: center;
|
||||
/* Better Font Rendering */
|
||||
-webkit-font-smoothing: antialiased;
|
||||
-moz-osx-font-smoothing: grayscale;
|
||||
}
|
||||
/*------------------------
|
||||
change icon size
|
||||
-------------------------*/
|
||||
/* relative units */
|
||||
.ph-xxs {
|
||||
font-size: 0.5em;
|
||||
}
|
||||
.ph-xs {
|
||||
font-size: 0.75em;
|
||||
}
|
||||
.ph-sm {
|
||||
font-size: 0.875em;
|
||||
}
|
||||
.ph-lg {
|
||||
font-size: 1.3333em;
|
||||
line-height: 0.75em;
|
||||
vertical-align: -0.0667em;
|
||||
}
|
||||
.ph-xl {
|
||||
font-size: 1.5em;
|
||||
line-height: 0.6666em;
|
||||
vertical-align: -0.075em;
|
||||
}
|
||||
.ph-1x {
|
||||
font-size: 1em;
|
||||
}
|
||||
.ph-2x {
|
||||
font-size: 2em;
|
||||
}
|
||||
.ph-3x {
|
||||
font-size: 3em;
|
||||
}
|
||||
.ph-4x {
|
||||
font-size: 4em;
|
||||
}
|
||||
.ph-5x {
|
||||
font-size: 5em;
|
||||
}
|
||||
.ph-6x {
|
||||
font-size: 6em;
|
||||
}
|
||||
.ph-7x {
|
||||
font-size: 7em;
|
||||
}
|
||||
.ph-8x {
|
||||
font-size: 8em;
|
||||
}
|
||||
.ph-9x {
|
||||
font-size: 9em;
|
||||
}
|
||||
.ph-10x {
|
||||
font-size: 10em;
|
||||
}
|
||||
.ph-fw {
|
||||
text-align: center;
|
||||
width: 1.25em;
|
||||
}
|
||||
/*------------------------
|
||||
icons (to add an icon you want to use,
|
||||
copy it from the unused.css file)
|
||||
-------------------------*/
|
||||
|
||||
.ph-magnifying-glass-bold::before {
|
||||
content: "\f8bf";
|
||||
}
|
||||
|
||||
.ph-github-logo-bold::before {
|
||||
content: "\f852";
|
||||
}
|
||||
|
||||
.ph-info-bold::before {
|
||||
content: "\f88f";
|
||||
}
|
||||
|
||||
.ph-book-bold::before {
|
||||
content: "\f6fb";
|
||||
}
|
||||
|
||||
.ph-browser-bold::before {
|
||||
content: "\f70d";
|
||||
}
|
||||
|
||||
.ph-youtube-logo-bold::before {
|
||||
content: "\fa5d";
|
||||
}
|
||||
|
||||
.ph-chat-circle-text-bold::before {
|
||||
content: "\f74c";
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -1,14 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg width="100%" height="100%" viewBox="0 0 9375 9375" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1.5;">
|
||||
<style>
|
||||
path {
|
||||
fill: #000;
|
||||
}
|
||||
@media ( prefers-color-scheme: dark ) {
|
||||
path {
|
||||
fill: #fff !important;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
<path d="M6128.72,8251.56c495.65,0 919.697,-176.222 1272.13,-528.659c352.437,-352.438 528.659,-776.484 528.659,-1272.13l-0,-3358.75c-0,-94.644 -35.492,-176.841 -106.482,-246.581c-70.985,-69.739 -153.801,-104.612 -248.445,-104.612c-99.634,-0 -184.314,34.873 -254.054,104.612c-69.746,69.74 -104.612,151.937 -104.612,246.581l-0,3358.75c-0,301.373 -105.857,557.923 -317.571,769.63c-211.708,211.714 -468.251,317.571 -769.63,317.571c-298.89,0 -554.808,-105.857 -767.766,-317.571c-212.958,-211.707 -319.434,-468.257 -319.434,-769.63l-0,-3358.75c-0,-94.644 -34.873,-176.841 -104.613,-246.581c-69.739,-69.739 -154.426,-104.612 -254.054,-104.612c-94.649,-0 -176.841,34.873 -246.58,104.612c-69.74,69.74 -104.613,151.937 -104.613,246.581l0,3358.75c0,301.373 -106.476,557.923 -319.434,769.63c-212.959,211.714 -468.883,317.571 -767.766,317.571c-301.379,0 -557.923,-105.857 -769.636,-317.571c-211.708,-211.707 -317.565,-468.257 -317.565,-769.63l0,-3358.75c0,-94.644 -34.873,-176.841 -104.612,-246.581c-69.74,-69.739 -154.427,-104.612 -254.054,-104.612c-94.65,-0 -176.841,34.873 -246.581,104.612c-69.739,69.74 -104.612,151.937 -104.612,246.581l-0,3358.75c-0,326.283 80.327,627.662 240.976,904.131c160.656,276.469 378.593,495.031 653.817,655.686c275.224,160.649 575.984,240.977 902.267,240.977c291.416,0 563.525,-64.761 816.335,-194.277c252.81,-129.517 460.158,-307.608 622.058,-534.263c166.878,226.655 376.722,404.746 629.532,534.263c252.809,129.516 524.919,194.277 816.335,194.277Zm-0.96,-1617.39l-0.582,-0c-99.627,-0 -184.314,-34.873 -254.054,-104.612c-69.739,-69.74 -104.612,-151.938 -104.612,-246.581l-0,-3358.74c-0,-301.373 -105.857,-557.923 -317.565,-769.63c-210.698,-210.699 -465.799,-316.549 -765.32,-317.559c-299.521,1.01 -554.622,106.86 -765.314,317.559c-211.714,211.707 -317.571,468.257 -317.571,769.63l0,3358.75c0,94.644 -34.866,176.841 -104.606,246.581c-69.739,69.739 -154.426,104.612 -254.054,104.612l-8.638,0c-94.643,0 -176.841,-34.873 -246.58,-104.612c-69.74,-69.74 -104.613,-151.937 -104.613,-246.581l0,-3358.75c0,-301.373 -106.476,-557.923 -319.434,-769.63c-212.959,-211.714 -468.876,-317.571 -767.766,-317.571c-301.379,-0 -557.922,105.857 -769.63,317.571c-211.714,211.707 -317.571,468.257 -317.571,769.63l0,3358.75c0,94.644 -34.867,176.841 -104.612,246.581c-69.74,69.739 -154.42,104.612 -254.054,104.612c-94.644,0 -176.841,-34.873 -246.581,-104.612c-69.739,-69.74 -104.606,-151.937 -104.606,-246.581l0,-3358.75c0,-326.283 80.321,-627.662 240.977,-904.131c160.649,-276.469 378.586,-495.031 653.816,-655.686c275.224,-160.649 575.978,-240.977 902.261,-240.977c291.416,-0 563.526,64.761 816.335,194.277c252.81,129.517 460.164,307.608 622.058,534.263c166.878,-226.655 376.722,-404.746 629.532,-534.263c252.809,-129.516 524.919,-194.277 816.335,-194.277l8.638,-0c164.822,-0 323.472,20.718 475.941,62.154l5.239,1.431c41.114,11.263 81.609,24.024 121.497,38.284c72.687,25.87 143.907,56.675 213.652,92.408c250.636,128.408 456.592,304.549 617.866,528.412l4.328,5.665c166.872,-226.58 376.667,-404.598 629.396,-534.077c252.809,-129.516 524.925,-194.277 816.335,-194.277c495.657,-0 919.704,176.222 1272.14,528.659c352.437,352.438 528.653,776.484 528.653,1272.13l0,3358.75c0,94.644 -35.492,176.841 -106.476,246.581c-70.984,69.739 -153.801,104.612 -248.451,104.612c-99.627,0 -184.314,-34.873 -254.054,-104.612c-69.739,-69.74 -104.612,-151.937 -104.612,-246.581l-0,-3358.75c-0,-301.373 -105.851,-557.923 -317.565,-769.63c-211.713,-211.714 -468.257,-317.571 -769.636,-317.571c-298.883,-0 -554.807,105.857 -767.766,317.571c-212.952,211.707 -319.434,468.257 -319.434,769.63l-0,3358.75c-0,94.644 -34.867,176.841 -104.606,246.581c-69.746,69.739 -154.427,104.612 -254.055,104.612l-0.582,-0.006Z" style="stroke:#185ADB;stroke-width:4.17px;"/></svg>
|
Before Width: | Height: | Size: 4.2 KiB |
|
@ -1,4 +0,0 @@
|
|||
<svg width="300" height="300" viewBox="0 0 300 300" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<rect x="25.909" y="49.7723" width="250.569" height="200.455" fill="white"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M300 195C300 252.951 252.951 300 195 300H105C47.049 300 0 252.951 0 195V105C0 47.0489 47.049 0 105 0H195C252.951 0 300 47.0489 300 105V195ZM187.005 200.017H186.99C184.431 200.017 182.255 199.121 180.463 197.329C178.671 195.537 177.775 193.425 177.775 190.993V104.696C177.775 96.9523 175.055 90.3607 169.616 84.9212C164.202 79.5076 157.648 76.7879 149.952 76.762C142.256 76.7879 135.702 79.5076 130.288 84.9212C124.849 90.3607 122.129 96.9523 122.129 104.696V190.993C122.129 193.425 121.233 195.537 119.441 197.329C117.649 199.121 115.473 200.017 112.914 200.017H112.692C110.26 200.017 108.148 199.121 106.356 197.329C104.564 195.537 103.668 193.425 103.668 190.993V104.696C103.668 96.9523 100.933 90.3607 95.461 84.9212C89.9894 79.4815 83.414 76.7617 75.7345 76.7617C67.991 76.7617 61.3995 79.4815 55.96 84.9212C50.5203 90.3607 47.8005 96.9523 47.8005 104.696V190.993C47.8005 193.425 46.9047 195.537 45.1127 197.329C43.3208 199.121 41.1451 200.017 38.5851 200.017C36.1534 200.017 34.0415 199.121 32.2496 197.329C30.4578 195.537 29.5619 193.425 29.5619 190.993V104.696C29.5619 96.3123 31.6257 88.5688 35.7535 81.4654C39.8811 74.362 45.4806 68.7463 52.5523 64.6186C59.6237 60.4909 67.3511 58.427 75.7345 58.427C83.2219 58.427 90.2134 60.091 96.7089 63.4187C103.204 66.7464 108.532 71.3222 112.692 77.1457C116.979 71.3222 122.371 66.7464 128.867 63.4187C135.362 60.091 142.354 58.427 149.841 58.427H150.063C154.298 58.427 158.374 58.9594 162.292 60.024L162.426 60.0607C163.483 60.3501 164.523 60.678 165.548 61.0444C167.415 61.7091 169.245 62.5006 171.037 63.4187C177.477 66.7179 182.769 71.2436 186.912 76.9954L187.024 77.141C191.311 71.3193 196.701 66.7454 203.195 63.4187C209.691 60.091 216.682 58.427 224.169 58.427C236.905 58.427 247.8 62.9548 256.855 72.0101C265.91 81.0655 270.438 91.9607 270.438 104.696V190.993C270.438 193.425 269.526 195.537 267.702 197.329C265.879 199.121 263.751 200.017 261.319 200.017C258.759 200.017 256.583 199.121 254.791 197.329C252.999 195.537 252.103 193.425 252.103 190.993V104.696C252.103 96.9523 249.384 90.3607 243.944 84.9212C238.504 79.4815 231.913 76.7617 224.169 76.7617C216.49 76.7617 209.915 79.4815 204.443 84.9212C198.971 90.3607 196.236 96.9523 196.236 104.696V190.993C196.236 193.425 195.34 195.537 193.548 197.329C191.756 199.121 189.58 200.017 187.02 200.017L187.005 200.017ZM187.03 241.573C199.765 241.573 210.66 237.045 219.716 227.99C228.771 218.935 233.299 208.039 233.299 195.304V109.007C233.299 106.575 232.387 104.463 230.563 102.671C228.739 100.879 226.611 99.9832 224.179 99.9832C221.619 99.9832 219.444 100.879 217.652 102.671C215.86 104.463 214.964 106.575 214.964 109.007V195.304C214.964 203.048 212.244 209.639 206.804 215.079C201.365 220.518 194.773 223.238 187.03 223.238C179.35 223.238 172.775 220.518 167.303 215.079C161.832 209.639 159.096 203.048 159.096 195.304V109.007C159.096 106.575 158.2 104.463 156.408 102.671C154.616 100.879 152.44 99.9832 149.881 99.9832C147.449 99.9832 145.337 100.879 143.545 102.671C141.753 104.463 140.857 106.575 140.857 109.007V195.304C140.857 203.048 138.122 209.639 132.65 215.079C127.178 220.518 120.603 223.238 112.923 223.238C105.18 223.238 98.5884 220.518 93.1488 215.079C87.7093 209.639 84.9894 203.048 84.9894 195.304V109.007C84.9894 106.575 84.0934 104.463 82.3016 102.671C80.5097 100.879 78.3338 99.9832 75.7741 99.9832C73.3422 99.9832 71.2304 100.879 69.4386 102.671C67.6467 104.463 66.7507 106.575 66.7507 109.007V195.304C66.7507 203.688 68.8146 211.431 72.9422 218.535C77.07 225.638 82.6696 231.254 89.741 235.381C96.8125 239.509 104.54 241.573 112.923 241.573C120.411 241.573 127.402 239.909 133.898 236.581C140.393 233.254 145.721 228.678 149.881 222.854C154.168 228.678 159.56 233.254 166.056 236.581C172.551 239.909 179.543 241.573 187.03 241.573V241.573Z" fill="#185ADB"/>
|
||||
</svg>
|
Before Width: | Height: | Size: 3.9 KiB |
File diff suppressed because one or more lines are too long
|
@ -1,18 +0,0 @@
|
|||
/**
|
||||
* This file is made for tweaking parameters on the front-end
|
||||
* without having to dive in the source code.
|
||||
*
|
||||
* THIS IS NOT A PLACE TO PUT SENSIBLE DATA LIKE API KEYS.
|
||||
* THIS FILE IS PUBLIC.
|
||||
*/
|
||||
|
||||
export default {
|
||||
componentPrefix: 'mwmbl',
|
||||
publicApiURL: '/api/v1/',
|
||||
// publicApiURL: 'http://localhost:5000/',
|
||||
searchQueryParam: 'q',
|
||||
commands: {
|
||||
'go: ': 'https://',
|
||||
'search: google.com ': 'https://www.google.com/search?q=',
|
||||
}
|
||||
}
|
1269
front-end/package-lock.json
generated
1269
front-end/package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -1,19 +0,0 @@
|
|||
{
|
||||
"name": "front-end",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "vite build",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@vitejs/plugin-legacy": "^2.3.1",
|
||||
"terser": "^5.16.1",
|
||||
"vite": "^3.2.3"
|
||||
},
|
||||
"dependencies": {
|
||||
"chart.js": "^4.4.0",
|
||||
"sortablejs": "^1.15.0"
|
||||
}
|
||||
}
|
|
@ -1,21 +0,0 @@
|
|||
import define from "../../utils/define.js";
|
||||
|
||||
|
||||
export default define('add-button', class extends HTMLButtonElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.addEventListener('click', (e) => {
|
||||
console.log("Add button");
|
||||
document.querySelector('.modal').style.display = 'block';
|
||||
document.querySelector('.modal input').focus();
|
||||
})
|
||||
}
|
||||
}, { extends: 'button' });
|
|
@ -1,69 +0,0 @@
|
|||
import define from '../../utils/define.js';
|
||||
import config from "../../../config.js";
|
||||
import {globalBus} from "../../utils/events.js";
|
||||
|
||||
|
||||
const FETCH_URL = '/app/fetch?'
|
||||
|
||||
|
||||
const template = () => /*html*/`
|
||||
<form class="modal-content">
|
||||
<span class="close">×</span>
|
||||
<input class="add-result" placeholder="Enter a URL...">
|
||||
<button>Save</button>
|
||||
</form>
|
||||
`;
|
||||
|
||||
export default define('add-result', class extends HTMLDivElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.classList.add('modal');
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
this.__events();
|
||||
this.style.display = 'none';
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.querySelector('.close').addEventListener('click', e => {
|
||||
if (e.target === this) {
|
||||
this.style.display = 'none';
|
||||
}
|
||||
});
|
||||
|
||||
this.addEventListener('click', e => {
|
||||
this.style.display = 'none';
|
||||
});
|
||||
|
||||
this.querySelector('form').addEventListener('click', e => {
|
||||
// Clicking on the form shouldn't close it
|
||||
e.stopPropagation();
|
||||
});
|
||||
|
||||
this.addEventListener('submit', this.__urlSubmitted.bind(this));
|
||||
}
|
||||
|
||||
async __urlSubmitted(e) {
|
||||
e.preventDefault();
|
||||
const value = this.querySelector('input').value;
|
||||
console.log("Input value", value);
|
||||
|
||||
const query = document.querySelector('.search-bar input').value;
|
||||
|
||||
const url = `${FETCH_URL}url=${encodeURIComponent(value)}&query=${encodeURIComponent(query)}`;
|
||||
const response = await fetch(url);
|
||||
if (response.status === 200) {
|
||||
const data = await response.text();
|
||||
console.log("Data", data);
|
||||
|
||||
const addResultEvent = new CustomEvent('curate-add-result', {detail: data});
|
||||
globalBus.dispatch(addResultEvent);
|
||||
} else {
|
||||
console.log("Bad response", response);
|
||||
// TODO
|
||||
}
|
||||
}
|
||||
}, { extends: 'div' });
|
|
@ -1,35 +0,0 @@
|
|||
import define from "../../utils/define.js";
|
||||
import {globalBus} from "../../utils/events.js";
|
||||
|
||||
|
||||
export default define('delete-button', class extends HTMLButtonElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.addEventListener('click', (e) => {
|
||||
console.log("Delete button");
|
||||
|
||||
const result = this.closest('.result');
|
||||
const parent = result.parentNode;
|
||||
|
||||
const index = Array.prototype.indexOf.call(parent.getElementsByClassName('result'), result);
|
||||
console.log("Delete index", index);
|
||||
|
||||
const beginCuratingEvent = new CustomEvent('curate-delete-result', {
|
||||
detail: {
|
||||
data: {
|
||||
delete_index: index
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(beginCuratingEvent);
|
||||
})
|
||||
}
|
||||
}, { extends: 'button' });
|
|
@ -1,45 +0,0 @@
|
|||
import define from '../../utils/define.js';
|
||||
import escapeString from '../../utils/escapeString.js';
|
||||
import { globalBus } from '../../utils/events.js';
|
||||
|
||||
|
||||
export default define('result', class extends HTMLLIElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.classList.add('result');
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.addEventListener('keydown', (e) => {
|
||||
if (this.firstElementChild === document.activeElement) {
|
||||
if (e.key === 'ArrowDown') {
|
||||
e.preventDefault();
|
||||
this?.nextElementSibling?.firstElementChild.focus();
|
||||
}
|
||||
if (e.key === 'ArrowUp') {
|
||||
e.preventDefault();
|
||||
if (this.previousElementSibling)
|
||||
this.previousElementSibling?.firstElementChild.focus();
|
||||
else {
|
||||
const focusSearchEvent = new CustomEvent('focus-search');
|
||||
globalBus.dispatch(focusSearchEvent);
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
__handleBold(input) {
|
||||
let text = '';
|
||||
for (const part of input) {
|
||||
if (part.is_bold) text += `<strong>${escapeString(part.value)}</strong>`;
|
||||
else text += escapeString(part.value);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
}, { extends: 'li' });
|
|
@ -1,53 +0,0 @@
|
|||
import define from "../../utils/define.js";
|
||||
import {globalBus} from "../../utils/events.js";
|
||||
|
||||
|
||||
const VALIDATED_CLASS = "validated";
|
||||
|
||||
export default define('validate-button', class extends HTMLButtonElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.addEventListener('click', (e) => {
|
||||
console.log("Validate button");
|
||||
|
||||
const result = this.closest('.result');
|
||||
const parent = result.parentNode;
|
||||
|
||||
const index = Array.prototype.indexOf.call(parent.getElementsByClassName('result'), result);
|
||||
console.log("Validate index", index);
|
||||
|
||||
const curationValidateEvent = new CustomEvent('curate-validate-result', {
|
||||
detail: {
|
||||
data: {
|
||||
validate_index: index
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationValidateEvent);
|
||||
})
|
||||
}
|
||||
|
||||
isValidated() {
|
||||
return this.classList.contains(VALIDATED_CLASS);
|
||||
}
|
||||
|
||||
validate() {
|
||||
this.classList.add(VALIDATED_CLASS);
|
||||
}
|
||||
|
||||
unvalidate() {
|
||||
this.classList.remove(VALIDATED_CLASS);
|
||||
}
|
||||
|
||||
toggleValidate() {
|
||||
this.classList.toggle(VALIDATED_CLASS);
|
||||
}
|
||||
}, { extends: 'button' });
|
|
@ -1,191 +0,0 @@
|
|||
import {globalBus} from '../../utils/events.js';
|
||||
import Sortable from 'sortablejs';
|
||||
|
||||
class ResultsHandler {
|
||||
constructor() {
|
||||
this.results = null;
|
||||
this.oldIndex = null;
|
||||
this.curating = false;
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.__events();
|
||||
this.__initializeResults();
|
||||
}
|
||||
|
||||
__events() {
|
||||
document.body.addEventListener('htmx:load', e => {
|
||||
this.__initializeResults();
|
||||
});
|
||||
|
||||
// Focus first element when coming from the search bar
|
||||
globalBus.on('focus-result', () => {
|
||||
this.results.firstElementChild.firstElementChild.focus();
|
||||
});
|
||||
|
||||
globalBus.on('curate-delete-result', (e) => {
|
||||
console.log("Curate delete result event", e);
|
||||
this.__beginCurating.bind(this)();
|
||||
|
||||
const children = this.results.getElementsByClassName('result');
|
||||
let deleteIndex = e.detail.data.delete_index;
|
||||
const child = children[deleteIndex];
|
||||
this.results.removeChild(child);
|
||||
const newResults = this.__getResults();
|
||||
|
||||
const curationSaveEvent = new CustomEvent('save-curation', {
|
||||
detail: {
|
||||
type: 'delete',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: newResults,
|
||||
curation: {
|
||||
delete_index: deleteIndex
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationSaveEvent);
|
||||
});
|
||||
|
||||
globalBus.on('curate-validate-result', (e) => {
|
||||
console.log("Curate validate result event", e);
|
||||
this.__beginCurating.bind(this)();
|
||||
|
||||
const children = this.results.getElementsByClassName('result');
|
||||
const validateChild = children[e.detail.data.validate_index];
|
||||
validateChild.querySelector('.curate-approve').toggleValidate();
|
||||
|
||||
const newResults = this.__getResults();
|
||||
|
||||
const curationStartEvent = new CustomEvent('save-curation', {
|
||||
detail: {
|
||||
type: 'validate',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: newResults,
|
||||
curation: e.detail.data
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationStartEvent);
|
||||
});
|
||||
|
||||
globalBus.on('begin-curating-results', (e) => {
|
||||
// We might not be online, or logged in, so save the curation in local storage in case:
|
||||
console.log("Begin curation event", e);
|
||||
this.__beginCurating.bind(this)();
|
||||
});
|
||||
|
||||
globalBus.on('curate-add-result', (e) => {
|
||||
console.log("Add result", e);
|
||||
this.__beginCurating();
|
||||
const resultData = e.detail;
|
||||
this.results.insertAdjacentHTML('afterbegin', resultData);
|
||||
|
||||
const newResults = this.__getResults();
|
||||
const url = newResults[0].url;
|
||||
|
||||
let detail = {
|
||||
type: 'add',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: newResults,
|
||||
curation: {
|
||||
insert_index: 0,
|
||||
url: url
|
||||
}
|
||||
}
|
||||
};
|
||||
console.log("Detail", detail);
|
||||
const curationSaveEvent = new CustomEvent('save-curation', {
|
||||
detail: detail
|
||||
});
|
||||
globalBus.dispatch(curationSaveEvent);
|
||||
});
|
||||
}
|
||||
|
||||
__initializeResults() {
|
||||
this.results = document.querySelector('.results');
|
||||
|
||||
if (this.results) {
|
||||
const sortable = new Sortable(this.results, {
|
||||
"onStart": this.__sortableActivate.bind(this),
|
||||
"onEnd": this.__sortableDeactivate.bind(this),
|
||||
"handle": ".handle",
|
||||
});
|
||||
}
|
||||
|
||||
this.curating = false;
|
||||
}
|
||||
|
||||
__sortableActivate(event) {
|
||||
console.log("Sortable activate", event);
|
||||
this.__beginCurating();
|
||||
this.oldIndex = event.oldIndex;
|
||||
}
|
||||
|
||||
__beginCurating() {
|
||||
if (!this.curating) {
|
||||
const results = this.__getResults();
|
||||
const curationStartEvent = new CustomEvent('save-curation', {
|
||||
detail: {
|
||||
type: 'begin',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: results,
|
||||
curation: {}
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationStartEvent);
|
||||
this.curating = true;
|
||||
}
|
||||
}
|
||||
|
||||
__getResults() {
|
||||
const resultsElements = document.querySelectorAll('.results .result:not(.ui-sortable-placeholder)');
|
||||
const results = [];
|
||||
for (let resultElement of resultsElements) {
|
||||
const result = {
|
||||
url: resultElement.querySelector('a').href,
|
||||
title: resultElement.querySelector('.title').innerText,
|
||||
extract: resultElement.querySelector('.extract').innerText,
|
||||
curated: resultElement.querySelector('.curate-approve').isValidated()
|
||||
}
|
||||
results.push(result);
|
||||
}
|
||||
console.log("Results", results);
|
||||
return results;
|
||||
}
|
||||
|
||||
__sortableDeactivate(event) {
|
||||
const newIndex = event.newIndex;
|
||||
console.log('Sortable deactivate', this.oldIndex, newIndex);
|
||||
|
||||
const newResults = this.__getResults();
|
||||
|
||||
const curationMoveEvent = new CustomEvent('save-curation', {
|
||||
detail: {
|
||||
type: 'move',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: newResults,
|
||||
curation: {
|
||||
old_index: this.oldIndex,
|
||||
new_index: newIndex,
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationMoveEvent);
|
||||
}
|
||||
}
|
||||
|
||||
const resultsHandler = new ResultsHandler();
|
|
@ -1,112 +0,0 @@
|
|||
import define from '../../utils/define.js';
|
||||
import {globalBus} from "../../utils/events.js";
|
||||
import config from "../../../config.js";
|
||||
|
||||
|
||||
const CURATION_KEY_PREFIX = "curation-";
|
||||
const CURATION_URL = config.publicApiURL + "curation/";
|
||||
|
||||
|
||||
const template = () => /*html*/`
|
||||
<span></span>
|
||||
`;
|
||||
|
||||
|
||||
export default define('save', class extends HTMLDivElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.currentCurationId = null;
|
||||
this.classList.add('save');
|
||||
this.sendId = 0;
|
||||
this.sending = false;
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
this.__events();
|
||||
// TODO: figure out when to call __sendToApi()
|
||||
// setInterval(this.__sendToApi.bind(this), 1000);
|
||||
}
|
||||
|
||||
__events() {
|
||||
globalBus.on('save-curation', (e) => {
|
||||
// We might not be online, or logged in, so save the curation in local storage in case:
|
||||
console.log("Curation event", e);
|
||||
this.__setCuration(e.detail);
|
||||
this.__sendToApi();
|
||||
});
|
||||
}
|
||||
|
||||
__setCuration(curation) {
|
||||
this.sendId += 1;
|
||||
const key = CURATION_KEY_PREFIX + this.sendId;
|
||||
localStorage.setItem(key, JSON.stringify(curation));
|
||||
}
|
||||
|
||||
__getOldestCurationKey() {
|
||||
let oldestId = Number.MAX_SAFE_INTEGER;
|
||||
let oldestKey = null;
|
||||
for (let i=0; i<localStorage.length; ++i) {
|
||||
const key = localStorage.key(i);
|
||||
if (key.startsWith(CURATION_KEY_PREFIX)) {
|
||||
const timestamp = parseInt(key.substring(CURATION_KEY_PREFIX.length));
|
||||
if (timestamp < oldestId) {
|
||||
oldestKey = key;
|
||||
oldestId = timestamp;
|
||||
}
|
||||
}
|
||||
}
|
||||
return oldestKey;
|
||||
}
|
||||
|
||||
async __sendToApi() {
|
||||
if (this.sending) {
|
||||
return;
|
||||
}
|
||||
this.sending = true;
|
||||
const csrftoken = document.cookie
|
||||
.split('; ')
|
||||
.find((row) => row.startsWith('csrftoken='))
|
||||
?.split('=')[1];
|
||||
|
||||
if (!csrftoken) {
|
||||
console.log("No auth");
|
||||
return;
|
||||
}
|
||||
|
||||
const key = this.__getOldestCurationKey();
|
||||
if (key !== null) {
|
||||
const value = JSON.parse(localStorage.getItem(key));
|
||||
console.log("Value", value);
|
||||
const url = CURATION_URL + value['type'];
|
||||
|
||||
const data = value['data'];
|
||||
console.log("Data", data);
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
cache: 'no-cache',
|
||||
headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrftoken},
|
||||
credentials: "same-origin",
|
||||
mode: "same-origin",
|
||||
body: JSON.stringify(data),
|
||||
});
|
||||
|
||||
console.log("Save curation API response", response);
|
||||
|
||||
if (response.status === 200) {
|
||||
localStorage.removeItem(key);
|
||||
} else {
|
||||
console.log("Bad response, skipping");
|
||||
return;
|
||||
}
|
||||
|
||||
const responseData = await response.json();
|
||||
console.log("Response data", responseData);
|
||||
// There may be more to send, wait a second and see
|
||||
setTimeout(this.__sendToApi.bind(this), 1000);
|
||||
}
|
||||
this.sending = false;
|
||||
}
|
||||
}, { extends: 'div' });
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
/**
|
||||
* This file is mainly used as an entry point
|
||||
* to import components or define globals.
|
||||
*
|
||||
* Please do not pollute this file if you can make
|
||||
* util or component files instead.
|
||||
*/
|
||||
import 'vite/modulepreload-polyfill';
|
||||
|
||||
// Waiting for top-level await to be better supported.
|
||||
(async () => {
|
||||
// Check if a suggestion redirect is needed.
|
||||
const { redirectToSuggestions } = await import("./utils/suggestions.js");
|
||||
const redirected = redirectToSuggestions();
|
||||
|
||||
if (!redirected) {
|
||||
// Load components only after redirects are checked.
|
||||
import("./components/organisms/results.js");
|
||||
import("./components/organisms/save.js");
|
||||
import("./components/molecules/add-button.js");
|
||||
import("./components/molecules/add-result.js");
|
||||
import("./components/molecules/delete-button.js");
|
||||
import("./components/molecules/result.js");
|
||||
import("./components/molecules/validate-button.js");
|
||||
}
|
||||
})();
|
|
@ -1,69 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Mwmbl Stats</title>
|
||||
|
||||
<!-- Favicons -->
|
||||
<link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
|
||||
|
||||
<!-- Fonts import -->
|
||||
<link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<noscript>
|
||||
<link rel="stylesheet" href="/static/fonts/inter/inter.css">
|
||||
</noscript>
|
||||
|
||||
<!-- CSS Stylesheets (this is critical CSS) -->
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/reset.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/theme.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/global.css">
|
||||
<link rel="stylesheet" type="text/css" href="stats.css">
|
||||
</head>
|
||||
<body>
|
||||
<section>
|
||||
<div class="info">
|
||||
<h1>Mwmbl Stats</h1>
|
||||
<p>
|
||||
Mwmbl is a <a href="https://matrix.to/#/#mwmbl:matrix.org">community</a> devoted to building a
|
||||
<a href="https://en.wikipedia.org/wiki/Free_and_open-source_software">free</a> search engine. You can try it
|
||||
out <a href="/">here</a> or help us improve the index by
|
||||
<a href="https://en.wikipedia.org/wiki/Web_crawler">crawling</a> the web with our
|
||||
<a href="https://addons.mozilla.org/en-GB/firefox/addon/mwmbl-web-crawler/">Firefox extension</a>
|
||||
or <a href="https://github.com/mwmbl/crawler-script">command line script</a>.
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
<section>
|
||||
<div class="info">
|
||||
<h1>Number of users crawling today: <span id="num-users"></span></h1>
|
||||
<div class="wrap">
|
||||
<canvas id="users-by-day"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
<div class="info">
|
||||
<h1>Number of URLs crawled today: <span id="num-urls"></span></h1>
|
||||
<div class="wrap">
|
||||
<canvas id="urls-by-day"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
<div class="info">
|
||||
<div class="wrap">
|
||||
<canvas id="urls-by-hour"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section>
|
||||
<div class="info tall">
|
||||
<div class="wrap tall">
|
||||
<canvas id="urls-by-user"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
<div class="info tall">
|
||||
<div class="wrap tall">
|
||||
<canvas id="urls-by-domain"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<script src="./stats.js" type="module"></script>
|
||||
</body>
|
||||
</html>
|
|
@ -1,33 +0,0 @@
|
|||
|
||||
body {
|
||||
background: #eeeeee;
|
||||
}
|
||||
|
||||
section {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.info {
|
||||
flex: 1 500px;
|
||||
margin: 10px;
|
||||
padding: 50px;
|
||||
background: #ffffff;
|
||||
border-radius: 50px;
|
||||
}
|
||||
|
||||
.wrap {
|
||||
height: 512px;
|
||||
}
|
||||
|
||||
#users-by-day-info {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
#url-info {
|
||||
height: 3000px;
|
||||
}
|
||||
|
||||
.tall {
|
||||
height: 3000px;
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
import {Chart} from "chart.js/auto";
|
||||
|
||||
(async () => {
|
||||
Chart.defaults.font.size = 16;
|
||||
|
||||
function createChart(elementId, labels, label) {
|
||||
const canvas = document.getElementById(elementId);
|
||||
return new Chart(canvas, {
|
||||
type: 'line',
|
||||
data: {
|
||||
labels: labels,
|
||||
datasets: [{
|
||||
label: label,
|
||||
borderWidth: 1
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
scales: {
|
||||
y: {
|
||||
beginAtZero: true
|
||||
}
|
||||
},
|
||||
maintainAspectRatio: false
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const urlsCrawledDailyChart = createChart('urls-by-day', null, "URLs crawled by day");
|
||||
const urlsCrawledHourlyChart = createChart('urls-by-hour', [...Array(24).keys()], "URLs crawled today by hour")
|
||||
const usersCrawledDailyChart = createChart('users-by-day', null, "Number of users crawling by day")
|
||||
|
||||
const urlsByUserCanvas = document.getElementById('urls-by-user');
|
||||
const byUserChart = new Chart(urlsByUserCanvas, {
|
||||
type: 'bar',
|
||||
data: {
|
||||
datasets: [{
|
||||
label: "Top users",
|
||||
borderWidth: 1
|
||||
// barThickness: 15
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
scales: {
|
||||
x: {
|
||||
beginAtZero: true
|
||||
}
|
||||
},
|
||||
indexAxis: 'y',
|
||||
maintainAspectRatio: false
|
||||
}
|
||||
});
|
||||
|
||||
const urlsByDomainCanvas = document.getElementById('urls-by-domain');
|
||||
const byDomainChart = new Chart(urlsByDomainCanvas, {
|
||||
type: 'bar',
|
||||
data: {
|
||||
datasets: [{
|
||||
label: "Top domains",
|
||||
borderWidth: 1
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
scales: {
|
||||
x: {
|
||||
beginAtZero: true
|
||||
}
|
||||
},
|
||||
indexAxis: 'y',
|
||||
maintainAspectRatio: false
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
function updateStats() {
|
||||
fetch("https://api.mwmbl.org/crawler/stats").then(result => {
|
||||
result.json().then(stats => {
|
||||
console.log("Stats", stats);
|
||||
|
||||
const urlCountSpan = document.getElementById("num-urls");
|
||||
urlCountSpan.innerText = stats.urls_crawled_today;
|
||||
|
||||
const numUsers = Object.values(stats.users_crawled_daily)[Object.keys(stats.users_crawled_daily).length - 1];
|
||||
const userCountSpan = document.getElementById("num-users");
|
||||
userCountSpan.innerText = numUsers;
|
||||
|
||||
usersCrawledDailyChart.data.labels = Object.keys(stats.users_crawled_daily);
|
||||
usersCrawledDailyChart.data.datasets[0].data = Object.values(stats.users_crawled_daily);
|
||||
usersCrawledDailyChart.update();
|
||||
|
||||
urlsCrawledHourlyChart.data.datasets[0].data = stats.urls_crawled_hourly;
|
||||
urlsCrawledHourlyChart.update();
|
||||
|
||||
urlsCrawledDailyChart.data.labels = Object.keys(stats.urls_crawled_daily);
|
||||
urlsCrawledDailyChart.data.datasets[0].data = Object.values(stats.urls_crawled_daily);
|
||||
urlsCrawledDailyChart.update();
|
||||
|
||||
byUserChart.data.labels = Object.keys(stats.top_users);
|
||||
byUserChart.data.datasets[0].data = Object.values(stats.top_users);
|
||||
byUserChart.update();
|
||||
|
||||
byDomainChart.data.labels = Object.keys(stats.top_domains);
|
||||
byDomainChart.data.datasets[0].data = Object.values(stats.top_domains);
|
||||
byDomainChart.update();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
updateStats();
|
||||
setInterval(() => {
|
||||
updateStats();
|
||||
}, 5000);
|
||||
|
||||
})();
|
|
@ -1,13 +0,0 @@
|
|||
/**
|
||||
* A debounce function to reduce input spam
|
||||
* @param {*} callback Function that will be called
|
||||
* @param {*} timeout Minimum amount of time between calls
|
||||
* @returns The debounced function
|
||||
*/
|
||||
export default (callback, timeout = 100) => {
|
||||
let timer;
|
||||
return (...args) => {
|
||||
clearTimeout(timer);
|
||||
timer = setTimeout(() => { callback.apply(this, args); }, timeout);
|
||||
};
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
import config from '../../config.js';
|
||||
|
||||
/** Define a web component, this is a wrapper
|
||||
* around the `customElements.define` native function.
|
||||
* @function define
|
||||
* @param {string} name Name of the component (will be prefixed by the config `componentPrefix`)
|
||||
* @param {CustomElementConstructor} constructor
|
||||
* @param {ElementDefinitionOptions} [options]
|
||||
* @returns {string} Returns the element name ready for the DOM (.e.g `<search-bar></search-bar>`)
|
||||
*/
|
||||
export default (name, constructor, options) => {
|
||||
const componentName = `${config.componentPrefix}-${name}`;
|
||||
if (!customElements.get(componentName)) customElements.define(componentName, constructor, options);
|
||||
return componentName;
|
||||
}
|
|
@ -1,10 +0,0 @@
|
|||
/**
|
||||
* Escapes string with HTML Characters Codes.
|
||||
* @param {string} input String to escape
|
||||
* @returns {string}
|
||||
*/
|
||||
export default (input) => {
|
||||
return String(input).replace(/[^\w. ]/gi, (character) => {
|
||||
return `&#${character.charCodeAt(0)};`;
|
||||
});
|
||||
}
|
|
@ -1,30 +0,0 @@
|
|||
/**
|
||||
* A class destined to be used as an event bus.
|
||||
*
|
||||
* It is simply a trick using a div element
|
||||
* to carry events.
|
||||
*/
|
||||
class Bus {
|
||||
constructor() {
|
||||
this.element = document.createElement('div');
|
||||
}
|
||||
|
||||
on(eventName, callback) {
|
||||
this.element.addEventListener(eventName, callback);
|
||||
}
|
||||
|
||||
dispatch(event) {
|
||||
this.element.dispatchEvent(event);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A global event bus that can be used to
|
||||
* dispatch events in between components
|
||||
* */
|
||||
const globalBus = new Bus();
|
||||
|
||||
export {
|
||||
Bus,
|
||||
globalBus,
|
||||
}
|
|
@ -1,24 +0,0 @@
|
|||
/**
|
||||
* Handle redirect requests from the suggestion back-end.
|
||||
*/
|
||||
|
||||
|
||||
import config from "../../config.js";
|
||||
|
||||
const redirectToSuggestions = () => {
|
||||
const search = decodeURIComponent(document.location.search).replace(/\+/g, ' ').substr(3);
|
||||
console.log("Search", search);
|
||||
for (const [command, urlTemplate] of Object.entries(config.commands)) {
|
||||
console.log("Command", command);
|
||||
if (search.startsWith(command)) {
|
||||
const newUrl = urlTemplate + search.substr(command.length);
|
||||
window.location.replace(newUrl);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export {
|
||||
redirectToSuggestions
|
||||
};
|
|
@ -1,24 +0,0 @@
|
|||
import legacy from '@vitejs/plugin-legacy'
|
||||
import { resolve } from 'path'
|
||||
|
||||
export default {
|
||||
root: './src',
|
||||
base: '/static',
|
||||
publicDir: '../assets',
|
||||
build: {
|
||||
outDir: '../dist',
|
||||
manifest: true,
|
||||
rollupOptions: {
|
||||
input: {
|
||||
index: resolve(__dirname, 'src/index.js'),
|
||||
stats: resolve(__dirname, 'src/stats/index.html'),
|
||||
},
|
||||
},
|
||||
minify: false,
|
||||
},
|
||||
plugins: [
|
||||
legacy({
|
||||
targets: ['defaults', 'not IE 11'],
|
||||
}),
|
||||
]
|
||||
}
|
22
manage.py
22
manage.py
|
@ -1,22 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""Django's command-line utility for administrative tasks."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
"""Run administrative tasks."""
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
|
||||
try:
|
||||
from django.core.management import execute_from_command_line
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Couldn't import Django. Are you sure it's installed and "
|
||||
"available on your PYTHONPATH environment variable? Did you "
|
||||
"forget to activate a virtual environment?"
|
||||
) from exc
|
||||
execute_from_command_line(sys.argv)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,8 +0,0 @@
|
|||
from django.contrib.admin import ModelAdmin
|
||||
from django.contrib.auth.admin import UserAdmin
|
||||
from django.contrib import admin
|
||||
|
||||
from mwmbl.models import MwmblUser, UserCuration
|
||||
|
||||
admin.site.register(MwmblUser, UserAdmin)
|
||||
admin.site.register(UserCuration, ModelAdmin)
|
|
@ -1,48 +0,0 @@
|
|||
import os
|
||||
import shutil
|
||||
from multiprocessing import Process, Queue
|
||||
from pathlib import Path
|
||||
|
||||
from django.apps import AppConfig
|
||||
from django.conf import settings
|
||||
|
||||
from mwmbl.crawler.urls import URLDatabase
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.indexer.indexdb import IndexDatabase
|
||||
|
||||
|
||||
class MwmblConfig(AppConfig):
|
||||
name = "mwmbl"
|
||||
verbose_name = "Mwmbl Application"
|
||||
|
||||
def ready(self):
|
||||
# Imports here to avoid AppRegistryNotReady exception
|
||||
from mwmbl.search_setup import queued_batches
|
||||
from mwmbl import background
|
||||
from mwmbl.indexer.paths import INDEX_NAME
|
||||
from mwmbl.indexer.update_urls import update_urls_continuously
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
|
||||
from mwmbl.url_queue import update_queue_continuously
|
||||
|
||||
index_path = Path(settings.DATA_PATH) / INDEX_NAME
|
||||
try:
|
||||
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != settings.NUM_PAGES:
|
||||
raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
|
||||
f"({existing_index.num_pages}) do not match")
|
||||
except FileNotFoundError:
|
||||
print("Creating a new index")
|
||||
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
|
||||
page_size=PAGE_SIZE)
|
||||
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
url_db.create_tables()
|
||||
index_db = IndexDatabase(db.connection)
|
||||
index_db.create_tables()
|
||||
|
||||
if settings.RUN_BACKGROUND_PROCESSES:
|
||||
new_item_queue = Queue()
|
||||
Process(target=background.run, args=(settings.DATA_PATH,)).start()
|
||||
Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
|
||||
Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start()
|
|
@ -1,16 +0,0 @@
|
|||
"""
|
||||
ASGI config for app project.
|
||||
|
||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
|
||||
|
||||
application = get_asgi_application()
|
|
@ -1,9 +1,7 @@
|
|||
"""
|
||||
Script that updates data in a background process.
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
from logging import getLogger, basicConfig
|
||||
from logging import getLogger
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
|
@ -13,8 +11,6 @@ from mwmbl.indexer import index_batches, historical
|
|||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME
|
||||
|
||||
|
||||
basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
|
|
|
@ -1,22 +1,23 @@
|
|||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone, date
|
||||
from queue import Queue, Empty
|
||||
from typing import Union
|
||||
from uuid import uuid4
|
||||
|
||||
import boto3
|
||||
import justext
|
||||
import requests
|
||||
from fastapi import HTTPException
|
||||
from ninja import NinjaAPI
|
||||
from redis import Redis
|
||||
from fastapi import HTTPException, APIRouter
|
||||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||
|
||||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||
from mwmbl.crawler.stats import MwmblStats, StatsManager
|
||||
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.format import format_result
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
|
||||
from mwmbl.settings import (
|
||||
|
@ -30,9 +31,9 @@ from mwmbl.settings import (
|
|||
PUBLIC_URL_PREFIX,
|
||||
PUBLIC_USER_ID_LENGTH,
|
||||
FILE_NAME_SUFFIX,
|
||||
DATE_REGEX)
|
||||
|
||||
stats_manager = StatsManager(Redis.from_url(os.environ.get("REDIS_URL")))
|
||||
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
from mwmbl.url_queue import URLQueue
|
||||
|
||||
|
||||
def get_bucket(name):
|
||||
|
@ -50,11 +51,56 @@ def upload(data: bytes, name: str):
|
|||
last_batch = None
|
||||
|
||||
|
||||
def create_router(batch_cache: BatchCache, queued_batches: Queue, version: str) -> NinjaAPI:
|
||||
router = NinjaAPI(urls_namespace=f"crawler-{version}")
|
||||
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
|
||||
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
|
||||
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
|
||||
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
|
||||
encoding=None, default_encoding=DEFAULT_ENCODING,
|
||||
enc_errors=DEFAULT_ENC_ERRORS):
|
||||
"""
|
||||
Converts an HTML page into a list of classified paragraphs. Each paragraph
|
||||
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
|
||||
"""
|
||||
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
|
||||
|
||||
titles = dom.xpath("//title")
|
||||
title = titles[0].text if len(titles) > 0 else None
|
||||
|
||||
dom = preprocessor(dom)
|
||||
|
||||
paragraphs = ParagraphMaker.make_paragraphs(dom)
|
||||
|
||||
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
|
||||
stopwords_low, stopwords_high, max_link_density, no_headings)
|
||||
revise_paragraph_classification(paragraphs, max_heading_distance)
|
||||
|
||||
return paragraphs, title
|
||||
|
||||
|
||||
def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
||||
router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
|
||||
@router.on_event("startup")
|
||||
async def on_startup():
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
return url_db.create_tables()
|
||||
|
||||
@router.get('/fetch')
|
||||
def fetch_url(url: str, query: str):
|
||||
response = requests.get(url)
|
||||
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
||||
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
||||
|
||||
extract = ' '.join([p.text for p in good_paragraphs])
|
||||
if len(extract) > NUM_EXTRACT_CHARS:
|
||||
extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
|
||||
|
||||
result = Document(title=title, url=url, extract=extract, score=0.0)
|
||||
return format_result(result, query)
|
||||
|
||||
@router.post('/batches/')
|
||||
def post_batch(request, batch: Batch):
|
||||
def post_batch(batch: Batch):
|
||||
if len(batch.items) > MAX_BATCH_SIZE:
|
||||
raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
|
||||
|
||||
|
@ -82,9 +128,6 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue, version: str)
|
|||
# Using an approach from https://stackoverflow.com/a/30476450
|
||||
epoch_time = (now - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds()
|
||||
hashed_batch = HashedBatch(user_id_hash=user_id_hash, timestamp=epoch_time, items=batch.items)
|
||||
|
||||
stats_manager.record_batch(hashed_batch)
|
||||
|
||||
data = gzip.compress(hashed_batch.json().encode('utf8'))
|
||||
upload(data, filename)
|
||||
|
||||
|
@ -108,7 +151,7 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue, version: str)
|
|||
}
|
||||
|
||||
@router.post('/batches/new')
|
||||
def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
|
||||
def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
|
||||
user_id_hash = _get_user_id_hash(batch_request)
|
||||
try:
|
||||
urls = queued_batches.get(block=False)
|
||||
|
@ -123,14 +166,14 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue, version: str)
|
|||
return urls
|
||||
|
||||
@router.get('/batches/{date_str}/users/{public_user_id}')
|
||||
def get_batches_for_date_and_user(request, date_str, public_user_id):
|
||||
def get_batches_for_date_and_user(date_str, public_user_id):
|
||||
check_date_str(date_str)
|
||||
check_public_user_id(public_user_id)
|
||||
prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
|
||||
return get_batch_ids_for_prefix(prefix)
|
||||
|
||||
@router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
|
||||
def get_batch_from_id(request, date_str, public_user_id, batch_id):
|
||||
def get_batch_from_id(date_str, public_user_id, batch_id):
|
||||
url = get_batch_url(batch_id, date_str, public_user_id)
|
||||
data = json.loads(gzip.decompress(requests.get(url).content))
|
||||
return {
|
||||
|
@ -138,22 +181,18 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue, version: str)
|
|||
'batch': data,
|
||||
}
|
||||
|
||||
@router.get('/latest-batch')
|
||||
def get_latest_batch(request) -> list[HashedBatch]:
|
||||
@router.get('/latest-batch', response_model=list[HashedBatch])
|
||||
def get_latest_batch():
|
||||
return [] if last_batch is None else [last_batch]
|
||||
|
||||
@router.get('/batches/{date_str}/users')
|
||||
def get_user_id_hashes_for_date(request, date_str: str):
|
||||
def get_user_id_hashes_for_date(date_str: str):
|
||||
check_date_str(date_str)
|
||||
prefix = f'1/{VERSION}/{date_str}/1/'
|
||||
return get_subfolders(prefix)
|
||||
|
||||
@router.get('/stats')
|
||||
def get_stats(request) -> MwmblStats:
|
||||
return stats_manager.get_stats()
|
||||
|
||||
@router.get('/')
|
||||
def status(request):
|
||||
def status():
|
||||
return {
|
||||
'status': 'ok'
|
||||
}
|
||||
|
|
|
@ -1,21 +1,21 @@
|
|||
from typing import Optional
|
||||
|
||||
from ninja import Schema
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ItemContent(Schema):
|
||||
class ItemContent(BaseModel):
|
||||
title: str
|
||||
extract: str
|
||||
links: list[str]
|
||||
extra_links: Optional[list[str]]
|
||||
|
||||
|
||||
class ItemError(Schema):
|
||||
class ItemError(BaseModel):
|
||||
name: str
|
||||
message: Optional[str]
|
||||
|
||||
|
||||
class Item(Schema):
|
||||
class Item(BaseModel):
|
||||
url: str
|
||||
status: Optional[int]
|
||||
timestamp: int
|
||||
|
@ -23,16 +23,16 @@ class Item(Schema):
|
|||
error: Optional[ItemError]
|
||||
|
||||
|
||||
class Batch(Schema):
|
||||
class Batch(BaseModel):
|
||||
user_id: str
|
||||
items: list[Item]
|
||||
|
||||
|
||||
class NewBatchRequest(Schema):
|
||||
class NewBatchRequest(BaseModel):
|
||||
user_id: str
|
||||
|
||||
|
||||
class HashedBatch(Schema):
|
||||
class HashedBatch(BaseModel):
|
||||
user_id_hash: str
|
||||
timestamp: int
|
||||
items: list[Item]
|
||||
|
|
|
@ -1,133 +0,0 @@
|
|||
import gzip
|
||||
from datetime import datetime, timedelta
|
||||
from glob import glob
|
||||
from itertools import islice
|
||||
from logging import getLogger
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pydantic import BaseModel
|
||||
from redis import Redis
|
||||
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
from mwmbl.indexer.update_urls import get_datetime_from_timestamp
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
URL_DATE_COUNT_KEY = "url-count-{date}"
|
||||
URL_HOUR_COUNT_KEY = "url-count-hour-{hour}"
|
||||
USERS_KEY = "users-{date}"
|
||||
USER_COUNT_KEY = "user-count-{date}"
|
||||
HOST_COUNT_KEY = "host-count-{date}"
|
||||
SHORT_EXPIRE_SECONDS = 60 * 60 * 24
|
||||
LONG_EXPIRE_SECONDS = 60 * 60 * 24 * 30
|
||||
|
||||
|
||||
class MwmblStats(BaseModel):
|
||||
urls_crawled_today: int
|
||||
urls_crawled_daily: dict[str, int]
|
||||
urls_crawled_hourly: list[int]
|
||||
users_crawled_daily: dict[str, int]
|
||||
top_users: dict[str, int]
|
||||
top_domains: dict[str, int]
|
||||
|
||||
|
||||
class StatsManager:
|
||||
def __init__(self, redis: Redis):
|
||||
self.redis = redis
|
||||
|
||||
def record_batch(self, hashed_batch: HashedBatch):
|
||||
date_time = get_datetime_from_timestamp(hashed_batch.timestamp)
|
||||
|
||||
num_crawled_urls = sum(1 for item in hashed_batch.items if item.content is not None)
|
||||
|
||||
url_count_key = URL_DATE_COUNT_KEY.format(date=date_time.date())
|
||||
self.redis.incrby(url_count_key, num_crawled_urls)
|
||||
self.redis.expire(url_count_key, LONG_EXPIRE_SECONDS)
|
||||
|
||||
print("Date time", date_time)
|
||||
hour = datetime(date_time.year, date_time.month, date_time.day, date_time.hour)
|
||||
hour_key = URL_HOUR_COUNT_KEY.format(hour=hour)
|
||||
self.redis.incrby(hour_key, num_crawled_urls)
|
||||
self.redis.expire(hour_key, SHORT_EXPIRE_SECONDS)
|
||||
|
||||
users_key = USERS_KEY.format(date=date_time.date())
|
||||
self.redis.sadd(users_key, hashed_batch.user_id_hash)
|
||||
self.redis.expire(users_key, LONG_EXPIRE_SECONDS)
|
||||
|
||||
user_count_key = USER_COUNT_KEY.format(date=date_time.date())
|
||||
self.redis.zincrby(user_count_key, num_crawled_urls, hashed_batch.user_id_hash)
|
||||
self.redis.expire(user_count_key, SHORT_EXPIRE_SECONDS)
|
||||
|
||||
host_key = HOST_COUNT_KEY.format(date=date_time.date())
|
||||
for item in hashed_batch.items:
|
||||
if item.content is None:
|
||||
continue
|
||||
|
||||
host = urlparse(item.url).netloc
|
||||
self.redis.zincrby(host_key, 1, host)
|
||||
self.redis.expire(host_key, SHORT_EXPIRE_SECONDS)
|
||||
|
||||
def get_stats(self) -> MwmblStats:
|
||||
date_time = datetime.now()
|
||||
date = date_time.date()
|
||||
|
||||
urls_crawled_daily = {}
|
||||
users_crawled_daily = {}
|
||||
for i in range(29, -1, -1):
|
||||
date_i = date - timedelta(days=i)
|
||||
url_count_key = URL_DATE_COUNT_KEY.format(date=date_i)
|
||||
url_count = self.redis.get(url_count_key)
|
||||
if url_count is None:
|
||||
url_count = 0
|
||||
urls_crawled_daily[str(date_i)] = url_count
|
||||
|
||||
user_day_count_key = USERS_KEY.format(date=date_i)
|
||||
user_day_count = self.redis.scard(user_day_count_key)
|
||||
users_crawled_daily[str(date_i)] = user_day_count
|
||||
|
||||
hour_counts = []
|
||||
for i in range(date_time.hour + 1):
|
||||
hour = datetime(date_time.year, date_time.month, date_time.day, i)
|
||||
hour_key = URL_HOUR_COUNT_KEY.format(hour=hour)
|
||||
hour_count = self.redis.get(hour_key)
|
||||
if hour_count is None:
|
||||
hour_count = 0
|
||||
hour_counts.append(hour_count)
|
||||
|
||||
user_count_key = USER_COUNT_KEY.format(date=date_time.date())
|
||||
user_counts = self.redis.zrevrange(user_count_key, 0, 100, withscores=True)
|
||||
|
||||
host_key = HOST_COUNT_KEY.format(date=date_time.date())
|
||||
host_counts = self.redis.zrevrange(host_key, 0, 100, withscores=True)
|
||||
|
||||
urls_crawled_today = list(urls_crawled_daily.values())[-1]
|
||||
return MwmblStats(
|
||||
urls_crawled_today=urls_crawled_today,
|
||||
urls_crawled_daily=urls_crawled_daily,
|
||||
urls_crawled_hourly=hour_counts,
|
||||
users_crawled_daily=users_crawled_daily,
|
||||
top_users=user_counts,
|
||||
top_domains=host_counts,
|
||||
)
|
||||
|
||||
|
||||
def get_test_batches():
|
||||
for path in glob("./devdata/batches/**/*.json.gz", recursive=True):
|
||||
print("Processing path", path)
|
||||
with gzip.open(path) as gzip_file:
|
||||
yield HashedBatch.parse_raw(gzip_file.read())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
redis = Redis(host='localhost', port=6379, decode_responses=True)
|
||||
stats = StatsManager(redis)
|
||||
batches = get_test_batches()
|
||||
start = datetime.now()
|
||||
processed = 0
|
||||
for batch in islice(batches, 10000):
|
||||
stats.record_batch(batch)
|
||||
processed += 1
|
||||
total_time = (datetime.now() - start).total_seconds()
|
||||
print("Processed", processed)
|
||||
print("Total time", total_time)
|
||||
print("Time per batch", total_time/processed)
|
|
@ -1,13 +1,16 @@
|
|||
"""
|
||||
Database storing info on URLs
|
||||
"""
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
from logging import getLogger
|
||||
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.settings import CORE_DOMAINS
|
||||
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
|
||||
from mwmbl.utils import batch
|
||||
|
||||
|
@ -129,12 +132,10 @@ class URLDatabase:
|
|||
sql = f"""
|
||||
SELECT url, status, user_id_hash, score, updated FROM urls
|
||||
WHERE status = %(status)s
|
||||
ORDER BY score DESC
|
||||
LIMIT %(num_urls)s
|
||||
"""
|
||||
|
||||
# TODO: reinstate this line once performance issue is resolved:
|
||||
# ORDER BY score DESC
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql, {'status': status.value, 'num_urls': num_urls})
|
||||
results = cursor.fetchall()
|
||||
|
|
|
@ -9,6 +9,7 @@ import os
|
|||
from logging import getLogger
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pydantic import ValidationError
|
||||
|
|
|
@ -1,33 +0,0 @@
|
|||
from datetime import timedelta
|
||||
|
||||
from requests_cache import CachedSession
|
||||
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.settings import BLACKLIST_DOMAINS_URL, EXCLUDED_DOMAINS, DOMAIN_BLACKLIST_REGEX
|
||||
|
||||
|
||||
def get_blacklist_domains():
|
||||
with CachedSession(expire_after=timedelta(days=1)) as session:
|
||||
response = session.get(BLACKLIST_DOMAINS_URL)
|
||||
return set(response.text.split())
|
||||
|
||||
|
||||
def is_domain_blacklisted(domain: str, blacklist_domains: set[str]):
|
||||
if domain in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(domain) is not None \
|
||||
or domain in blacklist_domains:
|
||||
return True
|
||||
|
||||
if domain in DOMAINS:
|
||||
return False
|
||||
|
||||
# TODO: this is to filter out spammy domains that look like:
|
||||
# brofqpxj.uelinc.com
|
||||
# gzsmjc.fba01.com
|
||||
# 59648.etnomurcia.com
|
||||
#
|
||||
# Eventually we can figure out a better way to identify SEO spam
|
||||
domain_parts = domain.split('.')
|
||||
if (len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) in {6, 8}) or (
|
||||
set(domain_parts[0]) <= set("1234567890")
|
||||
):
|
||||
return True
|
|
@ -1,10 +1,13 @@
|
|||
"""
|
||||
Create a search index
|
||||
"""
|
||||
from collections import Counter
|
||||
from typing import Iterable
|
||||
from urllib.parse import unquote
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import TokenizedDocument
|
||||
import pandas as pd
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
|
||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||
|
||||
DEFAULT_SCORE = 0
|
||||
|
@ -49,7 +52,7 @@ def prepare_url_for_tokenizing(url: str):
|
|||
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
|
||||
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
|
||||
score = link_counts.get(url, DEFAULT_SCORE)
|
||||
yield tokenize_document(url, title_cleaned, extract, score)
|
||||
yield tokenize_document(url, title_cleaned, extract, score, nlp)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print("Processed", i)
|
||||
|
@ -61,7 +64,7 @@ def get_index_tokens(tokens):
|
|||
return set(first_tokens + bigrams)
|
||||
|
||||
|
||||
def tokenize_document(url, title_cleaned, extract, score):
|
||||
def tokenize_document(url, title_cleaned, extract, score, nlp):
|
||||
title_tokens = tokenize(title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
url_tokens = tokenize(prepared_url)
|
||||
|
|
|
@ -16,7 +16,6 @@ from mwmbl.indexer.batch_cache import BatchCache
|
|||
from mwmbl.indexer.index import tokenize_document
|
||||
from mwmbl.indexer.indexdb import BatchStatus
|
||||
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
|
||||
from mwmbl.utils import add_term_info, add_term_infos
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -32,20 +31,22 @@ def run(batch_cache: BatchCache, index_path: str):
|
|||
|
||||
def process(batches: Collection[HashedBatch]):
|
||||
with Database() as db:
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
url_db = URLDatabase(db.connection)
|
||||
index_batches(batches, index_path, url_db)
|
||||
index_batches(batches, index_path, nlp, url_db)
|
||||
logger.info("Indexed pages")
|
||||
|
||||
process_batch.run(batch_cache, BatchStatus.URLS_UPDATED, BatchStatus.INDEXED, process)
|
||||
|
||||
|
||||
def index_batches(batch_data: Collection[HashedBatch], index_path: str, url_db: URLDatabase):
|
||||
def index_batches(batch_data: Collection[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
|
||||
document_tuples = list(get_documents_from_batches(batch_data))
|
||||
urls = [url for title, url, extract in document_tuples]
|
||||
logger.info(f"Got {len(urls)} document tuples")
|
||||
url_scores = url_db.get_url_scores(urls)
|
||||
logger.info(f"Indexing {len(urls)} document tuples and {len(url_scores)} URL scores")
|
||||
logger.info(f"Got {len(url_scores)} scores")
|
||||
documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
|
||||
page_documents = preprocess_documents(documents, index_path)
|
||||
page_documents = preprocess_documents(documents, index_path, nlp)
|
||||
index_pages(index_path, page_documents)
|
||||
|
||||
|
||||
|
@ -57,27 +58,24 @@ def index_pages(index_path, page_documents):
|
|||
seen_urls = set()
|
||||
seen_titles = set()
|
||||
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
|
||||
# TODO: for now we add the term here, until all the documents in the index have terms
|
||||
sorted_documents_with_terms = add_term_infos(sorted_documents, indexer, page)
|
||||
for document in sorted_documents_with_terms:
|
||||
for document in sorted_documents:
|
||||
if document.title in seen_titles or document.url in seen_urls:
|
||||
continue
|
||||
new_documents.append(document)
|
||||
seen_urls.add(document.url)
|
||||
seen_titles.add(document.title)
|
||||
logger.info(f"Storing {len(new_documents)} documents for page {page}, originally {len(existing_documents)}")
|
||||
indexer.store_in_page(page, new_documents)
|
||||
|
||||
|
||||
def preprocess_documents(documents, index_path):
|
||||
def preprocess_documents(documents, index_path, nlp):
|
||||
page_documents = defaultdict(list)
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
for document in documents:
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
|
||||
for token in tokenized.tokens:
|
||||
page = indexer.get_key_page_index(token)
|
||||
term_document = Document(document.title, document.url, document.extract, document.score, token)
|
||||
page_documents[page].append(term_document)
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
|
||||
# logger.debug(f"Tokenized: {tokenized}")
|
||||
page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
|
||||
for page in page_indexes:
|
||||
page_documents[page].append(document)
|
||||
print(f"Preprocessed for {len(page_documents)} pages")
|
||||
return page_documents
|
||||
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import os
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from logging import getLogger
|
||||
from multiprocessing import Queue
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
from typing import Collection
|
||||
from typing import Iterable, Collection
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
|
@ -13,11 +15,10 @@ from mwmbl.database import Database
|
|||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.indexer import process_batch
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.blacklist import get_blacklist_domains, is_domain_blacklisted
|
||||
from mwmbl.indexer.index_batches import get_url_error_status
|
||||
from mwmbl.indexer.indexdb import BatchStatus
|
||||
from mwmbl.indexer.paths import BATCH_DIR_NAME
|
||||
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, SCORE_FOR_SAME_DOMAIN, \
|
||||
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
|
||||
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
|
||||
from mwmbl.utils import get_domain
|
||||
|
||||
|
@ -39,11 +40,7 @@ def run(batch_cache: BatchCache, new_item_queue: Queue):
|
|||
|
||||
|
||||
def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
|
||||
start = datetime.now()
|
||||
blacklist_domains = get_blacklist_domains()
|
||||
blacklist_retrieval_time = datetime.now() - start
|
||||
logger.info(f"Recording URLs in database for {len(batches)} batches, with {len(blacklist_domains)} blacklist "
|
||||
f"domains, retrieved in {blacklist_retrieval_time.total_seconds()} seconds")
|
||||
logger.info(f"Recording URLs in database for {len(batches)} batches")
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
url_scores = defaultdict(float)
|
||||
|
@ -66,13 +63,13 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
|
|||
continue
|
||||
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
||||
for link in item.content.links:
|
||||
process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
url_timestamps, url_users, False, blacklist_domains)
|
||||
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
url_timestamps, url_users, False)
|
||||
|
||||
if item.content.extra_links:
|
||||
for link in item.content.extra_links:
|
||||
process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
url_timestamps, url_users, True, blacklist_domains)
|
||||
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
url_timestamps, url_users, True)
|
||||
|
||||
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
|
||||
for url in url_scores.keys() | url_statuses.keys()]
|
||||
|
@ -83,20 +80,19 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
|
|||
logger.info(f"Put {len(urls)} new items in the URL queue")
|
||||
|
||||
|
||||
def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
|
||||
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool):
|
||||
parsed_link = urlparse(link)
|
||||
if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
|
||||
logger.debug(f"Excluding link for blacklisted domain: {parsed_link}")
|
||||
if parsed_link.netloc in EXCLUDED_DOMAINS:
|
||||
return
|
||||
|
||||
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
|
||||
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
|
||||
url_scores[link] += score * unknown_domain_multiplier * extra_multiplier
|
||||
url_users[link] = user_id_hash
|
||||
url_users[link] = batch.user_id_hash
|
||||
url_timestamps[link] = timestamp
|
||||
domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
|
||||
url_scores[domain] += SCORE_FOR_ROOT_PATH * unknown_domain_multiplier
|
||||
url_users[domain] = user_id_hash
|
||||
url_users[domain] = batch.user_id_hash
|
||||
url_timestamps[domain] = timestamp
|
||||
|
||||
|
||||
|
|
|
@ -1,13 +1,96 @@
|
|||
import django
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from multiprocessing import Process, Queue
|
||||
from pathlib import Path
|
||||
|
||||
import uvicorn
|
||||
from django.core.management import call_command
|
||||
from fastapi import FastAPI
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
|
||||
from mwmbl import background
|
||||
from mwmbl.crawler import app as crawler
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
||||
from mwmbl.platform import user
|
||||
from mwmbl.indexer.update_urls import update_urls_continuously
|
||||
from mwmbl.tinysearchengine import search
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
|
||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||
from mwmbl.url_queue import update_queue_continuously
|
||||
|
||||
FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
|
||||
|
||||
|
||||
MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
|
||||
|
||||
|
||||
def setup_args():
|
||||
parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
|
||||
parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
|
||||
parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
|
||||
parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000)
|
||||
parser.add_argument("--background", help="Enable running the background tasks to process batches",
|
||||
action='store_true')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def run():
|
||||
django.setup()
|
||||
call_command("collectstatic", "--clear", "--noinput")
|
||||
call_command("migrate")
|
||||
uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=5000)
|
||||
args = setup_args()
|
||||
|
||||
index_path = Path(args.data) / INDEX_NAME
|
||||
try:
|
||||
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
|
||||
raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
|
||||
f"({existing_index.num_pages}) do not match")
|
||||
except FileNotFoundError:
|
||||
print("Creating a new index")
|
||||
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
|
||||
|
||||
new_item_queue = Queue()
|
||||
queued_batches = Queue()
|
||||
# curation_queue = Queue()
|
||||
|
||||
if args.background:
|
||||
Process(target=background.run, args=(args.data,)).start()
|
||||
Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
|
||||
Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start()
|
||||
|
||||
completer = Completer()
|
||||
|
||||
with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index:
|
||||
ranker = HeuristicRanker(tiny_index, completer)
|
||||
# model = pickle.load(open(MODEL_PATH, 'rb'))
|
||||
# ranker = LTRRanker(model, tiny_index, completer)
|
||||
|
||||
# Initialize FastApi instance
|
||||
app = FastAPI()
|
||||
|
||||
# Try disabling since this is handled by nginx
|
||||
# app.add_middleware(
|
||||
# CORSMiddleware,
|
||||
# allow_origins=["*"],
|
||||
# allow_credentials=True,
|
||||
# allow_methods=["*"],
|
||||
# allow_headers=["*"],
|
||||
# )
|
||||
|
||||
search_router = search.create_router(ranker)
|
||||
app.include_router(search_router)
|
||||
|
||||
batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
|
||||
crawler_router = crawler.get_router(batch_cache, queued_batches)
|
||||
app.include_router(crawler_router)
|
||||
|
||||
user_router = user.create_router(index_path)
|
||||
app.include_router(user_router)
|
||||
|
||||
# Initialize uvicorn server using global app instance and server config params
|
||||
uvicorn.run(app, host="0.0.0.0", port=args.port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,58 +0,0 @@
|
|||
# Generated by Django 4.2.6 on 2023-10-25 11:55
|
||||
|
||||
from django.conf import settings
|
||||
import django.contrib.auth.models
|
||||
import django.contrib.auth.validators
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='MwmblUser',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('password', models.CharField(max_length=128, verbose_name='password')),
|
||||
('last_login', models.DateTimeField(blank=True, null=True, verbose_name='last login')),
|
||||
('is_superuser', models.BooleanField(default=False, help_text='Designates that this user has all permissions without explicitly assigning them.', verbose_name='superuser status')),
|
||||
('username', models.CharField(error_messages={'unique': 'A user with that username already exists.'}, help_text='Required. 150 characters or fewer. Letters, digits and @/./+/-/_ only.', max_length=150, unique=True, validators=[django.contrib.auth.validators.UnicodeUsernameValidator()], verbose_name='username')),
|
||||
('first_name', models.CharField(blank=True, max_length=150, verbose_name='first name')),
|
||||
('last_name', models.CharField(blank=True, max_length=150, verbose_name='last name')),
|
||||
('email', models.EmailField(blank=True, max_length=254, verbose_name='email address')),
|
||||
('is_staff', models.BooleanField(default=False, help_text='Designates whether the user can log into this admin site.', verbose_name='staff status')),
|
||||
('is_active', models.BooleanField(default=True, help_text='Designates whether this user should be treated as active. Unselect this instead of deleting accounts.', verbose_name='active')),
|
||||
('date_joined', models.DateTimeField(default=django.utils.timezone.now, verbose_name='date joined')),
|
||||
('groups', models.ManyToManyField(blank=True, help_text='The groups this user belongs to. A user will get all permissions granted to each of their groups.', related_name='user_set', related_query_name='user', to='auth.group', verbose_name='groups')),
|
||||
('user_permissions', models.ManyToManyField(blank=True, help_text='Specific permissions for this user.', related_name='user_set', related_query_name='user', to='auth.permission', verbose_name='user permissions')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'user',
|
||||
'verbose_name_plural': 'users',
|
||||
'abstract': False,
|
||||
},
|
||||
managers=[
|
||||
('objects', django.contrib.auth.models.UserManager()),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='UserCuration',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('timestamp', models.DateTimeField()),
|
||||
('url', models.CharField(max_length=300)),
|
||||
('results', models.JSONField()),
|
||||
('curation_type', models.CharField(max_length=20)),
|
||||
('curation', models.JSONField()),
|
||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
),
|
||||
]
|
|
@ -1,15 +0,0 @@
|
|||
from django.db import models
|
||||
from django.contrib.auth.models import AbstractUser
|
||||
|
||||
|
||||
class MwmblUser(AbstractUser):
|
||||
pass
|
||||
|
||||
|
||||
class UserCuration(models.Model):
|
||||
user = models.ForeignKey(MwmblUser, on_delete=models.CASCADE)
|
||||
timestamp = models.DateTimeField()
|
||||
url = models.CharField(max_length=300)
|
||||
results = models.JSONField()
|
||||
curation_type = models.CharField(max_length=20)
|
||||
curation = models.JSONField()
|
|
@ -1,89 +0,0 @@
|
|||
from logging import getLogger
|
||||
from typing import Any
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
from ninja import Router, NinjaAPI
|
||||
|
||||
from mwmbl.indexer.update_urls import get_datetime_from_timestamp
|
||||
from mwmbl.models import UserCuration
|
||||
from mwmbl.platform.data import CurateBegin, CurateMove, CurateDelete, CurateAdd, CurateValidate, \
|
||||
make_curation_type
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tokenizer import tokenize
|
||||
from mwmbl.utils import add_term_info, add_term_infos
|
||||
|
||||
RESULT_URL = "https://mwmbl.org/?q="
|
||||
MAX_CURATED_SCORE = 1_111_111.0
|
||||
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
def create_router(index_path: str, version: str) -> NinjaAPI:
|
||||
router = NinjaAPI(urls_namespace=f"curate-{version}", csrf=True)
|
||||
|
||||
@router.post("/begin")
|
||||
def user_begin_curate(request, curate_begin: make_curation_type(CurateBegin)):
|
||||
return _curate(request, "curate_begin", curate_begin)
|
||||
|
||||
@router.post("/move")
|
||||
def user_move_result(request, curate_move: make_curation_type(CurateMove)):
|
||||
return _curate(request, "curate_move", curate_move)
|
||||
|
||||
@router.post("/delete")
|
||||
def user_delete_result(request, curate_delete: make_curation_type(CurateDelete)):
|
||||
return _curate(request, "curate_delete", curate_delete)
|
||||
|
||||
@router.post("/add")
|
||||
def user_add_result(request, curate_add: make_curation_type(CurateAdd)):
|
||||
return _curate(request, "curate_add", curate_add)
|
||||
|
||||
@router.post("/validate")
|
||||
def user_add_result(request, curate_validate: make_curation_type(CurateValidate)):
|
||||
return _curate(request, "curate_validate", curate_validate)
|
||||
|
||||
def _curate(request, curation_type: str, curation: Any):
|
||||
user_curation = UserCuration(
|
||||
user=request.user,
|
||||
timestamp=get_datetime_from_timestamp(curation.timestamp / 1000.0),
|
||||
url=curation.url,
|
||||
results=curation.dict()["results"],
|
||||
curation_type=curation_type,
|
||||
curation=curation.curation.dict(),
|
||||
)
|
||||
user_curation.save()
|
||||
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
query_string = parse_qs(curation.url)
|
||||
if len(query_string) > 1:
|
||||
raise ValueError(f"Should be one query string in the URL: {curation.url}")
|
||||
|
||||
queries = next(iter(query_string.values()))
|
||||
if len(queries) > 1:
|
||||
raise ValueError(f"Should be one query value in the URL: {curation.url}")
|
||||
|
||||
query = queries[0]
|
||||
tokens = tokenize(query)
|
||||
term = " ".join(tokens)
|
||||
|
||||
documents = [
|
||||
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
|
||||
for i, result in enumerate(curation.results)
|
||||
]
|
||||
|
||||
page_index = indexer.get_key_page_index(term)
|
||||
existing_documents_no_terms = indexer.get_page(page_index)
|
||||
existing_documents = add_term_infos(existing_documents_no_terms, indexer, page_index)
|
||||
other_documents = [doc for doc in existing_documents if doc.term != term]
|
||||
logger.info(f"Found {len(other_documents)} other documents for term {term} at page {page_index} "
|
||||
f"with terms { {doc.term for doc in other_documents} }")
|
||||
|
||||
all_documents = documents + other_documents
|
||||
logger.info(f"Storing {len(all_documents)} documents at page {page_index}")
|
||||
indexer.store_in_page(page_index, all_documents)
|
||||
|
||||
return {"curation": "ok"}
|
||||
|
||||
return router
|
||||
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
from datetime import datetime
|
||||
from typing import TypeVar, Generic
|
||||
|
||||
from ninja import Schema
|
||||
|
||||
|
||||
class Result(Schema):
|
||||
url: str
|
||||
title: str
|
||||
extract: str
|
||||
curated: bool
|
||||
|
||||
|
||||
class CurateBegin(Schema):
|
||||
pass
|
||||
|
||||
|
||||
class CurateMove(Schema):
|
||||
old_index: int
|
||||
new_index: int
|
||||
|
||||
|
||||
class CurateDelete(Schema):
|
||||
delete_index: int
|
||||
|
||||
|
||||
class CurateAdd(Schema):
|
||||
insert_index: int
|
||||
url: str
|
||||
|
||||
|
||||
class CurateValidate(Schema):
|
||||
validate_index: int
|
||||
is_validated: bool
|
||||
|
||||
|
||||
T = TypeVar('T', CurateBegin, CurateAdd, CurateDelete, CurateMove, CurateValidate)
|
||||
|
||||
|
||||
def make_curation_type(t):
|
||||
class Curation(Schema):
|
||||
timestamp: int
|
||||
url: str
|
||||
results: list[Result]
|
||||
curation: t
|
||||
return Curation
|
269
mwmbl/platform/user.py
Normal file
269
mwmbl/platform/user.py
Normal file
|
@ -0,0 +1,269 @@
|
|||
import json
|
||||
import os
|
||||
import uuid
|
||||
from typing import TypeVar, Generic, AsyncGenerator, Optional
|
||||
from urllib.parse import urljoin, parse_qs
|
||||
|
||||
import requests
|
||||
from fastapi import APIRouter, Response, Depends, Request
|
||||
from fastapi_users import UUIDIDMixin, BaseUserManager
|
||||
from fastapi_users.authentication import CookieTransport, AuthenticationBackend
|
||||
from fastapi_users.authentication.strategy import AccessTokenDatabase, DatabaseStrategy
|
||||
from fastapi_users_db_sqlalchemy import SQLAlchemyBaseUserTableUUID, SQLAlchemyUserDatabase
|
||||
from fastapi_users_db_sqlalchemy.access_token import SQLAlchemyBaseAccessTokenTableUUID, SQLAlchemyAccessTokenDatabase
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
|
||||
from mwmbl.settings import DATABASE_URL
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
|
||||
from mwmbl.tokenizer import tokenize
|
||||
|
||||
|
||||
LEMMY_URL = os.environ["LEMMY_URL"]
|
||||
RESULT_URL = "https://mwmbl.org/?q="
|
||||
MAX_CURATED_SCORE = 1_111_111.0
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
class User(SQLAlchemyBaseUserTableUUID, Base):
|
||||
pass
|
||||
|
||||
|
||||
class AccessToken(SQLAlchemyBaseAccessTokenTableUUID, Base):
|
||||
pass
|
||||
|
||||
|
||||
engine = create_async_engine(DATABASE_URL)
|
||||
async_session_maker = async_sessionmaker(engine, expire_on_commit=False)
|
||||
cookie_transport = CookieTransport(cookie_max_age=3600)
|
||||
|
||||
|
||||
async def create_db_and_tables():
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
|
||||
async def get_async_session() -> AsyncGenerator[AsyncSession, None]:
|
||||
async with async_session_maker() as session:
|
||||
yield session
|
||||
|
||||
|
||||
async def get_user_db(session: AsyncSession = Depends(get_async_session)):
|
||||
yield SQLAlchemyUserDatabase(session, User)
|
||||
|
||||
|
||||
async def get_access_token_db(session: AsyncSession = Depends(get_async_session)):
|
||||
yield SQLAlchemyAccessTokenDatabase(session, AccessToken)
|
||||
|
||||
|
||||
def get_database_strategy(
|
||||
access_token_db: AccessTokenDatabase[AccessToken] = Depends(get_access_token_db),
|
||||
) -> DatabaseStrategy:
|
||||
return DatabaseStrategy(access_token_db, lifetime_seconds=3600)
|
||||
|
||||
|
||||
auth_backend = AuthenticationBackend(
|
||||
name="db",
|
||||
transport=cookie_transport,
|
||||
get_strategy=get_database_strategy,
|
||||
)
|
||||
|
||||
|
||||
class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
reset_password_token_secret = "" # TODO
|
||||
verification_token_secret = "" # TODO
|
||||
|
||||
async def on_after_register(self, user: User, request: Optional[Request] = None):
|
||||
print(f"User {user.id} has registered.")
|
||||
|
||||
async def on_after_forgot_password(
|
||||
self, user: User, token: str, request: Optional[Request] = None
|
||||
):
|
||||
print(f"User {user.id} has forgot their password. Reset token: {token}")
|
||||
|
||||
async def on_after_request_verify(
|
||||
self, user: User, token: str, request: Optional[Request] = None
|
||||
):
|
||||
print(f"Verification requested for user {user.id}. Verification token: {token}")
|
||||
|
||||
|
||||
async def get_user_manager(user_db=Depends(get_user_db)):
|
||||
yield UserManager(user_db)
|
||||
|
||||
|
||||
class Register(BaseModel):
|
||||
username: str
|
||||
email: str
|
||||
password: str
|
||||
password_verify: str
|
||||
|
||||
|
||||
class Login(BaseModel):
|
||||
username_or_email: str
|
||||
password: str
|
||||
|
||||
|
||||
class Result(BaseModel):
|
||||
url: str
|
||||
title: str
|
||||
extract: str
|
||||
curated: bool
|
||||
|
||||
|
||||
class BeginCurate(BaseModel):
|
||||
auth: str
|
||||
url: str
|
||||
results: list[Result]
|
||||
|
||||
|
||||
class CurateMove(BaseModel):
|
||||
old_index: int
|
||||
new_index: int
|
||||
|
||||
|
||||
class CurateDelete(BaseModel):
|
||||
delete_index: int
|
||||
|
||||
|
||||
class CurateAdd(BaseModel):
|
||||
insert_index: int
|
||||
url: str
|
||||
|
||||
|
||||
class CurateValidate(BaseModel):
|
||||
validate_index: int
|
||||
is_validated: bool
|
||||
|
||||
|
||||
T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate)
|
||||
|
||||
|
||||
class Curation(BaseModel, Generic[T]):
|
||||
auth: str
|
||||
curation_id: int
|
||||
url: str
|
||||
results: list[Result]
|
||||
curation: T
|
||||
|
||||
|
||||
def create_router(index_path: str) -> APIRouter:
|
||||
router = APIRouter(prefix="/user", tags=["user"])
|
||||
|
||||
# TODO: reinstate
|
||||
# community_id = get_community_id()
|
||||
community_id = 0
|
||||
|
||||
@router.post("/register")
|
||||
def user_register(register: Register) -> Response:
|
||||
lemmy_register = {
|
||||
"username": register.username,
|
||||
"email": register.email,
|
||||
"password": register.password,
|
||||
"password_verify": register.password_verify,
|
||||
"answer": "not applicable",
|
||||
"captcha_answer": None,
|
||||
"captcha_uuid": None,
|
||||
"honeypot": None,
|
||||
"show_nsfw": False,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/register"), json=lemmy_register)
|
||||
if request.status_code != 200:
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
@router.post("/login")
|
||||
def user_login(login: Login) -> Response:
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/login"), json=login.dict())
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
@router.post("/curation/begin")
|
||||
def user_begin_curate(begin_curate: BeginCurate):
|
||||
results = begin_curate.dict()["results"]
|
||||
body = json.dumps({"original_results": results}, indent=2)
|
||||
create_post = {
|
||||
"auth": begin_curate.auth,
|
||||
"body": body,
|
||||
"community_id": community_id,
|
||||
"honeypot": None,
|
||||
"language_id": None,
|
||||
"name": begin_curate.url,
|
||||
"nsfw": None,
|
||||
"url": begin_curate.url,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/post"), json=create_post)
|
||||
if request.status_code != 200:
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
data = request.json()
|
||||
curation_id = data["post_view"]["post"]["id"]
|
||||
return {"curation_id": curation_id}
|
||||
|
||||
@router.post("/curation/move")
|
||||
def user_move_result(curate_move: Curation[CurateMove]):
|
||||
return _curate("curate_move", curate_move)
|
||||
|
||||
@router.post("/curation/delete")
|
||||
def user_delete_result(curate_delete: Curation[CurateDelete]):
|
||||
return _curate("curate_delete", curate_delete)
|
||||
|
||||
@router.post("/curation/add")
|
||||
def user_add_result(curate_add: Curation[CurateAdd]):
|
||||
return _curate("curate_add", curate_add)
|
||||
|
||||
@router.post("/curation/validate")
|
||||
def user_add_result(curate_validate: Curation[CurateValidate]):
|
||||
return _curate("curate_validate", curate_validate)
|
||||
|
||||
def _curate(curation_type: str, curation: Curation):
|
||||
content = json.dumps({
|
||||
"curation_type": curation_type,
|
||||
"curation": curation.curation.dict(),
|
||||
}, indent=2)
|
||||
create_comment = {
|
||||
"auth": curation.auth,
|
||||
"content": json.dumps(content, indent=2),
|
||||
"form_id": None,
|
||||
"language_id": None,
|
||||
"parent_id": None,
|
||||
"post_id": curation.curation_id,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
|
||||
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
query_string = parse_qs(curation.url)
|
||||
if len(query_string) > 1:
|
||||
raise ValueError(f"Should be one query string in the URL: {curation.url}")
|
||||
|
||||
queries = next(iter(query_string.values()))
|
||||
if len(queries) > 1:
|
||||
raise ValueError(f"Should be one query value in the URL: {curation.url}")
|
||||
|
||||
query = queries[0]
|
||||
print("Query", query)
|
||||
tokens = tokenize(query)
|
||||
print("Tokens", tokens)
|
||||
term = " ".join(tokens)
|
||||
print("Key", term)
|
||||
|
||||
documents = [
|
||||
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
|
||||
for i, result in enumerate(curation.results)
|
||||
]
|
||||
page_index = indexer.get_key_page_index(term)
|
||||
print("Page index", page_index)
|
||||
print("Storing documents", documents)
|
||||
indexer.store_in_page(page_index, documents)
|
||||
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
return router
|
||||
|
||||
|
||||
def get_community_id() -> str:
|
||||
request = requests.get(urljoin(LEMMY_URL, "api/v3/community?name=main"))
|
||||
community = request.json()
|
||||
return community["community_view"]["community"]["id"]
|
||||
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
from multiprocessing import Queue
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||
|
||||
queued_batches = Queue()
|
||||
completer = Completer()
|
||||
index_path = Path(settings.DATA_PATH) / INDEX_NAME
|
||||
tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||
tiny_index.__enter__()
|
||||
|
||||
ranker = HeuristicRanker(tiny_index, completer)
|
||||
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
|
|
@ -32,7 +32,6 @@ SCORE_FOR_SAME_DOMAIN = 0.01
|
|||
EXTRA_LINK_MULTIPLIER = 0.001
|
||||
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
|
||||
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
|
||||
DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx|jksu\.org|lwhyl\.org$|rgcd\.cn$|hzqwyou\.cn$|omgoat\.org$")
|
||||
CORE_DOMAINS = {
|
||||
'github.com',
|
||||
'en.wikipedia.org',
|
||||
|
@ -43,5 +42,3 @@ CORE_DOMAINS = {
|
|||
'arxiv.org',
|
||||
'www.python.org',
|
||||
}
|
||||
|
||||
BLACKLIST_DOMAINS_URL = "https://get.domainsblacklists.com/blacklist.txt"
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
from mwmbl.settings_prod import *
|
||||
|
||||
RUN_BACKGROUND_PROCESSES = True
|
|
@ -1,165 +0,0 @@
|
|||
"""
|
||||
Django settings for mwmbl project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 4.2.4.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/4.2/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/4.2/ref/settings/
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'django.contrib.humanize',
|
||||
'mwmbl',
|
||||
'django_htmx',
|
||||
'django_vite',
|
||||
'allauth',
|
||||
'allauth.account',
|
||||
'allauth.socialaccount',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
|
||||
"django_htmx.middleware.HtmxMiddleware",
|
||||
"allauth.account.middleware.AccountMiddleware",
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'mwmbl.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [],
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'mwmbl.wsgi.application'
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/4.2/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_TZ = True
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/4.2/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
|
||||
DJANGO_VITE_DEV_MODE = False
|
||||
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
AUTHENTICATION_BACKENDS = [
|
||||
# Needed to login by username in Django admin, regardless of `allauth`
|
||||
'django.contrib.auth.backends.ModelBackend',
|
||||
|
||||
# `allauth` specific authentication methods, such as login by email
|
||||
'allauth.account.auth_backends.AuthenticationBackend',
|
||||
]
|
||||
|
||||
|
||||
AUTH_USER_MODEL = "mwmbl.MwmblUser"
|
||||
|
||||
|
||||
ACCOUNT_EMAIL_REQUIRED = True
|
||||
ACCOUNT_EMAIL_VERIFICATION = "mandatory"
|
||||
|
||||
DEFAULT_FROM_EMAIL = "admin@mwmbl.org"
|
||||
|
||||
LOGIN_REDIRECT_URL = "/"
|
||||
|
||||
FOOTER_LINKS = [
|
||||
{
|
||||
"name": "Matrix",
|
||||
"icon": "ph-chat-circle-text-bold",
|
||||
"href": "https://matrix.to/#/#mwmbl:matrix.org",
|
||||
},
|
||||
{
|
||||
"name": "Book",
|
||||
"icon": "ph-book-bold",
|
||||
"href": "https://book.mwmbl.org",
|
||||
},
|
||||
{
|
||||
"name": "Blog",
|
||||
"icon": "ph-browser-bold",
|
||||
"href": "https://blog.mwmbl.org",
|
||||
},
|
||||
{
|
||||
"name": "GitHub",
|
||||
"icon": "ph-github-logo-bold",
|
||||
"href": "https://github.com/mwmbl/mwmbl",
|
||||
},
|
||||
{
|
||||
"name": "YouTube",
|
||||
"icon": "ph-youtube-logo-bold",
|
||||
"href": "https://www.youtube.com/channel/UCFLbqrH63-icAHxQ1eFfAvA",
|
||||
},
|
||||
|
||||
|
||||
]
|
|
@ -1,31 +0,0 @@
|
|||
from mwmbl.settings_common import *
|
||||
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
|
||||
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
STATIC_ROOT = ""
|
||||
DJANGO_VITE_ASSETS_PATH = Path(__file__).parent.parent / "front-end" / "dist"
|
||||
DJANGO_VITE_MANIFEST_PATH = DJANGO_VITE_ASSETS_PATH / "manifest.json"
|
||||
|
||||
STATICFILES_DIRS = [str(DJANGO_VITE_ASSETS_PATH)]
|
||||
|
||||
|
||||
DEBUG = True
|
||||
ALLOWED_HOSTS = ["localhost", "127.0.0.1"]
|
||||
|
||||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||
|
||||
DATA_PATH = "./devdata"
|
||||
RUN_BACKGROUND_PROCESSES = False
|
||||
NUM_PAGES = 2560
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
import os
|
||||
|
||||
import dj_database_url
|
||||
|
||||
from mwmbl.settings_common import *
|
||||
|
||||
|
||||
SECRET_KEY = os.environ["DJANGO_SECRET_KEY"]
|
||||
|
||||
|
||||
STATIC_ROOT = "/app/static/"
|
||||
|
||||
DJANGO_VITE_ASSETS_PATH = "/front-end-build/"
|
||||
DJANGO_VITE_MANIFEST_PATH = Path(DJANGO_VITE_ASSETS_PATH) / "manifest.json"
|
||||
STATICFILES_DIRS = [DJANGO_VITE_ASSETS_PATH]
|
||||
|
||||
DATABASES = {'default': dj_database_url.config(default=os.environ["DATABASE_URL"])}
|
||||
|
||||
DEBUG = True # TODO set back to False
|
||||
ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org", "beta.mwmbl.org"]
|
||||
CSRF_TRUSTED_ORIGINS = [f"https://{domain}" for domain in ALLOWED_HOSTS]
|
||||
|
||||
|
||||
# Sendgrid email settings
|
||||
EMAIL_HOST = 'smtp.sendgrid.net'
|
||||
EMAIL_HOST_USER = 'apikey'
|
||||
EMAIL_HOST_PASSWORD = os.getenv('EMAIL_HOST_PASSWORD')
|
||||
EMAIL_PORT = 587
|
||||
EMAIL_USE_TLS = True
|
||||
|
||||
|
||||
DATA_PATH = "/app/storage"
|
||||
RUN_BACKGROUND_PROCESSES = False
|
||||
NUM_PAGES = 10240000
|
|
@ -1,22 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{% block title %}Simple is Better Than Complex{% endblock %}</title>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1>My Site</h1>
|
||||
{% if user.is_authenticated %}
|
||||
<a href="{% url 'account_logout' %}">logout</a>
|
||||
{% else %}
|
||||
<a href="{% url 'account_login' %}">login</a> / <a href="{% url 'signup' %}">signup</a>
|
||||
{% endif %}
|
||||
<hr>
|
||||
</header>
|
||||
<main>
|
||||
{% block content %}
|
||||
{% endblock %}
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
|
@ -1,32 +0,0 @@
|
|||
{% load humanize %}
|
||||
{% include "title.html" %}
|
||||
<div class="main">
|
||||
{% if query %}
|
||||
<button class="button curate-add" is="mwmbl-add-button">+ Add new</button>
|
||||
{% if results %}
|
||||
<ul class='results'>
|
||||
{% for result in results %}
|
||||
{% include "result.html" %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% else %}
|
||||
<ul>
|
||||
<li class="home">
|
||||
<h1>
|
||||
No results found for "{{query}}".
|
||||
</h1>
|
||||
</li>
|
||||
</ul>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% for item in activity %}
|
||||
<ul>
|
||||
<li class="activity">
|
||||
<h1>
|
||||
{{ item.user }} made {{ item.num_curations | apnumber }} changes to <a href="{{ item.url }}">{{ item.query }}</a> {{ item.timestamp | naturaltime }}.
|
||||
</h1>
|
||||
</li>
|
||||
</ul>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</div>
|
|
@ -1,97 +0,0 @@
|
|||
{% load django_vite %}
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<!-- Metas -->
|
||||
<meta charset="UTF-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
{% include "title.html" %}
|
||||
<meta name="description" content="The free, open-source and non-profit search engine.">
|
||||
|
||||
<!-- Favicons -->
|
||||
<link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
|
||||
|
||||
<!-- Fonts import -->
|
||||
<link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<noscript>
|
||||
<link rel="stylesheet" href="/static/fonts/inter/inter.css">
|
||||
</noscript>
|
||||
|
||||
<!-- CSS Stylesheets (this is critical CSS) -->
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/reset.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/theme.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/global.css">
|
||||
|
||||
<!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
|
||||
<link rel="preload" href="/static/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<noscript>
|
||||
<link rel="stylesheet" href="/static/fonts/phosphor/icons.css">
|
||||
</noscript>
|
||||
|
||||
<!-- Custom Element Polyfill for Safari -->
|
||||
<script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
|
||||
|
||||
<!-- OpenSearch -->
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/static/assets/opensearch.xml" title="Mwmbl Search">
|
||||
|
||||
<script src="https://unpkg.com/htmx.org@1.9.6"></script>
|
||||
|
||||
{% vite_hmr_client %}
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<mwmbl-app></mwmbl-app>
|
||||
<header class="search-menu compact">
|
||||
<a href="/" class="branding">
|
||||
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
|
||||
<span class="brand-title">Mwmbl</span>
|
||||
</a>
|
||||
<form class="search-bar">
|
||||
<i class="ph-magnifying-glass-bold"></i>
|
||||
<input
|
||||
type='search'
|
||||
name='q'
|
||||
class='search-bar-input'
|
||||
placeholder='Search on Mwmbl...'
|
||||
title='Use "CTRL+K" or "/" to focus.'
|
||||
autocomplete='off'
|
||||
value='{{ query|default_if_none:"" }}'
|
||||
hx-get="/app/home/"
|
||||
hx-trigger="keyup changed delay:100ms"
|
||||
hx-target=".main"
|
||||
>
|
||||
</form>
|
||||
<div is="mwmbl-save"></div>
|
||||
{% if user.is_authenticated %}
|
||||
<p class="login-info">Logged in as {{ user.username }}</p>
|
||||
<a class="button" href="/accounts/logout/">Log out</a>
|
||||
{% else %}
|
||||
<a class="button" href="/accounts/login/">Login</a>
|
||||
<a class="button" href="/accounts/signup/">Sign up</a>
|
||||
{% endif %}
|
||||
</header>
|
||||
<main>
|
||||
{% include "home.html" %}
|
||||
</main>
|
||||
<div is="mwmbl-add-result"></div>
|
||||
<div class="footer">
|
||||
<ul class="footer-list">
|
||||
{% for link in footer_links %}
|
||||
<li class="footer-item">
|
||||
<a href="{{ link.href }}" class="footer-link" target="__blank">
|
||||
<i class="{{ link.icon }}"></i>
|
||||
<span>{{ link.name }}</span>
|
||||
</a>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% vite_asset 'index.js' %}
|
||||
{% vite_legacy_polyfills %}
|
||||
{% vite_legacy_asset 'index-legacy.js' %}
|
||||
</body>
|
||||
|
||||
</html>
|
|
@ -1,26 +0,0 @@
|
|||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<h2>Log in to My Site</h2>
|
||||
{% if form.errors %}
|
||||
<p style="color: red">Your username and password didn't match. Please try again.</p>
|
||||
{% endif %}
|
||||
<form method="post">
|
||||
{% csrf_token %}
|
||||
<input type="hidden" name="next" value="{{ next }}" />
|
||||
{% for field in form %}
|
||||
<p>
|
||||
{{ field.label_tag }}<br>
|
||||
{{ field }}<br>
|
||||
{% for error in field.errors %}
|
||||
<p style="color: red">{{ error }}</p>
|
||||
{% endfor %}
|
||||
{% if field.help_text %}
|
||||
<p><small style="color: grey">{{ field.help_text }}</small></p>
|
||||
{% endif %}
|
||||
</p>
|
||||
{% endfor %}
|
||||
<button type="submit">Log in</button>
|
||||
<a href="{% url 'signup' %}">New to My Site? Sign up</a>
|
||||
</form>
|
||||
{% endblock %}
|
|
@ -1,17 +0,0 @@
|
|||
{% load result_filters %}
|
||||
<li class="result" is="mwmbl-result">
|
||||
<div class="result-container">
|
||||
<div class="result-link">
|
||||
<a href="{{result.url}}">
|
||||
<p class='link'>{{result.url}}</p>
|
||||
<p class='title'>{{result.title|strengthen}}</p>
|
||||
</a>
|
||||
<p class='extract'>{{result.extract|strengthen}}</p>
|
||||
</div>
|
||||
<div class="curation-buttons">
|
||||
<span class="button handle">↕ Move</span>
|
||||
<button class="button curate-delete" is="mwmbl-delete-button">✕ Delete</button>
|
||||
<button class="button curate-approve" is="mwmbl-validate-button">✓ Looks good</button>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
|
@ -1,10 +0,0 @@
|
|||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<h2>Sign up</h2>
|
||||
<form method="post">
|
||||
{% csrf_token %}
|
||||
{{ form.as_p }}
|
||||
<button type="submit">Sign up</button>
|
||||
</form>
|
||||
{% endblock %}
|
|
@ -1,6 +0,0 @@
|
|||
<!-- Page title -->
|
||||
{% if query %}
|
||||
<title>Mwmbl - {{ query }}</title>
|
||||
{% else %}
|
||||
<title>Mwmbl - Search</title>
|
||||
{% endif %}
|
|
@ -1,18 +0,0 @@
|
|||
from django.template import Library
|
||||
from django.utils.html import conditional_escape
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
register = Library()
|
||||
|
||||
|
||||
@register.filter(needs_autoescape=True)
|
||||
def strengthen(spans, autoescape=True):
|
||||
escape = conditional_escape if autoescape else lambda x: x
|
||||
strengthened = []
|
||||
for span in spans:
|
||||
escaped_value = escape(span["value"])
|
||||
if span["is_bold"]:
|
||||
strengthened.append(f"<strong>{escaped_value}</strong>")
|
||||
else:
|
||||
strengthened.append(escaped_value)
|
||||
return mark_safe("".join(strengthened))
|
|
@ -79,7 +79,6 @@ class TinyIndexMetadata:
|
|||
values = json.loads(data[constant_length:].decode('utf8'))
|
||||
return TinyIndexMetadata(**values)
|
||||
|
||||
|
||||
# Find the optimal amount of data that fits onto a page
|
||||
# We do this by leveraging binary search to quickly find the index where:
|
||||
# - index+1 cannot fit onto a page
|
||||
|
@ -107,12 +106,10 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
|
|||
# No better match, use our index
|
||||
return mid, compressed_data
|
||||
|
||||
|
||||
def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
|
||||
# Find max number of items that fit on a page
|
||||
return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
|
||||
|
||||
|
||||
def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
|
||||
num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
|
||||
|
||||
|
@ -189,6 +186,7 @@ class TinyIndex(Generic[T]):
|
|||
except ZstdError:
|
||||
logger.exception(f"Error decompressing page data, content: {page_data}")
|
||||
return []
|
||||
# logger.debug(f"Decompressed data: {decompressed_data}")
|
||||
return json.loads(decompressed_data.decode('utf8'))
|
||||
|
||||
def store_in_page(self, page_index: int, values: list[T]):
|
||||
|
|
|
@ -6,6 +6,7 @@ from operator import itemgetter
|
|||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.format import format_result_with_pattern, get_query_regex
|
||||
from mwmbl.platform.user import MAX_CURATED_SCORE
|
||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from logging import getLogger
|
||||
|
||||
from ninja import NinjaAPI
|
||||
from fastapi import APIRouter
|
||||
|
||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||
|
||||
|
@ -10,15 +10,15 @@ logger = getLogger(__name__)
|
|||
SCORE_THRESHOLD = 0.25
|
||||
|
||||
|
||||
def create_router(ranker: HeuristicRanker, version: str) -> NinjaAPI:
|
||||
router = NinjaAPI(urls_namespace=f"search-{version}")
|
||||
def create_router(ranker: HeuristicRanker) -> APIRouter:
|
||||
router = APIRouter(prefix="/search", tags=["search"])
|
||||
|
||||
@router.get("")
|
||||
def search(request, s: str):
|
||||
def search(s: str):
|
||||
return ranker.search(s)
|
||||
|
||||
@router.get("/complete")
|
||||
def complete(request, q: str):
|
||||
def complete(q: str):
|
||||
return ranker.complete(q)
|
||||
|
||||
return router
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta
|
||||
from logging import getLogger
|
||||
from multiprocessing import Queue
|
||||
|
@ -10,7 +11,6 @@ from typing import KeysView, Union
|
|||
from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS as TOP_DOMAINS
|
||||
from mwmbl.indexer.blacklist import is_domain_blacklisted, get_blacklist_domains
|
||||
from mwmbl.settings import CORE_DOMAINS
|
||||
from mwmbl.utils import batch, get_domain
|
||||
|
||||
|
@ -46,15 +46,13 @@ class URLQueue:
|
|||
|
||||
def initialize(self):
|
||||
logger.info(f"Initializing URL queue")
|
||||
blacklist_domains = get_blacklist_domains()
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
found_urls = url_db.get_urls(URLStatus.NEW, INITIALIZE_URLS)
|
||||
self._process_found_urls(found_urls, blacklist_domains)
|
||||
self._process_found_urls(found_urls)
|
||||
logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}")
|
||||
|
||||
def update(self):
|
||||
blacklist_domains = get_blacklist_domains()
|
||||
num_processed = 0
|
||||
while True:
|
||||
try:
|
||||
|
@ -62,10 +60,10 @@ class URLQueue:
|
|||
num_processed += 1
|
||||
except Empty:
|
||||
break
|
||||
self._process_found_urls(new_batch, blacklist_domains)
|
||||
self._process_found_urls(new_batch)
|
||||
return num_processed
|
||||
|
||||
def _process_found_urls(self, found_urls: list[FoundURL], blacklist_domains: set[str]):
|
||||
def _process_found_urls(self, found_urls: list[FoundURL]):
|
||||
min_updated_date = datetime.utcnow() - timedelta(hours=REASSIGN_MIN_HOURS)
|
||||
|
||||
logger.info(f"Found URLS: {len(found_urls)}")
|
||||
|
@ -73,7 +71,7 @@ class URLQueue:
|
|||
found_url.status == URLStatus.ASSIGNED.value and found_url.timestamp < min_updated_date)]
|
||||
logger.info(f"Valid URLs: {len(valid_urls)}")
|
||||
|
||||
self._sort_urls(valid_urls, blacklist_domains)
|
||||
self._sort_urls(valid_urls)
|
||||
logger.info(f"Queue size: {self.num_queued_batches}")
|
||||
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) >= self._min_top_domains:
|
||||
total_top_urls = sum(len(urls) for urls in self._top_urls.values())
|
||||
|
@ -85,14 +83,12 @@ class URLQueue:
|
|||
self._batch_urls()
|
||||
logger.info(f"Queue size after batching: {self.num_queued_batches}")
|
||||
|
||||
def _sort_urls(self, valid_urls: list[FoundURL], blacklist_domains: set[str]):
|
||||
def _sort_urls(self, valid_urls: list[FoundURL]):
|
||||
for found_url in valid_urls:
|
||||
try:
|
||||
domain = get_domain(found_url.url)
|
||||
except ValueError:
|
||||
continue
|
||||
if is_domain_blacklisted(domain, blacklist_domains):
|
||||
continue
|
||||
url_store = self._top_urls if domain in TOP_DOMAINS else self._other_urls
|
||||
url_store[domain][found_url.url] = found_url.score
|
||||
|
||||
|
|
|
@ -1,43 +0,0 @@
|
|||
"""
|
||||
URL configuration for app project.
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/4.2/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Import the include() function: from django.urls import include, path
|
||||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
|
||||
import mwmbl.crawler.app as crawler
|
||||
from mwmbl.platform import curate
|
||||
from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
|
||||
from mwmbl.tinysearchengine import search
|
||||
from mwmbl.views import home_fragment, fetch_url, index
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('accounts/', include('allauth.urls')),
|
||||
|
||||
path('', index, name="index"),
|
||||
path('app/home/', home_fragment, name="home"),
|
||||
path('app/fetch/', fetch_url, name="fetch_url"),
|
||||
|
||||
# TODO: this is the old API, deprecated and to be removed once all clients have moved over
|
||||
path("search/", search.create_router(ranker, "0.1").urls),
|
||||
path("crawler/", crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches, version="0.1").urls),
|
||||
path("curation/", curate.create_router(index_path, version="0.1").urls),
|
||||
|
||||
# New API
|
||||
path("api/v1/search/", search.create_router(ranker, "1.0.0").urls),
|
||||
path("api/v1/crawler/", crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches, version="1.0.0").urls),
|
||||
path("api/v1/curation/", curate.create_router(index_path, version="1.0.0").urls),
|
||||
]
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue