Merge pull request #128 from mwmbl/beta

Allow users to curate search results
This commit is contained in:
Daoud Clarke 2023-11-18 20:14:50 +00:00 committed by GitHub
commit a3cc316d15
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
66 changed files with 1721 additions and 1023 deletions

1
.gitignore vendored
View file

@ -17,6 +17,7 @@ __pycache__/
build/
develop-eggs/
dist/
front-end/dist/
downloads/
eggs/
.eggs/

View file

@ -50,7 +50,7 @@ COPY --from=builder /venv /venv
COPY --from=front-end /front-end/dist /front-end-build
ADD nginx.conf.sigil /app
ADD app.json /app
# ADD app.json /app
# Set up a volume where the data will live
VOLUME ["/data"]

51
analyse/add_term_info.py Normal file
View file

@ -0,0 +1,51 @@
"""
Investigate adding term information to the database.
How much extra space will it take?
"""
import os
from pathlib import Path
from random import Random
import numpy as np
from scipy.stats import sem
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
from zstandard import ZstdCompressor
from mwmbl.utils import add_term_info
random = Random(1)
INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
def run():
compressor = ZstdCompressor()
with TinyIndex(Document, INDEX_PATH) as index:
# Get some random integers between 0 and index.num_pages:
pages = random.sample(range(index.num_pages), 10000)
old_sizes = []
new_sizes = []
for i in pages:
page = index.get_page(i)
term_documents = []
for document in page:
term_document = add_term_info(document, index, i)
term_documents.append(term_document)
value_tuples = [astuple(value) for value in term_documents]
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
new_sizes.append(num_fitting)
old_sizes.append(len(page))
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
if __name__ == '__main__':
run()

View file

@ -1,57 +0,0 @@
"""
Index batches stored locally on the filesystem for the purpose of evaluation.
"""
import glob
import gzip
import json
import logging
import os
import sys
from datetime import datetime
import spacy
from mwmbl.crawler import HashedBatch
from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database
from mwmbl.indexer import index_batches
from mwmbl.tinysearchengine import TinyIndex, Document
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
NUM_BATCHES = 10000
EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
NUM_PAGES = 1_024_000
PAGE_SIZE = 4096
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
def get_batches():
for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
data = json.load(gzip.open(path))
yield HashedBatch.parse_obj(data)
def run():
try:
os.remove(EVALUATE_INDEX_PATH)
except FileNotFoundError:
pass
TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
batches = get_batches()
start = datetime.now()
with Database() as db:
nlp = spacy.load("en_core_web_sm")
url_db = URLDatabase(db.connection)
index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
end = datetime.now()
total_time = (end - start).total_seconds()
print("total_seconds:", total_time)
if __name__ == '__main__':
run()

View file

@ -1,60 +0,0 @@
import logging
import sys
import numpy as np
import spacy
from analyse.index_local import EVALUATE_INDEX_PATH
from mwmbl.indexer import tokenize_document
from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine import TinyIndex, Document
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
nlp = spacy.load("en_core_web_sm")
def store():
document = Document(
title='A nation in search of the new black | Theatre | The Guardian',
url='https://www.theguardian.com/stage/2007/nov/18/theatre',
extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
score=1.0
)
with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
print("Tokenized", tokenized)
# for token in tokenized.tokens:
#
# tiny_index.index(token, document)
def get_items():
with TinyIndex(Document, INDEX_PATH) as tiny_index:
items = tiny_index.retrieve('wikipedia')
if items:
for item in items:
print("Items", item)
def run(index_path):
with TinyIndex(Document, index_path) as tiny_index:
sizes = {}
for i in range(tiny_index.num_pages):
page = tiny_index.get_page(i)
if page:
sizes[i] = len(page)
if len(page) > 50:
print("Page", len(page), page)
# for item in page:
# if ' search' in item.title:
# print("Page", i, item)
print("Max", max(sizes.values()))
print("Top", sorted(sizes.values())[-100:])
print("Mean", np.mean(list(sizes.values())))
if __name__ == '__main__':
# store()
run(EVALUATE_INDEX_PATH)
# get_items()

Binary file not shown.

View file

@ -21,6 +21,12 @@ body {
margin: 25px;
}
@media screen and (max-width: 600px) {
.branding {
display: none;
}
}
.brand-title {
text-align: center;
font-weight: var(--black-font-weight);
@ -62,12 +68,9 @@ body {
height: 2rem;
}
mwmbl-search-bar {
width: 100%;
}
.search-bar {
position: relative;
width: 100%;
}
.search-bar-input {
@ -104,7 +107,7 @@ mwmbl-search-bar {
pointer-events: none;
}
mwmbl-results, footer {
.main, footer {
display: block;
max-width: 800px;
width: 100%;
@ -114,11 +117,14 @@ mwmbl-results, footer {
.results {
max-width: 100%;
list-style-type: none;
padding: 10px;
padding: 0;
}
.result a {
display: block;
.result {
min-height: 120px;
}
.result-container {
text-decoration: none;
color: var(--dark-color);
padding: 15px;
@ -130,11 +136,11 @@ mwmbl-results, footer {
outline 100ms ease-in-out;
}
.result:hover a, .result a:focus {
.result-container:hover,.result-container:focus {
background-color: var(--gray-color);
}
.result a:focus {
.result-container:focus {
outline: 3px solid var(--primary-color);
}
@ -158,7 +164,7 @@ mwmbl-results, footer {
font-weight: var(--bold-font-weight);
}
footer {
.footer {
position: sticky;
top: 100vh;
margin-bottom: 25px;
@ -228,5 +234,108 @@ footer {
a {
font-weight: var(--bold-font-weight);
color: var(--primary-color);
text-decoration: underline;
}
text-decoration: none;
}
.curation-buttons {
display: grid;
grid-auto-flow: column;
grid-column-gap: 20px;
grid-auto-columns: max-content;
}
.result-container .button {
background-color: var(--dark-gray-color);
color: white;
padding: 5px 10px;
margin: 0;
font-size: var(--small-font-size);
font-weight: var(--bold-font-weight);
}
.validated {
background-color: green !important;
}
.modal {
/*display: none; !* Hidden by default *!*/
position: fixed; /* Stay in place */
z-index: 100; /* Sit on top */
left: 0;
top: 0;
width: 100%; /* Full width */
height: 100%; /* Full height */
overflow: auto; /* Enable scroll if needed */
background-color: rgb(0,0,0); /* Fallback color */
background-color: rgba(0,0,0,0.4); /* Black w/ opacity */
}
/* Modal Content/Box */
.modal-content {
background-color: #fefefe;
margin: 15% auto; /* 15% from the top and centered */
padding: 20px;
border: 1px solid #888;
max-width: 800px;
width: 80%; /* Could be more or less, depending on screen size */
}
/* The Close Button */
.close {
color: #aaa;
float: right;
font-size: 28px;
font-weight: bold;
}
.close:hover,
.close:focus {
color: black;
text-decoration: none;
cursor: pointer;
}
.button {
background-color: var(--primary-color);
border: none;
color: white;
padding: 10px 20px;
margin: 10px;
text-align: center;
text-decoration: none;
display: inline-block;
font-size: var(--default-font-size);
border-radius: 50px;
cursor: pointer;
flex-shrink: 0;
transition: background-color 200ms ease-in-out;
}
@media screen and (max-width: 600px) {
.button {
padding: 5px 10px;
font-size: var(--small-font-size);
margin: 5px;
}
}
.button:hover {
background-color: var(--dark-color);
}
.login-info {
padding: 10px;
}
/* Sortable styling is not working in HTML 5 yet */
/*.sortable-drag {*/
/* opacity: 1.0;*/
/*}*/
/*.sortable-ghost {*/
/* opacity: 1.0;*/
/*}*/
/*.sortable-chosen {*/
/* opacity: 0;*/
/*}*/

View file

@ -7,9 +7,11 @@
--primary-color: #185ADB;
--gray-color: #EEEEEE;
--light-color: #F8F8F8;
--dark-gray-color: #767676;
/* Fonts: */
--regular-font: 'Inter', sans-serif;
--small-font-size: 12px;
--default-font-size: 16px;
--default-font-weight: 400;
--bold-font-weight: 700;

View file

@ -103,4 +103,20 @@ Phosphor Web Font
.ph-info-bold::before {
content: "\f88f";
}
}
.ph-book-bold::before {
content: "\f6fb";
}
.ph-browser-bold::before {
content: "\f70d";
}
.ph-youtube-logo-bold::before {
content: "\fa5d";
}
.ph-chat-circle-text-bold::before {
content: "\f74c";
}

View file

@ -13290,10 +13290,6 @@
content: "\f6fa";
}
.ph-book-bold::before {
content: "\f6fb";
}
.ph-book-bookmark-bold::before {
content: "\f6fc";
}
@ -13362,10 +13358,6 @@
content: "\f70c";
}
.ph-browser-bold::before {
content: "\f70d";
}
.ph-browsers-bold::before {
content: "\f70e";
}
@ -13614,10 +13606,6 @@
content: "\f74b";
}
.ph-chat-circle-text-bold::before {
content: "\f74c";
}
.ph-chat-dots-bold::before {
content: "\f74d";
}
@ -16750,10 +16738,6 @@
content: "\fa5c";
}
.ph-youtube-logo-bold::before {
content: "\fa5d";
}
.ph-activity-fill::before {
content: "\fa5e";
}

File diff suppressed because one or more lines are too long

View file

@ -8,20 +8,9 @@
export default {
componentPrefix: 'mwmbl',
publicApiURL: 'https://api.mwmbl.org/',
publicApiURL: '/api/v1/',
// publicApiURL: 'http://localhost:5000/',
searchQueryParam: 'q',
footerLinks: [
{
name: 'Github',
icon: 'ph-github-logo-bold',
href: 'https://github.com/mwmbl/mwmbl'
},
{
name: 'Wiki',
icon: 'ph-info-bold',
href: 'https://github.com/mwmbl/mwmbl/wiki'
}
],
commands: {
'go: ': 'https://',
'search: google.com ': 'https://www.google.com/search?q=',

View file

@ -6,7 +6,8 @@
"": {
"name": "front-end",
"dependencies": {
"chart.js": "^4.4.0"
"chart.js": "^4.4.0",
"sortablejs": "^1.15.0"
},
"devDependencies": {
"@vitejs/plugin-legacy": "^2.3.1",
@ -598,16 +599,10 @@
}
},
"node_modules/nanoid": {
"version": "3.3.6",
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
"integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==",
"version": "3.3.4",
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.4.tgz",
"integrity": "sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw==",
"dev": true,
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/ai"
}
],
"bin": {
"nanoid": "bin/nanoid.cjs"
},
@ -628,9 +623,9 @@
"dev": true
},
"node_modules/postcss": {
"version": "8.4.31",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz",
"integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==",
"version": "8.4.19",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.19.tgz",
"integrity": "sha512-h+pbPsyhlYj6N2ozBmHhHrs9DzGmbaarbLvWipMRO7RLS+v4onj26MPFXA5OBYFxyqYhUJK456SwDcY9H2/zsA==",
"dev": true,
"funding": [
{
@ -640,14 +635,10 @@
{
"type": "tidelift",
"url": "https://tidelift.com/funding/github/npm/postcss"
},
{
"type": "github",
"url": "https://github.com/sponsors/ai"
}
],
"dependencies": {
"nanoid": "^3.3.6",
"nanoid": "^3.3.4",
"picocolors": "^1.0.0",
"source-map-js": "^1.0.2"
},
@ -693,6 +684,11 @@
"fsevents": "~2.3.2"
}
},
"node_modules/sortablejs": {
"version": "1.15.0",
"resolved": "https://registry.npmjs.org/sortablejs/-/sortablejs-1.15.0.tgz",
"integrity": "sha512-bv9qgVMjUMf89wAvM6AxVvS/4MX3sPeN0+agqShejLU5z5GX4C75ow1O2e5k4L6XItUyAK3gH6AxSbXrOM5e8w=="
},
"node_modules/source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
@ -765,9 +761,9 @@
}
},
"node_modules/vite": {
"version": "3.2.7",
"resolved": "https://registry.npmjs.org/vite/-/vite-3.2.7.tgz",
"integrity": "sha512-29pdXjk49xAP0QBr0xXqu2s5jiQIXNvE/xwd0vUizYT2Hzqe4BksNNoWllFVXJf4eLZ+UlVQmXfB4lWrc+t18g==",
"version": "3.2.5",
"resolved": "https://registry.npmjs.org/vite/-/vite-3.2.5.tgz",
"integrity": "sha512-4mVEpXpSOgrssFZAOmGIr85wPHKvaDAcXqxVxVRZhljkJOMZi1ibLibzjLHzJvcok8BMguLc7g1W6W/GqZbLdQ==",
"dev": true,
"dependencies": {
"esbuild": "^0.15.9",
@ -1145,9 +1141,9 @@
}
},
"nanoid": {
"version": "3.3.6",
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
"integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==",
"version": "3.3.4",
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.4.tgz",
"integrity": "sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw==",
"dev": true
},
"path-parse": {
@ -1163,12 +1159,12 @@
"dev": true
},
"postcss": {
"version": "8.4.31",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz",
"integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==",
"version": "8.4.19",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.19.tgz",
"integrity": "sha512-h+pbPsyhlYj6N2ozBmHhHrs9DzGmbaarbLvWipMRO7RLS+v4onj26MPFXA5OBYFxyqYhUJK456SwDcY9H2/zsA==",
"dev": true,
"requires": {
"nanoid": "^3.3.6",
"nanoid": "^3.3.4",
"picocolors": "^1.0.0",
"source-map-js": "^1.0.2"
}
@ -1199,6 +1195,11 @@
"fsevents": "~2.3.2"
}
},
"sortablejs": {
"version": "1.15.0",
"resolved": "https://registry.npmjs.org/sortablejs/-/sortablejs-1.15.0.tgz",
"integrity": "sha512-bv9qgVMjUMf89wAvM6AxVvS/4MX3sPeN0+agqShejLU5z5GX4C75ow1O2e5k4L6XItUyAK3gH6AxSbXrOM5e8w=="
},
"source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
@ -1252,9 +1253,9 @@
}
},
"vite": {
"version": "3.2.7",
"resolved": "https://registry.npmjs.org/vite/-/vite-3.2.7.tgz",
"integrity": "sha512-29pdXjk49xAP0QBr0xXqu2s5jiQIXNvE/xwd0vUizYT2Hzqe4BksNNoWllFVXJf4eLZ+UlVQmXfB4lWrc+t18g==",
"version": "3.2.5",
"resolved": "https://registry.npmjs.org/vite/-/vite-3.2.5.tgz",
"integrity": "sha512-4mVEpXpSOgrssFZAOmGIr85wPHKvaDAcXqxVxVRZhljkJOMZi1ibLibzjLHzJvcok8BMguLc7g1W6W/GqZbLdQ==",
"dev": true,
"requires": {
"esbuild": "^0.15.9",

View file

@ -13,6 +13,7 @@
"vite": "^3.2.3"
},
"dependencies": {
"chart.js": "^4.4.0"
"chart.js": "^4.4.0",
"sortablejs": "^1.15.0"
}
}

View file

@ -1,26 +0,0 @@
import define from '../utils/define.js';
const template = () => /*html*/`
<header class="search-menu">
<div class="branding">
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
<span class="brand-title">MWMBL</span>
</div>
<mwmbl-search-bar></mwmbl-search-bar>
</header>
<main>
<mwmbl-results></mwmbl-results>
</main>
<footer is="mwmbl-footer"></footer>
`;
export default define('app', class extends HTMLElement {
constructor() {
super();
this.__setup();
}
__setup() {
this.innerHTML = template();
}
});

View file

@ -0,0 +1,21 @@
import define from "../../utils/define.js";
export default define('add-button', class extends HTMLButtonElement {
constructor() {
super();
this.__setup();
}
__setup() {
this.__events();
}
__events() {
this.addEventListener('click', (e) => {
console.log("Add button");
document.querySelector('.modal').style.display = 'block';
document.querySelector('.modal input').focus();
})
}
}, { extends: 'button' });

View file

@ -0,0 +1,69 @@
import define from '../../utils/define.js';
import config from "../../../config.js";
import {globalBus} from "../../utils/events.js";
const FETCH_URL = '/app/fetch?'
const template = () => /*html*/`
<form class="modal-content">
<span class="close">&times;</span>
<input class="add-result" placeholder="Enter a URL...">
<button>Save</button>
</form>
`;
export default define('add-result', class extends HTMLDivElement {
constructor() {
super();
this.classList.add('modal');
this.__setup();
}
__setup() {
this.innerHTML = template();
this.__events();
this.style.display = 'none';
}
__events() {
this.querySelector('.close').addEventListener('click', e => {
if (e.target === this) {
this.style.display = 'none';
}
});
this.addEventListener('click', e => {
this.style.display = 'none';
});
this.querySelector('form').addEventListener('click', e => {
// Clicking on the form shouldn't close it
e.stopPropagation();
});
this.addEventListener('submit', this.__urlSubmitted.bind(this));
}
async __urlSubmitted(e) {
e.preventDefault();
const value = this.querySelector('input').value;
console.log("Input value", value);
const query = document.querySelector('.search-bar input').value;
const url = `${FETCH_URL}url=${encodeURIComponent(value)}&query=${encodeURIComponent(query)}`;
const response = await fetch(url);
if (response.status === 200) {
const data = await response.text();
console.log("Data", data);
const addResultEvent = new CustomEvent('curate-add-result', {detail: data});
globalBus.dispatch(addResultEvent);
} else {
console.log("Bad response", response);
// TODO
}
}
}, { extends: 'div' });

View file

@ -0,0 +1,35 @@
import define from "../../utils/define.js";
import {globalBus} from "../../utils/events.js";
export default define('delete-button', class extends HTMLButtonElement {
constructor() {
super();
this.__setup();
}
__setup() {
this.__events();
}
__events() {
this.addEventListener('click', (e) => {
console.log("Delete button");
const result = this.closest('.result');
const parent = result.parentNode;
const index = Array.prototype.indexOf.call(parent.getElementsByClassName('result'), result);
console.log("Delete index", index);
const beginCuratingEvent = new CustomEvent('curate-delete-result', {
detail: {
data: {
delete_index: index
}
}
});
globalBus.dispatch(beginCuratingEvent);
})
}
}, { extends: 'button' });

View file

@ -1,17 +0,0 @@
import define from '../../utils/define.js';
const template = () => /*html*/`
<p>We could not find anything for your search...</p>
`;
export default define('empty-result', class extends HTMLLIElement {
constructor() {
super();
this.classList.add('empty-result');
this.__setup();
}
__setup() {
this.innerHTML = template();
}
}, { extends: 'li' });

View file

@ -2,13 +2,6 @@ import define from '../../utils/define.js';
import escapeString from '../../utils/escapeString.js';
import { globalBus } from '../../utils/events.js';
const template = ({ data }) => /*html*/`
<a href='${data.url}'>
<p class='link'>${data.url}</p>
<p class='title'>${data.title}</p>
<p class='extract'>${data.extract}</p>
</a>
`;
export default define('result', class extends HTMLLIElement {
constructor() {
@ -18,11 +11,6 @@ export default define('result', class extends HTMLLIElement {
}
__setup() {
this.innerHTML = template({ data: {
url: this.dataset.url,
title: this.__handleBold(JSON.parse(this.dataset.title)),
extract: this.__handleBold(JSON.parse(this.dataset.extract))
}});
this.__events();
}

View file

@ -0,0 +1,53 @@
import define from "../../utils/define.js";
import {globalBus} from "../../utils/events.js";
const VALIDATED_CLASS = "validated";
export default define('validate-button', class extends HTMLButtonElement {
constructor() {
super();
this.__setup();
}
__setup() {
this.__events();
}
__events() {
this.addEventListener('click', (e) => {
console.log("Validate button");
const result = this.closest('.result');
const parent = result.parentNode;
const index = Array.prototype.indexOf.call(parent.getElementsByClassName('result'), result);
console.log("Validate index", index);
const curationValidateEvent = new CustomEvent('curate-validate-result', {
detail: {
data: {
validate_index: index
}
}
});
globalBus.dispatch(curationValidateEvent);
})
}
isValidated() {
return this.classList.contains(VALIDATED_CLASS);
}
validate() {
this.classList.add(VALIDATED_CLASS);
}
unvalidate() {
this.classList.remove(VALIDATED_CLASS);
}
toggleValidate() {
this.classList.toggle(VALIDATED_CLASS);
}
}, { extends: 'button' });

View file

@ -1,36 +0,0 @@
import define from '../../utils/define.js';
import config from '../../../config.js';
const template = ({ data }) => /*html*/`
<p class="footer-text">Find more on</p>
<ul class="footer-list">
${data.links.map(link => /*html*/`
<li class="footer-item">
<a href="${link.href}" class="footer-link" target="_blank">
<i class="${link.icon}"></i>
<span>${link.name}</span>
</a>
</li>
`).join('')}
</ul>
`;
export default define('footer', class extends HTMLElement {
constructor() {
super();
this.__setup();
}
__setup() {
this.innerHTML = template({
data: {
links: config.footerLinks
}
});
this.__events();
}
__events() {
}
}, { extends: 'footer' });

View file

@ -1,22 +0,0 @@
import define from '../../utils/define.js';
const template = () => /*html*/`
<h1>
Welcome to mwmbl, the free, open-source and non-profit search engine.
</h1>
<p>
You can start searching by using the search bar above!
</p>
`;
export default define('home', class extends HTMLLIElement {
constructor() {
super();
this.classList.add('home');
this.__setup();
}
__setup() {
this.innerHTML = template();
}
}, { extends: 'li' });

View file

@ -1,75 +1,191 @@
import define from '../../utils/define.js';
import { globalBus } from '../../utils/events.js';
import {globalBus} from '../../utils/events.js';
import Sortable from 'sortablejs';
// Components
import result from '../molecules/result.js';
import emptyResult from '../molecules/empty-result.js';
import home from './home.js';
import escapeString from '../../utils/escapeString.js';
const template = () => /*html*/`
<ul class='results'>
<li is='${home}'></li>
</ul>
`;
export default define('results', class extends HTMLElement {
class ResultsHandler {
constructor() {
super();
this.results = null;
this.oldIndex = null;
this.curating = false;
this.__setup();
}
__setup() {
this.innerHTML = template();
this.results = this.querySelector('.results');
this.__events();
this.__initializeResults();
}
__events() {
globalBus.on('search', (e) => {
this.results.innerHTML = '';
let resultsHTML = '';
if (!e.detail.error) {
// If there is no details the input is empty
if (!e.detail.results) {
resultsHTML = /*html*/`
<li is='${home}'></li>
`;
}
// If the details array has results display them
else if (e.detail.results.length > 0) {
for(const resultData of e.detail.results) {
resultsHTML += /*html*/`
<li
is='${result}'
data-url='${escapeString(resultData.url)}'
data-title='${escapeString(JSON.stringify(resultData.title))}'
data-extract='${escapeString(JSON.stringify(resultData.extract))}'
></li>
`;
}
}
// If the details array is empty there is no result
else {
resultsHTML = /*html*/`
<li is='${emptyResult}'></li>
`;
}
}
else {
// If there is an error display an empty result
resultsHTML = /*html*/`
<li is='${emptyResult}'></li>
`;
}
// Bind HTML to the DOM
this.results.innerHTML = resultsHTML;
document.body.addEventListener('htmx:load', e => {
this.__initializeResults();
});
// Focus first element when coming from the search bar
globalBus.on('focus-result', () => {
this.results.firstElementChild.firstElementChild.focus();
})
});
globalBus.on('curate-delete-result', (e) => {
console.log("Curate delete result event", e);
this.__beginCurating.bind(this)();
const children = this.results.getElementsByClassName('result');
let deleteIndex = e.detail.data.delete_index;
const child = children[deleteIndex];
this.results.removeChild(child);
const newResults = this.__getResults();
const curationSaveEvent = new CustomEvent('save-curation', {
detail: {
type: 'delete',
data: {
timestamp: Date.now(),
url: document.location.href,
results: newResults,
curation: {
delete_index: deleteIndex
}
}
}
});
globalBus.dispatch(curationSaveEvent);
});
globalBus.on('curate-validate-result', (e) => {
console.log("Curate validate result event", e);
this.__beginCurating.bind(this)();
const children = this.results.getElementsByClassName('result');
const validateChild = children[e.detail.data.validate_index];
validateChild.querySelector('.curate-approve').toggleValidate();
const newResults = this.__getResults();
const curationStartEvent = new CustomEvent('save-curation', {
detail: {
type: 'validate',
data: {
timestamp: Date.now(),
url: document.location.href,
results: newResults,
curation: e.detail.data
}
}
});
globalBus.dispatch(curationStartEvent);
});
globalBus.on('begin-curating-results', (e) => {
// We might not be online, or logged in, so save the curation in local storage in case:
console.log("Begin curation event", e);
this.__beginCurating.bind(this)();
});
globalBus.on('curate-add-result', (e) => {
console.log("Add result", e);
this.__beginCurating();
const resultData = e.detail;
this.results.insertAdjacentHTML('afterbegin', resultData);
const newResults = this.__getResults();
const url = newResults[0].url;
let detail = {
type: 'add',
data: {
timestamp: Date.now(),
url: document.location.href,
results: newResults,
curation: {
insert_index: 0,
url: url
}
}
};
console.log("Detail", detail);
const curationSaveEvent = new CustomEvent('save-curation', {
detail: detail
});
globalBus.dispatch(curationSaveEvent);
});
}
});
__initializeResults() {
this.results = document.querySelector('.results');
if (this.results) {
const sortable = new Sortable(this.results, {
"onStart": this.__sortableActivate.bind(this),
"onEnd": this.__sortableDeactivate.bind(this),
"handle": ".handle",
});
}
this.curating = false;
}
__sortableActivate(event) {
console.log("Sortable activate", event);
this.__beginCurating();
this.oldIndex = event.oldIndex;
}
__beginCurating() {
if (!this.curating) {
const results = this.__getResults();
const curationStartEvent = new CustomEvent('save-curation', {
detail: {
type: 'begin',
data: {
timestamp: Date.now(),
url: document.location.href,
results: results,
curation: {}
}
}
});
globalBus.dispatch(curationStartEvent);
this.curating = true;
}
}
__getResults() {
const resultsElements = document.querySelectorAll('.results .result:not(.ui-sortable-placeholder)');
const results = [];
for (let resultElement of resultsElements) {
const result = {
url: resultElement.querySelector('a').href,
title: resultElement.querySelector('.title').innerText,
extract: resultElement.querySelector('.extract').innerText,
curated: resultElement.querySelector('.curate-approve').isValidated()
}
results.push(result);
}
console.log("Results", results);
return results;
}
__sortableDeactivate(event) {
const newIndex = event.newIndex;
console.log('Sortable deactivate', this.oldIndex, newIndex);
const newResults = this.__getResults();
const curationMoveEvent = new CustomEvent('save-curation', {
detail: {
type: 'move',
data: {
timestamp: Date.now(),
url: document.location.href,
results: newResults,
curation: {
old_index: this.oldIndex,
new_index: newIndex,
}
}
}
});
globalBus.dispatch(curationMoveEvent);
}
}
const resultsHandler = new ResultsHandler();

View file

@ -0,0 +1,112 @@
import define from '../../utils/define.js';
import {globalBus} from "../../utils/events.js";
import config from "../../../config.js";
const CURATION_KEY_PREFIX = "curation-";
const CURATION_URL = config.publicApiURL + "curation/";
const template = () => /*html*/`
<span></span>
`;
export default define('save', class extends HTMLDivElement {
constructor() {
super();
this.currentCurationId = null;
this.classList.add('save');
this.sendId = 0;
this.sending = false;
this.__setup();
}
__setup() {
this.innerHTML = template();
this.__events();
// TODO: figure out when to call __sendToApi()
// setInterval(this.__sendToApi.bind(this), 1000);
}
__events() {
globalBus.on('save-curation', (e) => {
// We might not be online, or logged in, so save the curation in local storage in case:
console.log("Curation event", e);
this.__setCuration(e.detail);
this.__sendToApi();
});
}
__setCuration(curation) {
this.sendId += 1;
const key = CURATION_KEY_PREFIX + this.sendId;
localStorage.setItem(key, JSON.stringify(curation));
}
__getOldestCurationKey() {
let oldestId = Number.MAX_SAFE_INTEGER;
let oldestKey = null;
for (let i=0; i<localStorage.length; ++i) {
const key = localStorage.key(i);
if (key.startsWith(CURATION_KEY_PREFIX)) {
const timestamp = parseInt(key.substring(CURATION_KEY_PREFIX.length));
if (timestamp < oldestId) {
oldestKey = key;
oldestId = timestamp;
}
}
}
return oldestKey;
}
async __sendToApi() {
if (this.sending) {
return;
}
this.sending = true;
const csrftoken = document.cookie
.split('; ')
.find((row) => row.startsWith('csrftoken='))
?.split('=')[1];
if (!csrftoken) {
console.log("No auth");
return;
}
const key = this.__getOldestCurationKey();
if (key !== null) {
const value = JSON.parse(localStorage.getItem(key));
console.log("Value", value);
const url = CURATION_URL + value['type'];
const data = value['data'];
console.log("Data", data);
const response = await fetch(url, {
method: 'POST',
cache: 'no-cache',
headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrftoken},
credentials: "same-origin",
mode: "same-origin",
body: JSON.stringify(data),
});
console.log("Save curation API response", response);
if (response.status === 200) {
localStorage.removeItem(key);
} else {
console.log("Bad response, skipping");
return;
}
const responseData = await response.json();
console.log("Response data", responseData);
// There may be more to send, wait a second and see
setTimeout(this.__sendToApi.bind(this), 1000);
}
this.sending = false;
}
}, { extends: 'div' });

View file

@ -1,180 +0,0 @@
import define from '../../utils/define.js';
import config from '../../../config.js';
import { globalBus } from '../../utils/events.js';
import debounce from '../../utils/debounce.js'
const prefersReducedMotion = window.matchMedia('(prefers-reduced-motion)').matches;
const template = () => /*html*/`
<form class="search-bar">
<i class="ph-magnifying-glass-bold"></i>
<input
type='search'
class='search-bar-input'
placeholder='Search on mwmbl...'
title='Use "CTRL+K" or "/" to focus.'
autocomplete='off'
>
</form>
`;
export default define('search-bar', class extends HTMLElement {
constructor() {
super();
this.searchInput = null;
this.searchForm = null;
this.abortController = new AbortController();
this.__setup();
}
__setup() {
this.innerHTML = template();
this.searchInput = this.querySelector('input');
this.searchForm = this.querySelector('form');
this.__events();
}
__dispatchSearch({ results = null, error = null }) {
const searchEvent = new CustomEvent('search', {
detail: {
results,
error,
},
});
globalBus.dispatch(searchEvent)
}
/**
* Updates the overall layout of the page.
*
* `home` centers the search bar on the page.
* `compact` raises it to the top and makes room for displaying results.
*
* @param {'compact' | 'home'} mode
* @return {void}
*/
__setDisplayMode(mode) {
switch (mode) {
case 'compact': {
document.body.style.paddingTop = '25px';
document.querySelector('.search-menu').classList.add('compact');
break;
}
case 'home': {
document.body.style.paddingTop = '30vh';
document.querySelector('.search-menu').classList.remove('compact');
break;
}
}
}
async __executeSearch() {
this.abortController.abort();
this.abortController = new AbortController();
// Get response from API
const response = await fetch(`${config.publicApiURL}search?s=${encodeURIComponent(this.searchInput.value)}`, {
signal: this.abortController.signal
});
// Getting results from API
const search = await (response).json();
return search;
}
__handleSearch = async () => {
// Update page title
document.title = `MWMBL - ${this.searchInput.value || "Search"}`;
// Update query params
const queryParams = new URLSearchParams(document.location.search);
// Sets query param if search value is not empty
if (this.searchInput.value) queryParams.set(config.searchQueryParam, this.searchInput.value);
else queryParams.delete(config.searchQueryParam);
// New URL with query params
const newURL =
document.location.protocol
+ "//"
+ document.location.host
+ document.location.pathname
+ (this.searchInput.value ? '?' : '')
+ queryParams.toString();
// Replace history state
window.history.replaceState({ path: newURL }, '', newURL);
if (this.searchInput.value) {
this.__setDisplayMode('compact')
try {
const search = await this.__executeSearch()
// This is a guess at an explanation
// Check the searcInput.value before setting the results to prevent
// race condition where the user has cleared the search input after
// submitting an original search but before the search results have
// come back from the API
this.__dispatchSearch({ results: this.searchInput.value ? search : null });
}
catch(error) {
this.__dispatchSearch({ error })
}
}
else {
this.__setDisplayMode('home')
this.__dispatchSearch({ results: null });
}
}
__events() {
/**
* Always add the submit event, it makes things feel faster if
* someone does not prefer reduced motion and reflexively hits
* return once they've finished typing.
*/
this.searchForm.addEventListener('submit', (e) => {
e.preventDefault();
this.__handleSearch(e);
});
/**
* Only add the "real time" search behavior when the client does
* not prefer reduced motion; this prevents the page from changing
* while the user is still typing their query.
*/
if (!prefersReducedMotion) {
this.searchInput.addEventListener('input', debounce(this.__handleSearch, 500))
}
// Focus search bar when pressing `ctrl + k` or `/`
document.addEventListener('keydown', (e) => {
if ((e.key === 'k' && e.ctrlKey) || e.key === '/' || e.key === 'Escape') {
e.preventDefault();
this.searchInput.focus();
}
});
// Focus first result when pressing down arrow
this.addEventListener('keydown', (e) => {
if (e.key === 'ArrowDown' && this.searchInput.value) {
e.preventDefault();
const focusResultEvent = new CustomEvent('focus-result');
globalBus.dispatch(focusResultEvent);
}
});
globalBus.on('focus-search', (e) => {
this.searchInput.focus();
});
}
connectedCallback() {
// Focus search input when component is connected
this.searchInput.focus();
const searchQuery = new URLSearchParams(document.location.search).get(config.searchQueryParam);
this.searchInput.value = searchQuery;
/**
* Trigger search handling to coordinate the value pulled from the query string
* across the rest of the UI and to actually retrieve the results if the search
* value is now non-empty.
*/
this.__handleSearch();
}
});

View file

@ -1,63 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<!-- Metas -->
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<!-- Page title -->
<title>MWMBL - Search</title>
<meta name="description" content="The free, open-source and non-profit search engine.">
<!-- Favicons -->
<link rel="icon" href="/images/favicon.svg" type="image/svg+xml">
<!-- Fonts import -->
<link rel="preload" href="/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/fonts/inter/inter.css">
</noscript>
<!-- CSS Stylesheets (this is critical CSS) -->
<link rel="stylesheet" type="text/css" href="/css/reset.css">
<link rel="stylesheet" type="text/css" href="/css/theme.css">
<link rel="stylesheet" type="text/css" href="/css/global.css">
<!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
<link rel="preload" href="/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/fonts/phosphor/icons.css">
</noscript>
<!-- Custom Element Polyfill for Safari -->
<script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
<!-- OpenSearch -->
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="MWMBL Search">
</head>
<body>
<mwmbl-app></mwmbl-app>
<noscript>
<main class="noscript">
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
<h1>
Welcome to mwmbl, the free, open-source and non-profit search engine.
</h1>
<p>This website requires you to support/enable scripts.</p>
<p>
More information on
<a href="https://github.com/mwmbl/mwmbl" target="_blank">
Github
</a>
.
</p>
</main>
</noscript>
<!-- Javasript entrypoint -->
<script src="./index.js" type="module"></script>
</body>
</html>

View file

@ -5,6 +5,7 @@
* Please do not pollute this file if you can make
* util or component files instead.
*/
import 'vite/modulepreload-polyfill';
// Waiting for top-level await to be better supported.
(async () => {
@ -14,9 +15,12 @@
if (!redirected) {
// Load components only after redirects are checked.
import('./components/app.js');
import("./components/organisms/search-bar.js");
import("./components/organisms/results.js");
import("./components/organisms/footer.js");
import("./components/organisms/save.js");
import("./components/molecules/add-button.js");
import("./components/molecules/add-result.js");
import("./components/molecules/delete-button.js");
import("./components/molecules/result.js");
import("./components/molecules/validate-button.js");
}
})();

View file

@ -5,18 +5,18 @@
<title>Mwmbl Stats</title>
<!-- Favicons -->
<link rel="icon" href="/images/favicon.svg" type="image/svg+xml">
<link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
<!-- Fonts import -->
<link rel="preload" href="/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/fonts/inter/inter.css">
<link rel="stylesheet" href="/static/fonts/inter/inter.css">
</noscript>
<!-- CSS Stylesheets (this is critical CSS) -->
<link rel="stylesheet" type="text/css" href="/css/reset.css">
<link rel="stylesheet" type="text/css" href="/css/theme.css">
<link rel="stylesheet" type="text/css" href="/css/global.css">
<link rel="stylesheet" type="text/css" href="/static/css/reset.css">
<link rel="stylesheet" type="text/css" href="/static/css/theme.css">
<link rel="stylesheet" type="text/css" href="/static/css/global.css">
<link rel="stylesheet" type="text/css" href="stats.css">
</head>
<body>

View file

@ -7,12 +7,14 @@ export default {
publicDir: '../assets',
build: {
outDir: '../dist',
manifest: true,
rollupOptions: {
input: {
main: resolve(__dirname, 'src/index.html'),
index: resolve(__dirname, 'src/index.js'),
stats: resolve(__dirname, 'src/stats/index.html'),
},
},
minify: false,
},
plugins: [
legacy({

8
mwmbl/admin.py Normal file
View file

@ -0,0 +1,8 @@
from django.contrib.admin import ModelAdmin
from django.contrib.auth.admin import UserAdmin
from django.contrib import admin
from mwmbl.models import MwmblUser, UserCuration
admin.site.register(MwmblUser, UserAdmin)
admin.site.register(UserCuration, ModelAdmin)

View file

@ -1,36 +1,24 @@
from multiprocessing import Queue
from pathlib import Path
from django.conf import settings
from ninja import NinjaAPI
from ninja.security import django_auth
import mwmbl.crawler.app as crawler
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.platform import curate
from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.rank import HeuristicRanker
queued_batches = Queue()
completer = Completer()
index_path = Path(settings.DATA_PATH) / INDEX_NAME
tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
tiny_index.__enter__()
ranker = HeuristicRanker(tiny_index, completer)
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
def create_api(version):
api = NinjaAPI(version=version)
# Set csrf to True to all cookie-based authentication
api = NinjaAPI(version=version, csrf=True)
search_router = search.create_router(ranker)
api.add_router("/search/", search_router)
crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
api.add_router("/crawler/", crawler_router)
curation_router = curate.create_router(index_path)
api.add_router("/curation/", curation_router, auth=django_auth)
return api

View file

@ -6,12 +6,9 @@ from pathlib import Path
from django.apps import AppConfig
from django.conf import settings
from mwmbl.api import queued_batches
from mwmbl import background
from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.url_queue import update_queue_continuously
from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database
from mwmbl.indexer.indexdb import IndexDatabase
class MwmblConfig(AppConfig):
@ -19,6 +16,14 @@ class MwmblConfig(AppConfig):
verbose_name = "Mwmbl Application"
def ready(self):
# Imports here to avoid AppRegistryNotReady exception
from mwmbl.search_setup import queued_batches
from mwmbl import background
from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.url_queue import update_queue_continuously
index_path = Path(settings.DATA_PATH) / INDEX_NAME
try:
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
@ -30,6 +35,12 @@ class MwmblConfig(AppConfig):
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
page_size=PAGE_SIZE)
with Database() as db:
url_db = URLDatabase(db.connection)
url_db.create_tables()
index_db = IndexDatabase(db.connection)
index_db.create_tables()
if settings.RUN_BACKGROUND_PROCESSES:
new_item_queue = Queue()
Process(target=background.run, args=(settings.DATA_PATH,)).start()

View file

@ -1,7 +1,9 @@
"""
Script that updates data in a background process.
"""
from logging import getLogger
import logging
import sys
from logging import getLogger, basicConfig
from pathlib import Path
from time import sleep
@ -11,6 +13,8 @@ from mwmbl.indexer import index_batches, historical
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME
basicConfig(stream=sys.stdout, level=logging.INFO)
logger = getLogger(__name__)

View file

@ -8,12 +8,8 @@ from typing import Union
from uuid import uuid4
import boto3
import justext
import requests
from fastapi import HTTPException
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from ninja import Router
from redis import Redis
@ -21,7 +17,6 @@ from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.stats import MwmblStats, StatsManager
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
from mwmbl.database import Database
from mwmbl.format import format_result
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
from mwmbl.settings import (
@ -35,9 +30,7 @@ from mwmbl.settings import (
PUBLIC_URL_PREFIX,
PUBLIC_USER_ID_LENGTH,
FILE_NAME_SUFFIX,
DATE_REGEX, NUM_EXTRACT_CHARS)
from mwmbl.tinysearchengine.indexer import Document
DATE_REGEX)
stats_manager = StatsManager(Redis.from_url(os.environ.get("REDIS_URL")))
@ -57,32 +50,6 @@ def upload(data: bytes, name: str):
last_batch = None
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
encoding=None, default_encoding=DEFAULT_ENCODING,
enc_errors=DEFAULT_ENC_ERRORS):
"""
Converts an HTML page into a list of classified paragraphs. Each paragraph
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
"""
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
titles = dom.xpath("//title")
title = titles[0].text if len(titles) > 0 else None
dom = preprocessor(dom)
paragraphs = ParagraphMaker.make_paragraphs(dom)
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, no_headings)
revise_paragraph_classification(paragraphs, max_heading_distance)
return paragraphs, title
def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
router = Router(tags=["crawler"])
@ -90,19 +57,6 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
# #
# # url_db.create_tables()
@router.get('/fetch')
def fetch_url(request, url: str, query: str):
response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
extract = ' '.join([p.text for p in good_paragraphs])
if len(extract) > NUM_EXTRACT_CHARS:
extract = extract[:NUM_EXTRACT_CHARS - 1] + ''
result = Document(title=title, url=url, extract=extract, score=0.0)
return format_result(result, query)
@router.post('/batches/')
def post_batch(request, batch: Batch):
if len(batch.items) > MAX_BATCH_SIZE:

View file

@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
score = link_counts.get(url, DEFAULT_SCORE)
yield tokenize_document(url, title_cleaned, extract, score, nlp)
yield tokenize_document(url, title_cleaned, extract, score)
if i % 1000 == 0:
print("Processed", i)
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
return set(first_tokens + bigrams)
def tokenize_document(url, title_cleaned, extract, score, nlp):
def tokenize_document(url, title_cleaned, extract, score):
title_tokens = tokenize(title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(prepared_url)

View file

@ -16,6 +16,7 @@ from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.index import tokenize_document
from mwmbl.indexer.indexdb import BatchStatus
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
from mwmbl.utils import add_term_info, add_term_infos
logger = getLogger(__name__)
@ -31,22 +32,20 @@ def run(batch_cache: BatchCache, index_path: str):
def process(batches: Collection[HashedBatch]):
with Database() as db:
nlp = spacy.load("en_core_web_sm")
url_db = URLDatabase(db.connection)
index_batches(batches, index_path, nlp, url_db)
index_batches(batches, index_path, url_db)
logger.info("Indexed pages")
process_batch.run(batch_cache, BatchStatus.URLS_UPDATED, BatchStatus.INDEXED, process)
def index_batches(batch_data: Collection[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
def index_batches(batch_data: Collection[HashedBatch], index_path: str, url_db: URLDatabase):
document_tuples = list(get_documents_from_batches(batch_data))
urls = [url for title, url, extract in document_tuples]
logger.info(f"Got {len(urls)} document tuples")
url_scores = url_db.get_url_scores(urls)
logger.info(f"Got {len(url_scores)} scores")
logger.info(f"Indexing {len(urls)} document tuples and {len(url_scores)} URL scores")
documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
page_documents = preprocess_documents(documents, index_path, nlp)
page_documents = preprocess_documents(documents, index_path)
index_pages(index_path, page_documents)
@ -58,24 +57,27 @@ def index_pages(index_path, page_documents):
seen_urls = set()
seen_titles = set()
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
for document in sorted_documents:
# TODO: for now we add the term here, until all the documents in the index have terms
sorted_documents_with_terms = add_term_infos(sorted_documents, indexer, page)
for document in sorted_documents_with_terms:
if document.title in seen_titles or document.url in seen_urls:
continue
new_documents.append(document)
seen_urls.add(document.url)
seen_titles.add(document.title)
logger.info(f"Storing {len(new_documents)} documents for page {page}, originally {len(existing_documents)}")
indexer.store_in_page(page, new_documents)
def preprocess_documents(documents, index_path, nlp):
def preprocess_documents(documents, index_path):
page_documents = defaultdict(list)
with TinyIndex(Document, index_path, 'w') as indexer:
for document in documents:
tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
# logger.debug(f"Tokenized: {tokenized}")
page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
for page in page_indexes:
page_documents[page].append(document)
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
for token in tokenized.tokens:
page = indexer.get_key_page_index(token)
term_document = Document(document.title, document.url, document.extract, document.score, token)
page_documents[page].append(term_document)
print(f"Preprocessed for {len(page_documents)} pages")
return page_documents

View file

@ -86,7 +86,7 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
parsed_link = urlparse(link)
if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
logger.debug(f"Excluding link for blacklisted domain: {parsed_link}")
return
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0

View file

@ -1,7 +1,12 @@
import django
import uvicorn
from django.core.management import call_command
def run():
django.setup()
call_command("collectstatic", "--clear", "--noinput")
call_command("migrate")
uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=5000)

View file

@ -0,0 +1,58 @@
# Generated by Django 4.2.6 on 2023-10-25 11:55
from django.conf import settings
import django.contrib.auth.models
import django.contrib.auth.validators
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
initial = True
dependencies = [
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.CreateModel(
name='MwmblUser',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('password', models.CharField(max_length=128, verbose_name='password')),
('last_login', models.DateTimeField(blank=True, null=True, verbose_name='last login')),
('is_superuser', models.BooleanField(default=False, help_text='Designates that this user has all permissions without explicitly assigning them.', verbose_name='superuser status')),
('username', models.CharField(error_messages={'unique': 'A user with that username already exists.'}, help_text='Required. 150 characters or fewer. Letters, digits and @/./+/-/_ only.', max_length=150, unique=True, validators=[django.contrib.auth.validators.UnicodeUsernameValidator()], verbose_name='username')),
('first_name', models.CharField(blank=True, max_length=150, verbose_name='first name')),
('last_name', models.CharField(blank=True, max_length=150, verbose_name='last name')),
('email', models.EmailField(blank=True, max_length=254, verbose_name='email address')),
('is_staff', models.BooleanField(default=False, help_text='Designates whether the user can log into this admin site.', verbose_name='staff status')),
('is_active', models.BooleanField(default=True, help_text='Designates whether this user should be treated as active. Unselect this instead of deleting accounts.', verbose_name='active')),
('date_joined', models.DateTimeField(default=django.utils.timezone.now, verbose_name='date joined')),
('groups', models.ManyToManyField(blank=True, help_text='The groups this user belongs to. A user will get all permissions granted to each of their groups.', related_name='user_set', related_query_name='user', to='auth.group', verbose_name='groups')),
('user_permissions', models.ManyToManyField(blank=True, help_text='Specific permissions for this user.', related_name='user_set', related_query_name='user', to='auth.permission', verbose_name='user permissions')),
],
options={
'verbose_name': 'user',
'verbose_name_plural': 'users',
'abstract': False,
},
managers=[
('objects', django.contrib.auth.models.UserManager()),
],
),
migrations.CreateModel(
name='UserCuration',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('timestamp', models.DateTimeField()),
('url', models.CharField(max_length=300)),
('results', models.JSONField()),
('curation_type', models.CharField(max_length=20)),
('curation', models.JSONField()),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
),
]

View file

15
mwmbl/models.py Normal file
View file

@ -0,0 +1,15 @@
from django.db import models
from django.contrib.auth.models import AbstractUser
class MwmblUser(AbstractUser):
pass
class UserCuration(models.Model):
user = models.ForeignKey(MwmblUser, on_delete=models.CASCADE)
timestamp = models.DateTimeField()
url = models.CharField(max_length=300)
results = models.JSONField()
curation_type = models.CharField(max_length=20)
curation = models.JSONField()

89
mwmbl/platform/curate.py Normal file
View file

@ -0,0 +1,89 @@
from logging import getLogger
from typing import Any
from urllib.parse import parse_qs
from ninja import Router
from mwmbl.indexer.update_urls import get_datetime_from_timestamp
from mwmbl.models import UserCuration
from mwmbl.platform.data import CurateBegin, CurateMove, CurateDelete, CurateAdd, CurateValidate, \
make_curation_type
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize
from mwmbl.utils import add_term_info, add_term_infos
RESULT_URL = "https://mwmbl.org/?q="
MAX_CURATED_SCORE = 1_111_111.0
logger = getLogger(__name__)
def create_router(index_path: str) -> Router:
router = Router(tags=["user"])
@router.post("/begin")
def user_begin_curate(request, curate_begin: make_curation_type(CurateBegin)):
return _curate(request, "curate_begin", curate_begin)
@router.post("/move")
def user_move_result(request, curate_move: make_curation_type(CurateMove)):
return _curate(request, "curate_move", curate_move)
@router.post("/delete")
def user_delete_result(request, curate_delete: make_curation_type(CurateDelete)):
return _curate(request, "curate_delete", curate_delete)
@router.post("/add")
def user_add_result(request, curate_add: make_curation_type(CurateAdd)):
return _curate(request, "curate_add", curate_add)
@router.post("/validate")
def user_add_result(request, curate_validate: make_curation_type(CurateValidate)):
return _curate(request, "curate_validate", curate_validate)
def _curate(request, curation_type: str, curation: Any):
user_curation = UserCuration(
user=request.user,
timestamp=get_datetime_from_timestamp(curation.timestamp / 1000.0),
url=curation.url,
results=curation.dict()["results"],
curation_type=curation_type,
curation=curation.curation.dict(),
)
user_curation.save()
with TinyIndex(Document, index_path, 'w') as indexer:
query_string = parse_qs(curation.url)
if len(query_string) > 1:
raise ValueError(f"Should be one query string in the URL: {curation.url}")
queries = next(iter(query_string.values()))
if len(queries) > 1:
raise ValueError(f"Should be one query value in the URL: {curation.url}")
query = queries[0]
tokens = tokenize(query)
term = " ".join(tokens)
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
for i, result in enumerate(curation.results)
]
page_index = indexer.get_key_page_index(term)
existing_documents_no_terms = indexer.get_page(page_index)
existing_documents = add_term_infos(existing_documents_no_terms, indexer, page_index)
other_documents = [doc for doc in existing_documents if doc.term != term]
logger.info(f"Found {len(other_documents)} other documents for term {term} at page {page_index} "
f"with terms { {doc.term for doc in other_documents} }")
all_documents = documents + other_documents
logger.info(f"Storing {len(all_documents)} documents at page {page_index}")
indexer.store_in_page(page_index, all_documents)
return {"curation": "ok"}
return router

46
mwmbl/platform/data.py Normal file
View file

@ -0,0 +1,46 @@
from datetime import datetime
from typing import TypeVar, Generic
from ninja import Schema
class Result(Schema):
url: str
title: str
extract: str
curated: bool
class CurateBegin(Schema):
pass
class CurateMove(Schema):
old_index: int
new_index: int
class CurateDelete(Schema):
delete_index: int
class CurateAdd(Schema):
insert_index: int
url: str
class CurateValidate(Schema):
validate_index: int
is_validated: bool
T = TypeVar('T', CurateBegin, CurateAdd, CurateDelete, CurateMove, CurateValidate)
def make_curation_type(t):
class Curation(Schema):
timestamp: int
url: str
results: list[Result]
curation: t
return Curation

View file

@ -1,190 +0,0 @@
import json
import os
from typing import TypeVar, Generic
from urllib.parse import urljoin, parse_qs
import requests
from fastapi import APIRouter, Response
from pydantic import BaseModel
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize
LEMMY_URL = os.environ["LEMMY_URL"]
RESULT_URL = "https://mwmbl.org/?q="
MAX_CURATED_SCORE = 1_111_111.0
class Register(BaseModel):
username: str
email: str
password: str
password_verify: str
class Login(BaseModel):
username_or_email: str
password: str
class Result(BaseModel):
url: str
title: str
extract: str
curated: bool
class BeginCurate(BaseModel):
auth: str
url: str
results: list[Result]
class CurateMove(BaseModel):
old_index: int
new_index: int
class CurateDelete(BaseModel):
delete_index: int
class CurateAdd(BaseModel):
insert_index: int
url: str
class CurateValidate(BaseModel):
validate_index: int
is_validated: bool
T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate)
class Curation(BaseModel, Generic[T]):
auth: str
curation_id: int
url: str
results: list[Result]
curation: T
def create_router(index_path: str) -> APIRouter:
router = APIRouter(prefix="/user", tags=["user"])
# TODO: reinstate
# community_id = get_community_id()
community_id = 0
@router.post("/register")
def user_register(register: Register) -> Response:
lemmy_register = {
"username": register.username,
"email": register.email,
"password": register.password,
"password_verify": register.password_verify,
"answer": "not applicable",
"captcha_answer": None,
"captcha_uuid": None,
"honeypot": None,
"show_nsfw": False,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/register"), json=lemmy_register)
if request.status_code != 200:
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
@router.post("/login")
def user_login(login: Login) -> Response:
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/login"), json=login.dict())
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
@router.post("/curation/begin")
def user_begin_curate(begin_curate: BeginCurate):
results = begin_curate.dict()["results"]
body = json.dumps({"original_results": results}, indent=2)
create_post = {
"auth": begin_curate.auth,
"body": body,
"community_id": community_id,
"honeypot": None,
"language_id": None,
"name": begin_curate.url,
"nsfw": None,
"url": begin_curate.url,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/post"), json=create_post)
if request.status_code != 200:
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
data = request.json()
curation_id = data["post_view"]["post"]["id"]
return {"curation_id": curation_id}
@router.post("/curation/move")
def user_move_result(curate_move: Curation[CurateMove]):
return _curate("curate_move", curate_move)
@router.post("/curation/delete")
def user_delete_result(curate_delete: Curation[CurateDelete]):
return _curate("curate_delete", curate_delete)
@router.post("/curation/add")
def user_add_result(curate_add: Curation[CurateAdd]):
return _curate("curate_add", curate_add)
@router.post("/curation/validate")
def user_add_result(curate_validate: Curation[CurateValidate]):
return _curate("curate_validate", curate_validate)
def _curate(curation_type: str, curation: Curation):
content = json.dumps({
"curation_type": curation_type,
"curation": curation.curation.dict(),
}, indent=2)
create_comment = {
"auth": curation.auth,
"content": json.dumps(content, indent=2),
"form_id": None,
"language_id": None,
"parent_id": None,
"post_id": curation.curation_id,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
with TinyIndex(Document, index_path, 'w') as indexer:
query_string = parse_qs(curation.url)
if len(query_string) > 1:
raise ValueError(f"Should be one query string in the URL: {curation.url}")
queries = next(iter(query_string.values()))
if len(queries) > 1:
raise ValueError(f"Should be one query value in the URL: {curation.url}")
query = queries[0]
print("Query", query)
tokens = tokenize(query)
print("Tokens", tokens)
term = " ".join(tokens)
print("Key", term)
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
for i, result in enumerate(curation.results)
]
page_index = indexer.get_key_page_index(term)
print("Page index", page_index)
print("Storing documents", documents)
indexer.store_in_page(page_index, documents)
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
return router
def get_community_id() -> str:
request = requests.get(urljoin(LEMMY_URL, "api/v3/community?name=main"))
community = request.json()
return community["community_view"]["community"]["id"]

19
mwmbl/search_setup.py Normal file
View file

@ -0,0 +1,19 @@
from multiprocessing import Queue
from pathlib import Path
from django.conf import settings
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.rank import HeuristicRanker
queued_batches = Queue()
completer = Completer()
index_path = Path(settings.DATA_PATH) / INDEX_NAME
tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
tiny_index.__enter__()
ranker = HeuristicRanker(tiny_index, completer)
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)

View file

@ -5,4 +5,4 @@ ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org"]
DATA_PATH = "/app/storage"
RUN_BACKGROUND_PROCESSES = True
NUM_PAGES = 10240000
NUM_PAGES = 10240000

View file

@ -19,9 +19,6 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
# Application definition
@ -32,7 +29,13 @@ INSTALLED_APPS = [
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.humanize',
'mwmbl',
'django_htmx',
'django_vite',
'allauth',
'allauth.account',
'allauth.socialaccount',
]
MIDDLEWARE = [
@ -43,6 +46,9 @@ MIDDLEWARE = [
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
"django_htmx.middleware.HtmxMiddleware",
"allauth.account.middleware.AccountMiddleware",
]
ROOT_URLCONF = 'mwmbl.urls'
@ -66,17 +72,6 @@ TEMPLATES = [
WSGI_APPLICATION = 'mwmbl.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
@ -112,11 +107,60 @@ USE_TZ = True
# https://docs.djangoproject.com/en/4.2/howto/static-files/
STATIC_URL = 'static/'
STATICFILES_DIRS = [str(Path(__file__).parent.parent / "front-end" / "dist")]
print("Static files", STATICFILES_DIRS)
DJANGO_VITE_DEV_MODE = False
# Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
AUTHENTICATION_BACKENDS = [
# Needed to login by username in Django admin, regardless of `allauth`
'django.contrib.auth.backends.ModelBackend',
# `allauth` specific authentication methods, such as login by email
'allauth.account.auth_backends.AuthenticationBackend',
]
AUTH_USER_MODEL = "mwmbl.MwmblUser"
ACCOUNT_EMAIL_REQUIRED = True
ACCOUNT_EMAIL_VERIFICATION = "mandatory"
DEFAULT_FROM_EMAIL = "admin@mwmbl.org"
LOGIN_REDIRECT_URL = "/"
FOOTER_LINKS = [
{
"name": "Matrix",
"icon": "ph-chat-circle-text-bold",
"href": "https://matrix.to/#/#mwmbl:matrix.org",
},
{
"name": "Book",
"icon": "ph-book-bold",
"href": "https://book.mwmbl.org",
},
{
"name": "Blog",
"icon": "ph-browser-bold",
"href": "https://blog.mwmbl.org",
},
{
"name": "GitHub",
"icon": "ph-github-logo-bold",
"href": "https://github.com/mwmbl/mwmbl",
},
{
"name": "YouTube",
"icon": "ph-youtube-logo-bold",
"href": "https://www.youtube.com/channel/UCFLbqrH63-icAHxQ1eFfAvA",
},
]

View file

@ -1,9 +1,31 @@
from mwmbl.settings_common import *
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
STATIC_ROOT = ""
DJANGO_VITE_ASSETS_PATH = Path(__file__).parent.parent / "front-end" / "dist"
DJANGO_VITE_MANIFEST_PATH = DJANGO_VITE_ASSETS_PATH / "manifest.json"
STATICFILES_DIRS = [str(DJANGO_VITE_ASSETS_PATH)]
DEBUG = True
ALLOWED_HOSTS = ["localhost", "127.0.0.1"]
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
DATA_PATH = "./devdata"
RUN_BACKGROUND_PROCESSES = True
RUN_BACKGROUND_PROCESSES = False
NUM_PAGES = 2560

View file

@ -1,7 +1,33 @@
import os
import dj_database_url
from mwmbl.settings_common import *
DEBUG = False
ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org"]
SECRET_KEY = os.environ["DJANGO_SECRET_KEY"]
STATIC_ROOT = "/app/static/"
DJANGO_VITE_ASSETS_PATH = "/front-end-build/"
DJANGO_VITE_MANIFEST_PATH = Path(DJANGO_VITE_ASSETS_PATH) / "manifest.json"
STATICFILES_DIRS = [DJANGO_VITE_ASSETS_PATH]
DATABASES = {'default': dj_database_url.config(default=os.environ["DATABASE_URL"])}
DEBUG = True # TODO set back to False
ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org", "beta.mwmbl.org"]
CSRF_TRUSTED_ORIGINS = [f"https://{domain}" for domain in ALLOWED_HOSTS]
# Sendgrid email settings
EMAIL_HOST = 'smtp.sendgrid.net'
EMAIL_HOST_USER = 'apikey'
EMAIL_HOST_PASSWORD = os.getenv('EMAIL_HOST_PASSWORD')
EMAIL_PORT = 587
EMAIL_USE_TLS = True
DATA_PATH = "/app/storage"
RUN_BACKGROUND_PROCESSES = False

22
mwmbl/templates/base.html Normal file
View file

@ -0,0 +1,22 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{% block title %}Simple is Better Than Complex{% endblock %}</title>
</head>
<body>
<header>
<h1>My Site</h1>
{% if user.is_authenticated %}
<a href="{% url 'account_logout' %}">logout</a>
{% else %}
<a href="{% url 'account_login' %}">login</a> / <a href="{% url 'signup' %}">signup</a>
{% endif %}
<hr>
</header>
<main>
{% block content %}
{% endblock %}
</main>
</body>
</html>

32
mwmbl/templates/home.html Normal file
View file

@ -0,0 +1,32 @@
{% load humanize %}
{% include "title.html" %}
<div class="main">
{% if query %}
<button class="button curate-add" is="mwmbl-add-button"> Add new</button>
{% if results %}
<ul class='results'>
{% for result in results %}
{% include "result.html" %}
{% endfor %}
</ul>
{% else %}
<ul>
<li class="home">
<h1>
No results found for "{{query}}".
</h1>
</li>
</ul>
{% endif %}
{% else %}
{% for item in activity %}
<ul>
<li class="activity">
<h1>
{{ item.user }} made {{ item.num_curations | apnumber }} changes to <a href="{{ item.url }}">{{ item.query }}</a> {{ item.timestamp | naturaltime }}.
</h1>
</li>
</ul>
{% endfor %}
{% endif %}
</div>

View file

@ -0,0 +1,97 @@
{% load django_vite %}
<!DOCTYPE html>
<html lang="en">
<head>
<!-- Metas -->
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
{% include "title.html" %}
<meta name="description" content="The free, open-source and non-profit search engine.">
<!-- Favicons -->
<link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
<!-- Fonts import -->
<link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/static/fonts/inter/inter.css">
</noscript>
<!-- CSS Stylesheets (this is critical CSS) -->
<link rel="stylesheet" type="text/css" href="/static/css/reset.css">
<link rel="stylesheet" type="text/css" href="/static/css/theme.css">
<link rel="stylesheet" type="text/css" href="/static/css/global.css">
<!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
<link rel="preload" href="/static/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript>
<link rel="stylesheet" href="/static/fonts/phosphor/icons.css">
</noscript>
<!-- Custom Element Polyfill for Safari -->
<script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
<!-- OpenSearch -->
<link rel="search" type="application/opensearchdescription+xml" href="/static/assets/opensearch.xml" title="Mwmbl Search">
<script src="https://unpkg.com/htmx.org@1.9.6"></script>
{% vite_hmr_client %}
</head>
<body>
<mwmbl-app></mwmbl-app>
<header class="search-menu compact">
<a href="/" class="branding">
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
<span class="brand-title">Mwmbl</span>
</a>
<form class="search-bar">
<i class="ph-magnifying-glass-bold"></i>
<input
type='search'
name='q'
class='search-bar-input'
placeholder='Search on Mwmbl...'
title='Use "CTRL+K" or "/" to focus.'
autocomplete='off'
value='{{ query|default_if_none:"" }}'
hx-get="/app/home/"
hx-trigger="keyup changed delay:100ms"
hx-target=".main"
>
</form>
<div is="mwmbl-save"></div>
{% if user.is_authenticated %}
<p class="login-info">Logged in as {{ user.username }}</p>
<a class="button" href="/accounts/logout/">Log out</a>
{% else %}
<a class="button" href="/accounts/login/">Login</a>
<a class="button" href="/accounts/signup/">Sign up</a>
{% endif %}
</header>
<main>
{% include "home.html" %}
</main>
<div is="mwmbl-add-result"></div>
<div class="footer">
<ul class="footer-list">
{% for link in footer_links %}
<li class="footer-item">
<a href="{{ link.href }}" class="footer-link" target="__blank">
<i class="{{ link.icon }}"></i>
<span>{{ link.name }}</span>
</a>
</li>
{% endfor %}
</ul>
</div>
{% vite_asset 'index.js' %}
{% vite_legacy_polyfills %}
{% vite_legacy_asset 'index-legacy.js' %}
</body>
</html>

View file

@ -0,0 +1,26 @@
{% extends 'base.html' %}
{% block content %}
<h2>Log in to My Site</h2>
{% if form.errors %}
<p style="color: red">Your username and password didn't match. Please try again.</p>
{% endif %}
<form method="post">
{% csrf_token %}
<input type="hidden" name="next" value="{{ next }}" />
{% for field in form %}
<p>
{{ field.label_tag }}<br>
{{ field }}<br>
{% for error in field.errors %}
<p style="color: red">{{ error }}</p>
{% endfor %}
{% if field.help_text %}
<p><small style="color: grey">{{ field.help_text }}</small></p>
{% endif %}
</p>
{% endfor %}
<button type="submit">Log in</button>
<a href="{% url 'signup' %}">New to My Site? Sign up</a>
</form>
{% endblock %}

View file

@ -0,0 +1,17 @@
{% load result_filters %}
<li class="result" is="mwmbl-result">
<div class="result-container">
<div class="result-link">
<a href="{{result.url}}">
<p class='link'>{{result.url}}</p>
<p class='title'>{{result.title|strengthen}}</p>
</a>
<p class='extract'>{{result.extract|strengthen}}</p>
</div>
<div class="curation-buttons">
<span class="button handle">↕ Move</span>
<button class="button curate-delete" is="mwmbl-delete-button">✕ Delete</button>
<button class="button curate-approve" is="mwmbl-validate-button">✓ Looks good</button>
</div>
</div>
</li>

View file

@ -0,0 +1,10 @@
{% extends 'base.html' %}
{% block content %}
<h2>Sign up</h2>
<form method="post">
{% csrf_token %}
{{ form.as_p }}
<button type="submit">Sign up</button>
</form>
{% endblock %}

View file

@ -0,0 +1,6 @@
<!-- Page title -->
{% if query %}
<title>Mwmbl - {{ query }}</title>
{% else %}
<title>Mwmbl - Search</title>
{% endif %}

View file

View file

@ -0,0 +1,18 @@
from django.template import Library
from django.utils.html import conditional_escape
from django.utils.safestring import mark_safe
register = Library()
@register.filter(needs_autoescape=True)
def strengthen(spans, autoescape=True):
escape = conditional_escape if autoescape else lambda x: x
strengthened = []
for span in spans:
escaped_value = escape(span["value"])
if span["is_bold"]:
strengthened.append(f"<strong>{escaped_value}</strong>")
else:
strengthened.append(escaped_value)
return mark_safe("".join(strengthened))

View file

@ -79,6 +79,7 @@ class TinyIndexMetadata:
values = json.loads(data[constant_length:].decode('utf8'))
return TinyIndexMetadata(**values)
# Find the optimal amount of data that fits onto a page
# We do this by leveraging binary search to quickly find the index where:
# - index+1 cannot fit onto a page
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
# No better match, use our index
return mid, compressed_data
def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
# Find max number of items that fit on a page
return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
@ -186,7 +189,6 @@ class TinyIndex(Generic[T]):
except ZstdError:
logger.exception(f"Error decompressing page data, content: {page_data}")
return []
# logger.debug(f"Decompressed data: {decompressed_data}")
return json.loads(decompressed_data.decode('utf8'))
def store_in_page(self, page_index: int, values: list[T]):

View file

@ -15,12 +15,17 @@ Including another URLconf
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path
from django.urls import path, include
from mwmbl.api import api_original as api, api_v1
from mwmbl.api import api_v1
from mwmbl.views import home_fragment, fetch_url, index
urlpatterns = [
path('admin/', admin.site.urls),
path('', api.urls),
path('api/v1/', api_v1.urls)
path('api/v1/', api_v1.urls),
path('accounts/', include('allauth.urls')),
path('', index, name="home"),
path('app/home/', home_fragment, name="home"),
path('app/fetch/', fetch_url, name="fetch_url")
]

View file

@ -1,5 +1,8 @@
import re
from mwmbl.indexer.index import tokenize_document
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
DOMAIN_REGEX = re.compile(r".*://([^/]*)")
@ -17,3 +20,23 @@ def get_domain(url):
if results is None or len(results.groups()) == 0:
raise ValueError(f"Unable to parse domain from URL {url}")
return results.group(1)
def add_term_info(document: Document, index: TinyIndex, page_index: int):
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
for token in tokenized.tokens:
token_page_index = index.get_key_page_index(token)
if token_page_index == page_index:
return Document(document.title, document.url, document.extract, document.score, token)
raise ValueError("Could not find token in page index")
def add_term_infos(documents: list[Document], index: TinyIndex, page_index: int):
for document in documents:
if document.term is not None:
yield document
continue
try:
yield add_term_info(document, index, page_index)
except ValueError:
continue

129
mwmbl/views.py Normal file
View file

@ -0,0 +1,129 @@
from dataclasses import dataclass
from datetime import datetime
from itertools import groupby
from urllib.parse import urlparse, parse_qs
import justext
import requests
from django.contrib.auth.decorators import login_required
from django.shortcuts import render
from django_htmx.http import push_url
from mwmbl.format import format_result
from mwmbl.models import UserCuration, MwmblUser
from mwmbl.search_setup import ranker
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from mwmbl.settings import NUM_EXTRACT_CHARS
from mwmbl.tinysearchengine.indexer import Document
from django.conf import settings
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
encoding=None, default_encoding=DEFAULT_ENCODING,
enc_errors=DEFAULT_ENC_ERRORS):
"""
Converts an HTML page into a list of classified paragraphs. Each paragraph
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
"""
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
titles = dom.xpath("//title")
title = titles[0].text if len(titles) > 0 else None
dom = preprocessor(dom)
paragraphs = ParagraphMaker.make_paragraphs(dom)
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, no_headings)
revise_paragraph_classification(paragraphs, max_heading_distance)
return paragraphs, title
def index(request):
activity, query, results = _get_results_and_activity(request)
return render(request, "index.html", {
"results": results,
"query": query,
"user": request.user,
"activity": activity,
"footer_links": settings.FOOTER_LINKS,
})
def home_fragment(request):
activity, query, results = _get_results_and_activity(request)
response = render(request, "home.html", {
"results": results,
"query": query,
"activity": activity,
})
current_url = request.htmx.current_url
# Replace query string with new query
stripped_url = current_url[:current_url.index("?")] if "?" in current_url else current_url
query_string = "?q=" + query if len(query) > 0 else ""
new_url = stripped_url + query_string
# Set the htmx replace header
response["HX-Replace-Url"] = new_url
return response
@dataclass
class Activity:
user: MwmblUser
num_curations: int
timestamp: datetime
query: str
url: str
def _get_results_and_activity(request):
query = request.GET.get("q")
if query:
results = ranker.search(query)
activity = None
else:
results = None
curations = UserCuration.objects.order_by("-timestamp")[:100]
sorted_curations = sorted(curations, key=lambda x: x.user.username)
groups = groupby(sorted_curations, key=lambda x: (x.user.username, x.url))
unsorted_activity = []
for (user, url), group in groups:
parsed_url_query = parse_qs(urlparse(url).query)
activity_query = parsed_url_query.get("q", [""])[0]
group = list(group)
unsorted_activity.append(Activity(
user=user,
num_curations=len(group),
timestamp=max([i.timestamp for i in group]),
query=activity_query,
url=url,
))
activity = sorted(unsorted_activity, key=lambda a: a.timestamp, reverse=True)
return activity, query, results
def fetch_url(request):
url = request.GET["url"]
query = request.GET["query"]
response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
extract = ' '.join([p.text for p in good_paragraphs])
if len(extract) > NUM_EXTRACT_CHARS:
extract = extract[:NUM_EXTRACT_CHARS - 1] + ''
result = Document(title=title, url=url, extract=extract, score=0.0)
return render(request, "result.html", {
"result": format_result(result, query),
})

View file

@ -100,17 +100,12 @@ server {
## Static file hosting
location /static/ {
alias /var/lib/dokku/data/storage/mwmbl/;
}
## Root and stats served statically
location = / {
root /var/lib/dokku/data/storage/mwmbl;
try_files /index.html =404;
alias /var/lib/dokku/data/storage/mwmbl-beta/;
}
## Stats served statically
location ~ ^\/stats\/?$ {
root /var/lib/dokku/data/storage/mwmbl;
root /var/lib/dokku/data/storage/mwmbl-beta;
try_files /stats/index.html =404;
}

294
poetry.lock generated
View file

@ -1,10 +1,9 @@
# This file is automatically @generated by Poetry and should not be changed by hand.
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
[[package]]
name = "anyio"
version = "3.7.1"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -26,7 +25,6 @@ trio = ["trio (<0.22)"]
name = "asgiref"
version = "3.7.2"
description = "ASGI specs, helper code, and adapters"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -44,7 +42,6 @@ tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
name = "async-timeout"
version = "4.0.3"
description = "Timeout context manager for asyncio programs"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -56,7 +53,6 @@ files = [
name = "attrs"
version = "23.1.0"
description = "Classes Without Boilerplate"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -75,7 +71,6 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte
name = "beautifulsoup4"
version = "4.10.0"
description = "Screen-scraping library"
category = "main"
optional = true
python-versions = ">3.0.0"
files = [
@ -94,7 +89,6 @@ lxml = ["lxml"]
name = "blis"
version = "0.7.11"
description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension."
category = "main"
optional = false
python-versions = "*"
files = [
@ -141,7 +135,6 @@ numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""}
name = "boto3"
version = "1.28.62"
description = "The AWS SDK for Python"
category = "main"
optional = false
python-versions = ">= 3.7"
files = [
@ -161,7 +154,6 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
name = "botocore"
version = "1.31.62"
description = "Low-level, data-driven core of boto 3."
category = "main"
optional = false
python-versions = ">= 3.7"
files = [
@ -181,7 +173,6 @@ crt = ["awscrt (==0.16.26)"]
name = "catalogue"
version = "2.0.10"
description = "Super lightweight function registries for your library"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -193,7 +184,6 @@ files = [
name = "cattrs"
version = "23.1.2"
description = "Composable complex class support for attrs and dataclasses."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -219,7 +209,6 @@ ujson = ["ujson (>=5.4.0,<6.0.0)"]
name = "certifi"
version = "2023.7.22"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -231,7 +220,6 @@ files = [
name = "cffi"
version = "1.16.0"
description = "Foreign Function Interface for Python calling C code."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -296,7 +284,6 @@ pycparser = "*"
name = "charset-normalizer"
version = "3.3.0"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main"
optional = false
python-versions = ">=3.7.0"
files = [
@ -396,7 +383,6 @@ files = [
name = "click"
version = "8.1.7"
description = "Composable command line interface toolkit"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -411,7 +397,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
files = [
@ -419,11 +404,55 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
[[package]]
name = "cryptography"
version = "41.0.4"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
optional = false
python-versions = ">=3.7"
files = [
{file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"},
{file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143"},
{file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397"},
{file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860"},
{file = "cryptography-41.0.4-cp37-abi3-win32.whl", hash = "sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd"},
{file = "cryptography-41.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311"},
{file = "cryptography-41.0.4.tar.gz", hash = "sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a"},
]
[package.dependencies]
cffi = ">=1.12"
[package.extras]
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
nox = ["nox"]
pep8test = ["black", "check-sdist", "mypy", "ruff"]
sdist = ["build"]
ssh = ["bcrypt (>=3.1.5)"]
test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
test-randomorder = ["pytest-randomly"]
[[package]]
name = "cymem"
version = "2.0.8"
description = "Manage calls to calloc/free through Cython"
category = "main"
optional = false
python-versions = "*"
files = [
@ -462,11 +491,36 @@ files = [
{file = "cymem-2.0.8.tar.gz", hash = "sha256:8fb09d222e21dcf1c7e907dc85cf74501d4cea6c4ed4ac6c9e016f98fb59cbbf"},
]
[[package]]
name = "defusedxml"
version = "0.7.1"
description = "XML bomb protection for Python stdlib modules"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
{file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"},
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
]
[[package]]
name = "dj-database-url"
version = "2.1.0"
description = "Use Database URLs in your Django Application."
optional = false
python-versions = "*"
files = [
{file = "dj-database-url-2.1.0.tar.gz", hash = "sha256:f2042cefe1086e539c9da39fad5ad7f61173bf79665e69bf7e4de55fa88b135f"},
{file = "dj_database_url-2.1.0-py3-none-any.whl", hash = "sha256:04bc34b248d4c21aaa13e4ab419ae6575ef5f10f3df735ce7da97722caa356e0"},
]
[package.dependencies]
Django = ">=3.2"
typing-extensions = ">=3.10.0.0"
[[package]]
name = "django"
version = "4.2.6"
description = "A high-level Python web framework that encourages rapid development and clean, pragmatic design."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -483,11 +537,45 @@ tzdata = {version = "*", markers = "sys_platform == \"win32\""}
argon2 = ["argon2-cffi (>=19.1.0)"]
bcrypt = ["bcrypt"]
[[package]]
name = "django-allauth"
version = "0.57.0"
description = "Integrated set of Django applications addressing authentication, registration, account management as well as 3rd party (social) account authentication."
optional = false
python-versions = ">=3.7"
files = [
{file = "django-allauth-0.57.0.tar.gz", hash = "sha256:a095ef0db7de305d9175772c78e765ebd5fceb004ae61c1383d7fc1af0f7c5b1"},
]
[package.dependencies]
Django = ">=3.2"
pyjwt = {version = ">=1.7", extras = ["crypto"]}
python3-openid = ">=3.0.8"
requests = ">=2.0.0"
requests-oauthlib = ">=0.3.0"
[package.extras]
mfa = ["qrcode (>=7.0.0)"]
saml = ["python3-saml (>=1.15.0,<2.0.0)"]
[[package]]
name = "django-htmx"
version = "1.17.0"
description = "Extensions for using Django with htmx."
optional = false
python-versions = ">=3.8"
files = [
{file = "django_htmx-1.17.0-py3-none-any.whl", hash = "sha256:070a37092b88a42cd7af26c1b65f63c4529bae276710fd16137dc934938b44f2"},
{file = "django_htmx-1.17.0.tar.gz", hash = "sha256:2ef0d19db41c6152881e782673cd2cd1755a7fd6784f8b4f2279fb18dc03d2c2"},
]
[package.dependencies]
Django = ">=3.2"
[[package]]
name = "django-ninja"
version = "0.22.2"
description = "Django Ninja - Fast Django REST framework"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -504,11 +592,27 @@ dev = ["pre-commit"]
doc = ["markdown-include", "mkdocs", "mkdocs-material", "mkdocstrings"]
test = ["black", "django-stubs", "flake8", "isort", "mypy (==0.931)", "psycopg2-binary", "pytest", "pytest-asyncio", "pytest-cov", "pytest-django"]
[[package]]
name = "django-vite"
version = "2.1.3"
description = "Integration of ViteJS in a Django project."
optional = false
python-versions = "*"
files = [
{file = "django-vite-2.1.3.tar.gz", hash = "sha256:c59b3bbd85501bc1faf63c500df66542abed2951cfa10dfbf8be8ecf229f7652"},
{file = "django_vite-2.1.3-py3-none-any.whl", hash = "sha256:97984ac495910b7b71039228ccddff52d132231fa6612d3d31c6c228c95b0217"},
]
[package.dependencies]
Django = ">=1.11"
[package.extras]
dev = ["black", "flake8"]
[[package]]
name = "exceptiongroup"
version = "1.1.3"
description = "Backport of PEP 654 (exception groups)"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -523,7 +627,6 @@ test = ["pytest (>=6)"]
name = "fastapi"
version = "0.70.1"
description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
category = "main"
optional = false
python-versions = ">=3.6.1"
files = [
@ -545,7 +648,6 @@ test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==21.9b0)", "databases[sqlite] (
name = "h11"
version = "0.14.0"
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -557,7 +659,6 @@ files = [
name = "hiredis"
version = "2.2.3"
description = "Python wrapper for hiredis"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -656,7 +757,6 @@ files = [
name = "idna"
version = "3.3"
description = "Internationalized Domain Names in Applications (IDNA)"
category = "main"
optional = false
python-versions = ">=3.5"
files = [
@ -668,7 +768,6 @@ files = [
name = "iniconfig"
version = "2.0.0"
description = "brain-dead simple config-ini parsing"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -680,7 +779,6 @@ files = [
name = "jinja2"
version = "3.1.2"
description = "A very fast and expressive template engine."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -698,7 +796,6 @@ i18n = ["Babel (>=2.7)"]
name = "jmespath"
version = "1.0.1"
description = "JSON Matching Expressions"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -710,7 +807,6 @@ files = [
name = "joblib"
version = "1.3.2"
description = "Lightweight pipelining with Python functions"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -722,7 +818,6 @@ files = [
name = "justext"
version = "3.0.0"
description = "Heuristic based boilerplate removal tool"
category = "main"
optional = false
python-versions = "*"
files = [
@ -737,7 +832,6 @@ lxml = ">=4.4.2"
name = "langcodes"
version = "3.3.0"
description = "Tools for labeling human languages with IETF language tags"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -752,7 +846,6 @@ data = ["language-data (>=1.1,<2.0)"]
name = "langdetect"
version = "1.0.9"
description = "Language detection library ported from Google's language-detection."
category = "main"
optional = true
python-versions = "*"
files = [
@ -767,7 +860,6 @@ six = "*"
name = "levenshtein"
version = "0.16.0"
description = "Python extension for computing string edit distances and similarities."
category = "main"
optional = true
python-versions = ">=3.5"
files = [
@ -832,7 +924,6 @@ rapidfuzz = ">=1.8.2,<1.9"
name = "lxml"
version = "4.6.4"
description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
files = [
@ -908,7 +999,6 @@ source = ["Cython (>=0.29.7)"]
name = "markupsafe"
version = "2.1.3"
description = "Safely add untrusted strings to HTML/XML markup."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -978,7 +1068,6 @@ files = [
name = "mmh3"
version = "3.1.0"
description = "Python wrapper for MurmurHash (MurmurHash3), a set of fast and robust hash functions."
category = "main"
optional = false
python-versions = "*"
files = [
@ -1023,7 +1112,6 @@ files = [
name = "murmurhash"
version = "1.0.10"
description = "Cython bindings for MurmurHash"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -1066,7 +1154,6 @@ files = [
name = "numpy"
version = "1.26.0"
description = "Fundamental package for array computing in Python"
category = "main"
optional = false
python-versions = "<3.13,>=3.9"
files = [
@ -1104,11 +1191,26 @@ files = [
{file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
]
[[package]]
name = "oauthlib"
version = "3.2.2"
description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
optional = false
python-versions = ">=3.6"
files = [
{file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"},
{file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"},
]
[package.extras]
rsa = ["cryptography (>=3.0.0)"]
signals = ["blinker (>=1.4.0)"]
signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
[[package]]
name = "packaging"
version = "23.2"
description = "Core utilities for Python packages"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1120,7 +1222,6 @@ files = [
name = "pandas"
version = "1.5.3"
description = "Powerful data structures for data analysis, time series, and statistics"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1165,7 +1266,6 @@ test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
name = "pathy"
version = "0.10.2"
description = "pathlib.Path subclasses for local and cloud bucket storage"
category = "main"
optional = false
python-versions = ">= 3.6"
files = [
@ -1188,7 +1288,6 @@ test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
name = "platformdirs"
version = "3.11.0"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1204,7 +1303,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co
name = "pluggy"
version = "1.3.0"
description = "plugin and hook calling mechanisms for python"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1220,7 +1318,6 @@ testing = ["pytest", "pytest-benchmark"]
name = "preshed"
version = "3.0.9"
description = "Cython hash table that trusts the keys are pre-hashed"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -1267,7 +1364,6 @@ murmurhash = ">=0.28.0,<1.1.0"
name = "psycopg2-binary"
version = "2.9.9"
description = "psycopg2 - Python-PostgreSQL Database Adapter"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1297,6 +1393,7 @@ files = [
{file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@ -1305,6 +1402,8 @@ files = [
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
@ -1346,7 +1445,6 @@ files = [
name = "py4j"
version = "0.10.9.2"
description = "Enables Python programs to dynamically access arbitrary Java objects"
category = "main"
optional = true
python-versions = "*"
files = [
@ -1358,7 +1456,6 @@ files = [
name = "pyarrow"
version = "6.0.0"
description = "Python library for Apache Arrow"
category = "main"
optional = true
python-versions = ">=3.6"
files = [
@ -1407,7 +1504,6 @@ numpy = ">=1.16.6"
name = "pycparser"
version = "2.21"
description = "C parser in Python"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
@ -1419,7 +1515,6 @@ files = [
name = "pydantic"
version = "1.8.2"
description = "Data validation and settings management using python 3.6 type hinting"
category = "main"
optional = false
python-versions = ">=3.6.1"
files = [
@ -1454,11 +1549,30 @@ typing-extensions = ">=3.7.4.3"
dotenv = ["python-dotenv (>=0.10.4)"]
email = ["email-validator (>=1.0.3)"]
[[package]]
name = "pyjwt"
version = "2.8.0"
description = "JSON Web Token implementation in Python"
optional = false
python-versions = ">=3.7"
files = [
{file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"},
{file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"},
]
[package.dependencies]
cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"crypto\""}
[package.extras]
crypto = ["cryptography (>=3.4.0)"]
dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
[[package]]
name = "pyspark"
version = "3.2.0"
description = "Apache Spark Python API"
category = "main"
optional = true
python-versions = ">=3.6"
files = [
@ -1478,7 +1592,6 @@ sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
name = "pytest"
version = "7.4.2"
description = "pytest: simple powerful testing with Python"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1501,7 +1614,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no
name = "pytest-mock"
version = "3.11.1"
description = "Thin-wrapper around the mock package for easier use with pytest"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1519,7 +1631,6 @@ dev = ["pre-commit", "pytest-asyncio", "tox"]
name = "python-dateutil"
version = "2.8.2"
description = "Extensions to the standard Python datetime module"
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
files = [
@ -1530,11 +1641,28 @@ files = [
[package.dependencies]
six = ">=1.5"
[[package]]
name = "python3-openid"
version = "3.2.0"
description = "OpenID support for modern servers and consumers."
optional = false
python-versions = "*"
files = [
{file = "python3-openid-3.2.0.tar.gz", hash = "sha256:33fbf6928f401e0b790151ed2b5290b02545e8775f982485205a066f874aaeaf"},
{file = "python3_openid-3.2.0-py3-none-any.whl", hash = "sha256:6626f771e0417486701e0b4daff762e7212e820ca5b29fcc0d05f6f8736dfa6b"},
]
[package.dependencies]
defusedxml = "*"
[package.extras]
mysql = ["mysql-connector-python"]
postgresql = ["psycopg2"]
[[package]]
name = "pytz"
version = "2023.3.post1"
description = "World timezone definitions, modern and historical"
category = "main"
optional = false
python-versions = "*"
files = [
@ -1546,7 +1674,6 @@ files = [
name = "pyyaml"
version = "6.0"
description = "YAML parser and emitter for Python"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -1596,7 +1723,6 @@ files = [
name = "rapidfuzz"
version = "1.8.3"
description = "rapid fuzzy string matching"
category = "main"
optional = true
python-versions = ">=2.7"
files = [
@ -1663,7 +1789,6 @@ full = ["numpy"]
name = "redis"
version = "5.0.1"
description = "Python client for Redis database and key-value store"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1683,7 +1808,6 @@ ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"
name = "requests"
version = "2.31.0"
description = "Python HTTP for Humans."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1705,7 +1829,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
name = "requests-cache"
version = "1.1.0"
description = "A persistent cache for python requests"
category = "main"
optional = false
python-versions = ">=3.7,<4.0"
files = [
@ -1732,11 +1855,28 @@ redis = ["redis (>=3)"]
security = ["itsdangerous (>=2.0)"]
yaml = ["pyyaml (>=5.4)"]
[[package]]
name = "requests-oauthlib"
version = "1.3.1"
description = "OAuthlib authentication support for Requests."
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"},
{file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"},
]
[package.dependencies]
oauthlib = ">=3.0.0"
requests = ">=2.0.0"
[package.extras]
rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
[[package]]
name = "s3transfer"
version = "0.7.0"
description = "An Amazon S3 Transfer Manager"
category = "main"
optional = false
python-versions = ">= 3.7"
files = [
@ -1754,7 +1894,6 @@ crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
name = "scikit-learn"
version = "1.3.1"
description = "A set of python modules for machine learning and data mining"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1769,6 +1908,11 @@ files = [
{file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f66eddfda9d45dd6cadcd706b65669ce1df84b8549875691b1f403730bdef217"},
{file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6448c37741145b241eeac617028ba6ec2119e1339b1385c9720dae31367f2be"},
{file = "scikit_learn-1.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:c413c2c850241998168bbb3bd1bb59ff03b1195a53864f0b80ab092071af6028"},
{file = "scikit_learn-1.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ef540e09873e31569bc8b02c8a9f745ee04d8e1263255a15c9969f6f5caa627f"},
{file = "scikit_learn-1.3.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9147a3a4df4d401e618713880be023e36109c85d8569b3bf5377e6cd3fecdeac"},
{file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2cd3634695ad192bf71645702b3df498bd1e246fc2d529effdb45a06ab028b4"},
{file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c275a06c5190c5ce00af0acbb61c06374087949f643ef32d355ece12c4db043"},
{file = "scikit_learn-1.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:0e1aa8f206d0de814b81b41d60c1ce31f7f2c7354597af38fae46d9c47c45122"},
{file = "scikit_learn-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:52b77cc08bd555969ec5150788ed50276f5ef83abb72e6f469c5b91a0009bbca"},
{file = "scikit_learn-1.3.1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a683394bc3f80b7c312c27f9b14ebea7766b1f0a34faf1a2e9158d80e860ec26"},
{file = "scikit_learn-1.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15d964d9eb181c79c190d3dbc2fff7338786bf017e9039571418a1d53dab236"},
@ -1797,7 +1941,6 @@ tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (
name = "scipy"
version = "1.11.3"
description = "Fundamental algorithms for scientific computing in Python"
category = "main"
optional = false
python-versions = "<3.13,>=3.9"
files = [
@ -1840,7 +1983,6 @@ test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeo
name = "setuptools"
version = "68.2.2"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1857,7 +1999,6 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar
name = "six"
version = "1.16.0"
description = "Python 2 and 3 compatibility utilities"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
files = [
@ -1869,7 +2010,6 @@ files = [
name = "smart-open"
version = "6.4.0"
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
category = "main"
optional = false
python-versions = ">=3.6,<4.0"
files = [
@ -1891,7 +2031,6 @@ webhdfs = ["requests"]
name = "sniffio"
version = "1.3.0"
description = "Sniff out which async library your code is running under"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1903,7 +2042,6 @@ files = [
name = "soupsieve"
version = "2.5"
description = "A modern CSS selector implementation for Beautiful Soup."
category = "main"
optional = true
python-versions = ">=3.8"
files = [
@ -1915,7 +2053,6 @@ files = [
name = "spacy"
version = "3.2.1"
description = "Industrial-strength Natural Language Processing (NLP) in Python"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -1985,7 +2122,6 @@ transformers = ["spacy-transformers (>=1.1.2,<1.2.0)"]
name = "spacy-legacy"
version = "3.0.12"
description = "Legacy registered functions for spaCy backwards compatibility"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -1997,7 +2133,6 @@ files = [
name = "spacy-loggers"
version = "1.0.5"
description = "Logging utilities for SpaCy"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -2009,7 +2144,6 @@ files = [
name = "sqlparse"
version = "0.4.4"
description = "A non-validating SQL parser."
category = "main"
optional = false
python-versions = ">=3.5"
files = [
@ -2026,7 +2160,6 @@ test = ["pytest", "pytest-cov"]
name = "srsly"
version = "2.4.8"
description = "Modern high-performance serialization utilities for Python"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -2073,7 +2206,6 @@ catalogue = ">=2.0.3,<2.1.0"
name = "starlette"
version = "0.16.0"
description = "The little ASGI library that shines."
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -2091,7 +2223,6 @@ full = ["graphene", "itsdangerous", "jinja2", "python-multipart", "pyyaml", "req
name = "thinc"
version = "8.0.17"
description = "A refreshing functional take on deep learning, compatible with your favorite libraries"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -2157,7 +2288,6 @@ torch = ["torch (>=1.6.0)"]
name = "threadpoolctl"
version = "3.2.0"
description = "threadpoolctl"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -2169,7 +2299,6 @@ files = [
name = "tomli"
version = "2.0.1"
description = "A lil' TOML parser"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -2181,7 +2310,6 @@ files = [
name = "tqdm"
version = "4.66.1"
description = "Fast, Extensible Progress Meter"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -2202,7 +2330,6 @@ telegram = ["requests"]
name = "typer"
version = "0.4.2"
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -2223,7 +2350,6 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.
name = "typing-extensions"
version = "4.8.0"
description = "Backported and Experimental Type Hints for Python 3.8+"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -2235,7 +2361,6 @@ files = [
name = "tzdata"
version = "2023.3"
description = "Provider of IANA time zone data"
category = "main"
optional = false
python-versions = ">=2"
files = [
@ -2247,7 +2372,6 @@ files = [
name = "ujson"
version = "4.3.0"
description = "Ultra fast JSON encoder and decoder for Python"
category = "main"
optional = true
python-versions = ">=3.6"
files = [
@ -2301,7 +2425,6 @@ files = [
name = "url-normalize"
version = "1.4.3"
description = "URL normalization for Python"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
files = [
@ -2316,7 +2439,6 @@ six = "*"
name = "urllib3"
version = "2.0.6"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -2334,7 +2456,6 @@ zstd = ["zstandard (>=0.18.0)"]
name = "uvicorn"
version = "0.16.0"
description = "The lightning-fast ASGI server."
category = "main"
optional = false
python-versions = "*"
files = [
@ -2354,7 +2475,6 @@ standard = ["PyYAML (>=5.1)", "colorama (>=0.4)", "httptools (>=0.2.0,<0.4.0)",
name = "warcio"
version = "1.7.4"
description = "Streaming WARC (and ARC) IO library"
category = "main"
optional = true
python-versions = "*"
files = [
@ -2369,7 +2489,6 @@ six = "*"
name = "wasabi"
version = "0.10.1"
description = "A lightweight console printing and formatting toolkit"
category = "main"
optional = false
python-versions = "*"
files = [
@ -2381,7 +2500,6 @@ files = [
name = "zstandard"
version = "0.16.0"
description = "Zstandard bindings for Python"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -2438,9 +2556,9 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
indexer = ["Levenshtein", "beautifulsoup4", "idna", "langdetect", "lxml", "pyarrow", "pyspark", "ujson", "warcio"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.11"
content-hash = "fe5f238c57ec2d09acb6bdf8f46f33c7bbe499f68a7e34ab7bca1336e0ae881c"
content-hash = "4e4233221e9f3bd317c0693584612898b7b736f45983b7f3f5bad4d43e567353"

View file

@ -37,6 +37,10 @@ django = "^4.2.4"
django-ninja = "^0.22.2"
requests-cache = "^1.1.0"
redis = {extras = ["hiredis"], version = "^5.0.1"}
django-allauth = "^0.57.0"
dj-database-url = "^2.1.0"
django-htmx = "^1.17.0"
django-vite = "^2.1.3"
[tool.poetry.extras]
indexer = [