Merge pull request #128 from mwmbl/beta
Allow users to curate search results
This commit is contained in:
commit
a3cc316d15
66 changed files with 1721 additions and 1023 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -17,6 +17,7 @@ __pycache__/
|
|||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
front-end/dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
|
|
|
@ -50,7 +50,7 @@ COPY --from=builder /venv /venv
|
|||
COPY --from=front-end /front-end/dist /front-end-build
|
||||
|
||||
ADD nginx.conf.sigil /app
|
||||
ADD app.json /app
|
||||
# ADD app.json /app
|
||||
|
||||
# Set up a volume where the data will live
|
||||
VOLUME ["/data"]
|
||||
|
|
51
analyse/add_term_info.py
Normal file
51
analyse/add_term_info.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
"""
|
||||
Investigate adding term information to the database.
|
||||
|
||||
How much extra space will it take?
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from random import Random
|
||||
|
||||
import numpy as np
|
||||
from scipy.stats import sem
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
|
||||
|
||||
from zstandard import ZstdCompressor
|
||||
|
||||
from mwmbl.utils import add_term_info
|
||||
|
||||
random = Random(1)
|
||||
|
||||
INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
|
||||
|
||||
|
||||
def run():
|
||||
compressor = ZstdCompressor()
|
||||
with TinyIndex(Document, INDEX_PATH) as index:
|
||||
# Get some random integers between 0 and index.num_pages:
|
||||
pages = random.sample(range(index.num_pages), 10000)
|
||||
|
||||
old_sizes = []
|
||||
new_sizes = []
|
||||
|
||||
for i in pages:
|
||||
page = index.get_page(i)
|
||||
term_documents = []
|
||||
for document in page:
|
||||
term_document = add_term_info(document, index, i)
|
||||
term_documents.append(term_document)
|
||||
|
||||
value_tuples = [astuple(value) for value in term_documents]
|
||||
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
|
||||
|
||||
new_sizes.append(num_fitting)
|
||||
old_sizes.append(len(page))
|
||||
|
||||
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
|
||||
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
|
@ -1,57 +0,0 @@
|
|||
"""
|
||||
Index batches stored locally on the filesystem for the purpose of evaluation.
|
||||
"""
|
||||
import glob
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
import spacy
|
||||
|
||||
from mwmbl.crawler import HashedBatch
|
||||
from mwmbl.crawler.urls import URLDatabase
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.indexer import index_batches
|
||||
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||
|
||||
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
|
||||
NUM_BATCHES = 10000
|
||||
EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
|
||||
NUM_PAGES = 1_024_000
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
|
||||
|
||||
def get_batches():
|
||||
for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
|
||||
data = json.load(gzip.open(path))
|
||||
yield HashedBatch.parse_obj(data)
|
||||
|
||||
|
||||
def run():
|
||||
try:
|
||||
os.remove(EVALUATE_INDEX_PATH)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
|
||||
|
||||
batches = get_batches()
|
||||
|
||||
start = datetime.now()
|
||||
with Database() as db:
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
url_db = URLDatabase(db.connection)
|
||||
index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
|
||||
end = datetime.now()
|
||||
|
||||
total_time = (end - start).total_seconds()
|
||||
print("total_seconds:", total_time)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
|
@ -1,60 +0,0 @@
|
|||
import logging
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import spacy
|
||||
|
||||
from analyse.index_local import EVALUATE_INDEX_PATH
|
||||
from mwmbl.indexer import tokenize_document
|
||||
from mwmbl.indexer import INDEX_PATH
|
||||
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
|
||||
def store():
|
||||
document = Document(
|
||||
title='A nation in search of the new black | Theatre | The Guardian',
|
||||
url='https://www.theguardian.com/stage/2007/nov/18/theatre',
|
||||
extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
|
||||
score=1.0
|
||||
)
|
||||
with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
|
||||
print("Tokenized", tokenized)
|
||||
# for token in tokenized.tokens:
|
||||
#
|
||||
# tiny_index.index(token, document)
|
||||
|
||||
|
||||
def get_items():
|
||||
with TinyIndex(Document, INDEX_PATH) as tiny_index:
|
||||
items = tiny_index.retrieve('wikipedia')
|
||||
if items:
|
||||
for item in items:
|
||||
print("Items", item)
|
||||
|
||||
|
||||
def run(index_path):
|
||||
with TinyIndex(Document, index_path) as tiny_index:
|
||||
sizes = {}
|
||||
for i in range(tiny_index.num_pages):
|
||||
page = tiny_index.get_page(i)
|
||||
if page:
|
||||
sizes[i] = len(page)
|
||||
if len(page) > 50:
|
||||
print("Page", len(page), page)
|
||||
# for item in page:
|
||||
# if ' search' in item.title:
|
||||
# print("Page", i, item)
|
||||
print("Max", max(sizes.values()))
|
||||
print("Top", sorted(sizes.values())[-100:])
|
||||
print("Mean", np.mean(list(sizes.values())))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# store()
|
||||
run(EVALUATE_INDEX_PATH)
|
||||
# get_items()
|
Binary file not shown.
|
@ -21,6 +21,12 @@ body {
|
|||
margin: 25px;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 600px) {
|
||||
.branding {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
|
||||
.brand-title {
|
||||
text-align: center;
|
||||
font-weight: var(--black-font-weight);
|
||||
|
@ -62,12 +68,9 @@ body {
|
|||
height: 2rem;
|
||||
}
|
||||
|
||||
mwmbl-search-bar {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.search-bar {
|
||||
position: relative;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.search-bar-input {
|
||||
|
@ -104,7 +107,7 @@ mwmbl-search-bar {
|
|||
pointer-events: none;
|
||||
}
|
||||
|
||||
mwmbl-results, footer {
|
||||
.main, footer {
|
||||
display: block;
|
||||
max-width: 800px;
|
||||
width: 100%;
|
||||
|
@ -114,11 +117,14 @@ mwmbl-results, footer {
|
|||
.results {
|
||||
max-width: 100%;
|
||||
list-style-type: none;
|
||||
padding: 10px;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.result a {
|
||||
display: block;
|
||||
.result {
|
||||
min-height: 120px;
|
||||
}
|
||||
|
||||
.result-container {
|
||||
text-decoration: none;
|
||||
color: var(--dark-color);
|
||||
padding: 15px;
|
||||
|
@ -130,11 +136,11 @@ mwmbl-results, footer {
|
|||
outline 100ms ease-in-out;
|
||||
}
|
||||
|
||||
.result:hover a, .result a:focus {
|
||||
.result-container:hover,.result-container:focus {
|
||||
background-color: var(--gray-color);
|
||||
}
|
||||
|
||||
.result a:focus {
|
||||
.result-container:focus {
|
||||
outline: 3px solid var(--primary-color);
|
||||
}
|
||||
|
||||
|
@ -158,7 +164,7 @@ mwmbl-results, footer {
|
|||
font-weight: var(--bold-font-weight);
|
||||
}
|
||||
|
||||
footer {
|
||||
.footer {
|
||||
position: sticky;
|
||||
top: 100vh;
|
||||
margin-bottom: 25px;
|
||||
|
@ -228,5 +234,108 @@ footer {
|
|||
a {
|
||||
font-weight: var(--bold-font-weight);
|
||||
color: var(--primary-color);
|
||||
text-decoration: underline;
|
||||
}
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.curation-buttons {
|
||||
display: grid;
|
||||
grid-auto-flow: column;
|
||||
grid-column-gap: 20px;
|
||||
grid-auto-columns: max-content;
|
||||
}
|
||||
|
||||
.result-container .button {
|
||||
background-color: var(--dark-gray-color);
|
||||
color: white;
|
||||
padding: 5px 10px;
|
||||
margin: 0;
|
||||
font-size: var(--small-font-size);
|
||||
font-weight: var(--bold-font-weight);
|
||||
}
|
||||
|
||||
.validated {
|
||||
background-color: green !important;
|
||||
}
|
||||
|
||||
.modal {
|
||||
/*display: none; !* Hidden by default *!*/
|
||||
position: fixed; /* Stay in place */
|
||||
z-index: 100; /* Sit on top */
|
||||
left: 0;
|
||||
top: 0;
|
||||
width: 100%; /* Full width */
|
||||
height: 100%; /* Full height */
|
||||
overflow: auto; /* Enable scroll if needed */
|
||||
background-color: rgb(0,0,0); /* Fallback color */
|
||||
background-color: rgba(0,0,0,0.4); /* Black w/ opacity */
|
||||
}
|
||||
|
||||
/* Modal Content/Box */
|
||||
.modal-content {
|
||||
background-color: #fefefe;
|
||||
margin: 15% auto; /* 15% from the top and centered */
|
||||
padding: 20px;
|
||||
border: 1px solid #888;
|
||||
max-width: 800px;
|
||||
width: 80%; /* Could be more or less, depending on screen size */
|
||||
}
|
||||
|
||||
/* The Close Button */
|
||||
.close {
|
||||
color: #aaa;
|
||||
float: right;
|
||||
font-size: 28px;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.close:hover,
|
||||
.close:focus {
|
||||
color: black;
|
||||
text-decoration: none;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.button {
|
||||
background-color: var(--primary-color);
|
||||
border: none;
|
||||
color: white;
|
||||
padding: 10px 20px;
|
||||
margin: 10px;
|
||||
text-align: center;
|
||||
text-decoration: none;
|
||||
display: inline-block;
|
||||
font-size: var(--default-font-size);
|
||||
border-radius: 50px;
|
||||
cursor: pointer;
|
||||
flex-shrink: 0;
|
||||
transition: background-color 200ms ease-in-out;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 600px) {
|
||||
.button {
|
||||
padding: 5px 10px;
|
||||
font-size: var(--small-font-size);
|
||||
margin: 5px;
|
||||
}
|
||||
}
|
||||
|
||||
.button:hover {
|
||||
background-color: var(--dark-color);
|
||||
}
|
||||
|
||||
.login-info {
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
/* Sortable styling is not working in HTML 5 yet */
|
||||
/*.sortable-drag {*/
|
||||
/* opacity: 1.0;*/
|
||||
/*}*/
|
||||
|
||||
/*.sortable-ghost {*/
|
||||
/* opacity: 1.0;*/
|
||||
/*}*/
|
||||
|
||||
/*.sortable-chosen {*/
|
||||
/* opacity: 0;*/
|
||||
/*}*/
|
||||
|
|
|
@ -7,9 +7,11 @@
|
|||
--primary-color: #185ADB;
|
||||
--gray-color: #EEEEEE;
|
||||
--light-color: #F8F8F8;
|
||||
--dark-gray-color: #767676;
|
||||
|
||||
/* Fonts: */
|
||||
--regular-font: 'Inter', sans-serif;
|
||||
--small-font-size: 12px;
|
||||
--default-font-size: 16px;
|
||||
--default-font-weight: 400;
|
||||
--bold-font-weight: 700;
|
||||
|
|
|
@ -103,4 +103,20 @@ Phosphor Web Font
|
|||
|
||||
.ph-info-bold::before {
|
||||
content: "\f88f";
|
||||
}
|
||||
}
|
||||
|
||||
.ph-book-bold::before {
|
||||
content: "\f6fb";
|
||||
}
|
||||
|
||||
.ph-browser-bold::before {
|
||||
content: "\f70d";
|
||||
}
|
||||
|
||||
.ph-youtube-logo-bold::before {
|
||||
content: "\fa5d";
|
||||
}
|
||||
|
||||
.ph-chat-circle-text-bold::before {
|
||||
content: "\f74c";
|
||||
}
|
||||
|
|
|
@ -13290,10 +13290,6 @@
|
|||
content: "\f6fa";
|
||||
}
|
||||
|
||||
.ph-book-bold::before {
|
||||
content: "\f6fb";
|
||||
}
|
||||
|
||||
.ph-book-bookmark-bold::before {
|
||||
content: "\f6fc";
|
||||
}
|
||||
|
@ -13362,10 +13358,6 @@
|
|||
content: "\f70c";
|
||||
}
|
||||
|
||||
.ph-browser-bold::before {
|
||||
content: "\f70d";
|
||||
}
|
||||
|
||||
.ph-browsers-bold::before {
|
||||
content: "\f70e";
|
||||
}
|
||||
|
@ -13614,10 +13606,6 @@
|
|||
content: "\f74b";
|
||||
}
|
||||
|
||||
.ph-chat-circle-text-bold::before {
|
||||
content: "\f74c";
|
||||
}
|
||||
|
||||
.ph-chat-dots-bold::before {
|
||||
content: "\f74d";
|
||||
}
|
||||
|
@ -16750,10 +16738,6 @@
|
|||
content: "\fa5c";
|
||||
}
|
||||
|
||||
.ph-youtube-logo-bold::before {
|
||||
content: "\fa5d";
|
||||
}
|
||||
|
||||
.ph-activity-fill::before {
|
||||
content: "\fa5e";
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -8,20 +8,9 @@
|
|||
|
||||
export default {
|
||||
componentPrefix: 'mwmbl',
|
||||
publicApiURL: 'https://api.mwmbl.org/',
|
||||
publicApiURL: '/api/v1/',
|
||||
// publicApiURL: 'http://localhost:5000/',
|
||||
searchQueryParam: 'q',
|
||||
footerLinks: [
|
||||
{
|
||||
name: 'Github',
|
||||
icon: 'ph-github-logo-bold',
|
||||
href: 'https://github.com/mwmbl/mwmbl'
|
||||
},
|
||||
{
|
||||
name: 'Wiki',
|
||||
icon: 'ph-info-bold',
|
||||
href: 'https://github.com/mwmbl/mwmbl/wiki'
|
||||
}
|
||||
],
|
||||
commands: {
|
||||
'go: ': 'https://',
|
||||
'search: google.com ': 'https://www.google.com/search?q=',
|
||||
|
|
63
front-end/package-lock.json
generated
63
front-end/package-lock.json
generated
|
@ -6,7 +6,8 @@
|
|||
"": {
|
||||
"name": "front-end",
|
||||
"dependencies": {
|
||||
"chart.js": "^4.4.0"
|
||||
"chart.js": "^4.4.0",
|
||||
"sortablejs": "^1.15.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@vitejs/plugin-legacy": "^2.3.1",
|
||||
|
@ -598,16 +599,10 @@
|
|||
}
|
||||
},
|
||||
"node_modules/nanoid": {
|
||||
"version": "3.3.6",
|
||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
|
||||
"integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==",
|
||||
"version": "3.3.4",
|
||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.4.tgz",
|
||||
"integrity": "sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/ai"
|
||||
}
|
||||
],
|
||||
"bin": {
|
||||
"nanoid": "bin/nanoid.cjs"
|
||||
},
|
||||
|
@ -628,9 +623,9 @@
|
|||
"dev": true
|
||||
},
|
||||
"node_modules/postcss": {
|
||||
"version": "8.4.31",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz",
|
||||
"integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==",
|
||||
"version": "8.4.19",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.19.tgz",
|
||||
"integrity": "sha512-h+pbPsyhlYj6N2ozBmHhHrs9DzGmbaarbLvWipMRO7RLS+v4onj26MPFXA5OBYFxyqYhUJK456SwDcY9H2/zsA==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
{
|
||||
|
@ -640,14 +635,10 @@
|
|||
{
|
||||
"type": "tidelift",
|
||||
"url": "https://tidelift.com/funding/github/npm/postcss"
|
||||
},
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/ai"
|
||||
}
|
||||
],
|
||||
"dependencies": {
|
||||
"nanoid": "^3.3.6",
|
||||
"nanoid": "^3.3.4",
|
||||
"picocolors": "^1.0.0",
|
||||
"source-map-js": "^1.0.2"
|
||||
},
|
||||
|
@ -693,6 +684,11 @@
|
|||
"fsevents": "~2.3.2"
|
||||
}
|
||||
},
|
||||
"node_modules/sortablejs": {
|
||||
"version": "1.15.0",
|
||||
"resolved": "https://registry.npmjs.org/sortablejs/-/sortablejs-1.15.0.tgz",
|
||||
"integrity": "sha512-bv9qgVMjUMf89wAvM6AxVvS/4MX3sPeN0+agqShejLU5z5GX4C75ow1O2e5k4L6XItUyAK3gH6AxSbXrOM5e8w=="
|
||||
},
|
||||
"node_modules/source-map": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||
|
@ -765,9 +761,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/vite": {
|
||||
"version": "3.2.7",
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-3.2.7.tgz",
|
||||
"integrity": "sha512-29pdXjk49xAP0QBr0xXqu2s5jiQIXNvE/xwd0vUizYT2Hzqe4BksNNoWllFVXJf4eLZ+UlVQmXfB4lWrc+t18g==",
|
||||
"version": "3.2.5",
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-3.2.5.tgz",
|
||||
"integrity": "sha512-4mVEpXpSOgrssFZAOmGIr85wPHKvaDAcXqxVxVRZhljkJOMZi1ibLibzjLHzJvcok8BMguLc7g1W6W/GqZbLdQ==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"esbuild": "^0.15.9",
|
||||
|
@ -1145,9 +1141,9 @@
|
|||
}
|
||||
},
|
||||
"nanoid": {
|
||||
"version": "3.3.6",
|
||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
|
||||
"integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==",
|
||||
"version": "3.3.4",
|
||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.4.tgz",
|
||||
"integrity": "sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw==",
|
||||
"dev": true
|
||||
},
|
||||
"path-parse": {
|
||||
|
@ -1163,12 +1159,12 @@
|
|||
"dev": true
|
||||
},
|
||||
"postcss": {
|
||||
"version": "8.4.31",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz",
|
||||
"integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==",
|
||||
"version": "8.4.19",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.19.tgz",
|
||||
"integrity": "sha512-h+pbPsyhlYj6N2ozBmHhHrs9DzGmbaarbLvWipMRO7RLS+v4onj26MPFXA5OBYFxyqYhUJK456SwDcY9H2/zsA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"nanoid": "^3.3.6",
|
||||
"nanoid": "^3.3.4",
|
||||
"picocolors": "^1.0.0",
|
||||
"source-map-js": "^1.0.2"
|
||||
}
|
||||
|
@ -1199,6 +1195,11 @@
|
|||
"fsevents": "~2.3.2"
|
||||
}
|
||||
},
|
||||
"sortablejs": {
|
||||
"version": "1.15.0",
|
||||
"resolved": "https://registry.npmjs.org/sortablejs/-/sortablejs-1.15.0.tgz",
|
||||
"integrity": "sha512-bv9qgVMjUMf89wAvM6AxVvS/4MX3sPeN0+agqShejLU5z5GX4C75ow1O2e5k4L6XItUyAK3gH6AxSbXrOM5e8w=="
|
||||
},
|
||||
"source-map": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||
|
@ -1252,9 +1253,9 @@
|
|||
}
|
||||
},
|
||||
"vite": {
|
||||
"version": "3.2.7",
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-3.2.7.tgz",
|
||||
"integrity": "sha512-29pdXjk49xAP0QBr0xXqu2s5jiQIXNvE/xwd0vUizYT2Hzqe4BksNNoWllFVXJf4eLZ+UlVQmXfB4lWrc+t18g==",
|
||||
"version": "3.2.5",
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-3.2.5.tgz",
|
||||
"integrity": "sha512-4mVEpXpSOgrssFZAOmGIr85wPHKvaDAcXqxVxVRZhljkJOMZi1ibLibzjLHzJvcok8BMguLc7g1W6W/GqZbLdQ==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"esbuild": "^0.15.9",
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
"vite": "^3.2.3"
|
||||
},
|
||||
"dependencies": {
|
||||
"chart.js": "^4.4.0"
|
||||
"chart.js": "^4.4.0",
|
||||
"sortablejs": "^1.15.0"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
import define from '../utils/define.js';
|
||||
|
||||
const template = () => /*html*/`
|
||||
<header class="search-menu">
|
||||
<div class="branding">
|
||||
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
|
||||
<span class="brand-title">MWMBL</span>
|
||||
</div>
|
||||
<mwmbl-search-bar></mwmbl-search-bar>
|
||||
</header>
|
||||
<main>
|
||||
<mwmbl-results></mwmbl-results>
|
||||
</main>
|
||||
<footer is="mwmbl-footer"></footer>
|
||||
`;
|
||||
|
||||
export default define('app', class extends HTMLElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
}
|
||||
});
|
21
front-end/src/components/molecules/add-button.js
Normal file
21
front-end/src/components/molecules/add-button.js
Normal file
|
@ -0,0 +1,21 @@
|
|||
import define from "../../utils/define.js";
|
||||
|
||||
|
||||
export default define('add-button', class extends HTMLButtonElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.addEventListener('click', (e) => {
|
||||
console.log("Add button");
|
||||
document.querySelector('.modal').style.display = 'block';
|
||||
document.querySelector('.modal input').focus();
|
||||
})
|
||||
}
|
||||
}, { extends: 'button' });
|
69
front-end/src/components/molecules/add-result.js
Normal file
69
front-end/src/components/molecules/add-result.js
Normal file
|
@ -0,0 +1,69 @@
|
|||
import define from '../../utils/define.js';
|
||||
import config from "../../../config.js";
|
||||
import {globalBus} from "../../utils/events.js";
|
||||
|
||||
|
||||
const FETCH_URL = '/app/fetch?'
|
||||
|
||||
|
||||
const template = () => /*html*/`
|
||||
<form class="modal-content">
|
||||
<span class="close">×</span>
|
||||
<input class="add-result" placeholder="Enter a URL...">
|
||||
<button>Save</button>
|
||||
</form>
|
||||
`;
|
||||
|
||||
export default define('add-result', class extends HTMLDivElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.classList.add('modal');
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
this.__events();
|
||||
this.style.display = 'none';
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.querySelector('.close').addEventListener('click', e => {
|
||||
if (e.target === this) {
|
||||
this.style.display = 'none';
|
||||
}
|
||||
});
|
||||
|
||||
this.addEventListener('click', e => {
|
||||
this.style.display = 'none';
|
||||
});
|
||||
|
||||
this.querySelector('form').addEventListener('click', e => {
|
||||
// Clicking on the form shouldn't close it
|
||||
e.stopPropagation();
|
||||
});
|
||||
|
||||
this.addEventListener('submit', this.__urlSubmitted.bind(this));
|
||||
}
|
||||
|
||||
async __urlSubmitted(e) {
|
||||
e.preventDefault();
|
||||
const value = this.querySelector('input').value;
|
||||
console.log("Input value", value);
|
||||
|
||||
const query = document.querySelector('.search-bar input').value;
|
||||
|
||||
const url = `${FETCH_URL}url=${encodeURIComponent(value)}&query=${encodeURIComponent(query)}`;
|
||||
const response = await fetch(url);
|
||||
if (response.status === 200) {
|
||||
const data = await response.text();
|
||||
console.log("Data", data);
|
||||
|
||||
const addResultEvent = new CustomEvent('curate-add-result', {detail: data});
|
||||
globalBus.dispatch(addResultEvent);
|
||||
} else {
|
||||
console.log("Bad response", response);
|
||||
// TODO
|
||||
}
|
||||
}
|
||||
}, { extends: 'div' });
|
35
front-end/src/components/molecules/delete-button.js
Normal file
35
front-end/src/components/molecules/delete-button.js
Normal file
|
@ -0,0 +1,35 @@
|
|||
import define from "../../utils/define.js";
|
||||
import {globalBus} from "../../utils/events.js";
|
||||
|
||||
|
||||
export default define('delete-button', class extends HTMLButtonElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.addEventListener('click', (e) => {
|
||||
console.log("Delete button");
|
||||
|
||||
const result = this.closest('.result');
|
||||
const parent = result.parentNode;
|
||||
|
||||
const index = Array.prototype.indexOf.call(parent.getElementsByClassName('result'), result);
|
||||
console.log("Delete index", index);
|
||||
|
||||
const beginCuratingEvent = new CustomEvent('curate-delete-result', {
|
||||
detail: {
|
||||
data: {
|
||||
delete_index: index
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(beginCuratingEvent);
|
||||
})
|
||||
}
|
||||
}, { extends: 'button' });
|
|
@ -1,17 +0,0 @@
|
|||
import define from '../../utils/define.js';
|
||||
|
||||
const template = () => /*html*/`
|
||||
<p>We could not find anything for your search...</p>
|
||||
`;
|
||||
|
||||
export default define('empty-result', class extends HTMLLIElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.classList.add('empty-result');
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
}
|
||||
}, { extends: 'li' });
|
|
@ -2,13 +2,6 @@ import define from '../../utils/define.js';
|
|||
import escapeString from '../../utils/escapeString.js';
|
||||
import { globalBus } from '../../utils/events.js';
|
||||
|
||||
const template = ({ data }) => /*html*/`
|
||||
<a href='${data.url}'>
|
||||
<p class='link'>${data.url}</p>
|
||||
<p class='title'>${data.title}</p>
|
||||
<p class='extract'>${data.extract}</p>
|
||||
</a>
|
||||
`;
|
||||
|
||||
export default define('result', class extends HTMLLIElement {
|
||||
constructor() {
|
||||
|
@ -18,11 +11,6 @@ export default define('result', class extends HTMLLIElement {
|
|||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template({ data: {
|
||||
url: this.dataset.url,
|
||||
title: this.__handleBold(JSON.parse(this.dataset.title)),
|
||||
extract: this.__handleBold(JSON.parse(this.dataset.extract))
|
||||
}});
|
||||
this.__events();
|
||||
}
|
||||
|
||||
|
|
53
front-end/src/components/molecules/validate-button.js
Normal file
53
front-end/src/components/molecules/validate-button.js
Normal file
|
@ -0,0 +1,53 @@
|
|||
import define from "../../utils/define.js";
|
||||
import {globalBus} from "../../utils/events.js";
|
||||
|
||||
|
||||
const VALIDATED_CLASS = "validated";
|
||||
|
||||
export default define('validate-button', class extends HTMLButtonElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__events() {
|
||||
this.addEventListener('click', (e) => {
|
||||
console.log("Validate button");
|
||||
|
||||
const result = this.closest('.result');
|
||||
const parent = result.parentNode;
|
||||
|
||||
const index = Array.prototype.indexOf.call(parent.getElementsByClassName('result'), result);
|
||||
console.log("Validate index", index);
|
||||
|
||||
const curationValidateEvent = new CustomEvent('curate-validate-result', {
|
||||
detail: {
|
||||
data: {
|
||||
validate_index: index
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationValidateEvent);
|
||||
})
|
||||
}
|
||||
|
||||
isValidated() {
|
||||
return this.classList.contains(VALIDATED_CLASS);
|
||||
}
|
||||
|
||||
validate() {
|
||||
this.classList.add(VALIDATED_CLASS);
|
||||
}
|
||||
|
||||
unvalidate() {
|
||||
this.classList.remove(VALIDATED_CLASS);
|
||||
}
|
||||
|
||||
toggleValidate() {
|
||||
this.classList.toggle(VALIDATED_CLASS);
|
||||
}
|
||||
}, { extends: 'button' });
|
|
@ -1,36 +0,0 @@
|
|||
import define from '../../utils/define.js';
|
||||
import config from '../../../config.js';
|
||||
|
||||
const template = ({ data }) => /*html*/`
|
||||
<p class="footer-text">Find more on</p>
|
||||
<ul class="footer-list">
|
||||
${data.links.map(link => /*html*/`
|
||||
<li class="footer-item">
|
||||
<a href="${link.href}" class="footer-link" target="_blank">
|
||||
<i class="${link.icon}"></i>
|
||||
<span>${link.name}</span>
|
||||
</a>
|
||||
</li>
|
||||
`).join('')}
|
||||
</ul>
|
||||
`;
|
||||
|
||||
export default define('footer', class extends HTMLElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template({
|
||||
data: {
|
||||
links: config.footerLinks
|
||||
}
|
||||
});
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__events() {
|
||||
|
||||
}
|
||||
}, { extends: 'footer' });
|
|
@ -1,22 +0,0 @@
|
|||
import define from '../../utils/define.js';
|
||||
|
||||
const template = () => /*html*/`
|
||||
<h1>
|
||||
Welcome to mwmbl, the free, open-source and non-profit search engine.
|
||||
</h1>
|
||||
<p>
|
||||
You can start searching by using the search bar above!
|
||||
</p>
|
||||
`;
|
||||
|
||||
export default define('home', class extends HTMLLIElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.classList.add('home');
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
}
|
||||
}, { extends: 'li' });
|
|
@ -1,75 +1,191 @@
|
|||
import define from '../../utils/define.js';
|
||||
import { globalBus } from '../../utils/events.js';
|
||||
import {globalBus} from '../../utils/events.js';
|
||||
import Sortable from 'sortablejs';
|
||||
|
||||
// Components
|
||||
import result from '../molecules/result.js';
|
||||
import emptyResult from '../molecules/empty-result.js';
|
||||
import home from './home.js';
|
||||
import escapeString from '../../utils/escapeString.js';
|
||||
|
||||
const template = () => /*html*/`
|
||||
<ul class='results'>
|
||||
<li is='${home}'></li>
|
||||
</ul>
|
||||
`;
|
||||
|
||||
export default define('results', class extends HTMLElement {
|
||||
class ResultsHandler {
|
||||
constructor() {
|
||||
super();
|
||||
this.results = null;
|
||||
this.oldIndex = null;
|
||||
this.curating = false;
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
this.results = this.querySelector('.results');
|
||||
this.__events();
|
||||
this.__initializeResults();
|
||||
}
|
||||
|
||||
__events() {
|
||||
globalBus.on('search', (e) => {
|
||||
this.results.innerHTML = '';
|
||||
let resultsHTML = '';
|
||||
if (!e.detail.error) {
|
||||
// If there is no details the input is empty
|
||||
if (!e.detail.results) {
|
||||
resultsHTML = /*html*/`
|
||||
<li is='${home}'></li>
|
||||
`;
|
||||
}
|
||||
// If the details array has results display them
|
||||
else if (e.detail.results.length > 0) {
|
||||
for(const resultData of e.detail.results) {
|
||||
resultsHTML += /*html*/`
|
||||
<li
|
||||
is='${result}'
|
||||
data-url='${escapeString(resultData.url)}'
|
||||
data-title='${escapeString(JSON.stringify(resultData.title))}'
|
||||
data-extract='${escapeString(JSON.stringify(resultData.extract))}'
|
||||
></li>
|
||||
`;
|
||||
}
|
||||
}
|
||||
// If the details array is empty there is no result
|
||||
else {
|
||||
resultsHTML = /*html*/`
|
||||
<li is='${emptyResult}'></li>
|
||||
`;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// If there is an error display an empty result
|
||||
resultsHTML = /*html*/`
|
||||
<li is='${emptyResult}'></li>
|
||||
`;
|
||||
}
|
||||
// Bind HTML to the DOM
|
||||
this.results.innerHTML = resultsHTML;
|
||||
document.body.addEventListener('htmx:load', e => {
|
||||
this.__initializeResults();
|
||||
});
|
||||
|
||||
// Focus first element when coming from the search bar
|
||||
globalBus.on('focus-result', () => {
|
||||
this.results.firstElementChild.firstElementChild.focus();
|
||||
})
|
||||
});
|
||||
|
||||
globalBus.on('curate-delete-result', (e) => {
|
||||
console.log("Curate delete result event", e);
|
||||
this.__beginCurating.bind(this)();
|
||||
|
||||
const children = this.results.getElementsByClassName('result');
|
||||
let deleteIndex = e.detail.data.delete_index;
|
||||
const child = children[deleteIndex];
|
||||
this.results.removeChild(child);
|
||||
const newResults = this.__getResults();
|
||||
|
||||
const curationSaveEvent = new CustomEvent('save-curation', {
|
||||
detail: {
|
||||
type: 'delete',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: newResults,
|
||||
curation: {
|
||||
delete_index: deleteIndex
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationSaveEvent);
|
||||
});
|
||||
|
||||
globalBus.on('curate-validate-result', (e) => {
|
||||
console.log("Curate validate result event", e);
|
||||
this.__beginCurating.bind(this)();
|
||||
|
||||
const children = this.results.getElementsByClassName('result');
|
||||
const validateChild = children[e.detail.data.validate_index];
|
||||
validateChild.querySelector('.curate-approve').toggleValidate();
|
||||
|
||||
const newResults = this.__getResults();
|
||||
|
||||
const curationStartEvent = new CustomEvent('save-curation', {
|
||||
detail: {
|
||||
type: 'validate',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: newResults,
|
||||
curation: e.detail.data
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationStartEvent);
|
||||
});
|
||||
|
||||
globalBus.on('begin-curating-results', (e) => {
|
||||
// We might not be online, or logged in, so save the curation in local storage in case:
|
||||
console.log("Begin curation event", e);
|
||||
this.__beginCurating.bind(this)();
|
||||
});
|
||||
|
||||
globalBus.on('curate-add-result', (e) => {
|
||||
console.log("Add result", e);
|
||||
this.__beginCurating();
|
||||
const resultData = e.detail;
|
||||
this.results.insertAdjacentHTML('afterbegin', resultData);
|
||||
|
||||
const newResults = this.__getResults();
|
||||
const url = newResults[0].url;
|
||||
|
||||
let detail = {
|
||||
type: 'add',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: newResults,
|
||||
curation: {
|
||||
insert_index: 0,
|
||||
url: url
|
||||
}
|
||||
}
|
||||
};
|
||||
console.log("Detail", detail);
|
||||
const curationSaveEvent = new CustomEvent('save-curation', {
|
||||
detail: detail
|
||||
});
|
||||
globalBus.dispatch(curationSaveEvent);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
__initializeResults() {
|
||||
this.results = document.querySelector('.results');
|
||||
|
||||
if (this.results) {
|
||||
const sortable = new Sortable(this.results, {
|
||||
"onStart": this.__sortableActivate.bind(this),
|
||||
"onEnd": this.__sortableDeactivate.bind(this),
|
||||
"handle": ".handle",
|
||||
});
|
||||
}
|
||||
|
||||
this.curating = false;
|
||||
}
|
||||
|
||||
__sortableActivate(event) {
|
||||
console.log("Sortable activate", event);
|
||||
this.__beginCurating();
|
||||
this.oldIndex = event.oldIndex;
|
||||
}
|
||||
|
||||
__beginCurating() {
|
||||
if (!this.curating) {
|
||||
const results = this.__getResults();
|
||||
const curationStartEvent = new CustomEvent('save-curation', {
|
||||
detail: {
|
||||
type: 'begin',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: results,
|
||||
curation: {}
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationStartEvent);
|
||||
this.curating = true;
|
||||
}
|
||||
}
|
||||
|
||||
__getResults() {
|
||||
const resultsElements = document.querySelectorAll('.results .result:not(.ui-sortable-placeholder)');
|
||||
const results = [];
|
||||
for (let resultElement of resultsElements) {
|
||||
const result = {
|
||||
url: resultElement.querySelector('a').href,
|
||||
title: resultElement.querySelector('.title').innerText,
|
||||
extract: resultElement.querySelector('.extract').innerText,
|
||||
curated: resultElement.querySelector('.curate-approve').isValidated()
|
||||
}
|
||||
results.push(result);
|
||||
}
|
||||
console.log("Results", results);
|
||||
return results;
|
||||
}
|
||||
|
||||
__sortableDeactivate(event) {
|
||||
const newIndex = event.newIndex;
|
||||
console.log('Sortable deactivate', this.oldIndex, newIndex);
|
||||
|
||||
const newResults = this.__getResults();
|
||||
|
||||
const curationMoveEvent = new CustomEvent('save-curation', {
|
||||
detail: {
|
||||
type: 'move',
|
||||
data: {
|
||||
timestamp: Date.now(),
|
||||
url: document.location.href,
|
||||
results: newResults,
|
||||
curation: {
|
||||
old_index: this.oldIndex,
|
||||
new_index: newIndex,
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
globalBus.dispatch(curationMoveEvent);
|
||||
}
|
||||
}
|
||||
|
||||
const resultsHandler = new ResultsHandler();
|
||||
|
|
112
front-end/src/components/organisms/save.js
Normal file
112
front-end/src/components/organisms/save.js
Normal file
|
@ -0,0 +1,112 @@
|
|||
import define from '../../utils/define.js';
|
||||
import {globalBus} from "../../utils/events.js";
|
||||
import config from "../../../config.js";
|
||||
|
||||
|
||||
const CURATION_KEY_PREFIX = "curation-";
|
||||
const CURATION_URL = config.publicApiURL + "curation/";
|
||||
|
||||
|
||||
const template = () => /*html*/`
|
||||
<span></span>
|
||||
`;
|
||||
|
||||
|
||||
export default define('save', class extends HTMLDivElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.currentCurationId = null;
|
||||
this.classList.add('save');
|
||||
this.sendId = 0;
|
||||
this.sending = false;
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
this.__events();
|
||||
// TODO: figure out when to call __sendToApi()
|
||||
// setInterval(this.__sendToApi.bind(this), 1000);
|
||||
}
|
||||
|
||||
__events() {
|
||||
globalBus.on('save-curation', (e) => {
|
||||
// We might not be online, or logged in, so save the curation in local storage in case:
|
||||
console.log("Curation event", e);
|
||||
this.__setCuration(e.detail);
|
||||
this.__sendToApi();
|
||||
});
|
||||
}
|
||||
|
||||
__setCuration(curation) {
|
||||
this.sendId += 1;
|
||||
const key = CURATION_KEY_PREFIX + this.sendId;
|
||||
localStorage.setItem(key, JSON.stringify(curation));
|
||||
}
|
||||
|
||||
__getOldestCurationKey() {
|
||||
let oldestId = Number.MAX_SAFE_INTEGER;
|
||||
let oldestKey = null;
|
||||
for (let i=0; i<localStorage.length; ++i) {
|
||||
const key = localStorage.key(i);
|
||||
if (key.startsWith(CURATION_KEY_PREFIX)) {
|
||||
const timestamp = parseInt(key.substring(CURATION_KEY_PREFIX.length));
|
||||
if (timestamp < oldestId) {
|
||||
oldestKey = key;
|
||||
oldestId = timestamp;
|
||||
}
|
||||
}
|
||||
}
|
||||
return oldestKey;
|
||||
}
|
||||
|
||||
async __sendToApi() {
|
||||
if (this.sending) {
|
||||
return;
|
||||
}
|
||||
this.sending = true;
|
||||
const csrftoken = document.cookie
|
||||
.split('; ')
|
||||
.find((row) => row.startsWith('csrftoken='))
|
||||
?.split('=')[1];
|
||||
|
||||
if (!csrftoken) {
|
||||
console.log("No auth");
|
||||
return;
|
||||
}
|
||||
|
||||
const key = this.__getOldestCurationKey();
|
||||
if (key !== null) {
|
||||
const value = JSON.parse(localStorage.getItem(key));
|
||||
console.log("Value", value);
|
||||
const url = CURATION_URL + value['type'];
|
||||
|
||||
const data = value['data'];
|
||||
console.log("Data", data);
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
cache: 'no-cache',
|
||||
headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrftoken},
|
||||
credentials: "same-origin",
|
||||
mode: "same-origin",
|
||||
body: JSON.stringify(data),
|
||||
});
|
||||
|
||||
console.log("Save curation API response", response);
|
||||
|
||||
if (response.status === 200) {
|
||||
localStorage.removeItem(key);
|
||||
} else {
|
||||
console.log("Bad response, skipping");
|
||||
return;
|
||||
}
|
||||
|
||||
const responseData = await response.json();
|
||||
console.log("Response data", responseData);
|
||||
// There may be more to send, wait a second and see
|
||||
setTimeout(this.__sendToApi.bind(this), 1000);
|
||||
}
|
||||
this.sending = false;
|
||||
}
|
||||
}, { extends: 'div' });
|
||||
|
|
@ -1,180 +0,0 @@
|
|||
import define from '../../utils/define.js';
|
||||
import config from '../../../config.js';
|
||||
import { globalBus } from '../../utils/events.js';
|
||||
import debounce from '../../utils/debounce.js'
|
||||
|
||||
const prefersReducedMotion = window.matchMedia('(prefers-reduced-motion)').matches;
|
||||
|
||||
const template = () => /*html*/`
|
||||
<form class="search-bar">
|
||||
<i class="ph-magnifying-glass-bold"></i>
|
||||
<input
|
||||
type='search'
|
||||
class='search-bar-input'
|
||||
placeholder='Search on mwmbl...'
|
||||
title='Use "CTRL+K" or "/" to focus.'
|
||||
autocomplete='off'
|
||||
>
|
||||
</form>
|
||||
`;
|
||||
|
||||
export default define('search-bar', class extends HTMLElement {
|
||||
constructor() {
|
||||
super();
|
||||
this.searchInput = null;
|
||||
this.searchForm = null;
|
||||
this.abortController = new AbortController();
|
||||
this.__setup();
|
||||
}
|
||||
|
||||
__setup() {
|
||||
this.innerHTML = template();
|
||||
this.searchInput = this.querySelector('input');
|
||||
this.searchForm = this.querySelector('form');
|
||||
this.__events();
|
||||
}
|
||||
|
||||
__dispatchSearch({ results = null, error = null }) {
|
||||
const searchEvent = new CustomEvent('search', {
|
||||
detail: {
|
||||
results,
|
||||
error,
|
||||
},
|
||||
});
|
||||
globalBus.dispatch(searchEvent)
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the overall layout of the page.
|
||||
*
|
||||
* `home` centers the search bar on the page.
|
||||
* `compact` raises it to the top and makes room for displaying results.
|
||||
*
|
||||
* @param {'compact' | 'home'} mode
|
||||
* @return {void}
|
||||
*/
|
||||
__setDisplayMode(mode) {
|
||||
switch (mode) {
|
||||
case 'compact': {
|
||||
document.body.style.paddingTop = '25px';
|
||||
document.querySelector('.search-menu').classList.add('compact');
|
||||
break;
|
||||
}
|
||||
case 'home': {
|
||||
document.body.style.paddingTop = '30vh';
|
||||
document.querySelector('.search-menu').classList.remove('compact');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async __executeSearch() {
|
||||
this.abortController.abort();
|
||||
this.abortController = new AbortController();
|
||||
// Get response from API
|
||||
const response = await fetch(`${config.publicApiURL}search?s=${encodeURIComponent(this.searchInput.value)}`, {
|
||||
signal: this.abortController.signal
|
||||
});
|
||||
// Getting results from API
|
||||
const search = await (response).json();
|
||||
return search;
|
||||
}
|
||||
|
||||
__handleSearch = async () => {
|
||||
// Update page title
|
||||
document.title = `MWMBL - ${this.searchInput.value || "Search"}`;
|
||||
|
||||
// Update query params
|
||||
const queryParams = new URLSearchParams(document.location.search);
|
||||
// Sets query param if search value is not empty
|
||||
if (this.searchInput.value) queryParams.set(config.searchQueryParam, this.searchInput.value);
|
||||
else queryParams.delete(config.searchQueryParam);
|
||||
// New URL with query params
|
||||
const newURL =
|
||||
document.location.protocol
|
||||
+ "//"
|
||||
+ document.location.host
|
||||
+ document.location.pathname
|
||||
+ (this.searchInput.value ? '?' : '')
|
||||
+ queryParams.toString();
|
||||
// Replace history state
|
||||
window.history.replaceState({ path: newURL }, '', newURL);
|
||||
|
||||
if (this.searchInput.value) {
|
||||
this.__setDisplayMode('compact')
|
||||
|
||||
try {
|
||||
const search = await this.__executeSearch()
|
||||
// This is a guess at an explanation
|
||||
// Check the searcInput.value before setting the results to prevent
|
||||
// race condition where the user has cleared the search input after
|
||||
// submitting an original search but before the search results have
|
||||
// come back from the API
|
||||
this.__dispatchSearch({ results: this.searchInput.value ? search : null });
|
||||
}
|
||||
catch(error) {
|
||||
this.__dispatchSearch({ error })
|
||||
}
|
||||
}
|
||||
else {
|
||||
this.__setDisplayMode('home')
|
||||
this.__dispatchSearch({ results: null });
|
||||
}
|
||||
}
|
||||
|
||||
__events() {
|
||||
/**
|
||||
* Always add the submit event, it makes things feel faster if
|
||||
* someone does not prefer reduced motion and reflexively hits
|
||||
* return once they've finished typing.
|
||||
*/
|
||||
this.searchForm.addEventListener('submit', (e) => {
|
||||
e.preventDefault();
|
||||
this.__handleSearch(e);
|
||||
});
|
||||
|
||||
/**
|
||||
* Only add the "real time" search behavior when the client does
|
||||
* not prefer reduced motion; this prevents the page from changing
|
||||
* while the user is still typing their query.
|
||||
*/
|
||||
if (!prefersReducedMotion) {
|
||||
this.searchInput.addEventListener('input', debounce(this.__handleSearch, 500))
|
||||
}
|
||||
|
||||
// Focus search bar when pressing `ctrl + k` or `/`
|
||||
document.addEventListener('keydown', (e) => {
|
||||
if ((e.key === 'k' && e.ctrlKey) || e.key === '/' || e.key === 'Escape') {
|
||||
e.preventDefault();
|
||||
this.searchInput.focus();
|
||||
}
|
||||
});
|
||||
|
||||
// Focus first result when pressing down arrow
|
||||
this.addEventListener('keydown', (e) => {
|
||||
if (e.key === 'ArrowDown' && this.searchInput.value) {
|
||||
e.preventDefault();
|
||||
const focusResultEvent = new CustomEvent('focus-result');
|
||||
globalBus.dispatch(focusResultEvent);
|
||||
}
|
||||
});
|
||||
|
||||
globalBus.on('focus-search', (e) => {
|
||||
this.searchInput.focus();
|
||||
});
|
||||
}
|
||||
|
||||
connectedCallback() {
|
||||
// Focus search input when component is connected
|
||||
this.searchInput.focus();
|
||||
|
||||
const searchQuery = new URLSearchParams(document.location.search).get(config.searchQueryParam);
|
||||
this.searchInput.value = searchQuery;
|
||||
/**
|
||||
* Trigger search handling to coordinate the value pulled from the query string
|
||||
* across the rest of the UI and to actually retrieve the results if the search
|
||||
* value is now non-empty.
|
||||
*/
|
||||
this.__handleSearch();
|
||||
}
|
||||
});
|
|
@ -1,63 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<!-- Metas -->
|
||||
<meta charset="UTF-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<!-- Page title -->
|
||||
<title>MWMBL - Search</title>
|
||||
<meta name="description" content="The free, open-source and non-profit search engine.">
|
||||
|
||||
<!-- Favicons -->
|
||||
<link rel="icon" href="/images/favicon.svg" type="image/svg+xml">
|
||||
|
||||
<!-- Fonts import -->
|
||||
<link rel="preload" href="/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<noscript>
|
||||
<link rel="stylesheet" href="/fonts/inter/inter.css">
|
||||
</noscript>
|
||||
|
||||
<!-- CSS Stylesheets (this is critical CSS) -->
|
||||
<link rel="stylesheet" type="text/css" href="/css/reset.css">
|
||||
<link rel="stylesheet" type="text/css" href="/css/theme.css">
|
||||
<link rel="stylesheet" type="text/css" href="/css/global.css">
|
||||
|
||||
<!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
|
||||
<link rel="preload" href="/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<noscript>
|
||||
<link rel="stylesheet" href="/fonts/phosphor/icons.css">
|
||||
</noscript>
|
||||
|
||||
<!-- Custom Element Polyfill for Safari -->
|
||||
<script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
|
||||
|
||||
<!-- OpenSearch -->
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="MWMBL Search">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<mwmbl-app></mwmbl-app>
|
||||
<noscript>
|
||||
<main class="noscript">
|
||||
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
|
||||
<h1>
|
||||
Welcome to mwmbl, the free, open-source and non-profit search engine.
|
||||
</h1>
|
||||
<p>This website requires you to support/enable scripts.</p>
|
||||
<p>
|
||||
More information on
|
||||
<a href="https://github.com/mwmbl/mwmbl" target="_blank">
|
||||
Github
|
||||
</a>
|
||||
.
|
||||
</p>
|
||||
</main>
|
||||
</noscript>
|
||||
<!-- Javasript entrypoint -->
|
||||
<script src="./index.js" type="module"></script>
|
||||
</body>
|
||||
|
||||
</html>
|
|
@ -5,6 +5,7 @@
|
|||
* Please do not pollute this file if you can make
|
||||
* util or component files instead.
|
||||
*/
|
||||
import 'vite/modulepreload-polyfill';
|
||||
|
||||
// Waiting for top-level await to be better supported.
|
||||
(async () => {
|
||||
|
@ -14,9 +15,12 @@
|
|||
|
||||
if (!redirected) {
|
||||
// Load components only after redirects are checked.
|
||||
import('./components/app.js');
|
||||
import("./components/organisms/search-bar.js");
|
||||
import("./components/organisms/results.js");
|
||||
import("./components/organisms/footer.js");
|
||||
import("./components/organisms/save.js");
|
||||
import("./components/molecules/add-button.js");
|
||||
import("./components/molecules/add-result.js");
|
||||
import("./components/molecules/delete-button.js");
|
||||
import("./components/molecules/result.js");
|
||||
import("./components/molecules/validate-button.js");
|
||||
}
|
||||
})();
|
||||
|
|
|
@ -5,18 +5,18 @@
|
|||
<title>Mwmbl Stats</title>
|
||||
|
||||
<!-- Favicons -->
|
||||
<link rel="icon" href="/images/favicon.svg" type="image/svg+xml">
|
||||
<link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
|
||||
|
||||
<!-- Fonts import -->
|
||||
<link rel="preload" href="/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<noscript>
|
||||
<link rel="stylesheet" href="/fonts/inter/inter.css">
|
||||
<link rel="stylesheet" href="/static/fonts/inter/inter.css">
|
||||
</noscript>
|
||||
|
||||
<!-- CSS Stylesheets (this is critical CSS) -->
|
||||
<link rel="stylesheet" type="text/css" href="/css/reset.css">
|
||||
<link rel="stylesheet" type="text/css" href="/css/theme.css">
|
||||
<link rel="stylesheet" type="text/css" href="/css/global.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/reset.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/theme.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/global.css">
|
||||
<link rel="stylesheet" type="text/css" href="stats.css">
|
||||
</head>
|
||||
<body>
|
||||
|
|
|
@ -7,12 +7,14 @@ export default {
|
|||
publicDir: '../assets',
|
||||
build: {
|
||||
outDir: '../dist',
|
||||
manifest: true,
|
||||
rollupOptions: {
|
||||
input: {
|
||||
main: resolve(__dirname, 'src/index.html'),
|
||||
index: resolve(__dirname, 'src/index.js'),
|
||||
stats: resolve(__dirname, 'src/stats/index.html'),
|
||||
},
|
||||
},
|
||||
minify: false,
|
||||
},
|
||||
plugins: [
|
||||
legacy({
|
||||
|
|
8
mwmbl/admin.py
Normal file
8
mwmbl/admin.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from django.contrib.admin import ModelAdmin
|
||||
from django.contrib.auth.admin import UserAdmin
|
||||
from django.contrib import admin
|
||||
|
||||
from mwmbl.models import MwmblUser, UserCuration
|
||||
|
||||
admin.site.register(MwmblUser, UserAdmin)
|
||||
admin.site.register(UserCuration, ModelAdmin)
|
28
mwmbl/api.py
28
mwmbl/api.py
|
@ -1,36 +1,24 @@
|
|||
from multiprocessing import Queue
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from ninja import NinjaAPI
|
||||
from ninja.security import django_auth
|
||||
|
||||
import mwmbl.crawler.app as crawler
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
||||
from mwmbl.platform import curate
|
||||
from mwmbl.search_setup import queued_batches, index_path, ranker, batch_cache
|
||||
from mwmbl.tinysearchengine import search
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||
|
||||
|
||||
queued_batches = Queue()
|
||||
completer = Completer()
|
||||
|
||||
index_path = Path(settings.DATA_PATH) / INDEX_NAME
|
||||
tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||
tiny_index.__enter__()
|
||||
ranker = HeuristicRanker(tiny_index, completer)
|
||||
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
|
||||
|
||||
|
||||
def create_api(version):
|
||||
api = NinjaAPI(version=version)
|
||||
# Set csrf to True to all cookie-based authentication
|
||||
api = NinjaAPI(version=version, csrf=True)
|
||||
|
||||
search_router = search.create_router(ranker)
|
||||
api.add_router("/search/", search_router)
|
||||
|
||||
crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
|
||||
api.add_router("/crawler/", crawler_router)
|
||||
|
||||
curation_router = curate.create_router(index_path)
|
||||
api.add_router("/curation/", curation_router, auth=django_auth)
|
||||
return api
|
||||
|
||||
|
||||
|
|
|
@ -6,12 +6,9 @@ from pathlib import Path
|
|||
from django.apps import AppConfig
|
||||
from django.conf import settings
|
||||
|
||||
from mwmbl.api import queued_batches
|
||||
from mwmbl import background
|
||||
from mwmbl.indexer.paths import INDEX_NAME
|
||||
from mwmbl.indexer.update_urls import update_urls_continuously
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
|
||||
from mwmbl.url_queue import update_queue_continuously
|
||||
from mwmbl.crawler.urls import URLDatabase
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.indexer.indexdb import IndexDatabase
|
||||
|
||||
|
||||
class MwmblConfig(AppConfig):
|
||||
|
@ -19,6 +16,14 @@ class MwmblConfig(AppConfig):
|
|||
verbose_name = "Mwmbl Application"
|
||||
|
||||
def ready(self):
|
||||
# Imports here to avoid AppRegistryNotReady exception
|
||||
from mwmbl.search_setup import queued_batches
|
||||
from mwmbl import background
|
||||
from mwmbl.indexer.paths import INDEX_NAME
|
||||
from mwmbl.indexer.update_urls import update_urls_continuously
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
|
||||
from mwmbl.url_queue import update_queue_continuously
|
||||
|
||||
index_path = Path(settings.DATA_PATH) / INDEX_NAME
|
||||
try:
|
||||
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||
|
@ -30,6 +35,12 @@ class MwmblConfig(AppConfig):
|
|||
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
|
||||
page_size=PAGE_SIZE)
|
||||
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
url_db.create_tables()
|
||||
index_db = IndexDatabase(db.connection)
|
||||
index_db.create_tables()
|
||||
|
||||
if settings.RUN_BACKGROUND_PROCESSES:
|
||||
new_item_queue = Queue()
|
||||
Process(target=background.run, args=(settings.DATA_PATH,)).start()
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
"""
|
||||
Script that updates data in a background process.
|
||||
"""
|
||||
from logging import getLogger
|
||||
import logging
|
||||
import sys
|
||||
from logging import getLogger, basicConfig
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
|
@ -11,6 +13,8 @@ from mwmbl.indexer import index_batches, historical
|
|||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME
|
||||
|
||||
|
||||
basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
|
|
|
@ -8,12 +8,8 @@ from typing import Union
|
|||
from uuid import uuid4
|
||||
|
||||
import boto3
|
||||
import justext
|
||||
import requests
|
||||
from fastapi import HTTPException
|
||||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||
from ninja import Router
|
||||
from redis import Redis
|
||||
|
||||
|
@ -21,7 +17,6 @@ from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
|||
from mwmbl.crawler.stats import MwmblStats, StatsManager
|
||||
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.format import format_result
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
|
||||
from mwmbl.settings import (
|
||||
|
@ -35,9 +30,7 @@ from mwmbl.settings import (
|
|||
PUBLIC_URL_PREFIX,
|
||||
PUBLIC_USER_ID_LENGTH,
|
||||
FILE_NAME_SUFFIX,
|
||||
DATE_REGEX, NUM_EXTRACT_CHARS)
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
|
||||
DATE_REGEX)
|
||||
|
||||
stats_manager = StatsManager(Redis.from_url(os.environ.get("REDIS_URL")))
|
||||
|
||||
|
@ -57,32 +50,6 @@ def upload(data: bytes, name: str):
|
|||
last_batch = None
|
||||
|
||||
|
||||
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
|
||||
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
|
||||
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
|
||||
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
|
||||
encoding=None, default_encoding=DEFAULT_ENCODING,
|
||||
enc_errors=DEFAULT_ENC_ERRORS):
|
||||
"""
|
||||
Converts an HTML page into a list of classified paragraphs. Each paragraph
|
||||
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
|
||||
"""
|
||||
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
|
||||
|
||||
titles = dom.xpath("//title")
|
||||
title = titles[0].text if len(titles) > 0 else None
|
||||
|
||||
dom = preprocessor(dom)
|
||||
|
||||
paragraphs = ParagraphMaker.make_paragraphs(dom)
|
||||
|
||||
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
|
||||
stopwords_low, stopwords_high, max_link_density, no_headings)
|
||||
revise_paragraph_classification(paragraphs, max_heading_distance)
|
||||
|
||||
return paragraphs, title
|
||||
|
||||
|
||||
def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
|
||||
router = Router(tags=["crawler"])
|
||||
|
||||
|
@ -90,19 +57,6 @@ def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
|
|||
# #
|
||||
# # url_db.create_tables()
|
||||
|
||||
@router.get('/fetch')
|
||||
def fetch_url(request, url: str, query: str):
|
||||
response = requests.get(url)
|
||||
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
||||
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
||||
|
||||
extract = ' '.join([p.text for p in good_paragraphs])
|
||||
if len(extract) > NUM_EXTRACT_CHARS:
|
||||
extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
|
||||
|
||||
result = Document(title=title, url=url, extract=extract, score=0.0)
|
||||
return format_result(result, query)
|
||||
|
||||
@router.post('/batches/')
|
||||
def post_batch(request, batch: Batch):
|
||||
if len(batch.items) > MAX_BATCH_SIZE:
|
||||
|
|
|
@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
|
|||
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
|
||||
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
|
||||
score = link_counts.get(url, DEFAULT_SCORE)
|
||||
yield tokenize_document(url, title_cleaned, extract, score, nlp)
|
||||
yield tokenize_document(url, title_cleaned, extract, score)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print("Processed", i)
|
||||
|
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
|
|||
return set(first_tokens + bigrams)
|
||||
|
||||
|
||||
def tokenize_document(url, title_cleaned, extract, score, nlp):
|
||||
def tokenize_document(url, title_cleaned, extract, score):
|
||||
title_tokens = tokenize(title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
url_tokens = tokenize(prepared_url)
|
||||
|
|
|
@ -16,6 +16,7 @@ from mwmbl.indexer.batch_cache import BatchCache
|
|||
from mwmbl.indexer.index import tokenize_document
|
||||
from mwmbl.indexer.indexdb import BatchStatus
|
||||
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
|
||||
from mwmbl.utils import add_term_info, add_term_infos
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -31,22 +32,20 @@ def run(batch_cache: BatchCache, index_path: str):
|
|||
|
||||
def process(batches: Collection[HashedBatch]):
|
||||
with Database() as db:
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
url_db = URLDatabase(db.connection)
|
||||
index_batches(batches, index_path, nlp, url_db)
|
||||
index_batches(batches, index_path, url_db)
|
||||
logger.info("Indexed pages")
|
||||
|
||||
process_batch.run(batch_cache, BatchStatus.URLS_UPDATED, BatchStatus.INDEXED, process)
|
||||
|
||||
|
||||
def index_batches(batch_data: Collection[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
|
||||
def index_batches(batch_data: Collection[HashedBatch], index_path: str, url_db: URLDatabase):
|
||||
document_tuples = list(get_documents_from_batches(batch_data))
|
||||
urls = [url for title, url, extract in document_tuples]
|
||||
logger.info(f"Got {len(urls)} document tuples")
|
||||
url_scores = url_db.get_url_scores(urls)
|
||||
logger.info(f"Got {len(url_scores)} scores")
|
||||
logger.info(f"Indexing {len(urls)} document tuples and {len(url_scores)} URL scores")
|
||||
documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
|
||||
page_documents = preprocess_documents(documents, index_path, nlp)
|
||||
page_documents = preprocess_documents(documents, index_path)
|
||||
index_pages(index_path, page_documents)
|
||||
|
||||
|
||||
|
@ -58,24 +57,27 @@ def index_pages(index_path, page_documents):
|
|||
seen_urls = set()
|
||||
seen_titles = set()
|
||||
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
|
||||
for document in sorted_documents:
|
||||
# TODO: for now we add the term here, until all the documents in the index have terms
|
||||
sorted_documents_with_terms = add_term_infos(sorted_documents, indexer, page)
|
||||
for document in sorted_documents_with_terms:
|
||||
if document.title in seen_titles or document.url in seen_urls:
|
||||
continue
|
||||
new_documents.append(document)
|
||||
seen_urls.add(document.url)
|
||||
seen_titles.add(document.title)
|
||||
logger.info(f"Storing {len(new_documents)} documents for page {page}, originally {len(existing_documents)}")
|
||||
indexer.store_in_page(page, new_documents)
|
||||
|
||||
|
||||
def preprocess_documents(documents, index_path, nlp):
|
||||
def preprocess_documents(documents, index_path):
|
||||
page_documents = defaultdict(list)
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
for document in documents:
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
|
||||
# logger.debug(f"Tokenized: {tokenized}")
|
||||
page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
|
||||
for page in page_indexes:
|
||||
page_documents[page].append(document)
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
|
||||
for token in tokenized.tokens:
|
||||
page = indexer.get_key_page_index(token)
|
||||
term_document = Document(document.title, document.url, document.extract, document.score, token)
|
||||
page_documents[page].append(term_document)
|
||||
print(f"Preprocessed for {len(page_documents)} pages")
|
||||
return page_documents
|
||||
|
||||
|
|
|
@ -86,7 +86,7 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
|
|||
def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
|
||||
parsed_link = urlparse(link)
|
||||
if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
|
||||
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
|
||||
logger.debug(f"Excluding link for blacklisted domain: {parsed_link}")
|
||||
return
|
||||
|
||||
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
import django
|
||||
import uvicorn
|
||||
from django.core.management import call_command
|
||||
|
||||
|
||||
def run():
|
||||
django.setup()
|
||||
call_command("collectstatic", "--clear", "--noinput")
|
||||
call_command("migrate")
|
||||
uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=5000)
|
||||
|
||||
|
||||
|
|
58
mwmbl/migrations/0001_initial.py
Normal file
58
mwmbl/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
# Generated by Django 4.2.6 on 2023-10-25 11:55
|
||||
|
||||
from django.conf import settings
|
||||
import django.contrib.auth.models
|
||||
import django.contrib.auth.validators
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='MwmblUser',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('password', models.CharField(max_length=128, verbose_name='password')),
|
||||
('last_login', models.DateTimeField(blank=True, null=True, verbose_name='last login')),
|
||||
('is_superuser', models.BooleanField(default=False, help_text='Designates that this user has all permissions without explicitly assigning them.', verbose_name='superuser status')),
|
||||
('username', models.CharField(error_messages={'unique': 'A user with that username already exists.'}, help_text='Required. 150 characters or fewer. Letters, digits and @/./+/-/_ only.', max_length=150, unique=True, validators=[django.contrib.auth.validators.UnicodeUsernameValidator()], verbose_name='username')),
|
||||
('first_name', models.CharField(blank=True, max_length=150, verbose_name='first name')),
|
||||
('last_name', models.CharField(blank=True, max_length=150, verbose_name='last name')),
|
||||
('email', models.EmailField(blank=True, max_length=254, verbose_name='email address')),
|
||||
('is_staff', models.BooleanField(default=False, help_text='Designates whether the user can log into this admin site.', verbose_name='staff status')),
|
||||
('is_active', models.BooleanField(default=True, help_text='Designates whether this user should be treated as active. Unselect this instead of deleting accounts.', verbose_name='active')),
|
||||
('date_joined', models.DateTimeField(default=django.utils.timezone.now, verbose_name='date joined')),
|
||||
('groups', models.ManyToManyField(blank=True, help_text='The groups this user belongs to. A user will get all permissions granted to each of their groups.', related_name='user_set', related_query_name='user', to='auth.group', verbose_name='groups')),
|
||||
('user_permissions', models.ManyToManyField(blank=True, help_text='Specific permissions for this user.', related_name='user_set', related_query_name='user', to='auth.permission', verbose_name='user permissions')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'user',
|
||||
'verbose_name_plural': 'users',
|
||||
'abstract': False,
|
||||
},
|
||||
managers=[
|
||||
('objects', django.contrib.auth.models.UserManager()),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='UserCuration',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('timestamp', models.DateTimeField()),
|
||||
('url', models.CharField(max_length=300)),
|
||||
('results', models.JSONField()),
|
||||
('curation_type', models.CharField(max_length=20)),
|
||||
('curation', models.JSONField()),
|
||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
),
|
||||
]
|
0
mwmbl/migrations/__init__.py
Normal file
0
mwmbl/migrations/__init__.py
Normal file
15
mwmbl/models.py
Normal file
15
mwmbl/models.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
from django.db import models
|
||||
from django.contrib.auth.models import AbstractUser
|
||||
|
||||
|
||||
class MwmblUser(AbstractUser):
|
||||
pass
|
||||
|
||||
|
||||
class UserCuration(models.Model):
|
||||
user = models.ForeignKey(MwmblUser, on_delete=models.CASCADE)
|
||||
timestamp = models.DateTimeField()
|
||||
url = models.CharField(max_length=300)
|
||||
results = models.JSONField()
|
||||
curation_type = models.CharField(max_length=20)
|
||||
curation = models.JSONField()
|
89
mwmbl/platform/curate.py
Normal file
89
mwmbl/platform/curate.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
from logging import getLogger
|
||||
from typing import Any
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
from ninja import Router
|
||||
|
||||
from mwmbl.indexer.update_urls import get_datetime_from_timestamp
|
||||
from mwmbl.models import UserCuration
|
||||
from mwmbl.platform.data import CurateBegin, CurateMove, CurateDelete, CurateAdd, CurateValidate, \
|
||||
make_curation_type
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tokenizer import tokenize
|
||||
from mwmbl.utils import add_term_info, add_term_infos
|
||||
|
||||
RESULT_URL = "https://mwmbl.org/?q="
|
||||
MAX_CURATED_SCORE = 1_111_111.0
|
||||
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
def create_router(index_path: str) -> Router:
|
||||
router = Router(tags=["user"])
|
||||
|
||||
@router.post("/begin")
|
||||
def user_begin_curate(request, curate_begin: make_curation_type(CurateBegin)):
|
||||
return _curate(request, "curate_begin", curate_begin)
|
||||
|
||||
@router.post("/move")
|
||||
def user_move_result(request, curate_move: make_curation_type(CurateMove)):
|
||||
return _curate(request, "curate_move", curate_move)
|
||||
|
||||
@router.post("/delete")
|
||||
def user_delete_result(request, curate_delete: make_curation_type(CurateDelete)):
|
||||
return _curate(request, "curate_delete", curate_delete)
|
||||
|
||||
@router.post("/add")
|
||||
def user_add_result(request, curate_add: make_curation_type(CurateAdd)):
|
||||
return _curate(request, "curate_add", curate_add)
|
||||
|
||||
@router.post("/validate")
|
||||
def user_add_result(request, curate_validate: make_curation_type(CurateValidate)):
|
||||
return _curate(request, "curate_validate", curate_validate)
|
||||
|
||||
def _curate(request, curation_type: str, curation: Any):
|
||||
user_curation = UserCuration(
|
||||
user=request.user,
|
||||
timestamp=get_datetime_from_timestamp(curation.timestamp / 1000.0),
|
||||
url=curation.url,
|
||||
results=curation.dict()["results"],
|
||||
curation_type=curation_type,
|
||||
curation=curation.curation.dict(),
|
||||
)
|
||||
user_curation.save()
|
||||
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
query_string = parse_qs(curation.url)
|
||||
if len(query_string) > 1:
|
||||
raise ValueError(f"Should be one query string in the URL: {curation.url}")
|
||||
|
||||
queries = next(iter(query_string.values()))
|
||||
if len(queries) > 1:
|
||||
raise ValueError(f"Should be one query value in the URL: {curation.url}")
|
||||
|
||||
query = queries[0]
|
||||
tokens = tokenize(query)
|
||||
term = " ".join(tokens)
|
||||
|
||||
documents = [
|
||||
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
|
||||
for i, result in enumerate(curation.results)
|
||||
]
|
||||
|
||||
page_index = indexer.get_key_page_index(term)
|
||||
existing_documents_no_terms = indexer.get_page(page_index)
|
||||
existing_documents = add_term_infos(existing_documents_no_terms, indexer, page_index)
|
||||
other_documents = [doc for doc in existing_documents if doc.term != term]
|
||||
logger.info(f"Found {len(other_documents)} other documents for term {term} at page {page_index} "
|
||||
f"with terms { {doc.term for doc in other_documents} }")
|
||||
|
||||
all_documents = documents + other_documents
|
||||
logger.info(f"Storing {len(all_documents)} documents at page {page_index}")
|
||||
indexer.store_in_page(page_index, all_documents)
|
||||
|
||||
return {"curation": "ok"}
|
||||
|
||||
return router
|
||||
|
||||
|
46
mwmbl/platform/data.py
Normal file
46
mwmbl/platform/data.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
from datetime import datetime
|
||||
from typing import TypeVar, Generic
|
||||
|
||||
from ninja import Schema
|
||||
|
||||
|
||||
class Result(Schema):
|
||||
url: str
|
||||
title: str
|
||||
extract: str
|
||||
curated: bool
|
||||
|
||||
|
||||
class CurateBegin(Schema):
|
||||
pass
|
||||
|
||||
|
||||
class CurateMove(Schema):
|
||||
old_index: int
|
||||
new_index: int
|
||||
|
||||
|
||||
class CurateDelete(Schema):
|
||||
delete_index: int
|
||||
|
||||
|
||||
class CurateAdd(Schema):
|
||||
insert_index: int
|
||||
url: str
|
||||
|
||||
|
||||
class CurateValidate(Schema):
|
||||
validate_index: int
|
||||
is_validated: bool
|
||||
|
||||
|
||||
T = TypeVar('T', CurateBegin, CurateAdd, CurateDelete, CurateMove, CurateValidate)
|
||||
|
||||
|
||||
def make_curation_type(t):
|
||||
class Curation(Schema):
|
||||
timestamp: int
|
||||
url: str
|
||||
results: list[Result]
|
||||
curation: t
|
||||
return Curation
|
|
@ -1,190 +0,0 @@
|
|||
import json
|
||||
import os
|
||||
from typing import TypeVar, Generic
|
||||
from urllib.parse import urljoin, parse_qs
|
||||
|
||||
import requests
|
||||
from fastapi import APIRouter, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tokenizer import tokenize
|
||||
|
||||
|
||||
LEMMY_URL = os.environ["LEMMY_URL"]
|
||||
RESULT_URL = "https://mwmbl.org/?q="
|
||||
MAX_CURATED_SCORE = 1_111_111.0
|
||||
|
||||
|
||||
class Register(BaseModel):
|
||||
username: str
|
||||
email: str
|
||||
password: str
|
||||
password_verify: str
|
||||
|
||||
|
||||
class Login(BaseModel):
|
||||
username_or_email: str
|
||||
password: str
|
||||
|
||||
|
||||
class Result(BaseModel):
|
||||
url: str
|
||||
title: str
|
||||
extract: str
|
||||
curated: bool
|
||||
|
||||
|
||||
class BeginCurate(BaseModel):
|
||||
auth: str
|
||||
url: str
|
||||
results: list[Result]
|
||||
|
||||
|
||||
class CurateMove(BaseModel):
|
||||
old_index: int
|
||||
new_index: int
|
||||
|
||||
|
||||
class CurateDelete(BaseModel):
|
||||
delete_index: int
|
||||
|
||||
|
||||
class CurateAdd(BaseModel):
|
||||
insert_index: int
|
||||
url: str
|
||||
|
||||
|
||||
class CurateValidate(BaseModel):
|
||||
validate_index: int
|
||||
is_validated: bool
|
||||
|
||||
|
||||
T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate)
|
||||
|
||||
|
||||
class Curation(BaseModel, Generic[T]):
|
||||
auth: str
|
||||
curation_id: int
|
||||
url: str
|
||||
results: list[Result]
|
||||
curation: T
|
||||
|
||||
|
||||
def create_router(index_path: str) -> APIRouter:
|
||||
router = APIRouter(prefix="/user", tags=["user"])
|
||||
|
||||
# TODO: reinstate
|
||||
# community_id = get_community_id()
|
||||
community_id = 0
|
||||
|
||||
@router.post("/register")
|
||||
def user_register(register: Register) -> Response:
|
||||
lemmy_register = {
|
||||
"username": register.username,
|
||||
"email": register.email,
|
||||
"password": register.password,
|
||||
"password_verify": register.password_verify,
|
||||
"answer": "not applicable",
|
||||
"captcha_answer": None,
|
||||
"captcha_uuid": None,
|
||||
"honeypot": None,
|
||||
"show_nsfw": False,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/register"), json=lemmy_register)
|
||||
if request.status_code != 200:
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
@router.post("/login")
|
||||
def user_login(login: Login) -> Response:
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/login"), json=login.dict())
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
@router.post("/curation/begin")
|
||||
def user_begin_curate(begin_curate: BeginCurate):
|
||||
results = begin_curate.dict()["results"]
|
||||
body = json.dumps({"original_results": results}, indent=2)
|
||||
create_post = {
|
||||
"auth": begin_curate.auth,
|
||||
"body": body,
|
||||
"community_id": community_id,
|
||||
"honeypot": None,
|
||||
"language_id": None,
|
||||
"name": begin_curate.url,
|
||||
"nsfw": None,
|
||||
"url": begin_curate.url,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/post"), json=create_post)
|
||||
if request.status_code != 200:
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
data = request.json()
|
||||
curation_id = data["post_view"]["post"]["id"]
|
||||
return {"curation_id": curation_id}
|
||||
|
||||
@router.post("/curation/move")
|
||||
def user_move_result(curate_move: Curation[CurateMove]):
|
||||
return _curate("curate_move", curate_move)
|
||||
|
||||
@router.post("/curation/delete")
|
||||
def user_delete_result(curate_delete: Curation[CurateDelete]):
|
||||
return _curate("curate_delete", curate_delete)
|
||||
|
||||
@router.post("/curation/add")
|
||||
def user_add_result(curate_add: Curation[CurateAdd]):
|
||||
return _curate("curate_add", curate_add)
|
||||
|
||||
@router.post("/curation/validate")
|
||||
def user_add_result(curate_validate: Curation[CurateValidate]):
|
||||
return _curate("curate_validate", curate_validate)
|
||||
|
||||
def _curate(curation_type: str, curation: Curation):
|
||||
content = json.dumps({
|
||||
"curation_type": curation_type,
|
||||
"curation": curation.curation.dict(),
|
||||
}, indent=2)
|
||||
create_comment = {
|
||||
"auth": curation.auth,
|
||||
"content": json.dumps(content, indent=2),
|
||||
"form_id": None,
|
||||
"language_id": None,
|
||||
"parent_id": None,
|
||||
"post_id": curation.curation_id,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
|
||||
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
query_string = parse_qs(curation.url)
|
||||
if len(query_string) > 1:
|
||||
raise ValueError(f"Should be one query string in the URL: {curation.url}")
|
||||
|
||||
queries = next(iter(query_string.values()))
|
||||
if len(queries) > 1:
|
||||
raise ValueError(f"Should be one query value in the URL: {curation.url}")
|
||||
|
||||
query = queries[0]
|
||||
print("Query", query)
|
||||
tokens = tokenize(query)
|
||||
print("Tokens", tokens)
|
||||
term = " ".join(tokens)
|
||||
print("Key", term)
|
||||
|
||||
documents = [
|
||||
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
|
||||
for i, result in enumerate(curation.results)
|
||||
]
|
||||
page_index = indexer.get_key_page_index(term)
|
||||
print("Page index", page_index)
|
||||
print("Storing documents", documents)
|
||||
indexer.store_in_page(page_index, documents)
|
||||
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
return router
|
||||
|
||||
|
||||
def get_community_id() -> str:
|
||||
request = requests.get(urljoin(LEMMY_URL, "api/v3/community?name=main"))
|
||||
community = request.json()
|
||||
return community["community_view"]["community"]["id"]
|
||||
|
||||
|
19
mwmbl/search_setup.py
Normal file
19
mwmbl/search_setup.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
from multiprocessing import Queue
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||
|
||||
queued_batches = Queue()
|
||||
completer = Completer()
|
||||
index_path = Path(settings.DATA_PATH) / INDEX_NAME
|
||||
tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||
tiny_index.__enter__()
|
||||
|
||||
ranker = HeuristicRanker(tiny_index, completer)
|
||||
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
|
|
@ -5,4 +5,4 @@ ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org"]
|
|||
|
||||
DATA_PATH = "/app/storage"
|
||||
RUN_BACKGROUND_PROCESSES = True
|
||||
NUM_PAGES = 10240000
|
||||
NUM_PAGES = 10240000
|
|
@ -19,9 +19,6 @@ BASE_DIR = Path(__file__).resolve().parent.parent
|
|||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
|
@ -32,7 +29,13 @@ INSTALLED_APPS = [
|
|||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'django.contrib.humanize',
|
||||
'mwmbl',
|
||||
'django_htmx',
|
||||
'django_vite',
|
||||
'allauth',
|
||||
'allauth.account',
|
||||
'allauth.socialaccount',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
|
@ -43,6 +46,9 @@ MIDDLEWARE = [
|
|||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
|
||||
"django_htmx.middleware.HtmxMiddleware",
|
||||
"allauth.account.middleware.AccountMiddleware",
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'mwmbl.urls'
|
||||
|
@ -66,17 +72,6 @@ TEMPLATES = [
|
|||
WSGI_APPLICATION = 'mwmbl.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
|
||||
|
||||
|
@ -112,11 +107,60 @@ USE_TZ = True
|
|||
# https://docs.djangoproject.com/en/4.2/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
STATICFILES_DIRS = [str(Path(__file__).parent.parent / "front-end" / "dist")]
|
||||
print("Static files", STATICFILES_DIRS)
|
||||
|
||||
DJANGO_VITE_DEV_MODE = False
|
||||
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
AUTHENTICATION_BACKENDS = [
|
||||
# Needed to login by username in Django admin, regardless of `allauth`
|
||||
'django.contrib.auth.backends.ModelBackend',
|
||||
|
||||
# `allauth` specific authentication methods, such as login by email
|
||||
'allauth.account.auth_backends.AuthenticationBackend',
|
||||
]
|
||||
|
||||
|
||||
AUTH_USER_MODEL = "mwmbl.MwmblUser"
|
||||
|
||||
|
||||
ACCOUNT_EMAIL_REQUIRED = True
|
||||
ACCOUNT_EMAIL_VERIFICATION = "mandatory"
|
||||
|
||||
DEFAULT_FROM_EMAIL = "admin@mwmbl.org"
|
||||
|
||||
LOGIN_REDIRECT_URL = "/"
|
||||
|
||||
FOOTER_LINKS = [
|
||||
{
|
||||
"name": "Matrix",
|
||||
"icon": "ph-chat-circle-text-bold",
|
||||
"href": "https://matrix.to/#/#mwmbl:matrix.org",
|
||||
},
|
||||
{
|
||||
"name": "Book",
|
||||
"icon": "ph-book-bold",
|
||||
"href": "https://book.mwmbl.org",
|
||||
},
|
||||
{
|
||||
"name": "Blog",
|
||||
"icon": "ph-browser-bold",
|
||||
"href": "https://blog.mwmbl.org",
|
||||
},
|
||||
{
|
||||
"name": "GitHub",
|
||||
"icon": "ph-github-logo-bold",
|
||||
"href": "https://github.com/mwmbl/mwmbl",
|
||||
},
|
||||
{
|
||||
"name": "YouTube",
|
||||
"icon": "ph-youtube-logo-bold",
|
||||
"href": "https://www.youtube.com/channel/UCFLbqrH63-icAHxQ1eFfAvA",
|
||||
},
|
||||
|
||||
|
||||
]
|
||||
|
|
|
@ -1,9 +1,31 @@
|
|||
from mwmbl.settings_common import *
|
||||
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
|
||||
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
STATIC_ROOT = ""
|
||||
DJANGO_VITE_ASSETS_PATH = Path(__file__).parent.parent / "front-end" / "dist"
|
||||
DJANGO_VITE_MANIFEST_PATH = DJANGO_VITE_ASSETS_PATH / "manifest.json"
|
||||
|
||||
STATICFILES_DIRS = [str(DJANGO_VITE_ASSETS_PATH)]
|
||||
|
||||
|
||||
DEBUG = True
|
||||
ALLOWED_HOSTS = ["localhost", "127.0.0.1"]
|
||||
|
||||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||
|
||||
DATA_PATH = "./devdata"
|
||||
RUN_BACKGROUND_PROCESSES = True
|
||||
RUN_BACKGROUND_PROCESSES = False
|
||||
NUM_PAGES = 2560
|
||||
|
||||
|
|
|
@ -1,7 +1,33 @@
|
|||
import os
|
||||
|
||||
import dj_database_url
|
||||
|
||||
from mwmbl.settings_common import *
|
||||
|
||||
DEBUG = False
|
||||
ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org"]
|
||||
|
||||
SECRET_KEY = os.environ["DJANGO_SECRET_KEY"]
|
||||
|
||||
|
||||
STATIC_ROOT = "/app/static/"
|
||||
|
||||
DJANGO_VITE_ASSETS_PATH = "/front-end-build/"
|
||||
DJANGO_VITE_MANIFEST_PATH = Path(DJANGO_VITE_ASSETS_PATH) / "manifest.json"
|
||||
STATICFILES_DIRS = [DJANGO_VITE_ASSETS_PATH]
|
||||
|
||||
DATABASES = {'default': dj_database_url.config(default=os.environ["DATABASE_URL"])}
|
||||
|
||||
DEBUG = True # TODO set back to False
|
||||
ALLOWED_HOSTS = ["api.mwmbl.org", "mwmbl.org", "beta.mwmbl.org"]
|
||||
CSRF_TRUSTED_ORIGINS = [f"https://{domain}" for domain in ALLOWED_HOSTS]
|
||||
|
||||
|
||||
# Sendgrid email settings
|
||||
EMAIL_HOST = 'smtp.sendgrid.net'
|
||||
EMAIL_HOST_USER = 'apikey'
|
||||
EMAIL_HOST_PASSWORD = os.getenv('EMAIL_HOST_PASSWORD')
|
||||
EMAIL_PORT = 587
|
||||
EMAIL_USE_TLS = True
|
||||
|
||||
|
||||
DATA_PATH = "/app/storage"
|
||||
RUN_BACKGROUND_PROCESSES = False
|
||||
|
|
22
mwmbl/templates/base.html
Normal file
22
mwmbl/templates/base.html
Normal file
|
@ -0,0 +1,22 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{% block title %}Simple is Better Than Complex{% endblock %}</title>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1>My Site</h1>
|
||||
{% if user.is_authenticated %}
|
||||
<a href="{% url 'account_logout' %}">logout</a>
|
||||
{% else %}
|
||||
<a href="{% url 'account_login' %}">login</a> / <a href="{% url 'signup' %}">signup</a>
|
||||
{% endif %}
|
||||
<hr>
|
||||
</header>
|
||||
<main>
|
||||
{% block content %}
|
||||
{% endblock %}
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
32
mwmbl/templates/home.html
Normal file
32
mwmbl/templates/home.html
Normal file
|
@ -0,0 +1,32 @@
|
|||
{% load humanize %}
|
||||
{% include "title.html" %}
|
||||
<div class="main">
|
||||
{% if query %}
|
||||
<button class="button curate-add" is="mwmbl-add-button">+ Add new</button>
|
||||
{% if results %}
|
||||
<ul class='results'>
|
||||
{% for result in results %}
|
||||
{% include "result.html" %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% else %}
|
||||
<ul>
|
||||
<li class="home">
|
||||
<h1>
|
||||
No results found for "{{query}}".
|
||||
</h1>
|
||||
</li>
|
||||
</ul>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% for item in activity %}
|
||||
<ul>
|
||||
<li class="activity">
|
||||
<h1>
|
||||
{{ item.user }} made {{ item.num_curations | apnumber }} changes to <a href="{{ item.url }}">{{ item.query }}</a> {{ item.timestamp | naturaltime }}.
|
||||
</h1>
|
||||
</li>
|
||||
</ul>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</div>
|
97
mwmbl/templates/index.html
Normal file
97
mwmbl/templates/index.html
Normal file
|
@ -0,0 +1,97 @@
|
|||
{% load django_vite %}
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<!-- Metas -->
|
||||
<meta charset="UTF-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
{% include "title.html" %}
|
||||
<meta name="description" content="The free, open-source and non-profit search engine.">
|
||||
|
||||
<!-- Favicons -->
|
||||
<link rel="icon" href="/static/images/favicon.svg" type="image/svg+xml">
|
||||
|
||||
<!-- Fonts import -->
|
||||
<link rel="preload" href="/static/fonts/inter/inter.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<noscript>
|
||||
<link rel="stylesheet" href="/static/fonts/inter/inter.css">
|
||||
</noscript>
|
||||
|
||||
<!-- CSS Stylesheets (this is critical CSS) -->
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/reset.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/theme.css">
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/global.css">
|
||||
|
||||
<!-- Phosphor Icons (https://github.com/phosphor-icons/phosphor-home) -->
|
||||
<link rel="preload" href="/static/fonts/phosphor/icons.css" as="style" onload="this.onload=null;this.rel='stylesheet'">
|
||||
<noscript>
|
||||
<link rel="stylesheet" href="/static/fonts/phosphor/icons.css">
|
||||
</noscript>
|
||||
|
||||
<!-- Custom Element Polyfill for Safari -->
|
||||
<script src="https://unpkg.com/@ungap/custom-elements" type="module"></script>
|
||||
|
||||
<!-- OpenSearch -->
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/static/assets/opensearch.xml" title="Mwmbl Search">
|
||||
|
||||
<script src="https://unpkg.com/htmx.org@1.9.6"></script>
|
||||
|
||||
{% vite_hmr_client %}
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<mwmbl-app></mwmbl-app>
|
||||
<header class="search-menu compact">
|
||||
<a href="/" class="branding">
|
||||
<img class="brand-icon" src="/static/images/logo.svg" width="40" height="40" alt="mwmbl logo">
|
||||
<span class="brand-title">Mwmbl</span>
|
||||
</a>
|
||||
<form class="search-bar">
|
||||
<i class="ph-magnifying-glass-bold"></i>
|
||||
<input
|
||||
type='search'
|
||||
name='q'
|
||||
class='search-bar-input'
|
||||
placeholder='Search on Mwmbl...'
|
||||
title='Use "CTRL+K" or "/" to focus.'
|
||||
autocomplete='off'
|
||||
value='{{ query|default_if_none:"" }}'
|
||||
hx-get="/app/home/"
|
||||
hx-trigger="keyup changed delay:100ms"
|
||||
hx-target=".main"
|
||||
>
|
||||
</form>
|
||||
<div is="mwmbl-save"></div>
|
||||
{% if user.is_authenticated %}
|
||||
<p class="login-info">Logged in as {{ user.username }}</p>
|
||||
<a class="button" href="/accounts/logout/">Log out</a>
|
||||
{% else %}
|
||||
<a class="button" href="/accounts/login/">Login</a>
|
||||
<a class="button" href="/accounts/signup/">Sign up</a>
|
||||
{% endif %}
|
||||
</header>
|
||||
<main>
|
||||
{% include "home.html" %}
|
||||
</main>
|
||||
<div is="mwmbl-add-result"></div>
|
||||
<div class="footer">
|
||||
<ul class="footer-list">
|
||||
{% for link in footer_links %}
|
||||
<li class="footer-item">
|
||||
<a href="{{ link.href }}" class="footer-link" target="__blank">
|
||||
<i class="{{ link.icon }}"></i>
|
||||
<span>{{ link.name }}</span>
|
||||
</a>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% vite_asset 'index.js' %}
|
||||
{% vite_legacy_polyfills %}
|
||||
{% vite_legacy_asset 'index-legacy.js' %}
|
||||
</body>
|
||||
|
||||
</html>
|
26
mwmbl/templates/registration/login.html
Normal file
26
mwmbl/templates/registration/login.html
Normal file
|
@ -0,0 +1,26 @@
|
|||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<h2>Log in to My Site</h2>
|
||||
{% if form.errors %}
|
||||
<p style="color: red">Your username and password didn't match. Please try again.</p>
|
||||
{% endif %}
|
||||
<form method="post">
|
||||
{% csrf_token %}
|
||||
<input type="hidden" name="next" value="{{ next }}" />
|
||||
{% for field in form %}
|
||||
<p>
|
||||
{{ field.label_tag }}<br>
|
||||
{{ field }}<br>
|
||||
{% for error in field.errors %}
|
||||
<p style="color: red">{{ error }}</p>
|
||||
{% endfor %}
|
||||
{% if field.help_text %}
|
||||
<p><small style="color: grey">{{ field.help_text }}</small></p>
|
||||
{% endif %}
|
||||
</p>
|
||||
{% endfor %}
|
||||
<button type="submit">Log in</button>
|
||||
<a href="{% url 'signup' %}">New to My Site? Sign up</a>
|
||||
</form>
|
||||
{% endblock %}
|
17
mwmbl/templates/result.html
Normal file
17
mwmbl/templates/result.html
Normal file
|
@ -0,0 +1,17 @@
|
|||
{% load result_filters %}
|
||||
<li class="result" is="mwmbl-result">
|
||||
<div class="result-container">
|
||||
<div class="result-link">
|
||||
<a href="{{result.url}}">
|
||||
<p class='link'>{{result.url}}</p>
|
||||
<p class='title'>{{result.title|strengthen}}</p>
|
||||
</a>
|
||||
<p class='extract'>{{result.extract|strengthen}}</p>
|
||||
</div>
|
||||
<div class="curation-buttons">
|
||||
<span class="button handle">↕ Move</span>
|
||||
<button class="button curate-delete" is="mwmbl-delete-button">✕ Delete</button>
|
||||
<button class="button curate-approve" is="mwmbl-validate-button">✓ Looks good</button>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
10
mwmbl/templates/signup.html
Normal file
10
mwmbl/templates/signup.html
Normal file
|
@ -0,0 +1,10 @@
|
|||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<h2>Sign up</h2>
|
||||
<form method="post">
|
||||
{% csrf_token %}
|
||||
{{ form.as_p }}
|
||||
<button type="submit">Sign up</button>
|
||||
</form>
|
||||
{% endblock %}
|
6
mwmbl/templates/title.html
Normal file
6
mwmbl/templates/title.html
Normal file
|
@ -0,0 +1,6 @@
|
|||
<!-- Page title -->
|
||||
{% if query %}
|
||||
<title>Mwmbl - {{ query }}</title>
|
||||
{% else %}
|
||||
<title>Mwmbl - Search</title>
|
||||
{% endif %}
|
0
mwmbl/templatetags/__init__.py
Normal file
0
mwmbl/templatetags/__init__.py
Normal file
18
mwmbl/templatetags/result_filters.py
Normal file
18
mwmbl/templatetags/result_filters.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from django.template import Library
|
||||
from django.utils.html import conditional_escape
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
register = Library()
|
||||
|
||||
|
||||
@register.filter(needs_autoescape=True)
|
||||
def strengthen(spans, autoescape=True):
|
||||
escape = conditional_escape if autoescape else lambda x: x
|
||||
strengthened = []
|
||||
for span in spans:
|
||||
escaped_value = escape(span["value"])
|
||||
if span["is_bold"]:
|
||||
strengthened.append(f"<strong>{escaped_value}</strong>")
|
||||
else:
|
||||
strengthened.append(escaped_value)
|
||||
return mark_safe("".join(strengthened))
|
|
@ -79,6 +79,7 @@ class TinyIndexMetadata:
|
|||
values = json.loads(data[constant_length:].decode('utf8'))
|
||||
return TinyIndexMetadata(**values)
|
||||
|
||||
|
||||
# Find the optimal amount of data that fits onto a page
|
||||
# We do this by leveraging binary search to quickly find the index where:
|
||||
# - index+1 cannot fit onto a page
|
||||
|
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
|
|||
# No better match, use our index
|
||||
return mid, compressed_data
|
||||
|
||||
|
||||
def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
|
||||
# Find max number of items that fit on a page
|
||||
return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))
|
||||
|
||||
|
||||
def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
|
||||
num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)
|
||||
|
||||
|
@ -186,7 +189,6 @@ class TinyIndex(Generic[T]):
|
|||
except ZstdError:
|
||||
logger.exception(f"Error decompressing page data, content: {page_data}")
|
||||
return []
|
||||
# logger.debug(f"Decompressed data: {decompressed_data}")
|
||||
return json.loads(decompressed_data.decode('utf8'))
|
||||
|
||||
def store_in_page(self, page_index: int, values: list[T]):
|
||||
|
|
|
@ -15,12 +15,17 @@ Including another URLconf
|
|||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.urls import path
|
||||
from django.urls import path, include
|
||||
|
||||
from mwmbl.api import api_original as api, api_v1
|
||||
from mwmbl.api import api_v1
|
||||
from mwmbl.views import home_fragment, fetch_url, index
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('', api.urls),
|
||||
path('api/v1/', api_v1.urls)
|
||||
path('api/v1/', api_v1.urls),
|
||||
path('accounts/', include('allauth.urls')),
|
||||
|
||||
path('', index, name="home"),
|
||||
path('app/home/', home_fragment, name="home"),
|
||||
path('app/fetch/', fetch_url, name="fetch_url")
|
||||
]
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
import re
|
||||
|
||||
from mwmbl.indexer.index import tokenize_document
|
||||
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
|
||||
|
||||
DOMAIN_REGEX = re.compile(r".*://([^/]*)")
|
||||
|
||||
|
||||
|
@ -17,3 +20,23 @@ def get_domain(url):
|
|||
if results is None or len(results.groups()) == 0:
|
||||
raise ValueError(f"Unable to parse domain from URL {url}")
|
||||
return results.group(1)
|
||||
|
||||
|
||||
def add_term_info(document: Document, index: TinyIndex, page_index: int):
|
||||
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
|
||||
for token in tokenized.tokens:
|
||||
token_page_index = index.get_key_page_index(token)
|
||||
if token_page_index == page_index:
|
||||
return Document(document.title, document.url, document.extract, document.score, token)
|
||||
raise ValueError("Could not find token in page index")
|
||||
|
||||
|
||||
def add_term_infos(documents: list[Document], index: TinyIndex, page_index: int):
|
||||
for document in documents:
|
||||
if document.term is not None:
|
||||
yield document
|
||||
continue
|
||||
try:
|
||||
yield add_term_info(document, index, page_index)
|
||||
except ValueError:
|
||||
continue
|
||||
|
|
129
mwmbl/views.py
Normal file
129
mwmbl/views.py
Normal file
|
@ -0,0 +1,129 @@
|
|||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from itertools import groupby
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
import justext
|
||||
import requests
|
||||
from django.contrib.auth.decorators import login_required
|
||||
from django.shortcuts import render
|
||||
from django_htmx.http import push_url
|
||||
|
||||
from mwmbl.format import format_result
|
||||
from mwmbl.models import UserCuration, MwmblUser
|
||||
from mwmbl.search_setup import ranker
|
||||
|
||||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||
|
||||
from mwmbl.settings import NUM_EXTRACT_CHARS
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
|
||||
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
|
||||
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
|
||||
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
|
||||
encoding=None, default_encoding=DEFAULT_ENCODING,
|
||||
enc_errors=DEFAULT_ENC_ERRORS):
|
||||
"""
|
||||
Converts an HTML page into a list of classified paragraphs. Each paragraph
|
||||
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
|
||||
"""
|
||||
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
|
||||
|
||||
titles = dom.xpath("//title")
|
||||
title = titles[0].text if len(titles) > 0 else None
|
||||
|
||||
dom = preprocessor(dom)
|
||||
|
||||
paragraphs = ParagraphMaker.make_paragraphs(dom)
|
||||
|
||||
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
|
||||
stopwords_low, stopwords_high, max_link_density, no_headings)
|
||||
revise_paragraph_classification(paragraphs, max_heading_distance)
|
||||
|
||||
return paragraphs, title
|
||||
|
||||
|
||||
def index(request):
|
||||
activity, query, results = _get_results_and_activity(request)
|
||||
return render(request, "index.html", {
|
||||
"results": results,
|
||||
"query": query,
|
||||
"user": request.user,
|
||||
"activity": activity,
|
||||
"footer_links": settings.FOOTER_LINKS,
|
||||
})
|
||||
|
||||
|
||||
def home_fragment(request):
|
||||
activity, query, results = _get_results_and_activity(request)
|
||||
response = render(request, "home.html", {
|
||||
"results": results,
|
||||
"query": query,
|
||||
"activity": activity,
|
||||
})
|
||||
current_url = request.htmx.current_url
|
||||
# Replace query string with new query
|
||||
stripped_url = current_url[:current_url.index("?")] if "?" in current_url else current_url
|
||||
query_string = "?q=" + query if len(query) > 0 else ""
|
||||
new_url = stripped_url + query_string
|
||||
# Set the htmx replace header
|
||||
response["HX-Replace-Url"] = new_url
|
||||
return response
|
||||
|
||||
|
||||
@dataclass
|
||||
class Activity:
|
||||
user: MwmblUser
|
||||
num_curations: int
|
||||
timestamp: datetime
|
||||
query: str
|
||||
url: str
|
||||
|
||||
|
||||
def _get_results_and_activity(request):
|
||||
query = request.GET.get("q")
|
||||
if query:
|
||||
results = ranker.search(query)
|
||||
activity = None
|
||||
else:
|
||||
results = None
|
||||
curations = UserCuration.objects.order_by("-timestamp")[:100]
|
||||
sorted_curations = sorted(curations, key=lambda x: x.user.username)
|
||||
groups = groupby(sorted_curations, key=lambda x: (x.user.username, x.url))
|
||||
unsorted_activity = []
|
||||
for (user, url), group in groups:
|
||||
parsed_url_query = parse_qs(urlparse(url).query)
|
||||
activity_query = parsed_url_query.get("q", [""])[0]
|
||||
group = list(group)
|
||||
unsorted_activity.append(Activity(
|
||||
user=user,
|
||||
num_curations=len(group),
|
||||
timestamp=max([i.timestamp for i in group]),
|
||||
query=activity_query,
|
||||
url=url,
|
||||
))
|
||||
|
||||
activity = sorted(unsorted_activity, key=lambda a: a.timestamp, reverse=True)
|
||||
return activity, query, results
|
||||
|
||||
|
||||
def fetch_url(request):
|
||||
url = request.GET["url"]
|
||||
query = request.GET["query"]
|
||||
response = requests.get(url)
|
||||
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
||||
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
||||
|
||||
extract = ' '.join([p.text for p in good_paragraphs])
|
||||
if len(extract) > NUM_EXTRACT_CHARS:
|
||||
extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
|
||||
|
||||
result = Document(title=title, url=url, extract=extract, score=0.0)
|
||||
return render(request, "result.html", {
|
||||
"result": format_result(result, query),
|
||||
})
|
|
@ -100,17 +100,12 @@ server {
|
|||
|
||||
## Static file hosting
|
||||
location /static/ {
|
||||
alias /var/lib/dokku/data/storage/mwmbl/;
|
||||
}
|
||||
|
||||
## Root and stats served statically
|
||||
location = / {
|
||||
root /var/lib/dokku/data/storage/mwmbl;
|
||||
try_files /index.html =404;
|
||||
alias /var/lib/dokku/data/storage/mwmbl-beta/;
|
||||
}
|
||||
|
||||
## Stats served statically
|
||||
location ~ ^\/stats\/?$ {
|
||||
root /var/lib/dokku/data/storage/mwmbl;
|
||||
root /var/lib/dokku/data/storage/mwmbl-beta;
|
||||
try_files /stats/index.html =404;
|
||||
}
|
||||
|
||||
|
|
294
poetry.lock
generated
294
poetry.lock
generated
|
@ -1,10 +1,9 @@
|
|||
# This file is automatically @generated by Poetry and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "anyio"
|
||||
version = "3.7.1"
|
||||
description = "High level compatibility layer for multiple asynchronous event loop implementations"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -26,7 +25,6 @@ trio = ["trio (<0.22)"]
|
|||
name = "asgiref"
|
||||
version = "3.7.2"
|
||||
description = "ASGI specs, helper code, and adapters"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -44,7 +42,6 @@ tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
|
|||
name = "async-timeout"
|
||||
version = "4.0.3"
|
||||
description = "Timeout context manager for asyncio programs"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -56,7 +53,6 @@ files = [
|
|||
name = "attrs"
|
||||
version = "23.1.0"
|
||||
description = "Classes Without Boilerplate"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -75,7 +71,6 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte
|
|||
name = "beautifulsoup4"
|
||||
version = "4.10.0"
|
||||
description = "Screen-scraping library"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">3.0.0"
|
||||
files = [
|
||||
|
@ -94,7 +89,6 @@ lxml = ["lxml"]
|
|||
name = "blis"
|
||||
version = "0.7.11"
|
||||
description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -141,7 +135,6 @@ numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""}
|
|||
name = "boto3"
|
||||
version = "1.28.62"
|
||||
description = "The AWS SDK for Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">= 3.7"
|
||||
files = [
|
||||
|
@ -161,7 +154,6 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
|
|||
name = "botocore"
|
||||
version = "1.31.62"
|
||||
description = "Low-level, data-driven core of boto 3."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">= 3.7"
|
||||
files = [
|
||||
|
@ -181,7 +173,6 @@ crt = ["awscrt (==0.16.26)"]
|
|||
name = "catalogue"
|
||||
version = "2.0.10"
|
||||
description = "Super lightweight function registries for your library"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -193,7 +184,6 @@ files = [
|
|||
name = "cattrs"
|
||||
version = "23.1.2"
|
||||
description = "Composable complex class support for attrs and dataclasses."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -219,7 +209,6 @@ ujson = ["ujson (>=5.4.0,<6.0.0)"]
|
|||
name = "certifi"
|
||||
version = "2023.7.22"
|
||||
description = "Python package for providing Mozilla's CA Bundle."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -231,7 +220,6 @@ files = [
|
|||
name = "cffi"
|
||||
version = "1.16.0"
|
||||
description = "Foreign Function Interface for Python calling C code."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -296,7 +284,6 @@ pycparser = "*"
|
|||
name = "charset-normalizer"
|
||||
version = "3.3.0"
|
||||
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
files = [
|
||||
|
@ -396,7 +383,6 @@ files = [
|
|||
name = "click"
|
||||
version = "8.1.7"
|
||||
description = "Composable command line interface toolkit"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -411,7 +397,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
|||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
description = "Cross-platform colored terminal text."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||
files = [
|
||||
|
@ -419,11 +404,55 @@ files = [
|
|||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "41.0.4"
|
||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-win32.whl", hash = "sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd"},
|
||||
{file = "cryptography-41.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d"},
|
||||
{file = "cryptography-41.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67"},
|
||||
{file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e"},
|
||||
{file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829"},
|
||||
{file = "cryptography-41.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca"},
|
||||
{file = "cryptography-41.0.4-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d"},
|
||||
{file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac"},
|
||||
{file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9"},
|
||||
{file = "cryptography-41.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f"},
|
||||
{file = "cryptography-41.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91"},
|
||||
{file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8"},
|
||||
{file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6"},
|
||||
{file = "cryptography-41.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311"},
|
||||
{file = "cryptography-41.0.4.tar.gz", hash = "sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cffi = ">=1.12"
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
|
||||
docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
|
||||
nox = ["nox"]
|
||||
pep8test = ["black", "check-sdist", "mypy", "ruff"]
|
||||
sdist = ["build"]
|
||||
ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
|
||||
[[package]]
|
||||
name = "cymem"
|
||||
version = "2.0.8"
|
||||
description = "Manage calls to calloc/free through Cython"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -462,11 +491,36 @@ files = [
|
|||
{file = "cymem-2.0.8.tar.gz", hash = "sha256:8fb09d222e21dcf1c7e907dc85cf74501d4cea6c4ed4ac6c9e016f98fb59cbbf"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "defusedxml"
|
||||
version = "0.7.1"
|
||||
description = "XML bomb protection for Python stdlib modules"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
files = [
|
||||
{file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"},
|
||||
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dj-database-url"
|
||||
version = "2.1.0"
|
||||
description = "Use Database URLs in your Django Application."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "dj-database-url-2.1.0.tar.gz", hash = "sha256:f2042cefe1086e539c9da39fad5ad7f61173bf79665e69bf7e4de55fa88b135f"},
|
||||
{file = "dj_database_url-2.1.0-py3-none-any.whl", hash = "sha256:04bc34b248d4c21aaa13e4ab419ae6575ef5f10f3df735ce7da97722caa356e0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Django = ">=3.2"
|
||||
typing-extensions = ">=3.10.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "django"
|
||||
version = "4.2.6"
|
||||
description = "A high-level Python web framework that encourages rapid development and clean, pragmatic design."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -483,11 +537,45 @@ tzdata = {version = "*", markers = "sys_platform == \"win32\""}
|
|||
argon2 = ["argon2-cffi (>=19.1.0)"]
|
||||
bcrypt = ["bcrypt"]
|
||||
|
||||
[[package]]
|
||||
name = "django-allauth"
|
||||
version = "0.57.0"
|
||||
description = "Integrated set of Django applications addressing authentication, registration, account management as well as 3rd party (social) account authentication."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "django-allauth-0.57.0.tar.gz", hash = "sha256:a095ef0db7de305d9175772c78e765ebd5fceb004ae61c1383d7fc1af0f7c5b1"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Django = ">=3.2"
|
||||
pyjwt = {version = ">=1.7", extras = ["crypto"]}
|
||||
python3-openid = ">=3.0.8"
|
||||
requests = ">=2.0.0"
|
||||
requests-oauthlib = ">=0.3.0"
|
||||
|
||||
[package.extras]
|
||||
mfa = ["qrcode (>=7.0.0)"]
|
||||
saml = ["python3-saml (>=1.15.0,<2.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "django-htmx"
|
||||
version = "1.17.0"
|
||||
description = "Extensions for using Django with htmx."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "django_htmx-1.17.0-py3-none-any.whl", hash = "sha256:070a37092b88a42cd7af26c1b65f63c4529bae276710fd16137dc934938b44f2"},
|
||||
{file = "django_htmx-1.17.0.tar.gz", hash = "sha256:2ef0d19db41c6152881e782673cd2cd1755a7fd6784f8b4f2279fb18dc03d2c2"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Django = ">=3.2"
|
||||
|
||||
[[package]]
|
||||
name = "django-ninja"
|
||||
version = "0.22.2"
|
||||
description = "Django Ninja - Fast Django REST framework"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -504,11 +592,27 @@ dev = ["pre-commit"]
|
|||
doc = ["markdown-include", "mkdocs", "mkdocs-material", "mkdocstrings"]
|
||||
test = ["black", "django-stubs", "flake8", "isort", "mypy (==0.931)", "psycopg2-binary", "pytest", "pytest-asyncio", "pytest-cov", "pytest-django"]
|
||||
|
||||
[[package]]
|
||||
name = "django-vite"
|
||||
version = "2.1.3"
|
||||
description = "Integration of ViteJS in a Django project."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "django-vite-2.1.3.tar.gz", hash = "sha256:c59b3bbd85501bc1faf63c500df66542abed2951cfa10dfbf8be8ecf229f7652"},
|
||||
{file = "django_vite-2.1.3-py3-none-any.whl", hash = "sha256:97984ac495910b7b71039228ccddff52d132231fa6612d3d31c6c228c95b0217"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Django = ">=1.11"
|
||||
|
||||
[package.extras]
|
||||
dev = ["black", "flake8"]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
version = "1.1.3"
|
||||
description = "Backport of PEP 654 (exception groups)"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -523,7 +627,6 @@ test = ["pytest (>=6)"]
|
|||
name = "fastapi"
|
||||
version = "0.70.1"
|
||||
description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6.1"
|
||||
files = [
|
||||
|
@ -545,7 +648,6 @@ test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==21.9b0)", "databases[sqlite] (
|
|||
name = "h11"
|
||||
version = "0.14.0"
|
||||
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -557,7 +659,6 @@ files = [
|
|||
name = "hiredis"
|
||||
version = "2.2.3"
|
||||
description = "Python wrapper for hiredis"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -656,7 +757,6 @@ files = [
|
|||
name = "idna"
|
||||
version = "3.3"
|
||||
description = "Internationalized Domain Names in Applications (IDNA)"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
files = [
|
||||
|
@ -668,7 +768,6 @@ files = [
|
|||
name = "iniconfig"
|
||||
version = "2.0.0"
|
||||
description = "brain-dead simple config-ini parsing"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -680,7 +779,6 @@ files = [
|
|||
name = "jinja2"
|
||||
version = "3.1.2"
|
||||
description = "A very fast and expressive template engine."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -698,7 +796,6 @@ i18n = ["Babel (>=2.7)"]
|
|||
name = "jmespath"
|
||||
version = "1.0.1"
|
||||
description = "JSON Matching Expressions"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -710,7 +807,6 @@ files = [
|
|||
name = "joblib"
|
||||
version = "1.3.2"
|
||||
description = "Lightweight pipelining with Python functions"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -722,7 +818,6 @@ files = [
|
|||
name = "justext"
|
||||
version = "3.0.0"
|
||||
description = "Heuristic based boilerplate removal tool"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -737,7 +832,6 @@ lxml = ">=4.4.2"
|
|||
name = "langcodes"
|
||||
version = "3.3.0"
|
||||
description = "Tools for labeling human languages with IETF language tags"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -752,7 +846,6 @@ data = ["language-data (>=1.1,<2.0)"]
|
|||
name = "langdetect"
|
||||
version = "1.0.9"
|
||||
description = "Language detection library ported from Google's language-detection."
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -767,7 +860,6 @@ six = "*"
|
|||
name = "levenshtein"
|
||||
version = "0.16.0"
|
||||
description = "Python extension for computing string edit distances and similarities."
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.5"
|
||||
files = [
|
||||
|
@ -832,7 +924,6 @@ rapidfuzz = ">=1.8.2,<1.9"
|
|||
name = "lxml"
|
||||
version = "4.6.4"
|
||||
description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
|
||||
files = [
|
||||
|
@ -908,7 +999,6 @@ source = ["Cython (>=0.29.7)"]
|
|||
name = "markupsafe"
|
||||
version = "2.1.3"
|
||||
description = "Safely add untrusted strings to HTML/XML markup."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -978,7 +1068,6 @@ files = [
|
|||
name = "mmh3"
|
||||
version = "3.1.0"
|
||||
description = "Python wrapper for MurmurHash (MurmurHash3), a set of fast and robust hash functions."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -1023,7 +1112,6 @@ files = [
|
|||
name = "murmurhash"
|
||||
version = "1.0.10"
|
||||
description = "Cython bindings for MurmurHash"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -1066,7 +1154,6 @@ files = [
|
|||
name = "numpy"
|
||||
version = "1.26.0"
|
||||
description = "Fundamental package for array computing in Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "<3.13,>=3.9"
|
||||
files = [
|
||||
|
@ -1104,11 +1191,26 @@ files = [
|
|||
{file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "oauthlib"
|
||||
version = "3.2.2"
|
||||
description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"},
|
||||
{file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
rsa = ["cryptography (>=3.0.0)"]
|
||||
signals = ["blinker (>=1.4.0)"]
|
||||
signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "23.2"
|
||||
description = "Core utilities for Python packages"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -1120,7 +1222,6 @@ files = [
|
|||
name = "pandas"
|
||||
version = "1.5.3"
|
||||
description = "Powerful data structures for data analysis, time series, and statistics"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -1165,7 +1266,6 @@ test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
|
|||
name = "pathy"
|
||||
version = "0.10.2"
|
||||
description = "pathlib.Path subclasses for local and cloud bucket storage"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">= 3.6"
|
||||
files = [
|
||||
|
@ -1188,7 +1288,6 @@ test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
|
|||
name = "platformdirs"
|
||||
version = "3.11.0"
|
||||
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -1204,7 +1303,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co
|
|||
name = "pluggy"
|
||||
version = "1.3.0"
|
||||
description = "plugin and hook calling mechanisms for python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -1220,7 +1318,6 @@ testing = ["pytest", "pytest-benchmark"]
|
|||
name = "preshed"
|
||||
version = "3.0.9"
|
||||
description = "Cython hash table that trusts the keys are pre-hashed"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -1267,7 +1364,6 @@ murmurhash = ">=0.28.0,<1.1.0"
|
|||
name = "psycopg2-binary"
|
||||
version = "2.9.9"
|
||||
description = "psycopg2 - Python-PostgreSQL Database Adapter"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -1297,6 +1393,7 @@ files = [
|
|||
{file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
|
||||
{file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
|
||||
|
@ -1305,6 +1402,8 @@ files = [
|
|||
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
|
||||
{file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
|
||||
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
|
||||
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
|
||||
|
@ -1346,7 +1445,6 @@ files = [
|
|||
name = "py4j"
|
||||
version = "0.10.9.2"
|
||||
description = "Enables Python programs to dynamically access arbitrary Java objects"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -1358,7 +1456,6 @@ files = [
|
|||
name = "pyarrow"
|
||||
version = "6.0.0"
|
||||
description = "Python library for Apache Arrow"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -1407,7 +1504,6 @@ numpy = ">=1.16.6"
|
|||
name = "pycparser"
|
||||
version = "2.21"
|
||||
description = "C parser in Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
files = [
|
||||
|
@ -1419,7 +1515,6 @@ files = [
|
|||
name = "pydantic"
|
||||
version = "1.8.2"
|
||||
description = "Data validation and settings management using python 3.6 type hinting"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6.1"
|
||||
files = [
|
||||
|
@ -1454,11 +1549,30 @@ typing-extensions = ">=3.7.4.3"
|
|||
dotenv = ["python-dotenv (>=0.10.4)"]
|
||||
email = ["email-validator (>=1.0.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "pyjwt"
|
||||
version = "2.8.0"
|
||||
description = "JSON Web Token implementation in Python"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"},
|
||||
{file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"crypto\""}
|
||||
|
||||
[package.extras]
|
||||
crypto = ["cryptography (>=3.4.0)"]
|
||||
dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
|
||||
docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
|
||||
tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pyspark"
|
||||
version = "3.2.0"
|
||||
description = "Apache Spark Python API"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -1478,7 +1592,6 @@ sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
|
|||
name = "pytest"
|
||||
version = "7.4.2"
|
||||
description = "pytest: simple powerful testing with Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -1501,7 +1614,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no
|
|||
name = "pytest-mock"
|
||||
version = "3.11.1"
|
||||
description = "Thin-wrapper around the mock package for easier use with pytest"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -1519,7 +1631,6 @@ dev = ["pre-commit", "pytest-asyncio", "tox"]
|
|||
name = "python-dateutil"
|
||||
version = "2.8.2"
|
||||
description = "Extensions to the standard Python datetime module"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
|
||||
files = [
|
||||
|
@ -1530,11 +1641,28 @@ files = [
|
|||
[package.dependencies]
|
||||
six = ">=1.5"
|
||||
|
||||
[[package]]
|
||||
name = "python3-openid"
|
||||
version = "3.2.0"
|
||||
description = "OpenID support for modern servers and consumers."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "python3-openid-3.2.0.tar.gz", hash = "sha256:33fbf6928f401e0b790151ed2b5290b02545e8775f982485205a066f874aaeaf"},
|
||||
{file = "python3_openid-3.2.0-py3-none-any.whl", hash = "sha256:6626f771e0417486701e0b4daff762e7212e820ca5b29fcc0d05f6f8736dfa6b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
defusedxml = "*"
|
||||
|
||||
[package.extras]
|
||||
mysql = ["mysql-connector-python"]
|
||||
postgresql = ["psycopg2"]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2023.3.post1"
|
||||
description = "World timezone definitions, modern and historical"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -1546,7 +1674,6 @@ files = [
|
|||
name = "pyyaml"
|
||||
version = "6.0"
|
||||
description = "YAML parser and emitter for Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -1596,7 +1723,6 @@ files = [
|
|||
name = "rapidfuzz"
|
||||
version = "1.8.3"
|
||||
description = "rapid fuzzy string matching"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=2.7"
|
||||
files = [
|
||||
|
@ -1663,7 +1789,6 @@ full = ["numpy"]
|
|||
name = "redis"
|
||||
version = "5.0.1"
|
||||
description = "Python client for Redis database and key-value store"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -1683,7 +1808,6 @@ ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"
|
|||
name = "requests"
|
||||
version = "2.31.0"
|
||||
description = "Python HTTP for Humans."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -1705,7 +1829,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
|||
name = "requests-cache"
|
||||
version = "1.1.0"
|
||||
description = "A persistent cache for python requests"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7,<4.0"
|
||||
files = [
|
||||
|
@ -1732,11 +1855,28 @@ redis = ["redis (>=3)"]
|
|||
security = ["itsdangerous (>=2.0)"]
|
||||
yaml = ["pyyaml (>=5.4)"]
|
||||
|
||||
[[package]]
|
||||
name = "requests-oauthlib"
|
||||
version = "1.3.1"
|
||||
description = "OAuthlib authentication support for Requests."
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
files = [
|
||||
{file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"},
|
||||
{file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
oauthlib = ">=3.0.0"
|
||||
requests = ">=2.0.0"
|
||||
|
||||
[package.extras]
|
||||
rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "s3transfer"
|
||||
version = "0.7.0"
|
||||
description = "An Amazon S3 Transfer Manager"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">= 3.7"
|
||||
files = [
|
||||
|
@ -1754,7 +1894,6 @@ crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
|
|||
name = "scikit-learn"
|
||||
version = "1.3.1"
|
||||
description = "A set of python modules for machine learning and data mining"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -1769,6 +1908,11 @@ files = [
|
|||
{file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f66eddfda9d45dd6cadcd706b65669ce1df84b8549875691b1f403730bdef217"},
|
||||
{file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6448c37741145b241eeac617028ba6ec2119e1339b1385c9720dae31367f2be"},
|
||||
{file = "scikit_learn-1.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:c413c2c850241998168bbb3bd1bb59ff03b1195a53864f0b80ab092071af6028"},
|
||||
{file = "scikit_learn-1.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ef540e09873e31569bc8b02c8a9f745ee04d8e1263255a15c9969f6f5caa627f"},
|
||||
{file = "scikit_learn-1.3.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9147a3a4df4d401e618713880be023e36109c85d8569b3bf5377e6cd3fecdeac"},
|
||||
{file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2cd3634695ad192bf71645702b3df498bd1e246fc2d529effdb45a06ab028b4"},
|
||||
{file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c275a06c5190c5ce00af0acbb61c06374087949f643ef32d355ece12c4db043"},
|
||||
{file = "scikit_learn-1.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:0e1aa8f206d0de814b81b41d60c1ce31f7f2c7354597af38fae46d9c47c45122"},
|
||||
{file = "scikit_learn-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:52b77cc08bd555969ec5150788ed50276f5ef83abb72e6f469c5b91a0009bbca"},
|
||||
{file = "scikit_learn-1.3.1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a683394bc3f80b7c312c27f9b14ebea7766b1f0a34faf1a2e9158d80e860ec26"},
|
||||
{file = "scikit_learn-1.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15d964d9eb181c79c190d3dbc2fff7338786bf017e9039571418a1d53dab236"},
|
||||
|
@ -1797,7 +1941,6 @@ tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (
|
|||
name = "scipy"
|
||||
version = "1.11.3"
|
||||
description = "Fundamental algorithms for scientific computing in Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "<3.13,>=3.9"
|
||||
files = [
|
||||
|
@ -1840,7 +1983,6 @@ test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeo
|
|||
name = "setuptools"
|
||||
version = "68.2.2"
|
||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -1857,7 +1999,6 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar
|
|||
name = "six"
|
||||
version = "1.16.0"
|
||||
description = "Python 2 and 3 compatibility utilities"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||
files = [
|
||||
|
@ -1869,7 +2010,6 @@ files = [
|
|||
name = "smart-open"
|
||||
version = "6.4.0"
|
||||
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6,<4.0"
|
||||
files = [
|
||||
|
@ -1891,7 +2031,6 @@ webhdfs = ["requests"]
|
|||
name = "sniffio"
|
||||
version = "1.3.0"
|
||||
description = "Sniff out which async library your code is running under"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -1903,7 +2042,6 @@ files = [
|
|||
name = "soupsieve"
|
||||
version = "2.5"
|
||||
description = "A modern CSS selector implementation for Beautiful Soup."
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -1915,7 +2053,6 @@ files = [
|
|||
name = "spacy"
|
||||
version = "3.2.1"
|
||||
description = "Industrial-strength Natural Language Processing (NLP) in Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -1985,7 +2122,6 @@ transformers = ["spacy-transformers (>=1.1.2,<1.2.0)"]
|
|||
name = "spacy-legacy"
|
||||
version = "3.0.12"
|
||||
description = "Legacy registered functions for spaCy backwards compatibility"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -1997,7 +2133,6 @@ files = [
|
|||
name = "spacy-loggers"
|
||||
version = "1.0.5"
|
||||
description = "Logging utilities for SpaCy"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -2009,7 +2144,6 @@ files = [
|
|||
name = "sqlparse"
|
||||
version = "0.4.4"
|
||||
description = "A non-validating SQL parser."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
files = [
|
||||
|
@ -2026,7 +2160,6 @@ test = ["pytest", "pytest-cov"]
|
|||
name = "srsly"
|
||||
version = "2.4.8"
|
||||
description = "Modern high-performance serialization utilities for Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -2073,7 +2206,6 @@ catalogue = ">=2.0.3,<2.1.0"
|
|||
name = "starlette"
|
||||
version = "0.16.0"
|
||||
description = "The little ASGI library that shines."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -2091,7 +2223,6 @@ full = ["graphene", "itsdangerous", "jinja2", "python-multipart", "pyyaml", "req
|
|||
name = "thinc"
|
||||
version = "8.0.17"
|
||||
description = "A refreshing functional take on deep learning, compatible with your favorite libraries"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -2157,7 +2288,6 @@ torch = ["torch (>=1.6.0)"]
|
|||
name = "threadpoolctl"
|
||||
version = "3.2.0"
|
||||
description = "threadpoolctl"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -2169,7 +2299,6 @@ files = [
|
|||
name = "tomli"
|
||||
version = "2.0.1"
|
||||
description = "A lil' TOML parser"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -2181,7 +2310,6 @@ files = [
|
|||
name = "tqdm"
|
||||
version = "4.66.1"
|
||||
description = "Fast, Extensible Progress Meter"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -2202,7 +2330,6 @@ telegram = ["requests"]
|
|||
name = "typer"
|
||||
version = "0.4.2"
|
||||
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -2223,7 +2350,6 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.
|
|||
name = "typing-extensions"
|
||||
version = "4.8.0"
|
||||
description = "Backported and Experimental Type Hints for Python 3.8+"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
|
@ -2235,7 +2361,6 @@ files = [
|
|||
name = "tzdata"
|
||||
version = "2023.3"
|
||||
description = "Provider of IANA time zone data"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2"
|
||||
files = [
|
||||
|
@ -2247,7 +2372,6 @@ files = [
|
|||
name = "ujson"
|
||||
version = "4.3.0"
|
||||
description = "Ultra fast JSON encoder and decoder for Python"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -2301,7 +2425,6 @@ files = [
|
|||
name = "url-normalize"
|
||||
version = "1.4.3"
|
||||
description = "URL normalization for Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||
files = [
|
||||
|
@ -2316,7 +2439,6 @@ six = "*"
|
|||
name = "urllib3"
|
||||
version = "2.0.6"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
|
@ -2334,7 +2456,6 @@ zstd = ["zstandard (>=0.18.0)"]
|
|||
name = "uvicorn"
|
||||
version = "0.16.0"
|
||||
description = "The lightning-fast ASGI server."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -2354,7 +2475,6 @@ standard = ["PyYAML (>=5.1)", "colorama (>=0.4)", "httptools (>=0.2.0,<0.4.0)",
|
|||
name = "warcio"
|
||||
version = "1.7.4"
|
||||
description = "Streaming WARC (and ARC) IO library"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -2369,7 +2489,6 @@ six = "*"
|
|||
name = "wasabi"
|
||||
version = "0.10.1"
|
||||
description = "A lightweight console printing and formatting toolkit"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
|
@ -2381,7 +2500,6 @@ files = [
|
|||
name = "zstandard"
|
||||
version = "0.16.0"
|
||||
description = "Zstandard bindings for Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
|
@ -2438,9 +2556,9 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
|||
cffi = ["cffi (>=1.11)"]
|
||||
|
||||
[extras]
|
||||
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
|
||||
indexer = ["Levenshtein", "beautifulsoup4", "idna", "langdetect", "lxml", "pyarrow", "pyspark", "ujson", "warcio"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10,<3.11"
|
||||
content-hash = "fe5f238c57ec2d09acb6bdf8f46f33c7bbe499f68a7e34ab7bca1336e0ae881c"
|
||||
content-hash = "4e4233221e9f3bd317c0693584612898b7b736f45983b7f3f5bad4d43e567353"
|
||||
|
|
|
@ -37,6 +37,10 @@ django = "^4.2.4"
|
|||
django-ninja = "^0.22.2"
|
||||
requests-cache = "^1.1.0"
|
||||
redis = {extras = ["hiredis"], version = "^5.0.1"}
|
||||
django-allauth = "^0.57.0"
|
||||
dj-database-url = "^2.1.0"
|
||||
django-htmx = "^1.17.0"
|
||||
django-vite = "^2.1.3"
|
||||
|
||||
[tool.poetry.extras]
|
||||
indexer = [
|
||||
|
|
Loading…
Reference in a new issue