Merge pull request #115 from mwmbl/django-rewrite

Django rewrite
This commit is contained in:
Daoud Clarke 2023-10-10 16:25:36 +01:00 committed by GitHub
commit 213bdaa365
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
35 changed files with 346 additions and 167 deletions

View file

@ -46,5 +46,8 @@ VOLUME ["/data"]
EXPOSE 5000
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev
# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/"
# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"]
CMD ["/venv/bin/mwmbl-tinysearchengine"]

View file

@ -7,8 +7,8 @@ import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
from mwmbl.crawler.batch import HashedBatch
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
from mwmbl.crawler import HashedBatch
from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
# TODO: remove this line - temporary override

View file

@ -1,6 +1,6 @@
import json
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
from mwmbl.hn_top_domains_filtered import DOMAINS

View file

@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
"""
import sqlite3
from mwmbl.indexer.paths import URLS_PATH
from mwmbl.indexer import URLS_PATH
from mwmbl.app import get_config_and_index

View file

@ -7,16 +7,15 @@ import json
import logging
import os
import sys
from pathlib import Path
from datetime import datetime
import spacy
from mwmbl.crawler.batch import HashedBatch
from mwmbl.crawler import HashedBatch
from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database
from mwmbl.indexer.index_batches import index_batches
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.indexer import index_batches
from mwmbl.tinysearchengine import TinyIndex, Document
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
NUM_BATCHES = 10000

View file

@ -1,7 +1,7 @@
"""
Count unique URLs in the index.
"""
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine import TinyIndex, Document
def run():

View file

@ -5,9 +5,9 @@ import numpy as np
import spacy
from analyse.index_local import EVALUATE_INDEX_PATH
from mwmbl.indexer.index import tokenize_document
from mwmbl.indexer.paths import INDEX_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.indexer import tokenize_document
from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine import TinyIndex, Document
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

View file

@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled.
import glob
import gzip
import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
import requests
from mwmbl.indexer.paths import CRAWL_GLOB
from mwmbl.indexer import CRAWL_GLOB
API_ENDPOINT = "http://95.216.215.29/batches/historical"

View file

@ -2,9 +2,9 @@ import logging
import sys
from itertools import islice
from mwmbl.indexer.paths import INDEX_PATH
from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine import TinyIndex, Document
from mwmbl.tinysearchengine.rank import HeuristicRanker
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

View file

@ -3,7 +3,7 @@ Send a batch to a running instance.
"""
import requests
from mwmbl.crawler.batch import Batch, Item, ItemContent
from mwmbl.crawler import Batch, Item, ItemContent
URL = 'http://localhost:5000/crawler/batches/'

View file

@ -4,7 +4,7 @@ from datetime import datetime
from pathlib import Path
from queue import Queue
from mwmbl.indexer.update_urls import record_urls_in_database
from mwmbl.indexer import record_urls_in_database
def run_update_urls_on_fixed_batches():

22
manage.py Executable file
View file

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

31
mwmbl/api.py Normal file
View file

@ -0,0 +1,31 @@
from multiprocessing import Queue
from pathlib import Path
from django.conf import settings
from ninja import NinjaAPI
import mwmbl.crawler.app as crawler
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.rank import HeuristicRanker
api = NinjaAPI(version="1.0.0")
index_path = Path(settings.DATA_PATH) / INDEX_NAME
tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
tiny_index.__enter__()
completer = Completer()
ranker = HeuristicRanker(tiny_index, completer)
search_router = search.create_router(ranker)
api.add_router("/search/", search_router)
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
queued_batches = Queue()
crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
api.add_router("/crawler/", crawler_router)

35
mwmbl/apps.py Normal file
View file

@ -0,0 +1,35 @@
from multiprocessing import Process, Queue
from pathlib import Path
from django.apps import AppConfig
from django.conf import settings
from mwmbl.api import queued_batches
from mwmbl import background
from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.url_queue import update_queue_continuously
class MwmblConfig(AppConfig):
name = "mwmbl"
verbose_name = "Mwmbl Application"
def ready(self):
index_path = Path(settings.DATA_PATH) / INDEX_NAME
try:
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != settings.NUM_PAGES:
raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
f"({existing_index.num_pages}) do not match")
except FileNotFoundError:
print("Creating a new index")
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
page_size=PAGE_SIZE)
if settings.RUN_BACKGROUND_PROCESSES:
new_item_queue = Queue()
Process(target=background.run, args=(settings.DATA_PATH,)).start()
Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start()

16
mwmbl/asgi.py Normal file
View file

@ -0,0 +1,16 @@
"""
ASGI config for app project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
application = get_asgi_application()

View file

@ -10,10 +10,11 @@ from uuid import uuid4
import boto3
import justext
import requests
from fastapi import HTTPException, APIRouter
from fastapi import HTTPException
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from ninja import Router
from redis import Redis
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
return paragraphs, title
def get_router(batch_cache: BatchCache, queued_batches: Queue):
router = APIRouter(prefix="/crawler", tags=["crawler"])
def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
router = Router(tags=["crawler"])
@router.on_event("startup")
async def on_startup():
with Database() as db:
url_db = URLDatabase(db.connection)
return url_db.create_tables()
# TODO: # ensure tables are created before crawler code is used:
# #
# # url_db.create_tables()
@router.get('/fetch')
def fetch_url(url: str, query: str):
def fetch_url(request, url: str, query: str):
response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
return format_result(result, query)
@router.post('/batches/')
def post_batch(batch: Batch):
def post_batch(request, batch: Batch):
if len(batch.items) > MAX_BATCH_SIZE:
raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
}
@router.post('/batches/new')
def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
user_id_hash = _get_user_id_hash(batch_request)
try:
urls = queued_batches.get(block=False)
@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
return urls
@router.get('/batches/{date_str}/users/{public_user_id}')
def get_batches_for_date_and_user(date_str, public_user_id):
def get_batches_for_date_and_user(request, date_str, public_user_id):
check_date_str(date_str)
check_public_user_id(public_user_id)
prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
return get_batch_ids_for_prefix(prefix)
@router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
def get_batch_from_id(date_str, public_user_id, batch_id):
def get_batch_from_id(request, date_str, public_user_id, batch_id):
url = get_batch_url(batch_id, date_str, public_user_id)
data = json.loads(gzip.decompress(requests.get(url).content))
return {
@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
'batch': data,
}
@router.get('/latest-batch', response_model=list[HashedBatch])
def get_latest_batch():
@router.get('/latest-batch')
def get_latest_batch(request) -> list[HashedBatch]:
return [] if last_batch is None else [last_batch]
@router.get('/batches/{date_str}/users')
def get_user_id_hashes_for_date(date_str: str):
def get_user_id_hashes_for_date(request, date_str: str):
check_date_str(date_str)
prefix = f'1/{VERSION}/{date_str}/1/'
return get_subfolders(prefix)
@router.get('/stats')
def get_stats() -> MwmblStats:
def get_stats(request) -> MwmblStats:
return stats_manager.get_stats()
@router.get('/')
def status():
def status(request):
return {
'status': 'ok'
}

View file

@ -1,21 +1,21 @@
from typing import Optional
from pydantic import BaseModel
from ninja import Schema
class ItemContent(BaseModel):
class ItemContent(Schema):
title: str
extract: str
links: list[str]
extra_links: Optional[list[str]]
class ItemError(BaseModel):
class ItemError(Schema):
name: str
message: Optional[str]
class Item(BaseModel):
class Item(Schema):
url: str
status: Optional[int]
timestamp: int
@ -23,16 +23,16 @@ class Item(BaseModel):
error: Optional[ItemError]
class Batch(BaseModel):
class Batch(Schema):
user_id: str
items: list[Item]
class NewBatchRequest(BaseModel):
class NewBatchRequest(Schema):
user_id: str
class HashedBatch(BaseModel):
class HashedBatch(Schema):
user_id_hash: str
timestamp: int
items: list[Item]

View file

@ -1,16 +1,13 @@
"""
Database storing info on URLs
"""
import random
from dataclasses import dataclass
from datetime import datetime, timedelta
from datetime import datetime
from enum import Enum
from logging import getLogger
from psycopg2.extras import execute_values
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.settings import CORE_DOMAINS
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
from mwmbl.utils import batch

View file

@ -9,7 +9,6 @@ import os
from logging import getLogger
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse
from pydantic import ValidationError

View file

@ -1,13 +1,10 @@
"""
Create a search index
"""
from collections import Counter
from typing import Iterable
from urllib.parse import unquote
import pandas as pd
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
from mwmbl.tinysearchengine.indexer import TokenizedDocument
from mwmbl.tokenizer import tokenize, get_bigrams
DEFAULT_SCORE = 0

View file

@ -1,13 +1,10 @@
import os
import pickle
import re
from collections import defaultdict
from datetime import datetime, timezone, timedelta
from logging import getLogger
from multiprocessing import Queue
from pathlib import Path
from time import sleep
from typing import Iterable, Collection
from typing import Collection
from urllib.parse import urlparse
from requests_cache import CachedSession

View file

@ -1,96 +1,8 @@
import argparse
import logging
import sys
from multiprocessing import Process, Queue
from pathlib import Path
import uvicorn
from fastapi import FastAPI
from starlette.middleware.cors import CORSMiddleware
from mwmbl import background
from mwmbl.crawler import app as crawler
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.platform import user
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.tinysearchengine.rank import HeuristicRanker
from mwmbl.url_queue import update_queue_continuously
FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
def setup_args():
parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000)
parser.add_argument("--background", help="Enable running the background tasks to process batches",
action='store_true')
args = parser.parse_args()
return args
def run():
args = setup_args()
index_path = Path(args.data) / INDEX_NAME
try:
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
f"({existing_index.num_pages}) do not match")
except FileNotFoundError:
print("Creating a new index")
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
new_item_queue = Queue()
queued_batches = Queue()
# curation_queue = Queue()
if args.background:
Process(target=background.run, args=(args.data,)).start()
Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start()
completer = Completer()
with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index:
ranker = HeuristicRanker(tiny_index, completer)
# model = pickle.load(open(MODEL_PATH, 'rb'))
# ranker = LTRRanker(model, tiny_index, completer)
# Initialize FastApi instance
app = FastAPI()
# Try disabling since this is handled by nginx
# app.add_middleware(
# CORSMiddleware,
# allow_origins=["*"],
# allow_credentials=True,
# allow_methods=["*"],
# allow_headers=["*"],
# )
search_router = search.create_router(ranker)
app.include_router(search_router)
batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
crawler_router = crawler.get_router(batch_cache, queued_batches)
app.include_router(crawler_router)
user_router = user.create_router(index_path)
app.include_router(user_router)
# Initialize uvicorn server using global app instance and server config params
uvicorn.run(app, host="0.0.0.0", port=args.port)
uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000)
if __name__ == "__main__":

View file

@ -7,7 +7,7 @@ import requests
from fastapi import APIRouter, Response
from pydantic import BaseModel
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize

125
mwmbl/settings_common.py Normal file
View file

@ -0,0 +1,125 @@
"""
Django settings for mwmbl project.
Generated by 'django-admin startproject' using Django 4.2.4.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.2/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'mwmbl',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'mwmbl.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'mwmbl.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/4.2/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/4.2/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'

5
mwmbl/settings_dev.py Normal file
View file

@ -0,0 +1,5 @@
from mwmbl.settings_common import *
DATA_PATH = "./devdata"
RUN_BACKGROUND_PROCESSES = False
NUM_PAGES = 2560

5
mwmbl/settings_prod.py Normal file
View file

@ -0,0 +1,5 @@
from mwmbl.settings_common import *
DATA_PATH = "/app/storage"
RUN_BACKGROUND_PROCESSES = True
NUM_PAGES = 10240000

View file

@ -6,7 +6,6 @@ from operator import itemgetter
from urllib.parse import urlparse
from mwmbl.format import format_result_with_pattern, get_query_regex
from mwmbl.platform.user import MAX_CURATED_SCORE
from mwmbl.tokenizer import tokenize, get_bigrams
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.hn_top_domains_filtered import DOMAINS

View file

@ -1,6 +1,6 @@
from logging import getLogger
from fastapi import APIRouter
from ninja import Router
from mwmbl.tinysearchengine.rank import HeuristicRanker
@ -10,15 +10,15 @@ logger = getLogger(__name__)
SCORE_THRESHOLD = 0.25
def create_router(ranker: HeuristicRanker) -> APIRouter:
router = APIRouter(prefix="/search", tags=["search"])
def create_router(ranker: HeuristicRanker) -> Router:
router = Router(tags=["search"])
@router.get("")
def search(s: str):
def search(request, s: str):
return ranker.search(s)
@router.get("/complete")
def complete(q: str):
def complete(request, q: str):
return ranker.complete(q)
return router

View file

@ -1,6 +1,5 @@
import time
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
from logging import getLogger
from multiprocessing import Queue

25
mwmbl/urls.py Normal file
View file

@ -0,0 +1,25 @@
"""
URL configuration for app project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/4.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path
from mwmbl.api import api
urlpatterns = [
path('admin/', admin.site.urls),
path('', api.urls)
]

16
mwmbl/wsgi.py Normal file
View file

@ -0,0 +1,16 @@
"""
WSGI config for app project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
application = get_wsgi_application()

View file

@ -33,6 +33,8 @@ langdetect = {version= "==1.0.9", optional = true}
pyarrow = {version= "==6.0.0", optional = true}
pyspark = {version= "==3.2.0", optional = true}
Levenshtein = {version= "==0.16.0", optional = true}
django = "^4.2.4"
django-ninja = "^0.22.2"
requests-cache = "^1.1.0"
redis = {extras = ["hiredis"], version = "^5.0.1"}

View file

@ -1,5 +1,3 @@
import mwmbl.tinysearchengine.completer
import pytest
import pandas as pd
def mockCompleterData(mocker, data):
@ -16,7 +14,7 @@ def test_correctCompletions(mocker):
[3, 'buildings', 1]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('build')
assert ['build', 'builder', 'buildings'] == completion
@ -29,7 +27,7 @@ def test_correctSortOrder(mocker):
[3, 'buildings', 3]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('build')
assert ['build', 'buildings', 'builder'] == completion
@ -42,7 +40,7 @@ def test_noCompletions(mocker):
[3, 'buildings', 1]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('test')
assert [] == completion
@ -55,7 +53,7 @@ def test_singleCompletions(mocker):
[3, 'buildings', 1]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('announce')
assert ['announce'] == completion
@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker):
[3, 'buildings', 1]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
for i in range(3):
print(f"iteration: {i}")
completion = completer.complete('build')

View file

@ -1,9 +1,9 @@
from pathlib import Path
from tempfile import TemporaryDirectory
from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
import json
from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
from zstandard import ZstdCompressor
def test_create_index():
num_pages = 10

View file

@ -1,4 +1,4 @@
from mwmbl.indexer.update_urls import process_link
from mwmbl.indexer import process_link
def test_process_link_normal():