Merge pull request #115 from mwmbl/django-rewrite

Django rewrite
2023-10-10 16:25:36 +01:00 · 2023-10-10 16:25:36 +01:00 · 213bdaa365
commit 213bdaa365
parent d716cb347f 918eaa8709
35 changed files with 346 additions and 167 deletions
--- a/7
+++ b/7
@ -46,5 +46,8 @@ VOLUME ["/data"]
 EXPOSE 5000
-# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
+ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev
-CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
+
 # WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/"
 # CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"]
 CMD ["/venv/bin/mwmbl-tinysearchengine"]
--- a/analyse/analyse_crawled_domains.py
+++ b/analyse/analyse_crawled_domains.py
@ -7,8 +7,8 @@ import json
 from collections import defaultdict, Counter
 from urllib.parse import urlparse
-from mwmbl.crawler.batch import HashedBatch
+from mwmbl.crawler import HashedBatch
-from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
+from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
 # TODO: remove this line - temporary override
--- a/analyse/export_top_domains.py
+++ b/analyse/export_top_domains.py
@ -1,6 +1,6 @@
 import json
-from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
+from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
 from mwmbl.hn_top_domains_filtered import DOMAINS
--- a/analyse/export_urls.py
+++ b/analyse/export_urls.py
@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
 """
 import sqlite3
-from mwmbl.indexer.paths import URLS_PATH
+from mwmbl.indexer import URLS_PATH
 from mwmbl.app import get_config_and_index
--- a/analyse/index_local.py
+++ b/analyse/index_local.py
@ -7,16 +7,15 @@ import json
 import logging
 import os
 import sys
 from pathlib import Path
 from datetime import datetime
 import spacy
-from mwmbl.crawler.batch import HashedBatch
+from mwmbl.crawler import HashedBatch
 from mwmbl.crawler.urls import URLDatabase
 from mwmbl.database import Database
-from mwmbl.indexer.index_batches import index_batches
+from mwmbl.indexer import index_batches
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine import TinyIndex, Document
 LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
 NUM_BATCHES = 10000
--- a/analyse/index_url_count.py
+++ b/analyse/index_url_count.py
@ -1,7 +1,7 @@
 """
 Count unique URLs in the index.
 """
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine import TinyIndex, Document
 def run():
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@ -5,9 +5,9 @@ import numpy as np
 import spacy
 from analyse.index_local import EVALUATE_INDEX_PATH
-from mwmbl.indexer.index import tokenize_document
+from mwmbl.indexer import tokenize_document
-from mwmbl.indexer.paths import INDEX_PATH
+from mwmbl.indexer import INDEX_PATH
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine import TinyIndex, Document
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
--- a/analyse/record_historical_batches.py
+++ b/analyse/record_historical_batches.py
@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled.
 import glob
 import gzip
 import json
 from collections import defaultdict, Counter
 from urllib.parse import urlparse
 import requests
-from mwmbl.indexer.paths import CRAWL_GLOB
+from mwmbl.indexer import CRAWL_GLOB
 API_ENDPOINT = "http://95.216.215.29/batches/historical"
--- a/analyse/search.py
+++ b/analyse/search.py
@ -2,9 +2,9 @@ import logging
 import sys
 from itertools import islice
-from mwmbl.indexer.paths import INDEX_PATH
+from mwmbl.indexer import INDEX_PATH
 from mwmbl.tinysearchengine.completer import Completer
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine import TinyIndex, Document
 from mwmbl.tinysearchengine.rank import HeuristicRanker
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
--- a/analyse/send_batch.py
+++ b/analyse/send_batch.py
@ -3,7 +3,7 @@ Send a batch to a running instance.
 """
 import requests
-from mwmbl.crawler.batch import Batch, Item, ItemContent
+from mwmbl.crawler import Batch, Item, ItemContent
 URL = 'http://localhost:5000/crawler/batches/'
--- a/analyse/update_urls.py
+++ b/analyse/update_urls.py
@ -4,7 +4,7 @@ from datetime import datetime
 from pathlib import Path
 from queue import Queue
-from mwmbl.indexer.update_urls import record_urls_in_database
+from mwmbl.indexer import record_urls_in_database
 def run_update_urls_on_fixed_batches():
--- a/manage.py
+++ b/manage.py
@ -0,0 +1,22 @@
 #!/usr/bin/env python
 """Django's command-line utility for administrative tasks."""
 import os
 import sys
 def main():
    """Run administrative tasks."""
    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
    try:
        from django.core.management import execute_from_command_line
    except ImportError as exc:
        raise ImportError(
            "Couldn't import Django. Are you sure it's installed and "
            "available on your PYTHONPATH environment variable? Did you "
            "forget to activate a virtual environment?"
        ) from exc
    execute_from_command_line(sys.argv)
 if __name__ == '__main__':
    main()
--- a/mwmbl/api.py
+++ b/mwmbl/api.py
@ -0,0 +1,31 @@
 from multiprocessing import Queue
 from pathlib import Path
 from django.conf import settings
 from ninja import NinjaAPI
 import mwmbl.crawler.app as crawler
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 from mwmbl.tinysearchengine.rank import HeuristicRanker
 api = NinjaAPI(version="1.0.0")
 index_path = Path(settings.DATA_PATH) / INDEX_NAME
 tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
 tiny_index.__enter__()
 completer = Completer()
 ranker = HeuristicRanker(tiny_index, completer)
 search_router = search.create_router(ranker)
 api.add_router("/search/", search_router)
 batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
 queued_batches = Queue()
 crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
 api.add_router("/crawler/", crawler_router)
--- a/mwmbl/apps.py
+++ b/mwmbl/apps.py
@ -0,0 +1,35 @@
 from multiprocessing import Process, Queue
 from pathlib import Path
 from django.apps import AppConfig
 from django.conf import settings
 from mwmbl.api import queued_batches
 from mwmbl import background
 from mwmbl.indexer.paths import INDEX_NAME
 from mwmbl.indexer.update_urls import update_urls_continuously
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
 from mwmbl.url_queue import update_queue_continuously
 class MwmblConfig(AppConfig):
    name = "mwmbl"
    verbose_name = "Mwmbl Application"
    def ready(self):
        index_path = Path(settings.DATA_PATH) / INDEX_NAME
        try:
            existing_index = TinyIndex(item_factory=Document, index_path=index_path)
            if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != settings.NUM_PAGES:
                raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
                                 f"({existing_index.num_pages}) do not match")
        except FileNotFoundError:
            print("Creating a new index")
            TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
                             page_size=PAGE_SIZE)
        if settings.RUN_BACKGROUND_PROCESSES:
            new_item_queue = Queue()
            Process(target=background.run, args=(settings.DATA_PATH,)).start()
            Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
            Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start()
--- a/mwmbl/asgi.py
+++ b/mwmbl/asgi.py
@ -0,0 +1,16 @@
 """
 ASGI config for app project.
 It exposes the ASGI callable as a module-level variable named ``application``.
 For more information on this file, see
 https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
 """
 import os
 from django.core.asgi import get_asgi_application
 os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
 application = get_asgi_application()
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -10,10 +10,11 @@ from uuid import uuid4
 import boto3
 import justext
 import requests
-from fastapi import HTTPException, APIRouter
+from fastapi import HTTPException
 from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
    LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
    STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
 from ninja import Router
 from redis import Redis
 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
    return paragraphs, title
-def get_router(batch_cache: BatchCache, queued_batches: Queue):
+def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
-    router = APIRouter(prefix="/crawler", tags=["crawler"])
+    router = Router(tags=["crawler"])
-    @router.on_event("startup")
+    # TODO: # ensure tables are created before crawler code is used:
-    async def on_startup():
+    #       #
-        with Database() as db:
+    #       #     url_db.create_tables()
            url_db = URLDatabase(db.connection)
            return url_db.create_tables()
    @router.get('/fetch')
-    def fetch_url(url: str, query: str):
+    def fetch_url(request, url: str, query: str):
        response = requests.get(url)
        paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
        good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        return format_result(result, query)
    @router.post('/batches/')
-    def post_batch(batch: Batch):
+    def post_batch(request, batch: Batch):
        if len(batch.items) > MAX_BATCH_SIZE:
            raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        }
    @router.post('/batches/new')
-    def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
+    def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
        user_id_hash = _get_user_id_hash(batch_request)
        try:
            urls = queued_batches.get(block=False)
@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        return urls
    @router.get('/batches/{date_str}/users/{public_user_id}')
-    def get_batches_for_date_and_user(date_str, public_user_id):
+    def get_batches_for_date_and_user(request, date_str, public_user_id):
        check_date_str(date_str)
        check_public_user_id(public_user_id)
        prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
        return get_batch_ids_for_prefix(prefix)
    @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
-    def get_batch_from_id(date_str, public_user_id, batch_id):
+    def get_batch_from_id(request, date_str, public_user_id, batch_id):
        url = get_batch_url(batch_id, date_str, public_user_id)
        data = json.loads(gzip.decompress(requests.get(url).content))
        return {
@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
            'batch': data,
        }
-    @router.get('/latest-batch', response_model=list[HashedBatch])
+    @router.get('/latest-batch')
-    def get_latest_batch():
+    def get_latest_batch(request) -> list[HashedBatch]:
        return [] if last_batch is None else [last_batch]
    @router.get('/batches/{date_str}/users')
-    def get_user_id_hashes_for_date(date_str: str):
+    def get_user_id_hashes_for_date(request, date_str: str):
        check_date_str(date_str)
        prefix = f'1/{VERSION}/{date_str}/1/'
        return get_subfolders(prefix)
    @router.get('/stats')
-    def get_stats() -> MwmblStats:
+    def get_stats(request) -> MwmblStats:
        return stats_manager.get_stats()
    @router.get('/')
-    def status():
+    def status(request):
        return {
            'status': 'ok'
        }
--- a/mwmbl/crawler/batch.py
+++ b/mwmbl/crawler/batch.py
@ -1,21 +1,21 @@
 from typing import Optional
-from pydantic import BaseModel
+from ninja import Schema
-class ItemContent(BaseModel):
+class ItemContent(Schema):
    title: str
    extract: str
    links: list[str]
    extra_links: Optional[list[str]]
-class ItemError(BaseModel):
+class ItemError(Schema):
    name: str
    message: Optional[str]
-class Item(BaseModel):
+class Item(Schema):
    url: str
    status: Optional[int]
    timestamp: int
@ -23,16 +23,16 @@ class Item(BaseModel):
    error: Optional[ItemError]
-class Batch(BaseModel):
+class Batch(Schema):
    user_id: str
    items: list[Item]
-class NewBatchRequest(BaseModel):
+class NewBatchRequest(Schema):
    user_id: str
-class HashedBatch(BaseModel):
+class HashedBatch(Schema):
    user_id_hash: str
    timestamp: int
    items: list[Item]
--- a/mwmbl/crawler/urls.py
+++ b/mwmbl/crawler/urls.py
@ -1,16 +1,13 @@
 """
 Database storing info on URLs
 """
 import random
 from dataclasses import dataclass
-from datetime import datetime, timedelta
+from datetime import datetime
 from enum import Enum
 from logging import getLogger
 from psycopg2.extras import execute_values
 from mwmbl.hn_top_domains_filtered import DOMAINS
 from mwmbl.settings import CORE_DOMAINS
 # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
 from mwmbl.utils import batch
--- a/mwmbl/indexer/batch_cache.py
+++ b/mwmbl/indexer/batch_cache.py
@ -9,7 +9,6 @@ import os
 from logging import getLogger
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from urllib.parse import urlparse
 from pydantic import ValidationError
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@ -1,13 +1,10 @@
 """
 Create a search index
 """
 from collections import Counter
 from typing import Iterable
 from urllib.parse import unquote
-import pandas as pd
+from mwmbl.tinysearchengine.indexer import TokenizedDocument
 from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
 from mwmbl.tokenizer import tokenize, get_bigrams
 DEFAULT_SCORE = 0
--- a/mwmbl/indexer/update_urls.py
+++ b/mwmbl/indexer/update_urls.py
@ -1,13 +1,10 @@
 import os
 import pickle
 import re
 from collections import defaultdict
 from datetime import datetime, timezone, timedelta
 from logging import getLogger
 from multiprocessing import Queue
 from pathlib import Path
 from time import sleep
-from typing import Iterable, Collection
+from typing import Collection
 from urllib.parse import urlparse
 from requests_cache import CachedSession
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@ -1,96 +1,8 @@
 import argparse
 import logging
 import sys
 from multiprocessing import Process, Queue
 from pathlib import Path
 import uvicorn
 from fastapi import FastAPI
 from starlette.middleware.cors import CORSMiddleware
 from mwmbl import background
 from mwmbl.crawler import app as crawler
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
 from mwmbl.platform import user
 from mwmbl.indexer.update_urls import update_urls_continuously
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
 from mwmbl.tinysearchengine.rank import HeuristicRanker
 from mwmbl.url_queue import update_queue_continuously
 FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
 logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
 MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
 def setup_args():
    parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
    parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
    parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
    parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000)
    parser.add_argument("--background", help="Enable running the background tasks to process batches",
                        action='store_true')
    args = parser.parse_args()
    return args
 def run():
-    args = setup_args()
+    uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000)
    index_path = Path(args.data) / INDEX_NAME
    try:
        existing_index = TinyIndex(item_factory=Document, index_path=index_path)
        if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
            raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
                             f"({existing_index.num_pages}) do not match")
    except FileNotFoundError:
        print("Creating a new index")
        TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
    new_item_queue = Queue()
    queued_batches = Queue()
    # curation_queue = Queue()
    if args.background:
        Process(target=background.run, args=(args.data,)).start()
        Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
        Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start()
    completer = Completer()
    with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index:
        ranker = HeuristicRanker(tiny_index, completer)
        # model = pickle.load(open(MODEL_PATH, 'rb'))
        # ranker = LTRRanker(model, tiny_index, completer)
        # Initialize FastApi instance
        app = FastAPI()
        # Try disabling since this is handled by nginx
        # app.add_middleware(
        #     CORSMiddleware,
        #     allow_origins=["*"],
        #     allow_credentials=True,
        #     allow_methods=["*"],
        #     allow_headers=["*"],
        # )
        search_router = search.create_router(ranker)
        app.include_router(search_router)
        batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
        crawler_router = crawler.get_router(batch_cache, queued_batches)
        app.include_router(crawler_router)
        user_router = user.create_router(index_path)
        app.include_router(user_router)
        # Initialize uvicorn server using global app instance and server config params
        uvicorn.run(app, host="0.0.0.0", port=args.port)
 if __name__ == "__main__":
--- a/mwmbl/platform/user.py
+++ b/mwmbl/platform/user.py
@ -7,7 +7,7 @@ import requests
 from fastapi import APIRouter, Response
 from pydantic import BaseModel
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 from mwmbl.tokenizer import tokenize
--- a/mwmbl/settings_common.py
+++ b/mwmbl/settings_common.py
@ -0,0 +1,125 @@
 """
 Django settings for mwmbl project.
 Generated by 'django-admin startproject' using Django 4.2.4.
 For more information on this file, see
 https://docs.djangoproject.com/en/4.2/topics/settings/
 For the full list of settings and their values, see
 https://docs.djangoproject.com/en/4.2/ref/settings/
 """
 from pathlib import Path
 # Build paths inside the project like this: BASE_DIR / 'subdir'.
 BASE_DIR = Path(__file__).resolve().parent.parent
 # Quick-start development settings - unsuitable for production
 # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
 # SECURITY WARNING: keep the secret key used in production secret!
 SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = True
 ALLOWED_HOSTS = []
 # Application definition
 INSTALLED_APPS = [
    'django.contrib.admin',
    'django.contrib.auth',
    'django.contrib.contenttypes',
    'django.contrib.sessions',
    'django.contrib.messages',
    'django.contrib.staticfiles',
    'mwmbl',
 ]
 MIDDLEWARE = [
    'django.middleware.security.SecurityMiddleware',
    'django.contrib.sessions.middleware.SessionMiddleware',
    'django.middleware.common.CommonMiddleware',
    'django.middleware.csrf.CsrfViewMiddleware',
    'django.contrib.auth.middleware.AuthenticationMiddleware',
    'django.contrib.messages.middleware.MessageMiddleware',
    'django.middleware.clickjacking.XFrameOptionsMiddleware',
 ]
 ROOT_URLCONF = 'mwmbl.urls'
 TEMPLATES = [
    {
        'BACKEND': 'django.template.backends.django.DjangoTemplates',
        'DIRS': [],
        'APP_DIRS': True,
        'OPTIONS': {
            'context_processors': [
                'django.template.context_processors.debug',
                'django.template.context_processors.request',
                'django.contrib.auth.context_processors.auth',
                'django.contrib.messages.context_processors.messages',
            ],
        },
    },
 ]
 WSGI_APPLICATION = 'mwmbl.wsgi.application'
 # Database
 # https://docs.djangoproject.com/en/4.2/ref/settings/#databases
 DATABASES = {
    'default': {
        'ENGINE': 'django.db.backends.sqlite3',
        'NAME': BASE_DIR / 'db.sqlite3',
    }
 }
 # Password validation
 # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
 AUTH_PASSWORD_VALIDATORS = [
    {
        'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
    },
    {
        'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
    },
    {
        'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
    },
    {
        'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
    },
 ]
 # Internationalization
 # https://docs.djangoproject.com/en/4.2/topics/i18n/
 LANGUAGE_CODE = 'en-us'
 TIME_ZONE = 'UTC'
 USE_I18N = True
 USE_TZ = True
 # Static files (CSS, JavaScript, Images)
 # https://docs.djangoproject.com/en/4.2/howto/static-files/
 STATIC_URL = 'static/'
 # Default primary key field type
 # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
 DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
--- a/mwmbl/settings_dev.py
+++ b/mwmbl/settings_dev.py
@ -0,0 +1,5 @@
 from mwmbl.settings_common import *
 DATA_PATH = "./devdata"
 RUN_BACKGROUND_PROCESSES = False
 NUM_PAGES = 2560
--- a/mwmbl/settings_prod.py
+++ b/mwmbl/settings_prod.py
@ -0,0 +1,5 @@
 from mwmbl.settings_common import *
 DATA_PATH = "/app/storage"
 RUN_BACKGROUND_PROCESSES = True
 NUM_PAGES = 10240000
--- a/mwmbl/tinysearchengine/rank.py
+++ b/mwmbl/tinysearchengine/rank.py
@ -6,7 +6,6 @@ from operator import itemgetter
 from urllib.parse import urlparse
 from mwmbl.format import format_result_with_pattern, get_query_regex
 from mwmbl.platform.user import MAX_CURATED_SCORE
 from mwmbl.tokenizer import tokenize, get_bigrams
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.hn_top_domains_filtered import DOMAINS
--- a/mwmbl/tinysearchengine/search.py
+++ b/mwmbl/tinysearchengine/search.py
@ -1,6 +1,6 @@
 from logging import getLogger
-from fastapi import APIRouter
+from ninja import Router
 from mwmbl.tinysearchengine.rank import HeuristicRanker
@ -10,15 +10,15 @@ logger = getLogger(__name__)
 SCORE_THRESHOLD = 0.25
-def create_router(ranker: HeuristicRanker) -> APIRouter:
+def create_router(ranker: HeuristicRanker) -> Router:
-    router = APIRouter(prefix="/search", tags=["search"])
+    router = Router(tags=["search"])
    @router.get("")
-    def search(s: str):
+    def search(request, s: str):
        return ranker.search(s)
    @router.get("/complete")
-    def complete(q: str):
+    def complete(request, q: str):
        return ranker.complete(q)
    return router
--- a/mwmbl/url_queue.py
+++ b/mwmbl/url_queue.py
@ -1,6 +1,5 @@
 import time
 from collections import defaultdict
 from dataclasses import dataclass
 from datetime import datetime, timedelta
 from logging import getLogger
 from multiprocessing import Queue
--- a/mwmbl/urls.py
+++ b/mwmbl/urls.py
@ -0,0 +1,25 @@
 """
 URL configuration for app project.
 The `urlpatterns` list routes URLs to views. For more information please see:
    https://docs.djangoproject.com/en/4.2/topics/http/urls/
 Examples:
 Function views
    1. Add an import:  from my_app import views
    2. Add a URL to urlpatterns:  path('', views.home, name='home')
 Class-based views
    1. Add an import:  from other_app.views import Home
    2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
 Including another URLconf
    1. Import the include() function: from django.urls import include, path
    2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
 """
 from django.contrib import admin
 from django.urls import path
 from mwmbl.api import api
 urlpatterns = [
    path('admin/', admin.site.urls),
    path('', api.urls)
 ]
--- a/mwmbl/wsgi.py
+++ b/mwmbl/wsgi.py
@ -0,0 +1,16 @@
 """
 WSGI config for app project.
 It exposes the WSGI callable as a module-level variable named ``application``.
 For more information on this file, see
 https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
 """
 import os
 from django.core.wsgi import get_wsgi_application
 os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
 application = get_wsgi_application()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -33,6 +33,8 @@ langdetect = {version= "==1.0.9", optional = true}
 pyarrow = {version= "==6.0.0", optional = true}
 pyspark = {version= "==3.2.0", optional = true}
 Levenshtein = {version= "==0.16.0", optional = true}
 django = "^4.2.4"
 django-ninja = "^0.22.2"
 requests-cache = "^1.1.0"
 redis = {extras = ["hiredis"], version = "^5.0.1"}
--- a/test/test_completer.py
+++ b/test/test_completer.py
@ -1,5 +1,3 @@
 import mwmbl.tinysearchengine.completer
 import pytest
 import pandas as pd
 def mockCompleterData(mocker, data):
@ -16,7 +14,7 @@ def test_correctCompletions(mocker):
        [3, 'buildings', 1]]
    mockCompleterData(mocker, testdata)
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    completion = completer.complete('build')
    assert ['build', 'builder', 'buildings'] == completion
@ -29,7 +27,7 @@ def test_correctSortOrder(mocker):
        [3, 'buildings', 3]]
    mockCompleterData(mocker, testdata)
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    completion = completer.complete('build')
    assert ['build', 'buildings', 'builder'] == completion
@ -42,7 +40,7 @@ def test_noCompletions(mocker):
        [3, 'buildings', 1]]
    mockCompleterData(mocker, testdata)
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    completion = completer.complete('test')
    assert [] == completion
@ -55,7 +53,7 @@ def test_singleCompletions(mocker):
        [3, 'buildings', 1]]
    mockCompleterData(mocker, testdata)
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    completion = completer.complete('announce')
    assert ['announce'] == completion
@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker):
        [3, 'buildings', 1]]
    mockCompleterData(mocker, testdata)
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    for i in range(3):
        print(f"iteration: {i}")
        completion = completer.complete('build')
--- a/test/test_indexer.py
+++ b/test/test_indexer.py
@ -1,9 +1,9 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
+from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
-from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
+from zstandard import ZstdCompressor
-import json
+
 def test_create_index():
    num_pages = 10
--- a/test/test_update_urls.py
+++ b/test/test_update_urls.py
@ -1,4 +1,4 @@
-from mwmbl.indexer.update_urls import process_link
+from mwmbl.indexer import process_link
 def test_process_link_normal():
`@ -1,4 +1,4 @@`
	`from mwmbl.indexer.update_urls import process_link`	`from mwmbl.indexer import process_link`


	`def test_process_link_normal():`	`def test_process_link_normal():`