diff --git a/Dockerfile b/Dockerfile index f57bce7..4283a14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,5 +46,8 @@ VOLUME ["/data"] EXPOSE 5000 -# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl -CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"] +ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev + +# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/" +# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"] +CMD ["/venv/bin/mwmbl-tinysearchengine"] diff --git a/analyse/analyse_crawled_domains.py b/analyse/analyse_crawled_domains.py index 371cbb6..5e87abb 100644 --- a/analyse/analyse_crawled_domains.py +++ b/analyse/analyse_crawled_domains.py @@ -7,8 +7,8 @@ import json from collections import defaultdict, Counter from urllib.parse import urlparse -from mwmbl.crawler.batch import HashedBatch -from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR +from mwmbl.crawler import HashedBatch +from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR # TODO: remove this line - temporary override diff --git a/analyse/export_top_domains.py b/analyse/export_top_domains.py index 9f4d495..b9b4479 100644 --- a/analyse/export_top_domains.py +++ b/analyse/export_top_domains.py @@ -1,6 +1,6 @@ import json -from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH +from mwmbl.indexer import TOP_DOMAINS_JSON_PATH from mwmbl.hn_top_domains_filtered import DOMAINS diff --git a/analyse/export_urls.py b/analyse/export_urls.py index a042260..39ba98e 100644 --- a/analyse/export_urls.py +++ b/analyse/export_urls.py @@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation. """ import sqlite3 -from mwmbl.indexer.paths import URLS_PATH +from mwmbl.indexer import URLS_PATH from mwmbl.app import get_config_and_index diff --git a/analyse/index_local.py b/analyse/index_local.py index 24628a5..334868d 100644 --- a/analyse/index_local.py +++ b/analyse/index_local.py @@ -7,16 +7,15 @@ import json import logging import os import sys -from pathlib import Path from datetime import datetime import spacy -from mwmbl.crawler.batch import HashedBatch +from mwmbl.crawler import HashedBatch from mwmbl.crawler.urls import URLDatabase from mwmbl.database import Database -from mwmbl.indexer.index_batches import index_batches -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.indexer import index_batches +from mwmbl.tinysearchengine import TinyIndex, Document LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz' NUM_BATCHES = 10000 diff --git a/analyse/index_url_count.py b/analyse/index_url_count.py index f0c7ac2..dcb7245 100644 --- a/analyse/index_url_count.py +++ b/analyse/index_url_count.py @@ -1,7 +1,7 @@ """ Count unique URLs in the index. """ -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine import TinyIndex, Document def run(): diff --git a/analyse/inspect_index.py b/analyse/inspect_index.py index 20b0619..c48ad22 100644 --- a/analyse/inspect_index.py +++ b/analyse/inspect_index.py @@ -5,9 +5,9 @@ import numpy as np import spacy from analyse.index_local import EVALUATE_INDEX_PATH -from mwmbl.indexer.index import tokenize_document -from mwmbl.indexer.paths import INDEX_PATH -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.indexer import tokenize_document +from mwmbl.indexer import INDEX_PATH +from mwmbl.tinysearchengine import TinyIndex, Document logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) diff --git a/analyse/record_historical_batches.py b/analyse/record_historical_batches.py index 4d8ccd3..c482e49 100644 --- a/analyse/record_historical_batches.py +++ b/analyse/record_historical_batches.py @@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled. import glob import gzip import json -from collections import defaultdict, Counter -from urllib.parse import urlparse import requests -from mwmbl.indexer.paths import CRAWL_GLOB +from mwmbl.indexer import CRAWL_GLOB API_ENDPOINT = "http://95.216.215.29/batches/historical" diff --git a/analyse/search.py b/analyse/search.py index 4ffbd54..4bc3b72 100644 --- a/analyse/search.py +++ b/analyse/search.py @@ -2,9 +2,9 @@ import logging import sys from itertools import islice -from mwmbl.indexer.paths import INDEX_PATH +from mwmbl.indexer import INDEX_PATH from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine import TinyIndex, Document from mwmbl.tinysearchengine.rank import HeuristicRanker logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) diff --git a/analyse/send_batch.py b/analyse/send_batch.py index 9191834..6d52d41 100644 --- a/analyse/send_batch.py +++ b/analyse/send_batch.py @@ -3,7 +3,7 @@ Send a batch to a running instance. """ import requests -from mwmbl.crawler.batch import Batch, Item, ItemContent +from mwmbl.crawler import Batch, Item, ItemContent URL = 'http://localhost:5000/crawler/batches/' diff --git a/analyse/update_urls.py b/analyse/update_urls.py index 0655df7..f26c804 100644 --- a/analyse/update_urls.py +++ b/analyse/update_urls.py @@ -4,7 +4,7 @@ from datetime import datetime from pathlib import Path from queue import Queue -from mwmbl.indexer.update_urls import record_urls_in_database +from mwmbl.indexer import record_urls_in_database def run_update_urls_on_fixed_batches(): diff --git a/manage.py b/manage.py new file mode 100755 index 0000000..4931389 --- /dev/null +++ b/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/mwmbl/api.py b/mwmbl/api.py new file mode 100644 index 0000000..713bfc3 --- /dev/null +++ b/mwmbl/api.py @@ -0,0 +1,31 @@ +from multiprocessing import Queue +from pathlib import Path + +from django.conf import settings +from ninja import NinjaAPI + +import mwmbl.crawler.app as crawler +from mwmbl.indexer.batch_cache import BatchCache +from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME +from mwmbl.tinysearchengine import search +from mwmbl.tinysearchengine.completer import Completer +from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine.rank import HeuristicRanker + +api = NinjaAPI(version="1.0.0") + +index_path = Path(settings.DATA_PATH) / INDEX_NAME +tiny_index = TinyIndex(item_factory=Document, index_path=index_path) +tiny_index.__enter__() + +completer = Completer() +ranker = HeuristicRanker(tiny_index, completer) + +search_router = search.create_router(ranker) +api.add_router("/search/", search_router) + +batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME) + +queued_batches = Queue() +crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches) +api.add_router("/crawler/", crawler_router) diff --git a/mwmbl/apps.py b/mwmbl/apps.py new file mode 100644 index 0000000..bfc21a5 --- /dev/null +++ b/mwmbl/apps.py @@ -0,0 +1,35 @@ +from multiprocessing import Process, Queue +from pathlib import Path + +from django.apps import AppConfig +from django.conf import settings + +from mwmbl.api import queued_batches +from mwmbl import background +from mwmbl.indexer.paths import INDEX_NAME +from mwmbl.indexer.update_urls import update_urls_continuously +from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE +from mwmbl.url_queue import update_queue_continuously + + +class MwmblConfig(AppConfig): + name = "mwmbl" + verbose_name = "Mwmbl Application" + + def ready(self): + index_path = Path(settings.DATA_PATH) / INDEX_NAME + try: + existing_index = TinyIndex(item_factory=Document, index_path=index_path) + if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != settings.NUM_PAGES: + raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages " + f"({existing_index.num_pages}) do not match") + except FileNotFoundError: + print("Creating a new index") + TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES, + page_size=PAGE_SIZE) + + if settings.RUN_BACKGROUND_PROCESSES: + new_item_queue = Queue() + Process(target=background.run, args=(settings.DATA_PATH,)).start() + Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start() + Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start() diff --git a/mwmbl/asgi.py b/mwmbl/asgi.py new file mode 100644 index 0000000..73088a9 --- /dev/null +++ b/mwmbl/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for app project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev') + +application = get_asgi_application() diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py index dbfac92..a4f0524 100644 --- a/mwmbl/crawler/app.py +++ b/mwmbl/crawler/app.py @@ -10,10 +10,11 @@ from uuid import uuid4 import boto3 import justext import requests -from fastapi import HTTPException, APIRouter +from fastapi import HTTPException from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor +from ninja import Router from redis import Redis from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch @@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT, return paragraphs, title -def get_router(batch_cache: BatchCache, queued_batches: Queue): - router = APIRouter(prefix="/crawler", tags=["crawler"]) +def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router: + router = Router(tags=["crawler"]) - @router.on_event("startup") - async def on_startup(): - with Database() as db: - url_db = URLDatabase(db.connection) - return url_db.create_tables() + # TODO: # ensure tables are created before crawler code is used: + # # + # # url_db.create_tables() @router.get('/fetch') - def fetch_url(url: str, query: str): + def fetch_url(request, url: str, query: str): response = requests.get(url) paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English")) good_paragraphs = [p for p in paragraphs if p.class_type == 'good'] @@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): return format_result(result, query) @router.post('/batches/') - def post_batch(batch: Batch): + def post_batch(request, batch: Batch): if len(batch.items) > MAX_BATCH_SIZE: raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}") @@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): } @router.post('/batches/new') - def request_new_batch(batch_request: NewBatchRequest) -> list[str]: + def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]: user_id_hash = _get_user_id_hash(batch_request) try: urls = queued_batches.get(block=False) @@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): return urls @router.get('/batches/{date_str}/users/{public_user_id}') - def get_batches_for_date_and_user(date_str, public_user_id): + def get_batches_for_date_and_user(request, date_str, public_user_id): check_date_str(date_str) check_public_user_id(public_user_id) prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/' return get_batch_ids_for_prefix(prefix) @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}') - def get_batch_from_id(date_str, public_user_id, batch_id): + def get_batch_from_id(request, date_str, public_user_id, batch_id): url = get_batch_url(batch_id, date_str, public_user_id) data = json.loads(gzip.decompress(requests.get(url).content)) return { @@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): 'batch': data, } - @router.get('/latest-batch', response_model=list[HashedBatch]) - def get_latest_batch(): + @router.get('/latest-batch') + def get_latest_batch(request) -> list[HashedBatch]: return [] if last_batch is None else [last_batch] @router.get('/batches/{date_str}/users') - def get_user_id_hashes_for_date(date_str: str): + def get_user_id_hashes_for_date(request, date_str: str): check_date_str(date_str) prefix = f'1/{VERSION}/{date_str}/1/' return get_subfolders(prefix) @router.get('/stats') - def get_stats() -> MwmblStats: + def get_stats(request) -> MwmblStats: return stats_manager.get_stats() @router.get('/') - def status(): + def status(request): return { 'status': 'ok' } diff --git a/mwmbl/crawler/batch.py b/mwmbl/crawler/batch.py index b6b3a35..7d7f064 100644 --- a/mwmbl/crawler/batch.py +++ b/mwmbl/crawler/batch.py @@ -1,21 +1,21 @@ from typing import Optional -from pydantic import BaseModel +from ninja import Schema -class ItemContent(BaseModel): +class ItemContent(Schema): title: str extract: str links: list[str] extra_links: Optional[list[str]] -class ItemError(BaseModel): +class ItemError(Schema): name: str message: Optional[str] -class Item(BaseModel): +class Item(Schema): url: str status: Optional[int] timestamp: int @@ -23,16 +23,16 @@ class Item(BaseModel): error: Optional[ItemError] -class Batch(BaseModel): +class Batch(Schema): user_id: str items: list[Item] -class NewBatchRequest(BaseModel): +class NewBatchRequest(Schema): user_id: str -class HashedBatch(BaseModel): +class HashedBatch(Schema): user_id_hash: str timestamp: int items: list[Item] diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index cefe19e..7c83edf 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -1,16 +1,13 @@ """ Database storing info on URLs """ -import random from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime from enum import Enum from logging import getLogger from psycopg2.extras import execute_values -from mwmbl.hn_top_domains_filtered import DOMAINS -from mwmbl.settings import CORE_DOMAINS # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned from mwmbl.utils import batch diff --git a/mwmbl/indexer/batch_cache.py b/mwmbl/indexer/batch_cache.py index e7af6db..01d8cc9 100644 --- a/mwmbl/indexer/batch_cache.py +++ b/mwmbl/indexer/batch_cache.py @@ -9,7 +9,6 @@ import os from logging import getLogger from multiprocessing.pool import ThreadPool from pathlib import Path -from tempfile import NamedTemporaryFile from urllib.parse import urlparse from pydantic import ValidationError diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index 4edcb8a..fb61405 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -1,13 +1,10 @@ """ Create a search index """ -from collections import Counter from typing import Iterable from urllib.parse import unquote -import pandas as pd - -from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex +from mwmbl.tinysearchengine.indexer import TokenizedDocument from mwmbl.tokenizer import tokenize, get_bigrams DEFAULT_SCORE = 0 diff --git a/mwmbl/indexer/update_urls.py b/mwmbl/indexer/update_urls.py index 3819777..8a1b973 100644 --- a/mwmbl/indexer/update_urls.py +++ b/mwmbl/indexer/update_urls.py @@ -1,13 +1,10 @@ -import os -import pickle -import re from collections import defaultdict from datetime import datetime, timezone, timedelta from logging import getLogger from multiprocessing import Queue from pathlib import Path from time import sleep -from typing import Iterable, Collection +from typing import Collection from urllib.parse import urlparse from requests_cache import CachedSession diff --git a/mwmbl/main.py b/mwmbl/main.py index 3c25209..0281edc 100644 --- a/mwmbl/main.py +++ b/mwmbl/main.py @@ -1,96 +1,8 @@ -import argparse -import logging -import sys -from multiprocessing import Process, Queue -from pathlib import Path - import uvicorn -from fastapi import FastAPI -from starlette.middleware.cors import CORSMiddleware - -from mwmbl import background -from mwmbl.crawler import app as crawler -from mwmbl.indexer.batch_cache import BatchCache -from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME -from mwmbl.platform import user -from mwmbl.indexer.update_urls import update_urls_continuously -from mwmbl.tinysearchengine import search -from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE -from mwmbl.tinysearchengine.rank import HeuristicRanker -from mwmbl.url_queue import update_queue_continuously - -FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s' -logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT) - - -MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle' - - -def setup_args(): - parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor") - parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560) - parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata") - parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000) - parser.add_argument("--background", help="Enable running the background tasks to process batches", - action='store_true') - args = parser.parse_args() - return args def run(): - args = setup_args() - - index_path = Path(args.data) / INDEX_NAME - try: - existing_index = TinyIndex(item_factory=Document, index_path=index_path) - if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages: - raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages " - f"({existing_index.num_pages}) do not match") - except FileNotFoundError: - print("Creating a new index") - TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE) - - new_item_queue = Queue() - queued_batches = Queue() - # curation_queue = Queue() - - if args.background: - Process(target=background.run, args=(args.data,)).start() - Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start() - Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start() - - completer = Completer() - - with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index: - ranker = HeuristicRanker(tiny_index, completer) - # model = pickle.load(open(MODEL_PATH, 'rb')) - # ranker = LTRRanker(model, tiny_index, completer) - - # Initialize FastApi instance - app = FastAPI() - - # Try disabling since this is handled by nginx - # app.add_middleware( - # CORSMiddleware, - # allow_origins=["*"], - # allow_credentials=True, - # allow_methods=["*"], - # allow_headers=["*"], - # ) - - search_router = search.create_router(ranker) - app.include_router(search_router) - - batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME) - crawler_router = crawler.get_router(batch_cache, queued_batches) - app.include_router(crawler_router) - - user_router = user.create_router(index_path) - app.include_router(user_router) - - # Initialize uvicorn server using global app instance and server config params - uvicorn.run(app, host="0.0.0.0", port=args.port) + uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000) if __name__ == "__main__": diff --git a/mwmbl/platform/user.py b/mwmbl/platform/user.py index a3006c4..bbdcb0e 100644 --- a/mwmbl/platform/user.py +++ b/mwmbl/platform/user.py @@ -7,7 +7,7 @@ import requests from fastapi import APIRouter, Response from pydantic import BaseModel -from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState +from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tokenizer import tokenize diff --git a/mwmbl/settings_common.py b/mwmbl/settings_common.py new file mode 100644 index 0000000..b08b62c --- /dev/null +++ b/mwmbl/settings_common.py @@ -0,0 +1,125 @@ +""" +Django settings for mwmbl project. + +Generated by 'django-admin startproject' using Django 4.2.4. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/4.2/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'mwmbl', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'mwmbl.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'mwmbl.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/4.2/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/4.2/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/4.2/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' + diff --git a/mwmbl/settings_dev.py b/mwmbl/settings_dev.py new file mode 100644 index 0000000..fe07890 --- /dev/null +++ b/mwmbl/settings_dev.py @@ -0,0 +1,5 @@ +from mwmbl.settings_common import * + +DATA_PATH = "./devdata" +RUN_BACKGROUND_PROCESSES = False +NUM_PAGES = 2560 diff --git a/mwmbl/settings_prod.py b/mwmbl/settings_prod.py new file mode 100644 index 0000000..f7c50ee --- /dev/null +++ b/mwmbl/settings_prod.py @@ -0,0 +1,5 @@ +from mwmbl.settings_common import * + +DATA_PATH = "/app/storage" +RUN_BACKGROUND_PROCESSES = True +NUM_PAGES = 10240000 diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index 81109fd..7f331b8 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -6,7 +6,6 @@ from operator import itemgetter from urllib.parse import urlparse from mwmbl.format import format_result_with_pattern, get_query_regex -from mwmbl.platform.user import MAX_CURATED_SCORE from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tinysearchengine.completer import Completer from mwmbl.hn_top_domains_filtered import DOMAINS diff --git a/mwmbl/tinysearchengine/search.py b/mwmbl/tinysearchengine/search.py index f9ecace..8dae294 100644 --- a/mwmbl/tinysearchengine/search.py +++ b/mwmbl/tinysearchengine/search.py @@ -1,6 +1,6 @@ from logging import getLogger -from fastapi import APIRouter +from ninja import Router from mwmbl.tinysearchengine.rank import HeuristicRanker @@ -10,15 +10,15 @@ logger = getLogger(__name__) SCORE_THRESHOLD = 0.25 -def create_router(ranker: HeuristicRanker) -> APIRouter: - router = APIRouter(prefix="/search", tags=["search"]) +def create_router(ranker: HeuristicRanker) -> Router: + router = Router(tags=["search"]) @router.get("") - def search(s: str): + def search(request, s: str): return ranker.search(s) @router.get("/complete") - def complete(q: str): + def complete(request, q: str): return ranker.complete(q) return router diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index ab0f1bc..8151550 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -1,6 +1,5 @@ import time from collections import defaultdict -from dataclasses import dataclass from datetime import datetime, timedelta from logging import getLogger from multiprocessing import Queue diff --git a/mwmbl/urls.py b/mwmbl/urls.py new file mode 100644 index 0000000..ff67f2d --- /dev/null +++ b/mwmbl/urls.py @@ -0,0 +1,25 @@ +""" +URL configuration for app project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/4.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path + +from mwmbl.api import api + +urlpatterns = [ + path('admin/', admin.site.urls), + path('', api.urls) +] diff --git a/mwmbl/wsgi.py b/mwmbl/wsgi.py new file mode 100644 index 0000000..ebdf0ff --- /dev/null +++ b/mwmbl/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for app project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev') + +application = get_wsgi_application() diff --git a/pyproject.toml b/pyproject.toml index d435427..4a4a725 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,8 @@ langdetect = {version= "==1.0.9", optional = true} pyarrow = {version= "==6.0.0", optional = true} pyspark = {version= "==3.2.0", optional = true} Levenshtein = {version= "==0.16.0", optional = true} +django = "^4.2.4" +django-ninja = "^0.22.2" requests-cache = "^1.1.0" redis = {extras = ["hiredis"], version = "^5.0.1"} diff --git a/test/test_completer.py b/test/test_completer.py index b1fb49e..8867f26 100644 --- a/test/test_completer.py +++ b/test/test_completer.py @@ -1,5 +1,3 @@ -import mwmbl.tinysearchengine.completer -import pytest import pandas as pd def mockCompleterData(mocker, data): @@ -16,7 +14,7 @@ def test_correctCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('build') assert ['build', 'builder', 'buildings'] == completion @@ -29,7 +27,7 @@ def test_correctSortOrder(mocker): [3, 'buildings', 3]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('build') assert ['build', 'buildings', 'builder'] == completion @@ -42,7 +40,7 @@ def test_noCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('test') assert [] == completion @@ -55,7 +53,7 @@ def test_singleCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('announce') assert ['announce'] == completion @@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() for i in range(3): print(f"iteration: {i}") completion = completer.complete('build') diff --git a/test/test_indexer.py b/test/test_indexer.py index dd25b18..cf714c0 100644 --- a/test/test_indexer.py +++ b/test/test_indexer.py @@ -1,9 +1,9 @@ from pathlib import Path from tempfile import TemporaryDirectory -from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size -from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError -import json +from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size +from zstandard import ZstdCompressor + def test_create_index(): num_pages = 10 diff --git a/test/test_update_urls.py b/test/test_update_urls.py index 8f205f8..089caea 100644 --- a/test/test_update_urls.py +++ b/test/test_update_urls.py @@ -1,4 +1,4 @@ -from mwmbl.indexer.update_urls import process_link +from mwmbl.indexer import process_link def test_process_link_normal():