diff --git a/app/apps.py b/app/apps.py new file mode 100644 index 0000000..d4ad331 --- /dev/null +++ b/app/apps.py @@ -0,0 +1,22 @@ +import os +from multiprocessing import Process, Queue + +from django.apps import AppConfig + +from app import settings +from app.api import queued_batches +from mwmbl import background +from mwmbl.indexer.update_urls import update_urls_continuously +from mwmbl.url_queue import update_queue_continuously + + +class MwmblConfig(AppConfig): + name = "app" + verbose_name = "Mwmbl Application" + + def ready(self): + if os.environ.get('RUN_MAIN') and settings.RUN_BACKGROUND_PROCESSES: + new_item_queue = Queue() + Process(target=background.run, args=(settings.DATA_PATH,)).start() + Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start() + Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start() diff --git a/app/settings.py b/app/settings.py index f483d35..67f0d50 100644 --- a/app/settings.py +++ b/app/settings.py @@ -37,6 +37,7 @@ INSTALLED_APPS = [ 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', + 'app', ] MIDDLEWARE = [ @@ -125,5 +126,4 @@ DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' # ===================== Custom Settings ========================= DATA_PATH = "./devdata" - - +RUN_BACKGROUND_PROCESSES = True diff --git a/mwmbl/main.py b/mwmbl/main.py deleted file mode 100644 index 08518e3..0000000 --- a/mwmbl/main.py +++ /dev/null @@ -1,97 +0,0 @@ -import argparse -import logging -import sys -from multiprocessing import Process, Queue -from pathlib import Path - -import uvicorn -from fastapi import FastAPI -from starlette.middleware.cors import CORSMiddleware - -from mwmbl import background -from mwmbl.crawler import app as crawler -from mwmbl.indexer.batch_cache import BatchCache -from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME -from mwmbl.platform import user -from mwmbl.indexer.update_urls import update_urls_continuously -from mwmbl.tinysearchengine import search -from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE -from mwmbl.tinysearchengine.rank import HeuristicRanker -from mwmbl.url_queue import update_queue_continuously - -FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s' -logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT) - - -MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle' - - -def setup_args(): - parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor") - parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560) - parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata") - parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000) - parser.add_argument("--background", help="Enable running the background tasks to process batches", - action='store_true') - args = parser.parse_args() - return args - - -def run(): - args = setup_args() - - index_path = Path(args.data) / INDEX_NAME - try: - existing_index = TinyIndex(item_factory=Document, index_path=index_path) - if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages: - raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages " - f"({existing_index.num_pages}) do not match") - except FileNotFoundError: - print("Creating a new index") - TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE) - - new_item_queue = Queue() - queued_batches = Queue() - # curation_queue = Queue() - - if args.background: - Process(target=background.run, args=(args.data,)).start() - Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start() - Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start() - - completer = Completer() - - with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index: - ranker = HeuristicRanker(tiny_index, completer) - # model = pickle.load(open(MODEL_PATH, 'rb')) - # ranker = LTRRanker(model, tiny_index, completer) - - # Initialize FastApi instance - app = FastAPI() - - # Try disabling since this is handled by nginx - # app.add_middleware( - # CORSMiddleware, - # allow_origins=["*"], - # allow_credentials=True, - # allow_methods=["*"], - # allow_headers=["*"], - # ) - - search_router = search.create_router(ranker) - app.include_router(search_router) - - batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME) - crawler_router = crawler.create_router(batch_cache, queued_batches) - app.include_router(crawler_router) - - user_router = user.create_router(index_path) - app.include_router(user_router) - - # Initialize uvicorn server using global app instance and server config params - uvicorn.run(app, host="0.0.0.0", port=args.port) - - -if __name__ == "__main__": - run()