diff --git a/app/api.py b/app/api.py index 2c1f4e5..af959e9 100644 --- a/app/api.py +++ b/app/api.py @@ -1,9 +1,12 @@ +from multiprocessing import Queue from pathlib import Path from ninja import NinjaAPI from app import settings -from mwmbl.indexer.paths import INDEX_NAME +import mwmbl.crawler.app as crawler +from mwmbl.indexer.batch_cache import BatchCache +from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME from mwmbl.tinysearchengine import search from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.indexer import TinyIndex, Document @@ -19,10 +22,10 @@ completer = Completer() ranker = HeuristicRanker(tiny_index, completer) search_router = search.create_router(ranker) - api.add_router("/search/", search_router) +batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME) -@api.get("/hello") -def hello(request): - return {"response": "Hello world"} +queued_batches = Queue() +crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches) +api.add_router("/crawler/", crawler_router) diff --git a/app/urls.py b/app/urls.py index 6338358..440a2f4 100644 --- a/app/urls.py +++ b/app/urls.py @@ -21,5 +21,5 @@ from app.api import api urlpatterns = [ path('admin/', admin.site.urls), - path('api/v1/', api.urls) + path('', api.urls) ] diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py index dbfac92..bda2dc8 100644 --- a/mwmbl/crawler/app.py +++ b/mwmbl/crawler/app.py @@ -14,6 +14,7 @@ from fastapi import HTTPException, APIRouter from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor +from ninja import Router from redis import Redis from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch @@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT, return paragraphs, title -def get_router(batch_cache: BatchCache, queued_batches: Queue): - router = APIRouter(prefix="/crawler", tags=["crawler"]) +def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router: + router = Router(tags=["crawler"]) - @router.on_event("startup") - async def on_startup(): - with Database() as db: - url_db = URLDatabase(db.connection) - return url_db.create_tables() + # TODO: # ensure tables are created before crawler code is used: + # # + # # url_db.create_tables() @router.get('/fetch') - def fetch_url(url: str, query: str): + def fetch_url(request, url: str, query: str): response = requests.get(url) paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English")) good_paragraphs = [p for p in paragraphs if p.class_type == 'good'] @@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): return format_result(result, query) @router.post('/batches/') - def post_batch(batch: Batch): + def post_batch(request, batch: Batch): if len(batch.items) > MAX_BATCH_SIZE: raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}") @@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): } @router.post('/batches/new') - def request_new_batch(batch_request: NewBatchRequest) -> list[str]: + def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]: user_id_hash = _get_user_id_hash(batch_request) try: urls = queued_batches.get(block=False) @@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): return urls @router.get('/batches/{date_str}/users/{public_user_id}') - def get_batches_for_date_and_user(date_str, public_user_id): + def get_batches_for_date_and_user(request, date_str, public_user_id): check_date_str(date_str) check_public_user_id(public_user_id) prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/' return get_batch_ids_for_prefix(prefix) @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}') - def get_batch_from_id(date_str, public_user_id, batch_id): + def get_batch_from_id(request, date_str, public_user_id, batch_id): url = get_batch_url(batch_id, date_str, public_user_id) data = json.loads(gzip.decompress(requests.get(url).content)) return { @@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): 'batch': data, } - @router.get('/latest-batch', response_model=list[HashedBatch]) - def get_latest_batch(): + @router.get('/latest-batch') + def get_latest_batch(request) -> list[HashedBatch]: return [] if last_batch is None else [last_batch] @router.get('/batches/{date_str}/users') - def get_user_id_hashes_for_date(date_str: str): + def get_user_id_hashes_for_date(request, date_str: str): check_date_str(date_str) prefix = f'1/{VERSION}/{date_str}/1/' return get_subfolders(prefix) @router.get('/stats') - def get_stats() -> MwmblStats: + def get_stats(request) -> MwmblStats: return stats_manager.get_stats() @router.get('/') - def status(): + def status(request): return { 'status': 'ok' } diff --git a/mwmbl/crawler/batch.py b/mwmbl/crawler/batch.py index b6b3a35..7d7f064 100644 --- a/mwmbl/crawler/batch.py +++ b/mwmbl/crawler/batch.py @@ -1,21 +1,21 @@ from typing import Optional -from pydantic import BaseModel +from ninja import Schema -class ItemContent(BaseModel): +class ItemContent(Schema): title: str extract: str links: list[str] extra_links: Optional[list[str]] -class ItemError(BaseModel): +class ItemError(Schema): name: str message: Optional[str] -class Item(BaseModel): +class Item(Schema): url: str status: Optional[int] timestamp: int @@ -23,16 +23,16 @@ class Item(BaseModel): error: Optional[ItemError] -class Batch(BaseModel): +class Batch(Schema): user_id: str items: list[Item] -class NewBatchRequest(BaseModel): +class NewBatchRequest(Schema): user_id: str -class HashedBatch(BaseModel): +class HashedBatch(Schema): user_id_hash: str timestamp: int items: list[Item] diff --git a/mwmbl/main.py b/mwmbl/main.py index 3c25209..08518e3 100644 --- a/mwmbl/main.py +++ b/mwmbl/main.py @@ -83,7 +83,7 @@ def run(): app.include_router(search_router) batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME) - crawler_router = crawler.get_router(batch_cache, queued_batches) + crawler_router = crawler.create_router(batch_cache, queued_batches) app.include_router(crawler_router) user_router = user.create_router(index_path) diff --git a/mwmbl/tinysearchengine/search.py b/mwmbl/tinysearchengine/search.py index bd8e54a..8dae294 100644 --- a/mwmbl/tinysearchengine/search.py +++ b/mwmbl/tinysearchengine/search.py @@ -14,11 +14,11 @@ def create_router(ranker: HeuristicRanker) -> Router: router = Router(tags=["search"]) @router.get("") - def search(s: str): + def search(request, s: str): return ranker.search(s) @router.get("/complete") - def complete(q: str): + def complete(request, q: str): return ranker.complete(q) return router