Add crawler router

2023-10-08 14:13:38 +01:00 · 2023-10-08 14:13:38 +01:00 · b6fd27352b
commit b6fd27352b
parent ed64ca6c91
6 changed files with 35 additions and 33 deletions
--- a/app/api.py
+++ b/app/api.py
@ -1,9 +1,12 @@
+from multiprocessing import Queue
 from pathlib import Path

 from ninja import NinjaAPI

 from app import settings
-from mwmbl.indexer.paths import INDEX_NAME
+import mwmbl.crawler.app as crawler
+from mwmbl.indexer.batch_cache import BatchCache
+from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
@ -19,10 +22,10 @@ completer = Completer()
 ranker = HeuristicRanker(tiny_index, completer)

 search_router = search.create_router(ranker)
-
 api.add_router("/search/", search_router)

+batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)

-@api.get("/hello")
-def hello(request):
-    return {"response": "Hello world"}
+queued_batches = Queue()
+crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
+api.add_router("/crawler/", crawler_router)
--- a/app/urls.py
+++ b/app/urls.py
@ -21,5 +21,5 @@ from app.api import api

 urlpatterns = [
    path('admin/', admin.site.urls),
-    path('api/v1/', api.urls)
+    path('', api.urls)
 ]
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -14,6 +14,7 @@ from fastapi import HTTPException, APIRouter
 from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
    LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
    STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
+from ninja import Router
 from redis import Redis

 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
    return paragraphs, title


-def get_router(batch_cache: BatchCache, queued_batches: Queue):
-    router = APIRouter(prefix="/crawler", tags=["crawler"])
+def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
+    router = Router(tags=["crawler"])

-    @router.on_event("startup")
-    async def on_startup():
-        with Database() as db:
-            url_db = URLDatabase(db.connection)
-            return url_db.create_tables()
+    # TODO: # ensure tables are created before crawler code is used:
+    #       #
+    #       #     url_db.create_tables()

    @router.get('/fetch')
-    def fetch_url(url: str, query: str):
+    def fetch_url(request, url: str, query: str):
        response = requests.get(url)
        paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
        good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        return format_result(result, query)

    @router.post('/batches/')
-    def post_batch(batch: Batch):
+    def post_batch(request, batch: Batch):
        if len(batch.items) > MAX_BATCH_SIZE:
            raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")

@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        }

    @router.post('/batches/new')
-    def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
+    def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
        user_id_hash = _get_user_id_hash(batch_request)
        try:
            urls = queued_batches.get(block=False)
@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        return urls

    @router.get('/batches/{date_str}/users/{public_user_id}')
-    def get_batches_for_date_and_user(date_str, public_user_id):
+    def get_batches_for_date_and_user(request, date_str, public_user_id):
        check_date_str(date_str)
        check_public_user_id(public_user_id)
        prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
        return get_batch_ids_for_prefix(prefix)

    @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
-    def get_batch_from_id(date_str, public_user_id, batch_id):
+    def get_batch_from_id(request, date_str, public_user_id, batch_id):
        url = get_batch_url(batch_id, date_str, public_user_id)
        data = json.loads(gzip.decompress(requests.get(url).content))
        return {
@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
            'batch': data,
        }

-    @router.get('/latest-batch', response_model=list[HashedBatch])
-    def get_latest_batch():
+    @router.get('/latest-batch')
+    def get_latest_batch(request) -> list[HashedBatch]:
        return [] if last_batch is None else [last_batch]

    @router.get('/batches/{date_str}/users')
-    def get_user_id_hashes_for_date(date_str: str):
+    def get_user_id_hashes_for_date(request, date_str: str):
        check_date_str(date_str)
        prefix = f'1/{VERSION}/{date_str}/1/'
        return get_subfolders(prefix)

    @router.get('/stats')
-    def get_stats() -> MwmblStats:
+    def get_stats(request) -> MwmblStats:
        return stats_manager.get_stats()

    @router.get('/')
-    def status():
+    def status(request):
        return {
            'status': 'ok'
        }
--- a/mwmbl/crawler/batch.py
+++ b/mwmbl/crawler/batch.py
@ -1,21 +1,21 @@
 from typing import Optional

-from pydantic import BaseModel
+from ninja import Schema


-class ItemContent(BaseModel):
+class ItemContent(Schema):
    title: str
    extract: str
    links: list[str]
    extra_links: Optional[list[str]]


-class ItemError(BaseModel):
+class ItemError(Schema):
    name: str
    message: Optional[str]


-class Item(BaseModel):
+class Item(Schema):
    url: str
    status: Optional[int]
    timestamp: int
@ -23,16 +23,16 @@ class Item(BaseModel):
    error: Optional[ItemError]


-class Batch(BaseModel):
+class Batch(Schema):
    user_id: str
    items: list[Item]


-class NewBatchRequest(BaseModel):
+class NewBatchRequest(Schema):
    user_id: str


-class HashedBatch(BaseModel):
+class HashedBatch(Schema):
    user_id_hash: str
    timestamp: int
    items: list[Item]
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@ -83,7 +83,7 @@ def run():
        app.include_router(search_router)

        batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
-        crawler_router = crawler.get_router(batch_cache, queued_batches)
+        crawler_router = crawler.create_router(batch_cache, queued_batches)
        app.include_router(crawler_router)

        user_router = user.create_router(index_path)
--- a/mwmbl/tinysearchengine/search.py
+++ b/mwmbl/tinysearchengine/search.py
@ -14,11 +14,11 @@ def create_router(ranker: HeuristicRanker) -> Router:
    router = Router(tags=["search"])

    @router.get("")
-    def search(s: str):
+    def search(request, s: str):
        return ranker.search(s)

    @router.get("/complete")
-    def complete(q: str):
+    def complete(request, q: str):
        return ranker.complete(q)

    return router