Daoud Clarke 1 rok pred
rodič
commit
b6fd27352b

+ 8 - 5
app/api.py

@@ -1,9 +1,12 @@
+from multiprocessing import Queue
 from pathlib import Path
 
 from ninja import NinjaAPI
 
 from app import settings
-from mwmbl.indexer.paths import INDEX_NAME
+import mwmbl.crawler.app as crawler
+from mwmbl.indexer.batch_cache import BatchCache
+from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
@@ -19,10 +22,10 @@ completer = Completer()
 ranker = HeuristicRanker(tiny_index, completer)
 
 search_router = search.create_router(ranker)
-
 api.add_router("/search/", search_router)
 
+batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
 
-@api.get("/hello")
-def hello(request):
-    return {"response": "Hello world"}
+queued_batches = Queue()
+crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
+api.add_router("/crawler/", crawler_router)

+ 1 - 1
app/urls.py

@@ -21,5 +21,5 @@ from app.api import api
 
 urlpatterns = [
     path('admin/', admin.site.urls),
-    path('api/v1/', api.urls)
+    path('', api.urls)
 ]

+ 16 - 17
mwmbl/crawler/app.py

@@ -14,6 +14,7 @@ from fastapi import HTTPException, APIRouter
 from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
     LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
     STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
+from ninja import Router
 from redis import Redis
 
 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
     return paragraphs, title
 
 
-def get_router(batch_cache: BatchCache, queued_batches: Queue):
-    router = APIRouter(prefix="/crawler", tags=["crawler"])
+def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
+    router = Router(tags=["crawler"])
 
-    @router.on_event("startup")
-    async def on_startup():
-        with Database() as db:
-            url_db = URLDatabase(db.connection)
-            return url_db.create_tables()
+    # TODO: # ensure tables are created before crawler code is used:
+    #       #
+    #       #     url_db.create_tables()
 
     @router.get('/fetch')
-    def fetch_url(url: str, query: str):
+    def fetch_url(request, url: str, query: str):
         response = requests.get(url)
         paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
         good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
@@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
         return format_result(result, query)
 
     @router.post('/batches/')
-    def post_batch(batch: Batch):
+    def post_batch(request, batch: Batch):
         if len(batch.items) > MAX_BATCH_SIZE:
             raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
 
@@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
         }
 
     @router.post('/batches/new')
-    def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
+    def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
         user_id_hash = _get_user_id_hash(batch_request)
         try:
             urls = queued_batches.get(block=False)
@@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
         return urls
 
     @router.get('/batches/{date_str}/users/{public_user_id}')
-    def get_batches_for_date_and_user(date_str, public_user_id):
+    def get_batches_for_date_and_user(request, date_str, public_user_id):
         check_date_str(date_str)
         check_public_user_id(public_user_id)
         prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
         return get_batch_ids_for_prefix(prefix)
 
     @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
-    def get_batch_from_id(date_str, public_user_id, batch_id):
+    def get_batch_from_id(request, date_str, public_user_id, batch_id):
         url = get_batch_url(batch_id, date_str, public_user_id)
         data = json.loads(gzip.decompress(requests.get(url).content))
         return {
@@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
             'batch': data,
         }
 
-    @router.get('/latest-batch', response_model=list[HashedBatch])
-    def get_latest_batch():
+    @router.get('/latest-batch')
+    def get_latest_batch(request) -> list[HashedBatch]:
         return [] if last_batch is None else [last_batch]
 
     @router.get('/batches/{date_str}/users')
-    def get_user_id_hashes_for_date(date_str: str):
+    def get_user_id_hashes_for_date(request, date_str: str):
         check_date_str(date_str)
         prefix = f'1/{VERSION}/{date_str}/1/'
         return get_subfolders(prefix)
 
     @router.get('/stats')
-    def get_stats() -> MwmblStats:
+    def get_stats(request) -> MwmblStats:
         return stats_manager.get_stats()
 
     @router.get('/')
-    def status():
+    def status(request):
         return {
             'status': 'ok'
         }

+ 7 - 7
mwmbl/crawler/batch.py

@@ -1,21 +1,21 @@
 from typing import Optional
 
-from pydantic import BaseModel
+from ninja import Schema
 
 
-class ItemContent(BaseModel):
+class ItemContent(Schema):
     title: str
     extract: str
     links: list[str]
     extra_links: Optional[list[str]]
 
 
-class ItemError(BaseModel):
+class ItemError(Schema):
     name: str
     message: Optional[str]
 
 
-class Item(BaseModel):
+class Item(Schema):
     url: str
     status: Optional[int]
     timestamp: int
@@ -23,16 +23,16 @@ class Item(BaseModel):
     error: Optional[ItemError]
 
 
-class Batch(BaseModel):
+class Batch(Schema):
     user_id: str
     items: list[Item]
 
 
-class NewBatchRequest(BaseModel):
+class NewBatchRequest(Schema):
     user_id: str
 
 
-class HashedBatch(BaseModel):
+class HashedBatch(Schema):
     user_id_hash: str
     timestamp: int
     items: list[Item]

+ 1 - 1
mwmbl/main.py

@@ -83,7 +83,7 @@ def run():
         app.include_router(search_router)
 
         batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
-        crawler_router = crawler.get_router(batch_cache, queued_batches)
+        crawler_router = crawler.create_router(batch_cache, queued_batches)
         app.include_router(crawler_router)
 
         user_router = user.create_router(index_path)

+ 2 - 2
mwmbl/tinysearchengine/search.py

@@ -14,11 +14,11 @@ def create_router(ranker: HeuristicRanker) -> Router:
     router = Router(tags=["search"])
 
     @router.get("")
-    def search(s: str):
+    def search(request, s: str):
         return ranker.search(s)
 
     @router.get("/complete")
-    def complete(q: str):
+    def complete(request, q: str):
         return ranker.complete(q)
 
     return router