1 year ago · b6fd27352b
--- a/app/api.py
+++ b/app/api.py
@@ -1,9 +1,12 @@
 
				+from multiprocessing import Queue
			
 
				 from pathlib import Path
			
 
				 
			
 
				 from ninja import NinjaAPI
			
 
				 
			
 
				 from app import settings
			
 
				-from mwmbl.indexer.paths import INDEX_NAME
			
 
				+import mwmbl.crawler.app as crawler
			
 
				+from mwmbl.indexer.batch_cache import BatchCache
			
 
				+from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
			
 
				 from mwmbl.tinysearchengine import search
			
 
				 from mwmbl.tinysearchengine.completer import Completer
			
 
				 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
			
@@ -19,10 +22,10 @@ completer = Completer()
 
				 ranker = HeuristicRanker(tiny_index, completer)
			
 
				 
			
 
				 search_router = search.create_router(ranker)
			
 
				-
			
 
				 api.add_router("/search/", search_router)
			
 
				 
			
 
				+batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
			
 
				 
			
 
				-@api.get("/hello")
			
 
				-def hello(request):
			
 
				-    return {"response": "Hello world"}
			
 
				+queued_batches = Queue()
			
 
				+crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
			
 
				+api.add_router("/crawler/", crawler_router)
			
--- a/app/urls.py
+++ b/app/urls.py
@@ -21,5 +21,5 @@ from app.api import api
 
				 
			
 
				 urlpatterns = [
			
 
				     path('admin/', admin.site.urls),
			
 
				-    path('api/v1/', api.urls)
			
 
				+    path('', api.urls)
			
 
				 ]
			
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@@ -14,6 +14,7 @@ from fastapi import HTTPException, APIRouter
 
				 from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
			
 
				     LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
			
 
				     STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
			
 
				+from ninja import Router
			
 
				 from redis import Redis
			
 
				 
			
 
				 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
			
@@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
 
				     return paragraphs, title
			
 
				 
			
 
				 
			
 
				-def get_router(batch_cache: BatchCache, queued_batches: Queue):
			
 
				-    router = APIRouter(prefix="/crawler", tags=["crawler"])
			
 
				+def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
			
 
				+    router = Router(tags=["crawler"])
			
 
				 
			
 
				-    @router.on_event("startup")
			
 
				-    async def on_startup():
			
 
				-        with Database() as db:
			
 
				-            url_db = URLDatabase(db.connection)
			
 
				-            return url_db.create_tables()
			
 
				+    # TODO: # ensure tables are created before crawler code is used:
			
 
				+    #       #
			
 
				+    #       #     url_db.create_tables()
			
 
				 
			
 
				     @router.get('/fetch')
			
 
				-    def fetch_url(url: str, query: str):
			
 
				+    def fetch_url(request, url: str, query: str):
			
 
				         response = requests.get(url)
			
 
				         paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
			
 
				         good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
			
@@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
 
				         return format_result(result, query)
			
 
				 
			
 
				     @router.post('/batches/')
			
 
				-    def post_batch(batch: Batch):
			
 
				+    def post_batch(request, batch: Batch):
			
 
				         if len(batch.items) > MAX_BATCH_SIZE:
			
 
				             raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
			
 
				 
			
@@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
 
				         }
			
 
				 
			
 
				     @router.post('/batches/new')
			
 
				-    def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
			
 
				+    def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
			
 
				         user_id_hash = _get_user_id_hash(batch_request)
			
 
				         try:
			
 
				             urls = queued_batches.get(block=False)
			
@@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
 
				         return urls
			
 
				 
			
 
				     @router.get('/batches/{date_str}/users/{public_user_id}')
			
 
				-    def get_batches_for_date_and_user(date_str, public_user_id):
			
 
				+    def get_batches_for_date_and_user(request, date_str, public_user_id):
			
 
				         check_date_str(date_str)
			
 
				         check_public_user_id(public_user_id)
			
 
				         prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
			
 
				         return get_batch_ids_for_prefix(prefix)
			
 
				 
			
 
				     @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
			
 
				-    def get_batch_from_id(date_str, public_user_id, batch_id):
			
 
				+    def get_batch_from_id(request, date_str, public_user_id, batch_id):
			
 
				         url = get_batch_url(batch_id, date_str, public_user_id)
			
 
				         data = json.loads(gzip.decompress(requests.get(url).content))
			
 
				         return {
			
@@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
 
				             'batch': data,
			
 
				         }
			
 
				 
			
 
				-    @router.get('/latest-batch', response_model=list[HashedBatch])
			
 
				-    def get_latest_batch():
			
 
				+    @router.get('/latest-batch')
			
 
				+    def get_latest_batch(request) -> list[HashedBatch]:
			
 
				         return [] if last_batch is None else [last_batch]
			
 
				 
			
 
				     @router.get('/batches/{date_str}/users')
			
 
				-    def get_user_id_hashes_for_date(date_str: str):
			
 
				+    def get_user_id_hashes_for_date(request, date_str: str):
			
 
				         check_date_str(date_str)
			
 
				         prefix = f'1/{VERSION}/{date_str}/1/'
			
 
				         return get_subfolders(prefix)
			
 
				 
			
 
				     @router.get('/stats')
			
 
				-    def get_stats() -> MwmblStats:
			
 
				+    def get_stats(request) -> MwmblStats:
			
 
				         return stats_manager.get_stats()
			
 
				 
			
 
				     @router.get('/')
			
 
				-    def status():
			
 
				+    def status(request):
			
 
				         return {
			
 
				             'status': 'ok'
			
 
				         }
			
--- a/mwmbl/crawler/batch.py
+++ b/mwmbl/crawler/batch.py
@@ -1,21 +1,21 @@
 
				 from typing import Optional
			
 
				 
			
 
				-from pydantic import BaseModel
			
 
				+from ninja import Schema
			
 
				 
			
 
				 
			
 
				-class ItemContent(BaseModel):
			
 
				+class ItemContent(Schema):
			
 
				     title: str
			
 
				     extract: str
			
 
				     links: list[str]
			
 
				     extra_links: Optional[list[str]]
			
 
				 
			
 
				 
			
 
				-class ItemError(BaseModel):
			
 
				+class ItemError(Schema):
			
 
				     name: str
			
 
				     message: Optional[str]
			
 
				 
			
 
				 
			
 
				-class Item(BaseModel):
			
 
				+class Item(Schema):
			
 
				     url: str
			
 
				     status: Optional[int]
			
 
				     timestamp: int
			
@@ -23,16 +23,16 @@ class Item(BaseModel):
 
				     error: Optional[ItemError]
			
 
				 
			
 
				 
			
 
				-class Batch(BaseModel):
			
 
				+class Batch(Schema):
			
 
				     user_id: str
			
 
				     items: list[Item]
			
 
				 
			
 
				 
			
 
				-class NewBatchRequest(BaseModel):
			
 
				+class NewBatchRequest(Schema):
			
 
				     user_id: str
			
 
				 
			
 
				 
			
 
				-class HashedBatch(BaseModel):
			
 
				+class HashedBatch(Schema):
			
 
				     user_id_hash: str
			
 
				     timestamp: int
			
 
				     items: list[Item]
			
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@@ -83,7 +83,7 @@ def run():
 
				         app.include_router(search_router)
			
 
				 
			
 
				         batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
			
 
				-        crawler_router = crawler.get_router(batch_cache, queued_batches)
			
 
				+        crawler_router = crawler.create_router(batch_cache, queued_batches)
			
 
				         app.include_router(crawler_router)
			
 
				 
			
 
				         user_router = user.create_router(index_path)
			
--- a/mwmbl/tinysearchengine/search.py
+++ b/mwmbl/tinysearchengine/search.py
@@ -14,11 +14,11 @@ def create_router(ranker: HeuristicRanker) -> Router:
 
				     router = Router(tags=["search"])
			
 
				 
			
 
				     @router.get("")
			
 
				-    def search(s: str):
			
 
				+    def search(request, s: str):
			
 
				         return ranker.search(s)
			
 
				 
			
 
				     @router.get("/complete")
			
 
				-    def complete(q: str):
			
 
				+    def complete(request, q: str):
			
 
				         return ranker.complete(q)
			
 
				 
			
 
				     return router