Add crawler router
This commit is contained in:
parent
ed64ca6c91
commit
b6fd27352b
6 changed files with 35 additions and 33 deletions
13
app/api.py
13
app/api.py
|
@ -1,9 +1,12 @@
|
|||
from multiprocessing import Queue
|
||||
from pathlib import Path
|
||||
|
||||
from ninja import NinjaAPI
|
||||
|
||||
from app import settings
|
||||
from mwmbl.indexer.paths import INDEX_NAME
|
||||
import mwmbl.crawler.app as crawler
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
||||
from mwmbl.tinysearchengine import search
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
@ -19,10 +22,10 @@ completer = Completer()
|
|||
ranker = HeuristicRanker(tiny_index, completer)
|
||||
|
||||
search_router = search.create_router(ranker)
|
||||
|
||||
api.add_router("/search/", search_router)
|
||||
|
||||
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
|
||||
|
||||
@api.get("/hello")
|
||||
def hello(request):
|
||||
return {"response": "Hello world"}
|
||||
queued_batches = Queue()
|
||||
crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
|
||||
api.add_router("/crawler/", crawler_router)
|
||||
|
|
|
@ -21,5 +21,5 @@ from app.api import api
|
|||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('api/v1/', api.urls)
|
||||
path('', api.urls)
|
||||
]
|
||||
|
|
|
@ -14,6 +14,7 @@ from fastapi import HTTPException, APIRouter
|
|||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||
from ninja import Router
|
||||
from redis import Redis
|
||||
|
||||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||
|
@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
|
|||
return paragraphs, title
|
||||
|
||||
|
||||
def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
||||
router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
|
||||
router = Router(tags=["crawler"])
|
||||
|
||||
@router.on_event("startup")
|
||||
async def on_startup():
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
return url_db.create_tables()
|
||||
# TODO: # ensure tables are created before crawler code is used:
|
||||
# #
|
||||
# # url_db.create_tables()
|
||||
|
||||
@router.get('/fetch')
|
||||
def fetch_url(url: str, query: str):
|
||||
def fetch_url(request, url: str, query: str):
|
||||
response = requests.get(url)
|
||||
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
||||
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
||||
|
@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
|||
return format_result(result, query)
|
||||
|
||||
@router.post('/batches/')
|
||||
def post_batch(batch: Batch):
|
||||
def post_batch(request, batch: Batch):
|
||||
if len(batch.items) > MAX_BATCH_SIZE:
|
||||
raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
|
||||
|
||||
|
@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
|||
}
|
||||
|
||||
@router.post('/batches/new')
|
||||
def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
|
||||
def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
|
||||
user_id_hash = _get_user_id_hash(batch_request)
|
||||
try:
|
||||
urls = queued_batches.get(block=False)
|
||||
|
@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
|||
return urls
|
||||
|
||||
@router.get('/batches/{date_str}/users/{public_user_id}')
|
||||
def get_batches_for_date_and_user(date_str, public_user_id):
|
||||
def get_batches_for_date_and_user(request, date_str, public_user_id):
|
||||
check_date_str(date_str)
|
||||
check_public_user_id(public_user_id)
|
||||
prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
|
||||
return get_batch_ids_for_prefix(prefix)
|
||||
|
||||
@router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
|
||||
def get_batch_from_id(date_str, public_user_id, batch_id):
|
||||
def get_batch_from_id(request, date_str, public_user_id, batch_id):
|
||||
url = get_batch_url(batch_id, date_str, public_user_id)
|
||||
data = json.loads(gzip.decompress(requests.get(url).content))
|
||||
return {
|
||||
|
@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
|||
'batch': data,
|
||||
}
|
||||
|
||||
@router.get('/latest-batch', response_model=list[HashedBatch])
|
||||
def get_latest_batch():
|
||||
@router.get('/latest-batch')
|
||||
def get_latest_batch(request) -> list[HashedBatch]:
|
||||
return [] if last_batch is None else [last_batch]
|
||||
|
||||
@router.get('/batches/{date_str}/users')
|
||||
def get_user_id_hashes_for_date(date_str: str):
|
||||
def get_user_id_hashes_for_date(request, date_str: str):
|
||||
check_date_str(date_str)
|
||||
prefix = f'1/{VERSION}/{date_str}/1/'
|
||||
return get_subfolders(prefix)
|
||||
|
||||
@router.get('/stats')
|
||||
def get_stats() -> MwmblStats:
|
||||
def get_stats(request) -> MwmblStats:
|
||||
return stats_manager.get_stats()
|
||||
|
||||
@router.get('/')
|
||||
def status():
|
||||
def status(request):
|
||||
return {
|
||||
'status': 'ok'
|
||||
}
|
||||
|
|
|
@ -1,21 +1,21 @@
|
|||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
from ninja import Schema
|
||||
|
||||
|
||||
class ItemContent(BaseModel):
|
||||
class ItemContent(Schema):
|
||||
title: str
|
||||
extract: str
|
||||
links: list[str]
|
||||
extra_links: Optional[list[str]]
|
||||
|
||||
|
||||
class ItemError(BaseModel):
|
||||
class ItemError(Schema):
|
||||
name: str
|
||||
message: Optional[str]
|
||||
|
||||
|
||||
class Item(BaseModel):
|
||||
class Item(Schema):
|
||||
url: str
|
||||
status: Optional[int]
|
||||
timestamp: int
|
||||
|
@ -23,16 +23,16 @@ class Item(BaseModel):
|
|||
error: Optional[ItemError]
|
||||
|
||||
|
||||
class Batch(BaseModel):
|
||||
class Batch(Schema):
|
||||
user_id: str
|
||||
items: list[Item]
|
||||
|
||||
|
||||
class NewBatchRequest(BaseModel):
|
||||
class NewBatchRequest(Schema):
|
||||
user_id: str
|
||||
|
||||
|
||||
class HashedBatch(BaseModel):
|
||||
class HashedBatch(Schema):
|
||||
user_id_hash: str
|
||||
timestamp: int
|
||||
items: list[Item]
|
||||
|
|
|
@ -83,7 +83,7 @@ def run():
|
|||
app.include_router(search_router)
|
||||
|
||||
batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
|
||||
crawler_router = crawler.get_router(batch_cache, queued_batches)
|
||||
crawler_router = crawler.create_router(batch_cache, queued_batches)
|
||||
app.include_router(crawler_router)
|
||||
|
||||
user_router = user.create_router(index_path)
|
||||
|
|
|
@ -14,11 +14,11 @@ def create_router(ranker: HeuristicRanker) -> Router:
|
|||
router = Router(tags=["search"])
|
||||
|
||||
@router.get("")
|
||||
def search(s: str):
|
||||
def search(request, s: str):
|
||||
return ranker.search(s)
|
||||
|
||||
@router.get("/complete")
|
||||
def complete(q: str):
|
||||
def complete(request, q: str):
|
||||
return ranker.complete(q)
|
||||
|
||||
return router
|
||||
|
|
Loading…
Reference in a new issue