Get stats
This commit is contained in:
parent
a55a027107
commit
e1bf423e69
2 changed files with 51 additions and 5 deletions
|
@ -13,8 +13,10 @@ from fastapi import HTTPException, APIRouter
|
|||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||
from redis import Redis
|
||||
|
||||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||
from mwmbl.crawler.stats import MwmblStats, StatsManager
|
||||
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.format import format_result
|
||||
|
@ -31,9 +33,11 @@ from mwmbl.settings import (
|
|||
PUBLIC_URL_PREFIX,
|
||||
PUBLIC_USER_ID_LENGTH,
|
||||
FILE_NAME_SUFFIX,
|
||||
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
|
||||
DATE_REGEX, NUM_EXTRACT_CHARS)
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
from mwmbl.url_queue import URLQueue
|
||||
|
||||
|
||||
redis = Redis(host='localhost', port=6379, decode_responses=True)
|
||||
|
||||
|
||||
def get_bucket(name):
|
||||
|
@ -191,6 +195,13 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
|||
prefix = f'1/{VERSION}/{date_str}/1/'
|
||||
return get_subfolders(prefix)
|
||||
|
||||
@router.get('/stats')
|
||||
def get_stats() -> MwmblStats:
|
||||
stats = StatsManager(redis)
|
||||
stats = stats.get_stats()
|
||||
print("Stats", stats)
|
||||
return stats
|
||||
|
||||
@router.get('/')
|
||||
def status():
|
||||
return {
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import gzip
|
||||
import json
|
||||
from datetime import datetime
|
||||
from glob import glob
|
||||
from itertools import islice
|
||||
from logging import getLogger
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pydantic import BaseModel
|
||||
from redis import Redis
|
||||
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
|
@ -20,6 +20,13 @@ HOST_COUNT_KEY = "host-count-{date}"
|
|||
EXPIRE_SECONDS = 60*60*24
|
||||
|
||||
|
||||
class MwmblStats(BaseModel):
|
||||
urls_crawled_today: int
|
||||
urls_crawled_hourly: list[int]
|
||||
top_users: dict[str, int]
|
||||
top_domains: dict[str, int]
|
||||
|
||||
|
||||
class StatsManager:
|
||||
def __init__(self, redis: Redis):
|
||||
self.redis = redis
|
||||
|
@ -51,8 +58,36 @@ class StatsManager:
|
|||
self.redis.zincrby(host_key, 1, host)
|
||||
self.redis.expire(host_key, EXPIRE_SECONDS)
|
||||
|
||||
def get_stats(self):
|
||||
pass
|
||||
def get_stats(self) -> MwmblStats:
|
||||
date_time = datetime.now()
|
||||
date = date_time.date()
|
||||
url_count_key = URL_DATE_COUNT_KEY.format(date=date)
|
||||
url_count = self.redis.get(url_count_key)
|
||||
|
||||
if url_count is None:
|
||||
url_count = 0
|
||||
|
||||
hour_counts = []
|
||||
for i in range(date_time.hour + 1):
|
||||
hour = datetime(date_time.year, date_time.month, date_time.day, i)
|
||||
hour_key = URL_HOUR_COUNT_KEY.format(hour=hour)
|
||||
hour_count = self.redis.get(hour_key)
|
||||
if hour_count is None:
|
||||
hour_count = 0
|
||||
hour_counts.append(hour_count)
|
||||
|
||||
user_count_key = USER_COUNT_KEY.format(date=date_time.date)
|
||||
user_counts = self.redis.zrevrange(user_count_key, 0, 100, withscores=True)
|
||||
|
||||
host_key = HOST_COUNT_KEY.format(date=date_time.date)
|
||||
host_counts = self.redis.zrevrange(host_key, 0, 100, withscores=True)
|
||||
|
||||
return MwmblStats(
|
||||
urls_crawled_today=url_count,
|
||||
urls_crawled_hourly=hour_counts,
|
||||
top_users=user_counts,
|
||||
top_domains=host_counts,
|
||||
)
|
||||
|
||||
|
||||
def get_test_batches():
|
||||
|
|
Loading…
Reference in a new issue