|
@@ -1,11 +1,11 @@
|
|
import gzip
|
|
import gzip
|
|
-import json
|
|
|
|
from datetime import datetime
|
|
from datetime import datetime
|
|
from glob import glob
|
|
from glob import glob
|
|
from itertools import islice
|
|
from itertools import islice
|
|
from logging import getLogger
|
|
from logging import getLogger
|
|
from urllib.parse import urlparse
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
+from pydantic import BaseModel
|
|
from redis import Redis
|
|
from redis import Redis
|
|
|
|
|
|
from mwmbl.crawler.batch import HashedBatch
|
|
from mwmbl.crawler.batch import HashedBatch
|
|
@@ -20,6 +20,13 @@ HOST_COUNT_KEY = "host-count-{date}"
|
|
EXPIRE_SECONDS = 60*60*24
|
|
EXPIRE_SECONDS = 60*60*24
|
|
|
|
|
|
|
|
|
|
|
|
+class MwmblStats(BaseModel):
|
|
|
|
+ urls_crawled_today: int
|
|
|
|
+ urls_crawled_hourly: list[int]
|
|
|
|
+ top_users: dict[str, int]
|
|
|
|
+ top_domains: dict[str, int]
|
|
|
|
+
|
|
|
|
+
|
|
class StatsManager:
|
|
class StatsManager:
|
|
def __init__(self, redis: Redis):
|
|
def __init__(self, redis: Redis):
|
|
self.redis = redis
|
|
self.redis = redis
|
|
@@ -51,8 +58,36 @@ class StatsManager:
|
|
self.redis.zincrby(host_key, 1, host)
|
|
self.redis.zincrby(host_key, 1, host)
|
|
self.redis.expire(host_key, EXPIRE_SECONDS)
|
|
self.redis.expire(host_key, EXPIRE_SECONDS)
|
|
|
|
|
|
- def get_stats(self):
|
|
|
|
- pass
|
|
|
|
|
|
+ def get_stats(self) -> MwmblStats:
|
|
|
|
+ date_time = datetime.now()
|
|
|
|
+ date = date_time.date()
|
|
|
|
+ url_count_key = URL_DATE_COUNT_KEY.format(date=date)
|
|
|
|
+ url_count = self.redis.get(url_count_key)
|
|
|
|
+
|
|
|
|
+ if url_count is None:
|
|
|
|
+ url_count = 0
|
|
|
|
+
|
|
|
|
+ hour_counts = []
|
|
|
|
+ for i in range(date_time.hour + 1):
|
|
|
|
+ hour = datetime(date_time.year, date_time.month, date_time.day, i)
|
|
|
|
+ hour_key = URL_HOUR_COUNT_KEY.format(hour=hour)
|
|
|
|
+ hour_count = self.redis.get(hour_key)
|
|
|
|
+ if hour_count is None:
|
|
|
|
+ hour_count = 0
|
|
|
|
+ hour_counts.append(hour_count)
|
|
|
|
+
|
|
|
|
+ user_count_key = USER_COUNT_KEY.format(date=date_time.date)
|
|
|
|
+ user_counts = self.redis.zrevrange(user_count_key, 0, 100, withscores=True)
|
|
|
|
+
|
|
|
|
+ host_key = HOST_COUNT_KEY.format(date=date_time.date)
|
|
|
|
+ host_counts = self.redis.zrevrange(host_key, 0, 100, withscores=True)
|
|
|
|
+
|
|
|
|
+ return MwmblStats(
|
|
|
|
+ urls_crawled_today=url_count,
|
|
|
|
+ urls_crawled_hourly=hour_counts,
|
|
|
|
+ top_users=user_counts,
|
|
|
|
+ top_domains=host_counts,
|
|
|
|
+ )
|
|
|
|
|
|
|
|
|
|
def get_test_batches():
|
|
def get_test_batches():
|