From e1bf423e69cd5228a39d62c948f9dd6bc977b15d Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Fri, 29 Sep 2023 13:58:26 +0100 Subject: [PATCH] Get stats --- mwmbl/crawler/app.py | 15 +++++++++++++-- mwmbl/crawler/stats.py | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py index cad5705..b18e904 100644 --- a/mwmbl/crawler/app.py +++ b/mwmbl/crawler/app.py @@ -13,8 +13,10 @@ from fastapi import HTTPException, APIRouter from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor +from redis import Redis from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch +from mwmbl.crawler.stats import MwmblStats, StatsManager from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus from mwmbl.database import Database from mwmbl.format import format_result @@ -31,9 +33,11 @@ from mwmbl.settings import ( PUBLIC_URL_PREFIX, PUBLIC_USER_ID_LENGTH, FILE_NAME_SUFFIX, - DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS) + DATE_REGEX, NUM_EXTRACT_CHARS) from mwmbl.tinysearchengine.indexer import Document -from mwmbl.url_queue import URLQueue + + +redis = Redis(host='localhost', port=6379, decode_responses=True) def get_bucket(name): @@ -191,6 +195,13 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): prefix = f'1/{VERSION}/{date_str}/1/' return get_subfolders(prefix) + @router.get('/stats') + def get_stats() -> MwmblStats: + stats = StatsManager(redis) + stats = stats.get_stats() + print("Stats", stats) + return stats + @router.get('/') def status(): return { diff --git a/mwmbl/crawler/stats.py b/mwmbl/crawler/stats.py index c71973c..eb0efca 100644 --- a/mwmbl/crawler/stats.py +++ b/mwmbl/crawler/stats.py @@ -1,11 +1,11 @@ import gzip -import json from datetime import datetime from glob import glob from itertools import islice from logging import getLogger from urllib.parse import urlparse +from pydantic import BaseModel from redis import Redis from mwmbl.crawler.batch import HashedBatch @@ -20,6 +20,13 @@ HOST_COUNT_KEY = "host-count-{date}" EXPIRE_SECONDS = 60*60*24 +class MwmblStats(BaseModel): + urls_crawled_today: int + urls_crawled_hourly: list[int] + top_users: dict[str, int] + top_domains: dict[str, int] + + class StatsManager: def __init__(self, redis: Redis): self.redis = redis @@ -51,8 +58,36 @@ class StatsManager: self.redis.zincrby(host_key, 1, host) self.redis.expire(host_key, EXPIRE_SECONDS) - def get_stats(self): - pass + def get_stats(self) -> MwmblStats: + date_time = datetime.now() + date = date_time.date() + url_count_key = URL_DATE_COUNT_KEY.format(date=date) + url_count = self.redis.get(url_count_key) + + if url_count is None: + url_count = 0 + + hour_counts = [] + for i in range(date_time.hour + 1): + hour = datetime(date_time.year, date_time.month, date_time.day, i) + hour_key = URL_HOUR_COUNT_KEY.format(hour=hour) + hour_count = self.redis.get(hour_key) + if hour_count is None: + hour_count = 0 + hour_counts.append(hour_count) + + user_count_key = USER_COUNT_KEY.format(date=date_time.date) + user_counts = self.redis.zrevrange(user_count_key, 0, 100, withscores=True) + + host_key = HOST_COUNT_KEY.format(date=date_time.date) + host_counts = self.redis.zrevrange(host_key, 0, 100, withscores=True) + + return MwmblStats( + urls_crawled_today=url_count, + urls_crawled_hourly=hour_counts, + top_users=user_counts, + top_domains=host_counts, + ) def get_test_batches():