Get stats

This commit is contained in:
Daoud Clarke 2023-09-29 13:58:26 +01:00
parent a55a027107
commit e1bf423e69
2 changed files with 51 additions and 5 deletions

View file

@ -13,8 +13,10 @@ from fastapi import HTTPException, APIRouter
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from redis import Redis
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.stats import MwmblStats, StatsManager
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
from mwmbl.database import Database
from mwmbl.format import format_result
@ -31,9 +33,11 @@ from mwmbl.settings import (
PUBLIC_URL_PREFIX,
PUBLIC_USER_ID_LENGTH,
FILE_NAME_SUFFIX,
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
DATE_REGEX, NUM_EXTRACT_CHARS)
from mwmbl.tinysearchengine.indexer import Document
from mwmbl.url_queue import URLQueue
redis = Redis(host='localhost', port=6379, decode_responses=True)
def get_bucket(name):
@ -191,6 +195,13 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
prefix = f'1/{VERSION}/{date_str}/1/'
return get_subfolders(prefix)
@router.get('/stats')
def get_stats() -> MwmblStats:
stats = StatsManager(redis)
stats = stats.get_stats()
print("Stats", stats)
return stats
@router.get('/')
def status():
return {

View file

@ -1,11 +1,11 @@
import gzip
import json
from datetime import datetime
from glob import glob
from itertools import islice
from logging import getLogger
from urllib.parse import urlparse
from pydantic import BaseModel
from redis import Redis
from mwmbl.crawler.batch import HashedBatch
@ -20,6 +20,13 @@ HOST_COUNT_KEY = "host-count-{date}"
EXPIRE_SECONDS = 60*60*24
class MwmblStats(BaseModel):
urls_crawled_today: int
urls_crawled_hourly: list[int]
top_users: dict[str, int]
top_domains: dict[str, int]
class StatsManager:
def __init__(self, redis: Redis):
self.redis = redis
@ -51,8 +58,36 @@ class StatsManager:
self.redis.zincrby(host_key, 1, host)
self.redis.expire(host_key, EXPIRE_SECONDS)
def get_stats(self):
pass
def get_stats(self) -> MwmblStats:
date_time = datetime.now()
date = date_time.date()
url_count_key = URL_DATE_COUNT_KEY.format(date=date)
url_count = self.redis.get(url_count_key)
if url_count is None:
url_count = 0
hour_counts = []
for i in range(date_time.hour + 1):
hour = datetime(date_time.year, date_time.month, date_time.day, i)
hour_key = URL_HOUR_COUNT_KEY.format(hour=hour)
hour_count = self.redis.get(hour_key)
if hour_count is None:
hour_count = 0
hour_counts.append(hour_count)
user_count_key = USER_COUNT_KEY.format(date=date_time.date)
user_counts = self.redis.zrevrange(user_count_key, 0, 100, withscores=True)
host_key = HOST_COUNT_KEY.format(date=date_time.date)
host_counts = self.redis.zrevrange(host_key, 0, 100, withscores=True)
return MwmblStats(
urls_crawled_today=url_count,
urls_crawled_hourly=hour_counts,
top_users=user_counts,
top_domains=host_counts,
)
def get_test_batches():