Daoud Clarke 1 ano atrás
pai
commit
e1bf423e69
2 arquivos alterados com 51 adições e 5 exclusões
  1. 13 2
      mwmbl/crawler/app.py
  2. 38 3
      mwmbl/crawler/stats.py

+ 13 - 2
mwmbl/crawler/app.py

@@ -13,8 +13,10 @@ from fastapi import HTTPException, APIRouter
 from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
 from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
     LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
     LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
     STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
     STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
+from redis import Redis
 
 
 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
+from mwmbl.crawler.stats import MwmblStats, StatsManager
 from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
 from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
 from mwmbl.database import Database
 from mwmbl.database import Database
 from mwmbl.format import format_result
 from mwmbl.format import format_result
@@ -31,9 +33,11 @@ from mwmbl.settings import (
     PUBLIC_URL_PREFIX,
     PUBLIC_URL_PREFIX,
     PUBLIC_USER_ID_LENGTH,
     PUBLIC_USER_ID_LENGTH,
     FILE_NAME_SUFFIX,
     FILE_NAME_SUFFIX,
-    DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
+    DATE_REGEX, NUM_EXTRACT_CHARS)
 from mwmbl.tinysearchengine.indexer import Document
 from mwmbl.tinysearchengine.indexer import Document
-from mwmbl.url_queue import URLQueue
+
+
+redis = Redis(host='localhost', port=6379, decode_responses=True)
 
 
 
 
 def get_bucket(name):
 def get_bucket(name):
@@ -191,6 +195,13 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
         prefix = f'1/{VERSION}/{date_str}/1/'
         prefix = f'1/{VERSION}/{date_str}/1/'
         return get_subfolders(prefix)
         return get_subfolders(prefix)
 
 
+    @router.get('/stats')
+    def get_stats() -> MwmblStats:
+        stats = StatsManager(redis)
+        stats = stats.get_stats()
+        print("Stats", stats)
+        return stats
+
     @router.get('/')
     @router.get('/')
     def status():
     def status():
         return {
         return {

+ 38 - 3
mwmbl/crawler/stats.py

@@ -1,11 +1,11 @@
 import gzip
 import gzip
-import json
 from datetime import datetime
 from datetime import datetime
 from glob import glob
 from glob import glob
 from itertools import islice
 from itertools import islice
 from logging import getLogger
 from logging import getLogger
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 
 
+from pydantic import BaseModel
 from redis import Redis
 from redis import Redis
 
 
 from mwmbl.crawler.batch import HashedBatch
 from mwmbl.crawler.batch import HashedBatch
@@ -20,6 +20,13 @@ HOST_COUNT_KEY = "host-count-{date}"
 EXPIRE_SECONDS = 60*60*24
 EXPIRE_SECONDS = 60*60*24
 
 
 
 
+class MwmblStats(BaseModel):
+    urls_crawled_today: int
+    urls_crawled_hourly: list[int]
+    top_users: dict[str, int]
+    top_domains: dict[str, int]
+
+
 class StatsManager:
 class StatsManager:
     def __init__(self, redis: Redis):
     def __init__(self, redis: Redis):
         self.redis = redis
         self.redis = redis
@@ -51,8 +58,36 @@ class StatsManager:
             self.redis.zincrby(host_key, 1, host)
             self.redis.zincrby(host_key, 1, host)
         self.redis.expire(host_key, EXPIRE_SECONDS)
         self.redis.expire(host_key, EXPIRE_SECONDS)
 
 
-    def get_stats(self):
-        pass
+    def get_stats(self) -> MwmblStats:
+        date_time = datetime.now()
+        date = date_time.date()
+        url_count_key = URL_DATE_COUNT_KEY.format(date=date)
+        url_count = self.redis.get(url_count_key)
+
+        if url_count is None:
+            url_count = 0
+
+        hour_counts = []
+        for i in range(date_time.hour + 1):
+            hour = datetime(date_time.year, date_time.month, date_time.day, i)
+            hour_key = URL_HOUR_COUNT_KEY.format(hour=hour)
+            hour_count = self.redis.get(hour_key)
+            if hour_count is None:
+                hour_count = 0
+            hour_counts.append(hour_count)
+
+        user_count_key = USER_COUNT_KEY.format(date=date_time.date)
+        user_counts = self.redis.zrevrange(user_count_key, 0, 100, withscores=True)
+
+        host_key = HOST_COUNT_KEY.format(date=date_time.date)
+        host_counts = self.redis.zrevrange(host_key, 0, 100, withscores=True)
+
+        return MwmblStats(
+            urls_crawled_today=url_count,
+            urls_crawled_hourly=hour_counts,
+            top_users=user_counts,
+            top_domains=host_counts,
+        )
 
 
 
 
 def get_test_batches():
 def get_test_batches():