From 988f3fd2a9b6320d18f490809221c3a5630e3703 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Mon, 2 Oct 2023 22:19:02 +0100
Subject: [PATCH] Add more stats

---
 mwmbl/crawler/stats.py | 45 +++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/mwmbl/crawler/stats.py b/mwmbl/crawler/stats.py
index 69e7a99..df20e76 100644
--- a/mwmbl/crawler/stats.py
+++ b/mwmbl/crawler/stats.py
@@ -1,5 +1,5 @@
 import gzip
-from datetime import datetime
+from datetime import datetime, timedelta
 from glob import glob
 from itertools import islice
 from logging import getLogger
@@ -15,14 +15,18 @@ logger = getLogger(__name__)
 
 URL_DATE_COUNT_KEY = "url-count-{date}"
 URL_HOUR_COUNT_KEY = "url-count-hour-{hour}"
+USERS_KEY = "users-{date}"
 USER_COUNT_KEY = "user-count-{date}"
 HOST_COUNT_KEY = "host-count-{date}"
-EXPIRE_SECONDS = 60*60*24
+SHORT_EXPIRE_SECONDS = 60 * 60 * 24
+LONG_EXPIRE_SECONDS = 60 * 60 * 24 * 30
 
 
 class MwmblStats(BaseModel):
     urls_crawled_today: int
+    urls_crawled_daily: dict[str, int]
     urls_crawled_hourly: list[int]
+    users_crawled_daily: dict[str, int]
     top_users: dict[str, int]
     top_domains: dict[str, int]
 
@@ -38,16 +42,21 @@ class StatsManager:
 
         url_count_key = URL_DATE_COUNT_KEY.format(date=date_time.date())
         self.redis.incrby(url_count_key, num_crawled_urls)
-        self.redis.expire(url_count_key, EXPIRE_SECONDS)
+        self.redis.expire(url_count_key, LONG_EXPIRE_SECONDS)
 
+        print("Date time", date_time)
         hour = datetime(date_time.year, date_time.month, date_time.day, date_time.hour)
         hour_key = URL_HOUR_COUNT_KEY.format(hour=hour)
         self.redis.incrby(hour_key, num_crawled_urls)
-        self.redis.expire(hour_key, EXPIRE_SECONDS)
+        self.redis.expire(hour_key, SHORT_EXPIRE_SECONDS)
+
+        users_key = USERS_KEY.format(date=date_time.date())
+        self.redis.sadd(users_key, hashed_batch.user_id_hash)
+        self.redis.expire(users_key, LONG_EXPIRE_SECONDS)
 
         user_count_key = USER_COUNT_KEY.format(date=date_time.date())
         self.redis.zincrby(user_count_key, num_crawled_urls, hashed_batch.user_id_hash)
-        self.redis.expire(user_count_key, EXPIRE_SECONDS)
+        self.redis.expire(user_count_key, SHORT_EXPIRE_SECONDS)
 
         host_key = HOST_COUNT_KEY.format(date=date_time.date())
         for item in hashed_batch.items:
@@ -56,16 +65,25 @@ class StatsManager:
 
             host = urlparse(item.url).netloc
             self.redis.zincrby(host_key, 1, host)
-        self.redis.expire(host_key, EXPIRE_SECONDS)
+        self.redis.expire(host_key, SHORT_EXPIRE_SECONDS)
 
     def get_stats(self) -> MwmblStats:
         date_time = datetime.now()
         date = date_time.date()
-        url_count_key = URL_DATE_COUNT_KEY.format(date=date)
-        url_count = self.redis.get(url_count_key)
 
-        if url_count is None:
-            url_count = 0
+        urls_crawled_daily = {}
+        users_crawled_daily = {}
+        for i in range(29, -1, -1):
+            date_i = date - timedelta(days=i)
+            url_count_key = URL_DATE_COUNT_KEY.format(date=date_i)
+            url_count = self.redis.get(url_count_key)
+            if url_count is None:
+                url_count = 0
+            urls_crawled_daily[str(date_i)] = url_count
+
+            user_day_count_key = USERS_KEY.format(date=date_i)
+            user_day_count = self.redis.scard(user_day_count_key)
+            users_crawled_daily[str(date_i)] = user_day_count
 
         hour_counts = []
         for i in range(date_time.hour + 1):
@@ -82,9 +100,12 @@ class StatsManager:
         host_key = HOST_COUNT_KEY.format(date=date_time.date())
         host_counts = self.redis.zrevrange(host_key, 0, 100, withscores=True)
 
+        urls_crawled_today = list(urls_crawled_daily.values())[-1]
         return MwmblStats(
-            urls_crawled_today=url_count,
+            urls_crawled_today=urls_crawled_today,
+            urls_crawled_daily=urls_crawled_daily,
             urls_crawled_hourly=hour_counts,
+            users_crawled_daily=users_crawled_daily,
             top_users=user_counts,
             top_domains=host_counts,
         )
@@ -103,7 +124,7 @@ if __name__ == '__main__':
     batches = get_test_batches()
     start = datetime.now()
     processed = 0
-    for batch in islice(batches, 100):
+    for batch in islice(batches, 10000):
         stats.record_batch(batch)
         processed += 1
     total_time = (datetime.now() - start).total_seconds()