před 1 rokem · a55a027107
--- a/mwmbl/crawler/stats.py
+++ b/mwmbl/crawler/stats.py
@@ -0,0 +1,77 @@
 
															+import gzip
														
 
															+import json
														
 
															+from datetime import datetime
														
 
															+from glob import glob
														
 
															+from itertools import islice
														
 
															+from logging import getLogger
														
 
															+from urllib.parse import urlparse
														
 
															+
														
 
															+from redis import Redis
														
 
															+
														
 
															+from mwmbl.crawler.batch import HashedBatch
														
 
															+from mwmbl.indexer.update_urls import get_datetime_from_timestamp
														
 
															+
														
 
															+logger = getLogger(__name__)
														
 
															+
														
 
															+URL_DATE_COUNT_KEY = "url-count-{date}"
														
 
															+URL_HOUR_COUNT_KEY = "url-count-hour-{hour}"
														
 
															+USER_COUNT_KEY = "user-count-{date}"
														
 
															+HOST_COUNT_KEY = "host-count-{date}"
														
 
															+EXPIRE_SECONDS = 60*60*24
														
 
															+
														
 
															+
														
 
															+class StatsManager:
														
 
															+    def __init__(self, redis: Redis):
														
 
															+        self.redis = redis
														
 
															+
														
 
															+    def record_batch(self, hashed_batch: HashedBatch):
														
 
															+        date_time = get_datetime_from_timestamp(hashed_batch.timestamp)
														
 
															+
														
 
															+        num_crawled_urls = sum(1 for item in hashed_batch.items if item.content is not None)
														
 
															+
														
 
															+        url_count_key = URL_DATE_COUNT_KEY.format(date=date_time.date)
														
 
															+        self.redis.incrby(url_count_key, num_crawled_urls)
														
 
															+        self.redis.expire(url_count_key, EXPIRE_SECONDS)
														
 
															+
														
 
															+        hour = datetime(date_time.year, date_time.month, date_time.day, date_time.hour)
														
 
															+        hour_key = URL_HOUR_COUNT_KEY.format(hour=hour)
														
 
															+        self.redis.incrby(hour_key, num_crawled_urls)
														
 
															+        self.redis.expire(hour_key, EXPIRE_SECONDS)
														
 
															+
														
 
															+        user_count_key = USER_COUNT_KEY.format(date=date_time.date)
														
 
															+        self.redis.zincrby(user_count_key, num_crawled_urls, hashed_batch.user_id_hash)
														
 
															+        self.redis.expire(user_count_key, EXPIRE_SECONDS)
														
 
															+
														
 
															+        host_key = HOST_COUNT_KEY.format(date=date_time.date)
														
 
															+        for item in hashed_batch.items:
														
 
															+            if item.content is None:
														
 
															+                continue
														
 
															+
														
 
															+            host = urlparse(item.url).netloc
														
 
															+            self.redis.zincrby(host_key, 1, host)
														
 
															+        self.redis.expire(host_key, EXPIRE_SECONDS)
														
 
															+
														
 
															+    def get_stats(self):
														
 
															+        pass
														
 
															+
														
 
															+
														
 
															+def get_test_batches():
														
 
															+    for path in glob("./devdata/batches/**/*.json.gz", recursive=True):
														
 
															+        print("Processing path", path)
														
 
															+        with gzip.open(path) as gzip_file:
														
 
															+            yield HashedBatch.parse_raw(gzip_file.read())
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    redis = Redis(host='localhost', port=6379, decode_responses=True)
														
 
															+    stats = StatsManager(redis)
														
 
															+    batches = get_test_batches()
														
 
															+    start = datetime.now()
														
 
															+    processed = 0
														
 
															+    for batch in islice(batches, 100):
														
 
															+        stats.record_batch(batch)
														
 
															+        processed += 1
														
 
															+    total_time = (datetime.now() - start).total_seconds()
														
 
															+    print("Processed", processed)
														
 
															+    print("Total time", total_time)
														
 
															+    print("Time per batch", total_time/processed)
														
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ pyarrow = {version= "==6.0.0", optional = true}
 
															 pyspark = {version= "==3.2.0", optional = true}
														
 
															 Levenshtein = {version= "==0.16.0", optional = true}
														
 
															 requests-cache = "^1.1.0"
														
 
															+redis = {extras = ["hiredis"], version = "^5.0.1"}
														
 
															 [tool.poetry.extras]
														
 
															 indexer = [