From b7984684c9ee6e2641a5292a0b249b6928c1ff83 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Sun, 1 Jan 2023 01:14:05 +0000
Subject: [PATCH] Tidy, improve logging

---
 mwmbl/crawler/urls.py | 19 +++++++++++--------
 mwmbl/url_queue.py    |  3 +--
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py
index fb02d6e..0be20e2 100644
--- a/mwmbl/crawler/urls.py
+++ b/mwmbl/crawler/urls.py
@@ -129,7 +129,7 @@ class URLDatabase:
 
                 execute_values(cursor, insert_sql, data)
 
-    def get_urls_for_crawling(self, num_urls: int):
+    def get_urls_for_crawling(self):
         start = datetime.utcnow()
         logger.info("Getting URLs for crawling")
 
@@ -138,24 +138,26 @@ class URLDatabase:
         select_sql = f"""
             SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
             WHERE host IN %(domains)s
-               AND status IN ({URLStatus.NEW.value}) OR (
-                   status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
-               )
+                AND status IN ({URLStatus.NEW.value}) OR (
+                    status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
+                )
             GROUP BY host
         """
 
         others_sql = f"""
             SELECT DISTINCT ON (host) url FROM (
                 SELECT * FROM url_and_hosts
-                WHERE status=0
+                WHERE status IN ({URLStatus.NEW.value}) OR (
+                   status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
+                )
                 ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u
             ORDER BY host
         """
 
         update_sql = f"""
-         UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
-         WHERE url IN %(urls)s
-         """
+            UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
+            WHERE url IN %(urls)s
+        """
 
         now = datetime.utcnow()
         min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
@@ -166,6 +168,7 @@ class URLDatabase:
             cursor.execute(select_sql,
                            {'min_updated_date': min_updated_date, 'domains': domains})
             agg_results = cursor.fetchall()
+            logger.info(f"Agg results: {agg_results}")
 
         results = []
         for result in agg_results:
diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py
index da3f75d..e4124ec 100644
--- a/mwmbl/url_queue.py
+++ b/mwmbl/url_queue.py
@@ -20,10 +20,9 @@ def update_url_queue(url_queue: Queue):
         logger.info(f"Skipping queue update, current size {current_size}")
         return
 
-    num_urls_to_fetch = (MAX_QUEUE_SIZE - current_size) * BATCH_SIZE
     with Database() as db:
         url_db = URLDatabase(db.connection)
-        urls = url_db.get_urls_for_crawling(num_urls_to_fetch)
+        urls = url_db.get_urls_for_crawling()
         queue_batches(url_queue, urls)
         logger.info(f"Queued {len(urls)} urls, current queue size: {url_queue.qsize()}")