From b7984684c9ee6e2641a5292a0b249b6928c1ff83 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 1 Jan 2023 01:14:05 +0000 Subject: [PATCH] Tidy, improve logging --- mwmbl/crawler/urls.py | 19 +++++++++++-------- mwmbl/url_queue.py | 3 +-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index fb02d6e..0be20e2 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -129,7 +129,7 @@ class URLDatabase: execute_values(cursor, insert_sql, data) - def get_urls_for_crawling(self, num_urls: int): + def get_urls_for_crawling(self): start = datetime.utcnow() logger.info("Getting URLs for crawling") @@ -138,24 +138,26 @@ class URLDatabase: select_sql = f""" SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts WHERE host IN %(domains)s - AND status IN ({URLStatus.NEW.value}) OR ( - status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s - ) + AND status IN ({URLStatus.NEW.value}) OR ( + status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s + ) GROUP BY host """ others_sql = f""" SELECT DISTINCT ON (host) url FROM ( SELECT * FROM url_and_hosts - WHERE status=0 + WHERE status IN ({URLStatus.NEW.value}) OR ( + status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s + ) ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u ORDER BY host """ update_sql = f""" - UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s - WHERE url IN %(urls)s - """ + UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s + WHERE url IN %(urls)s + """ now = datetime.utcnow() min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS) @@ -166,6 +168,7 @@ class URLDatabase: cursor.execute(select_sql, {'min_updated_date': min_updated_date, 'domains': domains}) agg_results = cursor.fetchall() + logger.info(f"Agg results: {agg_results}") results = [] for result in agg_results: diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index da3f75d..e4124ec 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -20,10 +20,9 @@ def update_url_queue(url_queue: Queue): logger.info(f"Skipping queue update, current size {current_size}") return - num_urls_to_fetch = (MAX_QUEUE_SIZE - current_size) * BATCH_SIZE with Database() as db: url_db = URLDatabase(db.connection) - urls = url_db.get_urls_for_crawling(num_urls_to_fetch) + urls = url_db.get_urls_for_crawling() queue_batches(url_queue, urls) logger.info(f"Queued {len(urls)} urls, current queue size: {url_queue.qsize()}")