From 77f08d8f0a91e360f66bd17a0cd82553dfcbc2c3 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 31 Dec 2022 22:25:05 +0000 Subject: [PATCH] Update URL status --- mwmbl/crawler/urls.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index 6c16e35..f3d9388 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -133,33 +133,40 @@ class URLDatabase: work_mem = "SET work_mem = '512MB'" - sql = f""" - UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s - WHERE url IN ( - SELECT url FROM ( - SELECT url, host, score, rank() OVER (PARTITION BY host ORDER BY score DESC) AS pos - FROM url_and_hosts - WHERE host IN %(domains)s + select_sql = f""" + SELECT (array_agg(url order by score desc))[:100] FROM url_and_hosts + WHERE host IN %(domains)s AND status IN ({URLStatus.NEW.value}) OR ( status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s ) - ) u - WHERE pos < {MAX_TOP_DOMAIN_URLS} - ) - RETURNING url + GROUP BY host + """ + + update_sql = f""" + UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s + WHERE url IN %(urls)s """ now = datetime.utcnow() min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS) - domains = tuple(random.sample(DOMAINS.keys(), 100)) + domains = tuple(random.sample(DOMAINS.keys(), 500)) + # domains = tuple(DOMAINS.keys()) logger.info(f"Getting URLs for domains {domains}") with self.connection.cursor() as cursor: cursor.execute(work_mem) - cursor.execute(sql, - {'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls, 'domains': domains}) - results = cursor.fetchall() + cursor.execute(select_sql, + {'min_updated_date': min_updated_date, 'domains': domains}) + agg_results = cursor.fetchall() + + results = [] + for result in agg_results: + results += result[0] + + with self.connection.cursor() as cursor: + cursor.execute(update_sql, + {'now': now, 'urls': tuple(results)}) + print("Results", agg_results) - results = [result[0] for result in results] total_time_seconds = (datetime.now() - start).total_seconds() logger.info(f"Got {len(results)} in {total_time_seconds} seconds")