From d9cd3c585b8d3e192a39980293778692011c0f79 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 31 Dec 2022 22:51:00 +0000 Subject: [PATCH] Get results from other domains --- mwmbl/crawler/urls.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index f3d9388..fb02d6e 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -15,7 +15,9 @@ from mwmbl.utils import batch REASSIGN_MIN_HOURS = 5 BATCH_SIZE = 100 -MAX_TOP_DOMAIN_URLS = 10 +MAX_URLS_PER_TOP_DOMAIN = 100 +MAX_TOP_DOMAINS = 500 +MAX_OTHER_DOMAINS = 50000 logger = getLogger(__name__) @@ -134,7 +136,7 @@ class URLDatabase: work_mem = "SET work_mem = '512MB'" select_sql = f""" - SELECT (array_agg(url order by score desc))[:100] FROM url_and_hosts + SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts WHERE host IN %(domains)s AND status IN ({URLStatus.NEW.value}) OR ( status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s @@ -142,6 +144,14 @@ class URLDatabase: GROUP BY host """ + others_sql = f""" + SELECT DISTINCT ON (host) url FROM ( + SELECT * FROM url_and_hosts + WHERE status=0 + ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u + ORDER BY host + """ + update_sql = f""" UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s WHERE url IN %(urls)s @@ -149,8 +159,7 @@ class URLDatabase: now = datetime.utcnow() min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS) - domains = tuple(random.sample(DOMAINS.keys(), 500)) - # domains = tuple(DOMAINS.keys()) + domains = tuple(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS)) logger.info(f"Getting URLs for domains {domains}") with self.connection.cursor() as cursor: cursor.execute(work_mem) @@ -161,11 +170,18 @@ class URLDatabase: results = [] for result in agg_results: results += result[0] + logger.info(f"Got {len(results)} top domain results") + + with self.connection.cursor() as cursor: + cursor.execute(others_sql) + other_results = cursor.fetchall() + other_results_list = [result[0] for result in other_results] + logger.info(f"Got {len(other_results_list)} results from all domains") + results += other_results_list with self.connection.cursor() as cursor: cursor.execute(update_sql, {'now': now, 'urls': tuple(results)}) - print("Results", agg_results) total_time_seconds = (datetime.now() - start).total_seconds() logger.info(f"Got {len(results)} in {total_time_seconds} seconds")