From ea16e7b5cd0207816b62e9b0f2f11b6bf884d374 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 31 Dec 2022 13:37:40 +0000 Subject: [PATCH] WIP: improve method of getting URLs for crawling --- mwmbl/crawler/urls.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index d5ae18b..9236ca8 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -130,20 +130,17 @@ class URLDatabase: start = datetime.utcnow() logger.info("Getting URLs for crawling") - work_mem = "SET work_mem = '512MB'" - sql = f""" UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s WHERE url IN ( - SELECT url FROM ( - SELECT url, host, score, rank() OVER (PARTITION BY host ORDER BY score DESC) AS pos - FROM url_and_hosts - WHERE host IN %(domains)s + SELECT url FROM url_and_hosts + WHERE host = %(domain)s AND status IN ({URLStatus.NEW.value}) OR ( status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s ) - ) u - WHERE pos < {MAX_TOP_DOMAIN_URLS} + ORDER BY score DESC + LIMIT {MAX_TOP_DOMAIN_URLS} + ) ) RETURNING url """ @@ -151,13 +148,15 @@ class URLDatabase: now = datetime.utcnow() min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS) domains = tuple(DOMAINS.keys()) - with self.connection.cursor() as cursor: - cursor.execute(work_mem) - cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls, 'domains': domains}) - results = cursor.fetchall() + + results = [] + for domain in domains: + with self.connection.cursor() as cursor: + cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'domain': domain}) + domain_results = cursor.fetchall() + results += [result[0] for result in domain_results] total_time_seconds = (datetime.now() - start).total_seconds() - results = [result[0] for result in results] logger.info(f"Got {len(results)} in {total_time_seconds} seconds") return results