WIP: improve method of getting URLs for crawling

This commit is contained in:
Daoud Clarke 2022-12-31 13:37:40 +00:00
parent 7dae39b780
commit ea16e7b5cd

View file

@ -130,20 +130,17 @@ class URLDatabase:
start = datetime.utcnow()
logger.info("Getting URLs for crawling")
work_mem = "SET work_mem = '512MB'"
sql = f"""
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
WHERE url IN (
SELECT url FROM (
SELECT url, host, score, rank() OVER (PARTITION BY host ORDER BY score DESC) AS pos
FROM url_and_hosts
WHERE host IN %(domains)s
SELECT url FROM url_and_hosts
WHERE host = %(domain)s
AND status IN ({URLStatus.NEW.value}) OR (
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
)
) u
WHERE pos < {MAX_TOP_DOMAIN_URLS}
ORDER BY score DESC
LIMIT {MAX_TOP_DOMAIN_URLS}
)
)
RETURNING url
"""
@ -151,13 +148,15 @@ class URLDatabase:
now = datetime.utcnow()
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
domains = tuple(DOMAINS.keys())
with self.connection.cursor() as cursor:
cursor.execute(work_mem)
cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls, 'domains': domains})
results = cursor.fetchall()
results = []
for domain in domains:
with self.connection.cursor() as cursor:
cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'domain': domain})
domain_results = cursor.fetchall()
results += [result[0] for result in domain_results]
total_time_seconds = (datetime.now() - start).total_seconds()
results = [result[0] for result in results]
logger.info(f"Got {len(results)} in {total_time_seconds} seconds")
return results