Check the domain is correct, potential bug in psql

This commit is contained in:
Daoud Clarke 2023-01-01 01:30:44 +00:00
parent b7984684c9
commit 6edf48693b

View file

@ -136,7 +136,7 @@ class URLDatabase:
work_mem = "SET work_mem = '512MB'"
select_sql = f"""
SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
SELECT host, (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
WHERE host IN %(domains)s
AND status IN ({URLStatus.NEW.value}) OR (
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
@ -161,7 +161,8 @@ class URLDatabase:
now = datetime.utcnow()
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
domains = tuple(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
domains = tuple(domain_sample)
logger.info(f"Getting URLs for domains {domains}")
with self.connection.cursor() as cursor:
cursor.execute(work_mem)
@ -171,12 +172,14 @@ class URLDatabase:
logger.info(f"Agg results: {agg_results}")
results = []
for result in agg_results:
results += result[0]
for host, urls in agg_results:
# There seems to be a bug in psql where we can get things we didn't ask for...
if host in domain_sample:
results += urls
logger.info(f"Got {len(results)} top domain results")
with self.connection.cursor() as cursor:
cursor.execute(others_sql)
cursor.execute(others_sql, {'min_updated_date': min_updated_date})
other_results = cursor.fetchall()
other_results_list = [result[0] for result in other_results]
logger.info(f"Got {len(other_results_list)} results from all domains")