Check the domain is correct, potential bug in psql
This commit is contained in:
parent
b7984684c9
commit
6edf48693b
1 changed files with 8 additions and 5 deletions
|
@ -136,7 +136,7 @@ class URLDatabase:
|
|||
work_mem = "SET work_mem = '512MB'"
|
||||
|
||||
select_sql = f"""
|
||||
SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
|
||||
SELECT host, (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
|
||||
WHERE host IN %(domains)s
|
||||
AND status IN ({URLStatus.NEW.value}) OR (
|
||||
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
||||
|
@ -161,7 +161,8 @@ class URLDatabase:
|
|||
|
||||
now = datetime.utcnow()
|
||||
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
|
||||
domains = tuple(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
|
||||
domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
|
||||
domains = tuple(domain_sample)
|
||||
logger.info(f"Getting URLs for domains {domains}")
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(work_mem)
|
||||
|
@ -171,12 +172,14 @@ class URLDatabase:
|
|||
logger.info(f"Agg results: {agg_results}")
|
||||
|
||||
results = []
|
||||
for result in agg_results:
|
||||
results += result[0]
|
||||
for host, urls in agg_results:
|
||||
# There seems to be a bug in psql where we can get things we didn't ask for...
|
||||
if host in domain_sample:
|
||||
results += urls
|
||||
logger.info(f"Got {len(results)} top domain results")
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(others_sql)
|
||||
cursor.execute(others_sql, {'min_updated_date': min_updated_date})
|
||||
other_results = cursor.fetchall()
|
||||
other_results_list = [result[0] for result in other_results]
|
||||
logger.info(f"Got {len(other_results_list)} results from all domains")
|
||||
|
|
Loading…
Reference in a new issue