Update URL status
This commit is contained in:
parent
36af579f7c
commit
77f08d8f0a
1 changed files with 23 additions and 16 deletions
|
@ -133,33 +133,40 @@ class URLDatabase:
|
|||
|
||||
work_mem = "SET work_mem = '512MB'"
|
||||
|
||||
sql = f"""
|
||||
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
|
||||
WHERE url IN (
|
||||
SELECT url FROM (
|
||||
SELECT url, host, score, rank() OVER (PARTITION BY host ORDER BY score DESC) AS pos
|
||||
FROM url_and_hosts
|
||||
WHERE host IN %(domains)s
|
||||
select_sql = f"""
|
||||
SELECT (array_agg(url order by score desc))[:100] FROM url_and_hosts
|
||||
WHERE host IN %(domains)s
|
||||
AND status IN ({URLStatus.NEW.value}) OR (
|
||||
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
||||
)
|
||||
) u
|
||||
WHERE pos < {MAX_TOP_DOMAIN_URLS}
|
||||
)
|
||||
RETURNING url
|
||||
GROUP BY host
|
||||
"""
|
||||
|
||||
update_sql = f"""
|
||||
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
|
||||
WHERE url IN %(urls)s
|
||||
"""
|
||||
|
||||
now = datetime.utcnow()
|
||||
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
|
||||
domains = tuple(random.sample(DOMAINS.keys(), 100))
|
||||
domains = tuple(random.sample(DOMAINS.keys(), 500))
|
||||
# domains = tuple(DOMAINS.keys())
|
||||
logger.info(f"Getting URLs for domains {domains}")
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(work_mem)
|
||||
cursor.execute(sql,
|
||||
{'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls, 'domains': domains})
|
||||
results = cursor.fetchall()
|
||||
cursor.execute(select_sql,
|
||||
{'min_updated_date': min_updated_date, 'domains': domains})
|
||||
agg_results = cursor.fetchall()
|
||||
|
||||
results = []
|
||||
for result in agg_results:
|
||||
results += result[0]
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(update_sql,
|
||||
{'now': now, 'urls': tuple(results)})
|
||||
print("Results", agg_results)
|
||||
|
||||
results = [result[0] for result in results]
|
||||
total_time_seconds = (datetime.now() - start).total_seconds()
|
||||
logger.info(f"Got {len(results)} in {total_time_seconds} seconds")
|
||||
|
||||
|
|
Loading…
Reference in a new issue