Tidy, improve logging

This commit is contained in:
Daoud Clarke 2023-01-01 01:14:05 +00:00
parent 7c14cd99f8
commit b7984684c9
2 changed files with 12 additions and 10 deletions

View file

@ -129,7 +129,7 @@ class URLDatabase:
execute_values(cursor, insert_sql, data)
def get_urls_for_crawling(self, num_urls: int):
def get_urls_for_crawling(self):
start = datetime.utcnow()
logger.info("Getting URLs for crawling")
@ -138,24 +138,26 @@ class URLDatabase:
select_sql = f"""
SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
WHERE host IN %(domains)s
AND status IN ({URLStatus.NEW.value}) OR (
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
)
AND status IN ({URLStatus.NEW.value}) OR (
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
)
GROUP BY host
"""
others_sql = f"""
SELECT DISTINCT ON (host) url FROM (
SELECT * FROM url_and_hosts
WHERE status=0
WHERE status IN ({URLStatus.NEW.value}) OR (
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
)
ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u
ORDER BY host
"""
update_sql = f"""
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
WHERE url IN %(urls)s
"""
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
WHERE url IN %(urls)s
"""
now = datetime.utcnow()
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
@ -166,6 +168,7 @@ class URLDatabase:
cursor.execute(select_sql,
{'min_updated_date': min_updated_date, 'domains': domains})
agg_results = cursor.fetchall()
logger.info(f"Agg results: {agg_results}")
results = []
for result in agg_results:

View file

@ -20,10 +20,9 @@ def update_url_queue(url_queue: Queue):
logger.info(f"Skipping queue update, current size {current_size}")
return
num_urls_to_fetch = (MAX_QUEUE_SIZE - current_size) * BATCH_SIZE
with Database() as db:
url_db = URLDatabase(db.connection)
urls = url_db.get_urls_for_crawling(num_urls_to_fetch)
urls = url_db.get_urls_for_crawling()
queue_batches(url_queue, urls)
logger.info(f"Queued {len(urls)} urls, current queue size: {url_queue.qsize()}")