Tidy, improve logging
This commit is contained in:
parent
7c14cd99f8
commit
b7984684c9
2 changed files with 12 additions and 10 deletions
|
@ -129,7 +129,7 @@ class URLDatabase:
|
|||
|
||||
execute_values(cursor, insert_sql, data)
|
||||
|
||||
def get_urls_for_crawling(self, num_urls: int):
|
||||
def get_urls_for_crawling(self):
|
||||
start = datetime.utcnow()
|
||||
logger.info("Getting URLs for crawling")
|
||||
|
||||
|
@ -138,24 +138,26 @@ class URLDatabase:
|
|||
select_sql = f"""
|
||||
SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
|
||||
WHERE host IN %(domains)s
|
||||
AND status IN ({URLStatus.NEW.value}) OR (
|
||||
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
||||
)
|
||||
AND status IN ({URLStatus.NEW.value}) OR (
|
||||
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
||||
)
|
||||
GROUP BY host
|
||||
"""
|
||||
|
||||
others_sql = f"""
|
||||
SELECT DISTINCT ON (host) url FROM (
|
||||
SELECT * FROM url_and_hosts
|
||||
WHERE status=0
|
||||
WHERE status IN ({URLStatus.NEW.value}) OR (
|
||||
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
||||
)
|
||||
ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u
|
||||
ORDER BY host
|
||||
"""
|
||||
|
||||
update_sql = f"""
|
||||
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
|
||||
WHERE url IN %(urls)s
|
||||
"""
|
||||
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
|
||||
WHERE url IN %(urls)s
|
||||
"""
|
||||
|
||||
now = datetime.utcnow()
|
||||
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
|
||||
|
@ -166,6 +168,7 @@ class URLDatabase:
|
|||
cursor.execute(select_sql,
|
||||
{'min_updated_date': min_updated_date, 'domains': domains})
|
||||
agg_results = cursor.fetchall()
|
||||
logger.info(f"Agg results: {agg_results}")
|
||||
|
||||
results = []
|
||||
for result in agg_results:
|
||||
|
|
|
@ -20,10 +20,9 @@ def update_url_queue(url_queue: Queue):
|
|||
logger.info(f"Skipping queue update, current size {current_size}")
|
||||
return
|
||||
|
||||
num_urls_to_fetch = (MAX_QUEUE_SIZE - current_size) * BATCH_SIZE
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
urls = url_db.get_urls_for_crawling(num_urls_to_fetch)
|
||||
urls = url_db.get_urls_for_crawling()
|
||||
queue_batches(url_queue, urls)
|
||||
logger.info(f"Queued {len(urls)} urls, current queue size: {url_queue.qsize()}")
|
||||
|
||||
|
|
Loading…
Reference in a new issue