Sample domains

This commit is contained in:
Daoud Clarke 2022-12-31 17:04:38 +00:00
parent ea16e7b5cd
commit 36af579f7c

View file

@ -1,6 +1,7 @@
""" """
Database storing info on URLs Database storing info on URLs
""" """
import random
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime, timedelta
from enum import Enum from enum import Enum
@ -130,35 +131,39 @@ class URLDatabase:
start = datetime.utcnow() start = datetime.utcnow()
logger.info("Getting URLs for crawling") logger.info("Getting URLs for crawling")
work_mem = "SET work_mem = '512MB'"
sql = f""" sql = f"""
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
WHERE url IN ( WHERE url IN (
SELECT url FROM url_and_hosts SELECT url FROM (
WHERE host = %(domain)s SELECT url, host, score, rank() OVER (PARTITION BY host ORDER BY score DESC) AS pos
AND status IN ({URLStatus.NEW.value}) OR ( FROM url_and_hosts
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s WHERE host IN %(domains)s
) AND status IN ({URLStatus.NEW.value}) OR (
ORDER BY score DESC status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
LIMIT {MAX_TOP_DOMAIN_URLS} )
) ) u
) WHERE pos < {MAX_TOP_DOMAIN_URLS}
RETURNING url )
""" RETURNING url
"""
now = datetime.utcnow() now = datetime.utcnow()
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS) min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
domains = tuple(DOMAINS.keys()) domains = tuple(random.sample(DOMAINS.keys(), 100))
logger.info(f"Getting URLs for domains {domains}")
results = [] with self.connection.cursor() as cursor:
for domain in domains: cursor.execute(work_mem)
with self.connection.cursor() as cursor: cursor.execute(sql,
cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'domain': domain}) {'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls, 'domains': domains})
domain_results = cursor.fetchall() results = cursor.fetchall()
results += [result[0] for result in domain_results]
results = [result[0] for result in results]
total_time_seconds = (datetime.now() - start).total_seconds() total_time_seconds = (datetime.now() - start).total_seconds()
logger.info(f"Got {len(results)} in {total_time_seconds} seconds") logger.info(f"Got {len(results)} in {total_time_seconds} seconds")
random.shuffle(results)
return results return results
def get_urls(self, status: URLStatus, num_urls: int): def get_urls(self, status: URLStatus, num_urls: int):