Sample domains
This commit is contained in:
parent
ea16e7b5cd
commit
36af579f7c
1 changed files with 26 additions and 21 deletions
|
@ -1,6 +1,7 @@
|
||||||
"""
|
"""
|
||||||
Database storing info on URLs
|
Database storing info on URLs
|
||||||
"""
|
"""
|
||||||
|
import random
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
@ -130,35 +131,39 @@ class URLDatabase:
|
||||||
start = datetime.utcnow()
|
start = datetime.utcnow()
|
||||||
logger.info("Getting URLs for crawling")
|
logger.info("Getting URLs for crawling")
|
||||||
|
|
||||||
|
work_mem = "SET work_mem = '512MB'"
|
||||||
|
|
||||||
sql = f"""
|
sql = f"""
|
||||||
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
|
UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
|
||||||
WHERE url IN (
|
WHERE url IN (
|
||||||
SELECT url FROM url_and_hosts
|
SELECT url FROM (
|
||||||
WHERE host = %(domain)s
|
SELECT url, host, score, rank() OVER (PARTITION BY host ORDER BY score DESC) AS pos
|
||||||
AND status IN ({URLStatus.NEW.value}) OR (
|
FROM url_and_hosts
|
||||||
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
WHERE host IN %(domains)s
|
||||||
)
|
AND status IN ({URLStatus.NEW.value}) OR (
|
||||||
ORDER BY score DESC
|
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
||||||
LIMIT {MAX_TOP_DOMAIN_URLS}
|
)
|
||||||
)
|
) u
|
||||||
)
|
WHERE pos < {MAX_TOP_DOMAIN_URLS}
|
||||||
RETURNING url
|
)
|
||||||
"""
|
RETURNING url
|
||||||
|
"""
|
||||||
|
|
||||||
now = datetime.utcnow()
|
now = datetime.utcnow()
|
||||||
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
|
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
|
||||||
domains = tuple(DOMAINS.keys())
|
domains = tuple(random.sample(DOMAINS.keys(), 100))
|
||||||
|
logger.info(f"Getting URLs for domains {domains}")
|
||||||
results = []
|
with self.connection.cursor() as cursor:
|
||||||
for domain in domains:
|
cursor.execute(work_mem)
|
||||||
with self.connection.cursor() as cursor:
|
cursor.execute(sql,
|
||||||
cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'domain': domain})
|
{'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls, 'domains': domains})
|
||||||
domain_results = cursor.fetchall()
|
results = cursor.fetchall()
|
||||||
results += [result[0] for result in domain_results]
|
|
||||||
|
|
||||||
|
results = [result[0] for result in results]
|
||||||
total_time_seconds = (datetime.now() - start).total_seconds()
|
total_time_seconds = (datetime.now() - start).total_seconds()
|
||||||
logger.info(f"Got {len(results)} in {total_time_seconds} seconds")
|
logger.info(f"Got {len(results)} in {total_time_seconds} seconds")
|
||||||
|
|
||||||
|
random.shuffle(results)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def get_urls(self, status: URLStatus, num_urls: int):
|
def get_urls(self, status: URLStatus, num_urls: int):
|
||||||
|
|
Loading…
Reference in a new issue