Initialize URLs by processing them like all other URLs to avoid bias

This commit is contained in:
Daoud Clarke 2023-02-25 13:45:03 +00:00
parent 6bb8bdf0c2
commit a24156ce5c
2 changed files with 8 additions and 6 deletions

View file

@ -128,9 +128,9 @@ class URLDatabase:
updated = [FoundURL(*result) for result in results]
return updated
def get_urls(self, status: URLStatus, num_urls: int):
def get_urls(self, status: URLStatus, num_urls: int) -> list[FoundURL]:
sql = f"""
SELECT url FROM urls
SELECT url, status, user_id_hash, score, updated FROM urls
WHERE status = %(status)s
ORDER BY score DESC
LIMIT %(num_urls)s
@ -140,7 +140,7 @@ class URLDatabase:
cursor.execute(sql, {'status': status.value, 'num_urls': num_urls})
results = cursor.fetchall()
return [result[0] for result in results]
return [FoundURL(url, user_id_hash, score, status, updated) for url, status, user_id_hash, score, updated in results]
def get_url_scores(self, urls: list[str]) -> dict[str, float]:
sql = f"""

View file

@ -25,6 +25,7 @@ MAX_URLS_PER_CORE_DOMAIN = 1000
MAX_URLS_PER_TOP_DOMAIN = 100
MAX_URLS_PER_OTHER_DOMAIN = 5
MAX_OTHER_DOMAINS = 10000
INITIALIZE_URLS = 10000
random = Random(1)
@ -43,11 +44,12 @@ class URLQueue:
self._min_top_domains = min_top_domains
def initialize(self):
logger.info(f"Initializing URL queue")
with Database() as db:
url_db = URLDatabase(db.connection)
urls = url_db.get_urls(URLStatus.NEW, MAX_QUEUE_SIZE * BATCH_SIZE)
self._queue_urls(urls)
logger.info(f"Initialized URL queue with {len(urls)} urls, current queue size: {self.num_queued_batches}")
found_urls = url_db.get_urls(URLStatus.NEW, INITIALIZE_URLS)
self._process_found_urls(found_urls)
logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}")
def update(self):
num_processed = 0