Initialize URLs by processing them like all other URLs to avoid bias
This commit is contained in:
parent
6bb8bdf0c2
commit
a24156ce5c
2 changed files with 8 additions and 6 deletions
|
@ -128,9 +128,9 @@ class URLDatabase:
|
|||
updated = [FoundURL(*result) for result in results]
|
||||
return updated
|
||||
|
||||
def get_urls(self, status: URLStatus, num_urls: int):
|
||||
def get_urls(self, status: URLStatus, num_urls: int) -> list[FoundURL]:
|
||||
sql = f"""
|
||||
SELECT url FROM urls
|
||||
SELECT url, status, user_id_hash, score, updated FROM urls
|
||||
WHERE status = %(status)s
|
||||
ORDER BY score DESC
|
||||
LIMIT %(num_urls)s
|
||||
|
@ -140,7 +140,7 @@ class URLDatabase:
|
|||
cursor.execute(sql, {'status': status.value, 'num_urls': num_urls})
|
||||
results = cursor.fetchall()
|
||||
|
||||
return [result[0] for result in results]
|
||||
return [FoundURL(url, user_id_hash, score, status, updated) for url, status, user_id_hash, score, updated in results]
|
||||
|
||||
def get_url_scores(self, urls: list[str]) -> dict[str, float]:
|
||||
sql = f"""
|
||||
|
|
|
@ -25,6 +25,7 @@ MAX_URLS_PER_CORE_DOMAIN = 1000
|
|||
MAX_URLS_PER_TOP_DOMAIN = 100
|
||||
MAX_URLS_PER_OTHER_DOMAIN = 5
|
||||
MAX_OTHER_DOMAINS = 10000
|
||||
INITIALIZE_URLS = 10000
|
||||
|
||||
|
||||
random = Random(1)
|
||||
|
@ -43,11 +44,12 @@ class URLQueue:
|
|||
self._min_top_domains = min_top_domains
|
||||
|
||||
def initialize(self):
|
||||
logger.info(f"Initializing URL queue")
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
urls = url_db.get_urls(URLStatus.NEW, MAX_QUEUE_SIZE * BATCH_SIZE)
|
||||
self._queue_urls(urls)
|
||||
logger.info(f"Initialized URL queue with {len(urls)} urls, current queue size: {self.num_queued_batches}")
|
||||
found_urls = url_db.get_urls(URLStatus.NEW, INITIALIZE_URLS)
|
||||
self._process_found_urls(found_urls)
|
||||
logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}")
|
||||
|
||||
def update(self):
|
||||
num_processed = 0
|
||||
|
|
Loading…
Add table
Reference in a new issue