Bladeren bron

Initialize URLs by processing them like all other URLs to avoid bias

Daoud Clarke 2 jaren geleden
bovenliggende
commit
a24156ce5c
2 gewijzigde bestanden met toevoegingen van 8 en 6 verwijderingen
  1. 3 3
      mwmbl/crawler/urls.py
  2. 5 3
      mwmbl/url_queue.py

+ 3 - 3
mwmbl/crawler/urls.py

@@ -128,9 +128,9 @@ class URLDatabase:
                 updated = [FoundURL(*result) for result in results]
                 return updated
 
-    def get_urls(self, status: URLStatus, num_urls: int):
+    def get_urls(self, status: URLStatus, num_urls: int) -> list[FoundURL]:
         sql = f"""
-        SELECT url FROM urls
+        SELECT url, status, user_id_hash, score, updated FROM urls
         WHERE status = %(status)s
         ORDER BY score DESC
         LIMIT %(num_urls)s
@@ -140,7 +140,7 @@ class URLDatabase:
             cursor.execute(sql, {'status': status.value, 'num_urls': num_urls})
             results = cursor.fetchall()
 
-        return [result[0] for result in results]
+        return [FoundURL(url, user_id_hash, score, status, updated) for url, status, user_id_hash, score, updated in results]
 
     def get_url_scores(self, urls: list[str]) -> dict[str, float]:
         sql = f"""

+ 5 - 3
mwmbl/url_queue.py

@@ -25,6 +25,7 @@ MAX_URLS_PER_CORE_DOMAIN = 1000
 MAX_URLS_PER_TOP_DOMAIN = 100
 MAX_URLS_PER_OTHER_DOMAIN = 5
 MAX_OTHER_DOMAINS = 10000
+INITIALIZE_URLS = 10000
 
 
 random = Random(1)
@@ -43,11 +44,12 @@ class URLQueue:
         self._min_top_domains = min_top_domains
 
     def initialize(self):
+        logger.info(f"Initializing URL queue")
         with Database() as db:
             url_db = URLDatabase(db.connection)
-            urls = url_db.get_urls(URLStatus.NEW, MAX_QUEUE_SIZE * BATCH_SIZE)
-            self._queue_urls(urls)
-            logger.info(f"Initialized URL queue with {len(urls)} urls, current queue size: {self.num_queued_batches}")
+            found_urls = url_db.get_urls(URLStatus.NEW, INITIALIZE_URLS)
+            self._process_found_urls(found_urls)
+        logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}")
 
     def update(self):
         num_processed = 0