Explorar o código

Shuffle URLs before batching

Daoud Clarke %!s(int64=2) %!d(string=hai) anos
pai
achega
5c94dfa669
Modificáronse 1 ficheiros con 5 adicións e 0 borrados
  1. 5 0
      mwmbl/url_queue.py

+ 5 - 0
mwmbl/url_queue.py

@@ -5,6 +5,7 @@ from datetime import datetime, timedelta
 from logging import getLogger
 from multiprocessing import Queue
 from queue import Empty
+from random import Random
 from typing import KeysView, Union
 
 from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS
@@ -26,6 +27,9 @@ MAX_URLS_PER_OTHER_DOMAIN = 5
 MAX_OTHER_DOMAINS = 10000
 
 
+random = Random(1)
+
+
 class URLQueue:
     def __init__(self, new_item_queue: Queue, queued_batches: Queue, min_top_domains: int = 5):
         """
@@ -105,6 +109,7 @@ class URLQueue:
         self._queue_urls(urls)
 
     def _queue_urls(self, valid_urls: list[str]):
+        random.shuffle(valid_urls)
         for url_batch in batch(valid_urls, BATCH_SIZE):
             self._queued_batches.put(url_batch, block=False)