Browse Source

Shuffle URLs before batching

Daoud Clarke 2 years ago
parent
commit
5c94dfa669
1 changed files with 5 additions and 0 deletions
  1. 5 0
      mwmbl/url_queue.py

+ 5 - 0
mwmbl/url_queue.py

@@ -5,6 +5,7 @@ from datetime import datetime, timedelta
 from logging import getLogger
 from multiprocessing import Queue
 from queue import Empty
+from random import Random
 from typing import KeysView, Union
 
 from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS
@@ -26,6 +27,9 @@ MAX_URLS_PER_OTHER_DOMAIN = 5
 MAX_OTHER_DOMAINS = 10000
 
 
+random = Random(1)
+
+
 class URLQueue:
     def __init__(self, new_item_queue: Queue, queued_batches: Queue, min_top_domains: int = 5):
         """
@@ -105,6 +109,7 @@ class URLQueue:
         self._queue_urls(urls)
 
     def _queue_urls(self, valid_urls: list[str]):
+        random.shuffle(valid_urls)
         for url_batch in batch(valid_urls, BATCH_SIZE):
             self._queued_batches.put(url_batch, block=False)