From 5c94dfa669799c65ba9ad4594d3b30af9f633b6c Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 25 Feb 2023 10:35:10 +0000 Subject: [PATCH] Shuffle URLs before batching --- mwmbl/url_queue.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index dd850e5..4cf0983 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -5,6 +5,7 @@ from datetime import datetime, timedelta from logging import getLogger from multiprocessing import Queue from queue import Empty +from random import Random from typing import KeysView, Union from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS @@ -26,6 +27,9 @@ MAX_URLS_PER_OTHER_DOMAIN = 5 MAX_OTHER_DOMAINS = 10000 +random = Random(1) + + class URLQueue: def __init__(self, new_item_queue: Queue, queued_batches: Queue, min_top_domains: int = 5): """ @@ -105,6 +109,7 @@ class URLQueue: self._queue_urls(urls) def _queue_urls(self, valid_urls: list[str]): + random.shuffle(valid_urls) for url_batch in batch(valid_urls, BATCH_SIZE): self._queued_batches.put(url_batch, block=False)