From 6ff62fb119b3c5c0dbca989334ef61607c56ef70 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 25 Feb 2023 10:34:09 +0000 Subject: [PATCH 1/2] Ensure URLs in queue are unique --- mwmbl/url_queue.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index 6cfdab8..dd850e5 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -25,11 +25,6 @@ MAX_URLS_PER_TOP_DOMAIN = 100 MAX_URLS_PER_OTHER_DOMAIN = 5 MAX_OTHER_DOMAINS = 10000 -@dataclass -class URLScore: - url: str - score: float - class URLQueue: def __init__(self, new_item_queue: Queue, queued_batches: Queue, min_top_domains: int = 5): @@ -39,8 +34,8 @@ class URLQueue: """ self._new_item_queue = new_item_queue self._queued_batches = queued_batches - self._other_urls = defaultdict(list) - self._top_urls = defaultdict(list) + self._other_urls = defaultdict(dict) + self._top_urls = defaultdict(dict) self._min_top_domains = min_top_domains def initialize(self): @@ -88,7 +83,7 @@ class URLQueue: except ValueError: continue url_store = self._top_urls if domain in TOP_DOMAINS else self._other_urls - url_store[domain].append(URLScore(found_url.url, found_url.score)) + url_store[domain][found_url.url] = found_url.score logger.info(f"URL store updated: {len(self._top_urls)} top domains, {len(self._other_urls)} other domains") @@ -122,18 +117,17 @@ class URLQueue: return len(self._top_urls) -def _sort_and_limit_urls(domain_urls: dict[str, list[str]], max_urls: int): +def _sort_and_limit_urls(domain_urls: dict[str, dict[str, float]], max_urls: int): for domain, urls in domain_urls.items(): - domain_urls[domain] = sorted(urls, key=lambda url_score: url_score.score, reverse=True)[:max_urls] + domain_urls[domain] = dict(sorted(urls.items(), key=lambda url_score: url_score[1], reverse=True)[:max_urls]) -def _add_urls(domains: Union[set[str], KeysView], domain_urls: dict[str, list[URLScore]], urls: list[str], max_urls: int): +def _add_urls(domains: Union[set[str], KeysView], domain_urls: dict[str, dict[str, float]], urls: list[str], max_urls: int): for domain in list(domains & domain_urls.keys()): - new_urls = domain_urls[domain][:max_urls] - urls += [url_score.url for url_score in new_urls] - new_domain_urls = domain_urls[domain][max_urls:] + urls += list(domain_urls[domain].keys())[:max_urls] + new_domain_urls = list(domain_urls[domain].items())[max_urls:] if len(new_domain_urls) > 0: - domain_urls[domain] = new_domain_urls + domain_urls[domain] = dict(new_domain_urls) else: del domain_urls[domain] From 5c94dfa669799c65ba9ad4594d3b30af9f633b6c Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 25 Feb 2023 10:35:10 +0000 Subject: [PATCH 2/2] Shuffle URLs before batching --- mwmbl/url_queue.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index dd850e5..4cf0983 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -5,6 +5,7 @@ from datetime import datetime, timedelta from logging import getLogger from multiprocessing import Queue from queue import Empty +from random import Random from typing import KeysView, Union from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS @@ -26,6 +27,9 @@ MAX_URLS_PER_OTHER_DOMAIN = 5 MAX_OTHER_DOMAINS = 10000 +random = Random(1) + + class URLQueue: def __init__(self, new_item_queue: Queue, queued_batches: Queue, min_top_domains: int = 5): """ @@ -105,6 +109,7 @@ class URLQueue: self._queue_urls(urls) def _queue_urls(self, valid_urls: list[str]): + random.shuffle(valid_urls) for url_batch in batch(valid_urls, BATCH_SIZE): self._queued_batches.put(url_batch, block=False)