From e5c08e0d24ab68d72cf69ee84a555be0b357dc6c Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 25 Feb 2023 16:48:59 +0000 Subject: [PATCH] Fix big with other URLs --- mwmbl/url_queue.py | 7 ++++--- test/test_url_queue.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index fe7022f..ab0f1bc 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -42,6 +42,7 @@ class URLQueue: self._other_urls = defaultdict(dict) self._top_urls = defaultdict(dict) self._min_top_domains = min_top_domains + assert min_top_domains > 0, "Need a minimum greater than 0 to prevent a never-ending loop" def initialize(self): logger.info(f"Initializing URL queue") @@ -72,7 +73,7 @@ class URLQueue: self._sort_urls(valid_urls) logger.info(f"Queue size: {self.num_queued_batches}") - while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) > self._min_top_domains: + while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) >= self._min_top_domains: total_top_urls = sum(len(urls) for urls in self._top_urls.values()) logger.info(f"Total top URLs stored: {total_top_urls}") @@ -97,8 +98,8 @@ class URLQueue: _sort_and_limit_urls(self._other_urls, MAX_OTHER_URLS) # Keep only the top "other" domains, ranked by the top item for that domain - top_other_urls = sorted(self._other_urls.items(), key=lambda x: x[1][0].score, reverse=True)[:MAX_OTHER_DOMAINS] - self._other_urls = defaultdict(list, dict(top_other_urls)) + top_other_urls = sorted(self._other_urls.items(), key=lambda x: next(iter(x[1].values())), reverse=True)[:MAX_OTHER_DOMAINS] + self._other_urls = defaultdict(dict, dict(top_other_urls)) def _batch_urls(self): urls = [] diff --git a/test/test_url_queue.py b/test/test_url_queue.py index d6c15a2..5e754c4 100644 --- a/test/test_url_queue.py +++ b/test/test_url_queue.py @@ -17,3 +17,21 @@ def test_url_queue_empties(): items = queued_batches.get(block=False) assert items == ["https://google.com"] + + +def test_url_queue_multiple_puts(): + new_item_queue = Queue() + queued_batches = Queue() + + url_queue = URLQueue(new_item_queue, queued_batches, min_top_domains=1) + new_item_queue.put([FoundURL("https://google.com", "123", 10.0, URLStatus.NEW.value, datetime(2023, 1, 19))]) + url_queue.update() + + new_item_queue.put([FoundURL("https://www.supermemo.com", "124", 10.0, URLStatus.NEW.value, datetime(2023, 1, 20))]) + url_queue.update() + + items = queued_batches.get(block=False) + assert items == ["https://google.com"] + + items_2 = queued_batches.get(block=False) + assert items_2 == ["https://www.supermemo.com"]