Fix big with other URLs
This commit is contained in:
parent
a24156ce5c
commit
e5c08e0d24
2 changed files with 22 additions and 3 deletions
|
@ -42,6 +42,7 @@ class URLQueue:
|
|||
self._other_urls = defaultdict(dict)
|
||||
self._top_urls = defaultdict(dict)
|
||||
self._min_top_domains = min_top_domains
|
||||
assert min_top_domains > 0, "Need a minimum greater than 0 to prevent a never-ending loop"
|
||||
|
||||
def initialize(self):
|
||||
logger.info(f"Initializing URL queue")
|
||||
|
@ -72,7 +73,7 @@ class URLQueue:
|
|||
|
||||
self._sort_urls(valid_urls)
|
||||
logger.info(f"Queue size: {self.num_queued_batches}")
|
||||
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) > self._min_top_domains:
|
||||
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) >= self._min_top_domains:
|
||||
total_top_urls = sum(len(urls) for urls in self._top_urls.values())
|
||||
logger.info(f"Total top URLs stored: {total_top_urls}")
|
||||
|
||||
|
@ -97,8 +98,8 @@ class URLQueue:
|
|||
_sort_and_limit_urls(self._other_urls, MAX_OTHER_URLS)
|
||||
|
||||
# Keep only the top "other" domains, ranked by the top item for that domain
|
||||
top_other_urls = sorted(self._other_urls.items(), key=lambda x: x[1][0].score, reverse=True)[:MAX_OTHER_DOMAINS]
|
||||
self._other_urls = defaultdict(list, dict(top_other_urls))
|
||||
top_other_urls = sorted(self._other_urls.items(), key=lambda x: next(iter(x[1].values())), reverse=True)[:MAX_OTHER_DOMAINS]
|
||||
self._other_urls = defaultdict(dict, dict(top_other_urls))
|
||||
|
||||
def _batch_urls(self):
|
||||
urls = []
|
||||
|
|
|
@ -17,3 +17,21 @@ def test_url_queue_empties():
|
|||
items = queued_batches.get(block=False)
|
||||
|
||||
assert items == ["https://google.com"]
|
||||
|
||||
|
||||
def test_url_queue_multiple_puts():
|
||||
new_item_queue = Queue()
|
||||
queued_batches = Queue()
|
||||
|
||||
url_queue = URLQueue(new_item_queue, queued_batches, min_top_domains=1)
|
||||
new_item_queue.put([FoundURL("https://google.com", "123", 10.0, URLStatus.NEW.value, datetime(2023, 1, 19))])
|
||||
url_queue.update()
|
||||
|
||||
new_item_queue.put([FoundURL("https://www.supermemo.com", "124", 10.0, URLStatus.NEW.value, datetime(2023, 1, 20))])
|
||||
url_queue.update()
|
||||
|
||||
items = queued_batches.get(block=False)
|
||||
assert items == ["https://google.com"]
|
||||
|
||||
items_2 = queued_batches.get(block=False)
|
||||
assert items_2 == ["https://www.supermemo.com"]
|
||||
|
|
Loading…
Reference in a new issue