diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index 70528fd..3facf86 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -46,10 +46,11 @@ class URLQueue: def initialize(self): logger.info(f"Initializing URL queue") + blacklist_domains = get_blacklist_domains() with Database() as db: url_db = URLDatabase(db.connection) found_urls = url_db.get_urls(URLStatus.NEW, INITIALIZE_URLS) - self._process_found_urls(found_urls) + self._process_found_urls(found_urls, blacklist_domains) logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}") def update(self): diff --git a/test/test_blacklist.py b/test/test_blacklist.py new file mode 100644 index 0000000..b9f376a --- /dev/null +++ b/test/test_blacklist.py @@ -0,0 +1,22 @@ +from mwmbl.indexer.blacklist import is_domain_blacklisted + + +def test_blacklist_excludes_bad_pattern(): + bad_domains = [ + "brofqpxj.uelinc.com", + "gwaspsag.enflightmultisport.com", + "fmcqgzvk.onlinejobs2day.com", + "btmjmhyj.universityslandown.com", + "djqfctsq.ropman.com", + ] + + for domain in bad_domains: + assert is_domain_blacklisted(domain, set()) + + +def test_blacklist_allows_top_domains(): + assert not is_domain_blacklisted("teamblog.supportbee.com", set()) + + +def test_blacklist_allows_other_domains(): + assert not is_domain_blacklisted("something.com", set())