Use blacklist on initialisation, add tests
This commit is contained in:
parent
ce844b59ae
commit
8c7ddda7d9
2 changed files with 24 additions and 1 deletions
|
@ -46,10 +46,11 @@ class URLQueue:
|
|||
|
||||
def initialize(self):
|
||||
logger.info(f"Initializing URL queue")
|
||||
blacklist_domains = get_blacklist_domains()
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
found_urls = url_db.get_urls(URLStatus.NEW, INITIALIZE_URLS)
|
||||
self._process_found_urls(found_urls)
|
||||
self._process_found_urls(found_urls, blacklist_domains)
|
||||
logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}")
|
||||
|
||||
def update(self):
|
||||
|
|
22
test/test_blacklist.py
Normal file
22
test/test_blacklist.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
from mwmbl.indexer.blacklist import is_domain_blacklisted
|
||||
|
||||
|
||||
def test_blacklist_excludes_bad_pattern():
|
||||
bad_domains = [
|
||||
"brofqpxj.uelinc.com",
|
||||
"gwaspsag.enflightmultisport.com",
|
||||
"fmcqgzvk.onlinejobs2day.com",
|
||||
"btmjmhyj.universityslandown.com",
|
||||
"djqfctsq.ropman.com",
|
||||
]
|
||||
|
||||
for domain in bad_domains:
|
||||
assert is_domain_blacklisted(domain, set())
|
||||
|
||||
|
||||
def test_blacklist_allows_top_domains():
|
||||
assert not is_domain_blacklisted("teamblog.supportbee.com", set())
|
||||
|
||||
|
||||
def test_blacklist_allows_other_domains():
|
||||
assert not is_domain_blacklisted("something.com", set())
|
Loading…
Add table
Reference in a new issue