Filter out more spam domains

This commit is contained in:
Daoud Clarke 2023-10-17 22:05:53 +01:00
parent 8c7ddda7d9
commit 78a9bfbb11
2 changed files with 8 additions and 9 deletions

View file

@ -22,11 +22,12 @@ def is_domain_blacklisted(domain: str, blacklist_domains: set[str]):
# TODO: this is to filter out spammy domains that look like: # TODO: this is to filter out spammy domains that look like:
# brofqpxj.uelinc.com # brofqpxj.uelinc.com
# gwaspsag.enflightmultisport.com # gzsmjc.fba01.com
# fmcqgzvk.onlinejobs2day.com # 59648.etnomurcia.com
# btmjmhyj.universityslandown.com #
# djqfctsq.ropman.com
# Eventually we can figure out a better way to identify SEO spam # Eventually we can figure out a better way to identify SEO spam
domain_parts = domain.split('.') domain_parts = domain.split('.')
if len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) == 8: if (len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) in {6, 8}) or (
set(domain_parts[0]) <= set("1234567890")
):
return True return True

View file

@ -4,10 +4,8 @@ from mwmbl.indexer.blacklist import is_domain_blacklisted
def test_blacklist_excludes_bad_pattern(): def test_blacklist_excludes_bad_pattern():
bad_domains = [ bad_domains = [
"brofqpxj.uelinc.com", "brofqpxj.uelinc.com",
"gwaspsag.enflightmultisport.com", "gzsmjc.fba01.com",
"fmcqgzvk.onlinejobs2day.com", "59648.etnomurcia.com",
"btmjmhyj.universityslandown.com",
"djqfctsq.ropman.com",
] ]
for domain in bad_domains: for domain in bad_domains: