Filter out more spam domains

This commit is contained in:
Daoud Clarke 2023-10-17 22:05:53 +01:00
parent 8c7ddda7d9
commit 78a9bfbb11
2 changed files with 8 additions and 9 deletions

View file

@ -22,11 +22,12 @@ def is_domain_blacklisted(domain: str, blacklist_domains: set[str]):
# TODO: this is to filter out spammy domains that look like:
# brofqpxj.uelinc.com
# gwaspsag.enflightmultisport.com
# fmcqgzvk.onlinejobs2day.com
# btmjmhyj.universityslandown.com
# djqfctsq.ropman.com
# gzsmjc.fba01.com
# 59648.etnomurcia.com
#
# Eventually we can figure out a better way to identify SEO spam
domain_parts = domain.split('.')
if len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) == 8:
if (len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) in {6, 8}) or (
set(domain_parts[0]) <= set("1234567890")
):
return True

View file

@ -4,10 +4,8 @@ from mwmbl.indexer.blacklist import is_domain_blacklisted
def test_blacklist_excludes_bad_pattern():
bad_domains = [
"brofqpxj.uelinc.com",
"gwaspsag.enflightmultisport.com",
"fmcqgzvk.onlinejobs2day.com",
"btmjmhyj.universityslandown.com",
"djqfctsq.ropman.com",
"gzsmjc.fba01.com",
"59648.etnomurcia.com",
]
for domain in bad_domains: