Filter out more spam domains
This commit is contained in:
parent
8c7ddda7d9
commit
78a9bfbb11
2 changed files with 8 additions and 9 deletions
|
@ -22,11 +22,12 @@ def is_domain_blacklisted(domain: str, blacklist_domains: set[str]):
|
||||||
|
|
||||||
# TODO: this is to filter out spammy domains that look like:
|
# TODO: this is to filter out spammy domains that look like:
|
||||||
# brofqpxj.uelinc.com
|
# brofqpxj.uelinc.com
|
||||||
# gwaspsag.enflightmultisport.com
|
# gzsmjc.fba01.com
|
||||||
# fmcqgzvk.onlinejobs2day.com
|
# 59648.etnomurcia.com
|
||||||
# btmjmhyj.universityslandown.com
|
#
|
||||||
# djqfctsq.ropman.com
|
|
||||||
# Eventually we can figure out a better way to identify SEO spam
|
# Eventually we can figure out a better way to identify SEO spam
|
||||||
domain_parts = domain.split('.')
|
domain_parts = domain.split('.')
|
||||||
if len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) == 8:
|
if (len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) in {6, 8}) or (
|
||||||
|
set(domain_parts[0]) <= set("1234567890")
|
||||||
|
):
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -4,10 +4,8 @@ from mwmbl.indexer.blacklist import is_domain_blacklisted
|
||||||
def test_blacklist_excludes_bad_pattern():
|
def test_blacklist_excludes_bad_pattern():
|
||||||
bad_domains = [
|
bad_domains = [
|
||||||
"brofqpxj.uelinc.com",
|
"brofqpxj.uelinc.com",
|
||||||
"gwaspsag.enflightmultisport.com",
|
"gzsmjc.fba01.com",
|
||||||
"fmcqgzvk.onlinejobs2day.com",
|
"59648.etnomurcia.com",
|
||||||
"btmjmhyj.universityslandown.com",
|
|
||||||
"djqfctsq.ropman.com",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for domain in bad_domains:
|
for domain in bad_domains:
|
||||||
|
|
Loading…
Reference in a new issue