From 78a9bfbb11d476aad2c290ece8677f300ced4d57 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Tue, 17 Oct 2023 22:05:53 +0100 Subject: [PATCH] Filter out more spam domains --- mwmbl/indexer/blacklist.py | 11 ++++++----- test/test_blacklist.py | 6 ++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/mwmbl/indexer/blacklist.py b/mwmbl/indexer/blacklist.py index cb480d6..e6e5adb 100644 --- a/mwmbl/indexer/blacklist.py +++ b/mwmbl/indexer/blacklist.py @@ -22,11 +22,12 @@ def is_domain_blacklisted(domain: str, blacklist_domains: set[str]): # TODO: this is to filter out spammy domains that look like: # brofqpxj.uelinc.com - # gwaspsag.enflightmultisport.com - # fmcqgzvk.onlinejobs2day.com - # btmjmhyj.universityslandown.com - # djqfctsq.ropman.com + # gzsmjc.fba01.com + # 59648.etnomurcia.com + # # Eventually we can figure out a better way to identify SEO spam domain_parts = domain.split('.') - if len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) == 8: + if (len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) in {6, 8}) or ( + set(domain_parts[0]) <= set("1234567890") + ): return True diff --git a/test/test_blacklist.py b/test/test_blacklist.py index b9f376a..b68c1ce 100644 --- a/test/test_blacklist.py +++ b/test/test_blacklist.py @@ -4,10 +4,8 @@ from mwmbl.indexer.blacklist import is_domain_blacklisted def test_blacklist_excludes_bad_pattern(): bad_domains = [ "brofqpxj.uelinc.com", - "gwaspsag.enflightmultisport.com", - "fmcqgzvk.onlinejobs2day.com", - "btmjmhyj.universityslandown.com", - "djqfctsq.ropman.com", + "gzsmjc.fba01.com", + "59648.etnomurcia.com", ] for domain in bad_domains: