Exclude domains with bad pattern

This commit is contained in:
Daoud Clarke 2023-10-17 17:45:26 +01:00
parent f00eacf8aa
commit b426fa3b7e
4 changed files with 45 additions and 18 deletions

View file

@ -0,0 +1,32 @@
from datetime import timedelta
from requests_cache import CachedSession
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.settings import BLACKLIST_DOMAINS_URL, EXCLUDED_DOMAINS, DOMAIN_BLACKLIST_REGEX
def get_blacklist_domains():
with CachedSession(expire_after=timedelta(days=1)) as session:
response = session.get(BLACKLIST_DOMAINS_URL)
return set(response.text.split())
def is_domain_blacklisted(domain: str, blacklist_domains: set[str]):
if domain in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(domain) is not None \
or domain in blacklist_domains:
return True
if domain in DOMAINS:
return False
# TODO: this is to filter out spammy domains that look like:
# brofqpxj.uelinc.com
# gwaspsag.enflightmultisport.com
# fmcqgzvk.onlinejobs2day.com
# btmjmhyj.universityslandown.com
# djqfctsq.ropman.com
# Eventually we can figure out a better way to identify SEO spam
domain_parts = domain.split('.')
if len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) == 8:
return True

View file

@ -7,20 +7,18 @@ from time import sleep
from typing import Collection
from urllib.parse import urlparse
from requests_cache import CachedSession
from mwmbl.crawler.batch import HashedBatch
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
from mwmbl.database import Database
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.indexer import process_batch
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.blacklist import get_blacklist_domains, is_domain_blacklisted
from mwmbl.indexer.index_batches import get_url_error_status
from mwmbl.indexer.indexdb import BatchStatus
from mwmbl.indexer.paths import BATCH_DIR_NAME
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL, \
DOMAIN_BLACKLIST_REGEX
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, SCORE_FOR_SAME_DOMAIN, \
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
from mwmbl.utils import get_domain
logger = getLogger(__name__)
@ -85,16 +83,9 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
logger.info(f"Put {len(urls)} new items in the URL queue")
def get_blacklist_domains():
with CachedSession(expire_after=timedelta(days=1)) as session:
response = session.get(BLACKLIST_DOMAINS_URL)
return set(response.text.split())
def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
parsed_link = urlparse(link)
if parsed_link.netloc in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(parsed_link.netloc) is not None \
or parsed_link.netloc in blacklist_domains:
if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
return

View file

@ -4,6 +4,6 @@ DEBUG = True
ALLOWED_HOSTS = ["localhost", "127.0.0.1"]
DATA_PATH = "./devdata"
RUN_BACKGROUND_PROCESSES = False
RUN_BACKGROUND_PROCESSES = True
NUM_PAGES = 2560

View file

@ -10,6 +10,7 @@ from typing import KeysView, Union
from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS
from mwmbl.database import Database
from mwmbl.hn_top_domains_filtered import DOMAINS as TOP_DOMAINS
from mwmbl.indexer.blacklist import is_domain_blacklisted, get_blacklist_domains
from mwmbl.settings import CORE_DOMAINS
from mwmbl.utils import batch, get_domain
@ -52,6 +53,7 @@ class URLQueue:
logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}")
def update(self):
blacklist_domains = get_blacklist_domains()
num_processed = 0
while True:
try:
@ -59,10 +61,10 @@ class URLQueue:
num_processed += 1
except Empty:
break
self._process_found_urls(new_batch)
self._process_found_urls(new_batch, blacklist_domains)
return num_processed
def _process_found_urls(self, found_urls: list[FoundURL]):
def _process_found_urls(self, found_urls: list[FoundURL], blacklist_domains: set[str]):
min_updated_date = datetime.utcnow() - timedelta(hours=REASSIGN_MIN_HOURS)
logger.info(f"Found URLS: {len(found_urls)}")
@ -70,7 +72,7 @@ class URLQueue:
found_url.status == URLStatus.ASSIGNED.value and found_url.timestamp < min_updated_date)]
logger.info(f"Valid URLs: {len(valid_urls)}")
self._sort_urls(valid_urls)
self._sort_urls(valid_urls, blacklist_domains)
logger.info(f"Queue size: {self.num_queued_batches}")
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) >= self._min_top_domains:
total_top_urls = sum(len(urls) for urls in self._top_urls.values())
@ -82,12 +84,14 @@ class URLQueue:
self._batch_urls()
logger.info(f"Queue size after batching: {self.num_queued_batches}")
def _sort_urls(self, valid_urls: list[FoundURL]):
def _sort_urls(self, valid_urls: list[FoundURL], blacklist_domains: set[str]):
for found_url in valid_urls:
try:
domain = get_domain(found_url.url)
except ValueError:
continue
if is_domain_blacklisted(domain, blacklist_domains):
continue
url_store = self._top_urls if domain in TOP_DOMAINS else self._other_urls
url_store[domain][found_url.url] = found_url.score