Exclude domains by keyword

This commit is contained in:
Daoud Clarke 2023-10-04 19:51:33 +01:00
parent a77dc3eb4c
commit 593c71f689
2 changed files with 7 additions and 3 deletions

View file

@ -1,5 +1,6 @@
import os
import pickle
import re
from collections import defaultdict
from datetime import datetime, timezone, timedelta
from logging import getLogger
@ -21,7 +22,8 @@ from mwmbl.indexer.index_batches import get_url_error_status
from mwmbl.indexer.indexdb import BatchStatus
from mwmbl.indexer.paths import BATCH_DIR_NAME
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL, \
DOMAIN_BLACKLIST_REGEX
from mwmbl.utils import get_domain
logger = getLogger(__name__)
@ -94,7 +96,8 @@ def get_blacklist_domains():
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
parsed_link = urlparse(link)
if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains:
if parsed_link.netloc in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(parsed_link.netloc) is not None \
or parsed_link.netloc in blacklist_domains:
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
return

View file

@ -31,7 +31,8 @@ SCORE_FOR_DIFFERENT_DOMAIN = 1.0
SCORE_FOR_SAME_DOMAIN = 0.01
EXTRA_LINK_MULTIPLIER = 0.001
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com', 'changeporn.com'}
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx")
CORE_DOMAINS = {
'github.com',
'en.wikipedia.org',