Merge pull request #114 from mwmbl/exclude-domains-by-keyword
Exclude domains by keyword
This commit is contained in:
commit
d716cb347f
3 changed files with 55 additions and 8 deletions
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import pickle
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from logging import getLogger
|
||||
|
@ -21,7 +22,8 @@ from mwmbl.indexer.index_batches import get_url_error_status
|
|||
from mwmbl.indexer.indexdb import BatchStatus
|
||||
from mwmbl.indexer.paths import BATCH_DIR_NAME
|
||||
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
|
||||
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL
|
||||
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL, \
|
||||
DOMAIN_BLACKLIST_REGEX
|
||||
from mwmbl.utils import get_domain
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
@ -69,12 +71,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
|
|||
continue
|
||||
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
||||
for link in item.content.links:
|
||||
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
url_timestamps, url_users, False, blacklist_domains)
|
||||
|
||||
if item.content.extra_links:
|
||||
for link in item.content.extra_links:
|
||||
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
url_timestamps, url_users, True, blacklist_domains)
|
||||
|
||||
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
|
||||
|
@ -92,20 +94,21 @@ def get_blacklist_domains():
|
|||
return set(response.text.split())
|
||||
|
||||
|
||||
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
|
||||
def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
|
||||
parsed_link = urlparse(link)
|
||||
if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains:
|
||||
if parsed_link.netloc in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(parsed_link.netloc) is not None \
|
||||
or parsed_link.netloc in blacklist_domains:
|
||||
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
|
||||
return
|
||||
|
||||
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
|
||||
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
|
||||
url_scores[link] += score * unknown_domain_multiplier * extra_multiplier
|
||||
url_users[link] = batch.user_id_hash
|
||||
url_users[link] = user_id_hash
|
||||
url_timestamps[link] = timestamp
|
||||
domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
|
||||
url_scores[domain] += SCORE_FOR_ROOT_PATH * unknown_domain_multiplier
|
||||
url_users[domain] = batch.user_id_hash
|
||||
url_users[domain] = user_id_hash
|
||||
url_timestamps[domain] = timestamp
|
||||
|
||||
|
||||
|
|
|
@ -31,7 +31,8 @@ SCORE_FOR_DIFFERENT_DOMAIN = 1.0
|
|||
SCORE_FOR_SAME_DOMAIN = 0.01
|
||||
EXTRA_LINK_MULTIPLIER = 0.001
|
||||
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
|
||||
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com', 'changeporn.com'}
|
||||
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
|
||||
DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx")
|
||||
CORE_DOMAINS = {
|
||||
'github.com',
|
||||
'en.wikipedia.org',
|
||||
|
|
43
test/test_update_urls.py
Normal file
43
test/test_update_urls.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from mwmbl.indexer.update_urls import process_link
|
||||
|
||||
|
||||
def test_process_link_normal():
|
||||
url_scores = {"https://somesite.com/something.html": 0.0, "https://somesite.com/": 0.0}
|
||||
url_timestamps = {}
|
||||
url_users = {}
|
||||
|
||||
process_link(
|
||||
user_id_hash="abc123",
|
||||
crawled_page_domain="somewhere.com",
|
||||
link="https://somesite.com/something.html",
|
||||
unknown_domain_multiplier=1,
|
||||
timestamp=1234,
|
||||
url_scores=url_scores,
|
||||
url_timestamps=url_timestamps,
|
||||
url_users=url_users,
|
||||
is_extra=False, blacklist_domains=[]
|
||||
)
|
||||
|
||||
assert url_scores["https://somesite.com/something.html"] > 0.0
|
||||
|
||||
|
||||
def test_process_link_excludes_porn():
|
||||
url_scores = {}
|
||||
url_timestamps = {}
|
||||
url_users = {}
|
||||
|
||||
process_link(
|
||||
user_id_hash="abc123",
|
||||
crawled_page_domain="somewhere.com",
|
||||
link="https://somepornsite.com/something.html",
|
||||
unknown_domain_multiplier=1,
|
||||
timestamp=1234,
|
||||
url_scores=url_scores,
|
||||
url_timestamps=url_timestamps,
|
||||
url_users=url_users,
|
||||
is_extra=False, blacklist_domains=[]
|
||||
)
|
||||
|
||||
assert url_scores == {}
|
||||
assert url_timestamps == {}
|
||||
assert url_users == {}
|
Loading…
Reference in a new issue