Exclude blacklisted domains

This commit is contained in:
Daoud Clarke 2023-09-22 21:53:53 +01:00
parent 18dc760a34
commit 019095a4c1
4 changed files with 131 additions and 8 deletions

View file

@ -9,6 +9,8 @@ from time import sleep
from typing import Iterable, Collection from typing import Iterable, Collection
from urllib.parse import urlparse from urllib.parse import urlparse
from requests_cache import CachedSession
from mwmbl.crawler.batch import HashedBatch from mwmbl.crawler.batch import HashedBatch
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
from mwmbl.database import Database from mwmbl.database import Database
@ -19,7 +21,7 @@ from mwmbl.indexer.index_batches import get_url_error_status
from mwmbl.indexer.indexdb import BatchStatus from mwmbl.indexer.indexdb import BatchStatus
from mwmbl.indexer.paths import BATCH_DIR_NAME from mwmbl.indexer.paths import BATCH_DIR_NAME
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \ from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL
from mwmbl.utils import get_domain from mwmbl.utils import get_domain
logger = getLogger(__name__) logger = getLogger(__name__)
@ -40,7 +42,11 @@ def run(batch_cache: BatchCache, new_item_queue: Queue):
def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue): def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
logger.info(f"Recording URLs in database for {len(batches)} batches") start = datetime.now()
blacklist_domains = get_blacklist_domains()
blacklist_retrieval_time = datetime.now() - start
logger.info(f"Recording URLs in database for {len(batches)} batches, with {len(blacklist_domains)} blacklist "
f"domains, retrieved in {blacklist_retrieval_time.total_seconds()} seconds")
with Database() as db: with Database() as db:
url_db = URLDatabase(db.connection) url_db = URLDatabase(db.connection)
url_scores = defaultdict(float) url_scores = defaultdict(float)
@ -64,12 +70,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
for link in item.content.links: for link in item.content.links:
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores, process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
url_timestamps, url_users, False) url_timestamps, url_users, False, blacklist_domains)
if item.content.extra_links: if item.content.extra_links:
for link in item.content.extra_links: for link in item.content.extra_links:
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores, process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
url_timestamps, url_users, True) url_timestamps, url_users, True, blacklist_domains)
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url]) found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
for url in url_scores.keys() | url_statuses.keys()] for url in url_scores.keys() | url_statuses.keys()]
@ -80,9 +86,16 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
logger.info(f"Put {len(urls)} new items in the URL queue") logger.info(f"Put {len(urls)} new items in the URL queue")
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool): def get_blacklist_domains():
with CachedSession(expire_after=timedelta(days=1)) as session:
response = session.get(BLACKLIST_DOMAINS_URL)
return set(response.text.split())
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
parsed_link = urlparse(link) parsed_link = urlparse(link)
if parsed_link.netloc in EXCLUDED_DOMAINS: if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains:
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
return return
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0 extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0

View file

@ -42,3 +42,5 @@ CORE_DOMAINS = {
'arxiv.org', 'arxiv.org',
'www.python.org', 'www.python.org',
} }
BLACKLIST_DOMAINS_URL = "https://github.com/T145/black-mirror/releases/download/latest/BLOCK_DOMAIN.txt"

111
poetry.lock generated
View file

@ -36,6 +36,25 @@ files = [
[package.extras] [package.extras]
tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"] tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
[[package]]
name = "attrs"
version = "23.1.0"
description = "Classes Without Boilerplate"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
{file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"},
{file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"},
]
[package.extras]
cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
dev = ["attrs[docs,tests]", "pre-commit"]
docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
tests = ["attrs[tests-no-zope]", "zope-interface"]
tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
[[package]] [[package]]
name = "beautifulsoup4" name = "beautifulsoup4"
version = "4.10.0" version = "4.10.0"
@ -148,6 +167,32 @@ files = [
{file = "catalogue-2.0.8.tar.gz", hash = "sha256:b325c77659208bfb6af1b0d93b1a1aa4112e1bb29a4c5ced816758a722f0e388"}, {file = "catalogue-2.0.8.tar.gz", hash = "sha256:b325c77659208bfb6af1b0d93b1a1aa4112e1bb29a4c5ced816758a722f0e388"},
] ]
[[package]]
name = "cattrs"
version = "23.1.2"
description = "Composable complex class support for attrs and dataclasses."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
{file = "cattrs-23.1.2-py3-none-any.whl", hash = "sha256:b2bb14311ac17bed0d58785e5a60f022e5431aca3932e3fc5cc8ed8639de50a4"},
{file = "cattrs-23.1.2.tar.gz", hash = "sha256:db1c821b8c537382b2c7c66678c3790091ca0275ac486c76f3c8f3920e83c657"},
]
[package.dependencies]
attrs = ">=20"
exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
typing_extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
[package.extras]
bson = ["pymongo (>=4.2.0,<5.0.0)"]
cbor2 = ["cbor2 (>=5.4.6,<6.0.0)"]
msgpack = ["msgpack (>=1.0.2,<2.0.0)"]
orjson = ["orjson (>=3.5.2,<4.0.0)"]
pyyaml = ["PyYAML (>=6.0,<7.0)"]
tomlkit = ["tomlkit (>=0.11.4,<0.12.0)"]
ujson = ["ujson (>=5.4.0,<6.0.0)"]
[[package]] [[package]]
name = "certifi" name = "certifi"
version = "2022.12.7" version = "2022.12.7"
@ -949,6 +994,22 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
s3 = ["boto3"] s3 = ["boto3"]
test = ["mock", "pytest", "pytest-coverage", "typer-cli"] test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
[[package]]
name = "platformdirs"
version = "3.10.0"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
{file = "platformdirs-3.10.0-py3-none-any.whl", hash = "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"},
{file = "platformdirs-3.10.0.tar.gz", hash = "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d"},
]
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"]
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"]
[[package]] [[package]]
name = "pluggy" name = "pluggy"
version = "1.0.0" version = "1.0.0"
@ -1418,6 +1479,37 @@ urllib3 = ">=1.21.1,<1.27"
socks = ["PySocks (>=1.5.6,!=1.5.7)"] socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "requests-cache"
version = "1.1.0"
description = "A persistent cache for python requests"
category = "main"
optional = false
python-versions = ">=3.7,<4.0"
files = [
{file = "requests_cache-1.1.0-py3-none-any.whl", hash = "sha256:178282bce704b912c59e7f88f367c42bddd6cde6bf511b2a3e3cfb7e5332a92a"},
{file = "requests_cache-1.1.0.tar.gz", hash = "sha256:41b79166aa8e300cc4de982f7ab7c52af914a785160be1eda25c6e9265969a67"},
]
[package.dependencies]
attrs = ">=21.2"
cattrs = ">=22.2"
platformdirs = ">=2.5"
requests = ">=2.22"
url-normalize = ">=1.4"
urllib3 = ">=1.25.5"
[package.extras]
all = ["boto3 (>=1.15)", "botocore (>=1.18)", "itsdangerous (>=2.0)", "pymongo (>=3)", "pyyaml (>=5.4)", "redis (>=3)", "ujson (>=5.4)"]
bson = ["bson (>=0.5)"]
docs = ["furo (>=2023.3,<2024.0)", "linkify-it-py (>=2.0,<3.0)", "myst-parser (>=1.0,<2.0)", "sphinx (>=5.0.2,<6.0.0)", "sphinx-autodoc-typehints (>=1.19)", "sphinx-automodapi (>=0.14)", "sphinx-copybutton (>=0.5)", "sphinx-design (>=0.2)", "sphinx-notfound-page (>=0.8)", "sphinxcontrib-apidoc (>=0.3)", "sphinxext-opengraph (>=0.6)"]
dynamodb = ["boto3 (>=1.15)", "botocore (>=1.18)"]
json = ["ujson (>=5.4)"]
mongodb = ["pymongo (>=3)"]
redis = ["redis (>=3)"]
security = ["itsdangerous (>=2.0)"]
yaml = ["pyyaml (>=5.4)"]
[[package]] [[package]]
name = "s3transfer" name = "s3transfer"
version = "0.6.0" version = "0.6.0"
@ -1944,6 +2036,21 @@ files = [
{file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"}, {file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"},
] ]
[[package]]
name = "url-normalize"
version = "1.4.3"
description = "URL normalization for Python"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
files = [
{file = "url-normalize-1.4.3.tar.gz", hash = "sha256:d23d3a070ac52a67b83a1c59a0e68f8608d1cd538783b401bc9de2c0fac999b2"},
{file = "url_normalize-1.4.3-py2.py3-none-any.whl", hash = "sha256:ec3c301f04e5bb676d333a7fa162fa977ad2ca04b7e652bfc9fac4e405728eed"},
]
[package.dependencies]
six = "*"
[[package]] [[package]]
name = "urllib3" name = "urllib3"
version = "1.26.15" version = "1.26.15"
@ -2069,9 +2176,9 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"] cffi = ["cffi (>=1.11)"]
[extras] [extras]
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"] indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.10,<3.11" python-versions = ">=3.10,<3.11"
content-hash = "65eb26c5a9dda0e504632fa433334157fe5c1cbe19135ab18d59c74f47249a0b" content-hash = "0da699c2eea1ad81a41e5704a988b07cca28371b3260e082eb12bf9f21f985b4"

View file

@ -33,6 +33,7 @@ langdetect = {version= "==1.0.9", optional = true}
pyarrow = {version= "==6.0.0", optional = true} pyarrow = {version= "==6.0.0", optional = true}
pyspark = {version= "==3.2.0", optional = true} pyspark = {version= "==3.2.0", optional = true}
Levenshtein = {version= "==0.16.0", optional = true} Levenshtein = {version= "==0.16.0", optional = true}
requests-cache = "^1.1.0"
[tool.poetry.extras] [tool.poetry.extras]
indexer = [ indexer = [