From 019095a4c16d2a1f9c869225a074c9a3fa80b698 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Fri, 22 Sep 2023 21:53:53 +0100 Subject: [PATCH 1/3] Exclude blacklisted domains --- mwmbl/indexer/update_urls.py | 25 ++++++-- mwmbl/settings.py | 2 + poetry.lock | 111 ++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 4 files changed, 131 insertions(+), 8 deletions(-) diff --git a/mwmbl/indexer/update_urls.py b/mwmbl/indexer/update_urls.py index d171eaa..ac3b909 100644 --- a/mwmbl/indexer/update_urls.py +++ b/mwmbl/indexer/update_urls.py @@ -9,6 +9,8 @@ from time import sleep from typing import Iterable, Collection from urllib.parse import urlparse +from requests_cache import CachedSession + from mwmbl.crawler.batch import HashedBatch from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL from mwmbl.database import Database @@ -19,7 +21,7 @@ from mwmbl.indexer.index_batches import get_url_error_status from mwmbl.indexer.indexdb import BatchStatus from mwmbl.indexer.paths import BATCH_DIR_NAME from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \ - SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER + SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL from mwmbl.utils import get_domain logger = getLogger(__name__) @@ -40,7 +42,11 @@ def run(batch_cache: BatchCache, new_item_queue: Queue): def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue): - logger.info(f"Recording URLs in database for {len(batches)} batches") + start = datetime.now() + blacklist_domains = get_blacklist_domains() + blacklist_retrieval_time = datetime.now() - start + logger.info(f"Recording URLs in database for {len(batches)} batches, with {len(blacklist_domains)} blacklist " + f"domains, retrieved in {blacklist_retrieval_time.total_seconds()} seconds") with Database() as db: url_db = URLDatabase(db.connection) url_scores = defaultdict(float) @@ -64,12 +70,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER for link in item.content.links: process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores, - url_timestamps, url_users, False) + url_timestamps, url_users, False, blacklist_domains) if item.content.extra_links: for link in item.content.extra_links: process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores, - url_timestamps, url_users, True) + url_timestamps, url_users, True, blacklist_domains) found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url]) for url in url_scores.keys() | url_statuses.keys()] @@ -80,9 +86,16 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu logger.info(f"Put {len(urls)} new items in the URL queue") -def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool): +def get_blacklist_domains(): + with CachedSession(expire_after=timedelta(days=1)) as session: + response = session.get(BLACKLIST_DOMAINS_URL) + return set(response.text.split()) + + +def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains): parsed_link = urlparse(link) - if parsed_link.netloc in EXCLUDED_DOMAINS: + if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains: + logger.info(f"Excluding link for blacklisted domain: {parsed_link}") return extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0 diff --git a/mwmbl/settings.py b/mwmbl/settings.py index d700c46..296464c 100644 --- a/mwmbl/settings.py +++ b/mwmbl/settings.py @@ -42,3 +42,5 @@ CORE_DOMAINS = { 'arxiv.org', 'www.python.org', } + +BLACKLIST_DOMAINS_URL = "https://github.com/T145/black-mirror/releases/download/latest/BLOCK_DOMAIN.txt" diff --git a/poetry.lock b/poetry.lock index b364b25..0924e79 100644 --- a/poetry.lock +++ b/poetry.lock @@ -36,6 +36,25 @@ files = [ [package.extras] tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"] +[[package]] +name = "attrs" +version = "23.1.0" +description = "Classes Without Boilerplate" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"}, + {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"}, +] + +[package.extras] +cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] +dev = ["attrs[docs,tests]", "pre-commit"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"] +tests = ["attrs[tests-no-zope]", "zope-interface"] +tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] + [[package]] name = "beautifulsoup4" version = "4.10.0" @@ -148,6 +167,32 @@ files = [ {file = "catalogue-2.0.8.tar.gz", hash = "sha256:b325c77659208bfb6af1b0d93b1a1aa4112e1bb29a4c5ced816758a722f0e388"}, ] +[[package]] +name = "cattrs" +version = "23.1.2" +description = "Composable complex class support for attrs and dataclasses." +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cattrs-23.1.2-py3-none-any.whl", hash = "sha256:b2bb14311ac17bed0d58785e5a60f022e5431aca3932e3fc5cc8ed8639de50a4"}, + {file = "cattrs-23.1.2.tar.gz", hash = "sha256:db1c821b8c537382b2c7c66678c3790091ca0275ac486c76f3c8f3920e83c657"}, +] + +[package.dependencies] +attrs = ">=20" +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +typing_extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""} + +[package.extras] +bson = ["pymongo (>=4.2.0,<5.0.0)"] +cbor2 = ["cbor2 (>=5.4.6,<6.0.0)"] +msgpack = ["msgpack (>=1.0.2,<2.0.0)"] +orjson = ["orjson (>=3.5.2,<4.0.0)"] +pyyaml = ["PyYAML (>=6.0,<7.0)"] +tomlkit = ["tomlkit (>=0.11.4,<0.12.0)"] +ujson = ["ujson (>=5.4.0,<6.0.0)"] + [[package]] name = "certifi" version = "2022.12.7" @@ -949,6 +994,22 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"] s3 = ["boto3"] test = ["mock", "pytest", "pytest-coverage", "typer-cli"] +[[package]] +name = "platformdirs" +version = "3.10.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "platformdirs-3.10.0-py3-none-any.whl", hash = "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"}, + {file = "platformdirs-3.10.0.tar.gz", hash = "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d"}, +] + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] + [[package]] name = "pluggy" version = "1.0.0" @@ -1418,6 +1479,37 @@ urllib3 = ">=1.21.1,<1.27" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-cache" +version = "1.1.0" +description = "A persistent cache for python requests" +category = "main" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "requests_cache-1.1.0-py3-none-any.whl", hash = "sha256:178282bce704b912c59e7f88f367c42bddd6cde6bf511b2a3e3cfb7e5332a92a"}, + {file = "requests_cache-1.1.0.tar.gz", hash = "sha256:41b79166aa8e300cc4de982f7ab7c52af914a785160be1eda25c6e9265969a67"}, +] + +[package.dependencies] +attrs = ">=21.2" +cattrs = ">=22.2" +platformdirs = ">=2.5" +requests = ">=2.22" +url-normalize = ">=1.4" +urllib3 = ">=1.25.5" + +[package.extras] +all = ["boto3 (>=1.15)", "botocore (>=1.18)", "itsdangerous (>=2.0)", "pymongo (>=3)", "pyyaml (>=5.4)", "redis (>=3)", "ujson (>=5.4)"] +bson = ["bson (>=0.5)"] +docs = ["furo (>=2023.3,<2024.0)", "linkify-it-py (>=2.0,<3.0)", "myst-parser (>=1.0,<2.0)", "sphinx (>=5.0.2,<6.0.0)", "sphinx-autodoc-typehints (>=1.19)", "sphinx-automodapi (>=0.14)", "sphinx-copybutton (>=0.5)", "sphinx-design (>=0.2)", "sphinx-notfound-page (>=0.8)", "sphinxcontrib-apidoc (>=0.3)", "sphinxext-opengraph (>=0.6)"] +dynamodb = ["boto3 (>=1.15)", "botocore (>=1.18)"] +json = ["ujson (>=5.4)"] +mongodb = ["pymongo (>=3)"] +redis = ["redis (>=3)"] +security = ["itsdangerous (>=2.0)"] +yaml = ["pyyaml (>=5.4)"] + [[package]] name = "s3transfer" version = "0.6.0" @@ -1944,6 +2036,21 @@ files = [ {file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"}, ] +[[package]] +name = "url-normalize" +version = "1.4.3" +description = "URL normalization for Python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "url-normalize-1.4.3.tar.gz", hash = "sha256:d23d3a070ac52a67b83a1c59a0e68f8608d1cd538783b401bc9de2c0fac999b2"}, + {file = "url_normalize-1.4.3-py2.py3-none-any.whl", hash = "sha256:ec3c301f04e5bb676d333a7fa162fa977ad2ca04b7e652bfc9fac4e405728eed"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "urllib3" version = "1.26.15" @@ -2069,9 +2176,9 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"] +indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "pyarrow", "pyspark", "Levenshtein"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "65eb26c5a9dda0e504632fa433334157fe5c1cbe19135ab18d59c74f47249a0b" +content-hash = "0da699c2eea1ad81a41e5704a988b07cca28371b3260e082eb12bf9f21f985b4" diff --git a/pyproject.toml b/pyproject.toml index 2def9aa..4ec424a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ langdetect = {version= "==1.0.9", optional = true} pyarrow = {version= "==6.0.0", optional = true} pyspark = {version= "==3.2.0", optional = true} Levenshtein = {version= "==0.16.0", optional = true} +requests-cache = "^1.1.0" [tool.poetry.extras] indexer = [ From 7e054d0854cba0d82c36717dd8222ad1febeb8b9 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Fri, 22 Sep 2023 23:04:37 +0100 Subject: [PATCH 2/3] Better blacklist --- mwmbl/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwmbl/settings.py b/mwmbl/settings.py index 296464c..cf904c1 100644 --- a/mwmbl/settings.py +++ b/mwmbl/settings.py @@ -43,4 +43,4 @@ CORE_DOMAINS = { 'www.python.org', } -BLACKLIST_DOMAINS_URL = "https://github.com/T145/black-mirror/releases/download/latest/BLOCK_DOMAIN.txt" +BLACKLIST_DOMAINS_URL = "https://get.domainsblacklists.com/blacklist.txt" From bec00cdab50ccce84ac86e83b50db599d027e849 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Fri, 22 Sep 2023 23:06:04 +0100 Subject: [PATCH 3/3] Exclude additional domain --- mwmbl/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwmbl/settings.py b/mwmbl/settings.py index cf904c1..0a350b9 100644 --- a/mwmbl/settings.py +++ b/mwmbl/settings.py @@ -31,7 +31,7 @@ SCORE_FOR_DIFFERENT_DOMAIN = 1.0 SCORE_FOR_SAME_DOMAIN = 0.01 EXTRA_LINK_MULTIPLIER = 0.001 UNKNOWN_DOMAIN_MULTIPLIER = 0.001 -EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'} +EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com', 'changeporn.com'} CORE_DOMAINS = { 'github.com', 'en.wikipedia.org',