Exclude blacklisted domains

2023-09-22 21:53:53 +01:00 · 2023-09-22 21:53:53 +01:00 · 019095a4c1
commit 019095a4c1
parent 18dc760a34
4 changed files with 131 additions and 8 deletions
--- a/mwmbl/indexer/update_urls.py
+++ b/mwmbl/indexer/update_urls.py
@ -9,6 +9,8 @@ from time import sleep
 from typing import Iterable, Collection
 from urllib.parse import urlparse
 from requests_cache import CachedSession
 from mwmbl.crawler.batch import HashedBatch
 from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
 from mwmbl.database import Database
@ -19,7 +21,7 @@ from mwmbl.indexer.index_batches import get_url_error_status
 from mwmbl.indexer.indexdb import BatchStatus
 from mwmbl.indexer.paths import BATCH_DIR_NAME
 from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
-    SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
+    SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL
 from mwmbl.utils import get_domain
 logger = getLogger(__name__)
@ -40,7 +42,11 @@ def run(batch_cache: BatchCache, new_item_queue: Queue):
 def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
-    logger.info(f"Recording URLs in database for {len(batches)} batches")
+    start = datetime.now()
    blacklist_domains = get_blacklist_domains()
    blacklist_retrieval_time = datetime.now() - start
    logger.info(f"Recording URLs in database for {len(batches)} batches, with {len(blacklist_domains)} blacklist "
                f"domains, retrieved in {blacklist_retrieval_time.total_seconds()} seconds")
    with Database() as db:
        url_db = URLDatabase(db.connection)
        url_scores = defaultdict(float)
@ -64,12 +70,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
                    score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
                    for link in item.content.links:
                        process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
-                                     url_timestamps, url_users, False)
+                                     url_timestamps, url_users, False, blacklist_domains)
                    if item.content.extra_links:
                        for link in item.content.extra_links:
                            process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
-                                         url_timestamps, url_users, True)
+                                         url_timestamps, url_users, True, blacklist_domains)
        found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
                      for url in url_scores.keys() | url_statuses.keys()]
@ -80,9 +86,16 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
        logger.info(f"Put {len(urls)} new items in the URL queue")
-def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool):
+def get_blacklist_domains():
    with CachedSession(expire_after=timedelta(days=1)) as session:
        response = session.get(BLACKLIST_DOMAINS_URL)
        return set(response.text.split())
 def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
    parsed_link = urlparse(link)
-    if parsed_link.netloc in EXCLUDED_DOMAINS:
+    if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains:
        logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
        return
    extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
--- a/mwmbl/settings.py
+++ b/mwmbl/settings.py
@ -42,3 +42,5 @@ CORE_DOMAINS = {
    'arxiv.org',
    'www.python.org',
 }
 BLACKLIST_DOMAINS_URL = "https://github.com/T145/black-mirror/releases/download/latest/BLOCK_DOMAIN.txt"
--- a/poetry.lock
+++ b/poetry.lock
@ -36,6 +36,25 @@ files = [
 [package.extras]
 tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
 [[package]]
 name = "attrs"
 version = "23.1.0"
 description = "Classes Without Boilerplate"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
    {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"},
    {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"},
 ]
 [package.extras]
 cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
 dev = ["attrs[docs,tests]", "pre-commit"]
 docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
 tests = ["attrs[tests-no-zope]", "zope-interface"]
 tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
 [[package]]
 name = "beautifulsoup4"
 version = "4.10.0"
@ -148,6 +167,32 @@ files = [
    {file = "catalogue-2.0.8.tar.gz", hash = "sha256:b325c77659208bfb6af1b0d93b1a1aa4112e1bb29a4c5ced816758a722f0e388"},
 ]
 [[package]]
 name = "cattrs"
 version = "23.1.2"
 description = "Composable complex class support for attrs and dataclasses."
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
    {file = "cattrs-23.1.2-py3-none-any.whl", hash = "sha256:b2bb14311ac17bed0d58785e5a60f022e5431aca3932e3fc5cc8ed8639de50a4"},
    {file = "cattrs-23.1.2.tar.gz", hash = "sha256:db1c821b8c537382b2c7c66678c3790091ca0275ac486c76f3c8f3920e83c657"},
 ]
 [package.dependencies]
 attrs = ">=20"
 exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
 typing_extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
 [package.extras]
 bson = ["pymongo (>=4.2.0,<5.0.0)"]
 cbor2 = ["cbor2 (>=5.4.6,<6.0.0)"]
 msgpack = ["msgpack (>=1.0.2,<2.0.0)"]
 orjson = ["orjson (>=3.5.2,<4.0.0)"]
 pyyaml = ["PyYAML (>=6.0,<7.0)"]
 tomlkit = ["tomlkit (>=0.11.4,<0.12.0)"]
 ujson = ["ujson (>=5.4.0,<6.0.0)"]
 [[package]]
 name = "certifi"
 version = "2022.12.7"
@ -949,6 +994,22 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
 s3 = ["boto3"]
 test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
 [[package]]
 name = "platformdirs"
 version = "3.10.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
    {file = "platformdirs-3.10.0-py3-none-any.whl", hash = "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"},
    {file = "platformdirs-3.10.0.tar.gz", hash = "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d"},
 ]
 [package.extras]
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"]
 test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"]
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@ -1418,6 +1479,37 @@ urllib3 = ">=1.21.1,<1.27"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 [[package]]
 name = "requests-cache"
 version = "1.1.0"
 description = "A persistent cache for python requests"
 category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"
 files = [
    {file = "requests_cache-1.1.0-py3-none-any.whl", hash = "sha256:178282bce704b912c59e7f88f367c42bddd6cde6bf511b2a3e3cfb7e5332a92a"},
    {file = "requests_cache-1.1.0.tar.gz", hash = "sha256:41b79166aa8e300cc4de982f7ab7c52af914a785160be1eda25c6e9265969a67"},
 ]
 [package.dependencies]
 attrs = ">=21.2"
 cattrs = ">=22.2"
 platformdirs = ">=2.5"
 requests = ">=2.22"
 url-normalize = ">=1.4"
 urllib3 = ">=1.25.5"
 [package.extras]
 all = ["boto3 (>=1.15)", "botocore (>=1.18)", "itsdangerous (>=2.0)", "pymongo (>=3)", "pyyaml (>=5.4)", "redis (>=3)", "ujson (>=5.4)"]
 bson = ["bson (>=0.5)"]
 docs = ["furo (>=2023.3,<2024.0)", "linkify-it-py (>=2.0,<3.0)", "myst-parser (>=1.0,<2.0)", "sphinx (>=5.0.2,<6.0.0)", "sphinx-autodoc-typehints (>=1.19)", "sphinx-automodapi (>=0.14)", "sphinx-copybutton (>=0.5)", "sphinx-design (>=0.2)", "sphinx-notfound-page (>=0.8)", "sphinxcontrib-apidoc (>=0.3)", "sphinxext-opengraph (>=0.6)"]
 dynamodb = ["boto3 (>=1.15)", "botocore (>=1.18)"]
 json = ["ujson (>=5.4)"]
 mongodb = ["pymongo (>=3)"]
 redis = ["redis (>=3)"]
 security = ["itsdangerous (>=2.0)"]
 yaml = ["pyyaml (>=5.4)"]
 [[package]]
 name = "s3transfer"
 version = "0.6.0"
@ -1944,6 +2036,21 @@ files = [
    {file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"},
 ]
 [[package]]
 name = "url-normalize"
 version = "1.4.3"
 description = "URL normalization for Python"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 files = [
    {file = "url-normalize-1.4.3.tar.gz", hash = "sha256:d23d3a070ac52a67b83a1c59a0e68f8608d1cd538783b401bc9de2c0fac999b2"},
    {file = "url_normalize-1.4.3-py2.py3-none-any.whl", hash = "sha256:ec3c301f04e5bb676d333a7fa162fa977ad2ca04b7e652bfc9fac4e405728eed"},
 ]
 [package.dependencies]
 six = "*"
 [[package]]
 name = "urllib3"
 version = "1.26.15"
@ -2069,9 +2176,9 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 cffi = ["cffi (>=1.11)"]
 [extras]
-indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
+indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.11"
-content-hash = "65eb26c5a9dda0e504632fa433334157fe5c1cbe19135ab18d59c74f47249a0b"
+content-hash = "0da699c2eea1ad81a41e5704a988b07cca28371b3260e082eb12bf9f21f985b4"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -33,6 +33,7 @@ langdetect = {version= "==1.0.9", optional = true}
 pyarrow = {version= "==6.0.0", optional = true}
 pyspark = {version= "==3.2.0", optional = true}
 Levenshtein = {version= "==0.16.0", optional = true}
 requests-cache = "^1.1.0"
 [tool.poetry.extras]
 indexer = [