1 year ago · ed96386f05
--- a/mwmbl/indexer/update_urls.py
+++ b/mwmbl/indexer/update_urls.py
@@ -9,6 +9,8 @@ from time import sleep
 
				 from typing import Iterable, Collection
			
 
				 from urllib.parse import urlparse
			
 
				 
			
 
				+from requests_cache import CachedSession
			
 
				+
			
 
				 from mwmbl.crawler.batch import HashedBatch
			
 
				 from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
			
 
				 from mwmbl.database import Database
			
@@ -19,7 +21,7 @@ from mwmbl.indexer.index_batches import get_url_error_status
 
				 from mwmbl.indexer.indexdb import BatchStatus
			
 
				 from mwmbl.indexer.paths import BATCH_DIR_NAME
			
 
				 from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
			
 
				-    SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
			
 
				+    SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL
			
 
				 from mwmbl.utils import get_domain
			
 
				 
			
 
				 logger = getLogger(__name__)
			
@@ -40,7 +42,11 @@ def run(batch_cache: BatchCache, new_item_queue: Queue):
 
				 
			
 
				 
			
 
				 def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
			
 
				-    logger.info(f"Recording URLs in database for {len(batches)} batches")
			
 
				+    start = datetime.now()
			
 
				+    blacklist_domains = get_blacklist_domains()
			
 
				+    blacklist_retrieval_time = datetime.now() - start
			
 
				+    logger.info(f"Recording URLs in database for {len(batches)} batches, with {len(blacklist_domains)} blacklist "
			
 
				+                f"domains, retrieved in {blacklist_retrieval_time.total_seconds()} seconds")
			
 
				     with Database() as db:
			
 
				         url_db = URLDatabase(db.connection)
			
 
				         url_scores = defaultdict(float)
			
@@ -64,12 +70,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
 
				                     score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
			
 
				                     for link in item.content.links:
			
 
				                         process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
			
 
				-                                     url_timestamps, url_users, False)
			
 
				+                                     url_timestamps, url_users, False, blacklist_domains)
			
 
				 
			
 
				                     if item.content.extra_links:
			
 
				                         for link in item.content.extra_links:
			
 
				                             process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
			
 
				-                                         url_timestamps, url_users, True)
			
 
				+                                         url_timestamps, url_users, True, blacklist_domains)
			
 
				 
			
 
				         found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
			
 
				                       for url in url_scores.keys() | url_statuses.keys()]
			
@@ -80,9 +86,16 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
 
				         logger.info(f"Put {len(urls)} new items in the URL queue")
			
 
				 
			
 
				 
			
 
				-def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool):
			
 
				+def get_blacklist_domains():
			
 
				+    with CachedSession(expire_after=timedelta(days=1)) as session:
			
 
				+        response = session.get(BLACKLIST_DOMAINS_URL)
			
 
				+        return set(response.text.split())
			
 
				+
			
 
				+
			
 
				+def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
			
 
				     parsed_link = urlparse(link)
			
 
				-    if parsed_link.netloc in EXCLUDED_DOMAINS:
			
 
				+    if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains:
			
 
				+        logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
			
 
				         return
			
 
				 
			
 
				     extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
			
--- a/mwmbl/settings.py
+++ b/mwmbl/settings.py
@@ -42,3 +42,5 @@ CORE_DOMAINS = {
 
				     'arxiv.org',
			
 
				     'www.python.org',
			
 
				 }
			
 
				+
			
 
				+BLACKLIST_DOMAINS_URL = "https://github.com/T145/black-mirror/releases/download/latest/BLOCK_DOMAIN.txt"
			
--- a/poetry.lock
+++ b/poetry.lock
@@ -36,6 +36,25 @@ files = [
 
				 [package.extras]
			
 
				 tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "attrs"
			
 
				+version = "23.1.0"
			
 
				+description = "Classes Without Boilerplate"
			
 
				+category = "main"
			
 
				+optional = false
			
 
				+python-versions = ">=3.7"
			
 
				+files = [
			
 
				+    {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"},
			
 
				+    {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"},
			
 
				+]
			
 
				+
			
 
				+[package.extras]
			
 
				+cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
			
 
				+dev = ["attrs[docs,tests]", "pre-commit"]
			
 
				+docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
			
 
				+tests = ["attrs[tests-no-zope]", "zope-interface"]
			
 
				+tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "beautifulsoup4"
			
 
				 version = "4.10.0"
			
@@ -148,6 +167,32 @@ files = [
 
				     {file = "catalogue-2.0.8.tar.gz", hash = "sha256:b325c77659208bfb6af1b0d93b1a1aa4112e1bb29a4c5ced816758a722f0e388"},
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "cattrs"
			
 
				+version = "23.1.2"
			
 
				+description = "Composable complex class support for attrs and dataclasses."
			
 
				+category = "main"
			
 
				+optional = false
			
 
				+python-versions = ">=3.7"
			
 
				+files = [
			
 
				+    {file = "cattrs-23.1.2-py3-none-any.whl", hash = "sha256:b2bb14311ac17bed0d58785e5a60f022e5431aca3932e3fc5cc8ed8639de50a4"},
			
 
				+    {file = "cattrs-23.1.2.tar.gz", hash = "sha256:db1c821b8c537382b2c7c66678c3790091ca0275ac486c76f3c8f3920e83c657"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+attrs = ">=20"
			
 
				+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
			
 
				+typing_extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
			
 
				+
			
 
				+[package.extras]
			
 
				+bson = ["pymongo (>=4.2.0,<5.0.0)"]
			
 
				+cbor2 = ["cbor2 (>=5.4.6,<6.0.0)"]
			
 
				+msgpack = ["msgpack (>=1.0.2,<2.0.0)"]
			
 
				+orjson = ["orjson (>=3.5.2,<4.0.0)"]
			
 
				+pyyaml = ["PyYAML (>=6.0,<7.0)"]
			
 
				+tomlkit = ["tomlkit (>=0.11.4,<0.12.0)"]
			
 
				+ujson = ["ujson (>=5.4.0,<6.0.0)"]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "certifi"
			
 
				 version = "2022.12.7"
			
@@ -949,6 +994,22 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
 
				 s3 = ["boto3"]
			
 
				 test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "platformdirs"
			
 
				+version = "3.10.0"
			
 
				+description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
			
 
				+category = "main"
			
 
				+optional = false
			
 
				+python-versions = ">=3.7"
			
 
				+files = [
			
 
				+    {file = "platformdirs-3.10.0-py3-none-any.whl", hash = "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"},
			
 
				+    {file = "platformdirs-3.10.0.tar.gz", hash = "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d"},
			
 
				+]
			
 
				+
			
 
				+[package.extras]
			
 
				+docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"]
			
 
				+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "pluggy"
			
 
				 version = "1.0.0"
			
@@ -1418,6 +1479,37 @@ urllib3 = ">=1.21.1,<1.27"
 
				 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
			
 
				 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "requests-cache"
			
 
				+version = "1.1.0"
			
 
				+description = "A persistent cache for python requests"
			
 
				+category = "main"
			
 
				+optional = false
			
 
				+python-versions = ">=3.7,<4.0"
			
 
				+files = [
			
 
				+    {file = "requests_cache-1.1.0-py3-none-any.whl", hash = "sha256:178282bce704b912c59e7f88f367c42bddd6cde6bf511b2a3e3cfb7e5332a92a"},
			
 
				+    {file = "requests_cache-1.1.0.tar.gz", hash = "sha256:41b79166aa8e300cc4de982f7ab7c52af914a785160be1eda25c6e9265969a67"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+attrs = ">=21.2"
			
 
				+cattrs = ">=22.2"
			
 
				+platformdirs = ">=2.5"
			
 
				+requests = ">=2.22"
			
 
				+url-normalize = ">=1.4"
			
 
				+urllib3 = ">=1.25.5"
			
 
				+
			
 
				+[package.extras]
			
 
				+all = ["boto3 (>=1.15)", "botocore (>=1.18)", "itsdangerous (>=2.0)", "pymongo (>=3)", "pyyaml (>=5.4)", "redis (>=3)", "ujson (>=5.4)"]
			
 
				+bson = ["bson (>=0.5)"]
			
 
				+docs = ["furo (>=2023.3,<2024.0)", "linkify-it-py (>=2.0,<3.0)", "myst-parser (>=1.0,<2.0)", "sphinx (>=5.0.2,<6.0.0)", "sphinx-autodoc-typehints (>=1.19)", "sphinx-automodapi (>=0.14)", "sphinx-copybutton (>=0.5)", "sphinx-design (>=0.2)", "sphinx-notfound-page (>=0.8)", "sphinxcontrib-apidoc (>=0.3)", "sphinxext-opengraph (>=0.6)"]
			
 
				+dynamodb = ["boto3 (>=1.15)", "botocore (>=1.18)"]
			
 
				+json = ["ujson (>=5.4)"]
			
 
				+mongodb = ["pymongo (>=3)"]
			
 
				+redis = ["redis (>=3)"]
			
 
				+security = ["itsdangerous (>=2.0)"]
			
 
				+yaml = ["pyyaml (>=5.4)"]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "s3transfer"
			
 
				 version = "0.6.0"
			
@@ -1944,6 +2036,21 @@ files = [
 
				     {file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"},
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "url-normalize"
			
 
				+version = "1.4.3"
			
 
				+description = "URL normalization for Python"
			
 
				+category = "main"
			
 
				+optional = false
			
 
				+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
			
 
				+files = [
			
 
				+    {file = "url-normalize-1.4.3.tar.gz", hash = "sha256:d23d3a070ac52a67b83a1c59a0e68f8608d1cd538783b401bc9de2c0fac999b2"},
			
 
				+    {file = "url_normalize-1.4.3-py2.py3-none-any.whl", hash = "sha256:ec3c301f04e5bb676d333a7fa162fa977ad2ca04b7e652bfc9fac4e405728eed"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+six = "*"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "urllib3"
			
 
				 version = "1.26.15"
			
@@ -2069,9 +2176,9 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 
				 cffi = ["cffi (>=1.11)"]
			
 
				 
			
 
				 [extras]
			
 
				-indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
			
 
				+indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
			
 
				 
			
 
				 [metadata]
			
 
				 lock-version = "2.0"
			
 
				 python-versions = ">=3.10,<3.11"
			
 
				-content-hash = "65eb26c5a9dda0e504632fa433334157fe5c1cbe19135ab18d59c74f47249a0b"
			
 
				+content-hash = "0da699c2eea1ad81a41e5704a988b07cca28371b3260e082eb12bf9f21f985b4"
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ langdetect = {version= "==1.0.9", optional = true}
 
				 pyarrow = {version= "==6.0.0", optional = true}
			
 
				 pyspark = {version= "==3.2.0", optional = true}
			
 
				 Levenshtein = {version= "==0.16.0", optional = true}
			
 
				+requests-cache = "^1.1.0"
			
 
				 
			
 
				 [tool.poetry.extras]
			
 
				 indexer = [