Exclude blacklisted domains
This commit is contained in:
parent
18dc760a34
commit
019095a4c1
4 changed files with 131 additions and 8 deletions
|
@ -9,6 +9,8 @@ from time import sleep
|
||||||
from typing import Iterable, Collection
|
from typing import Iterable, Collection
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from requests_cache import CachedSession
|
||||||
|
|
||||||
from mwmbl.crawler.batch import HashedBatch
|
from mwmbl.crawler.batch import HashedBatch
|
||||||
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
|
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
|
||||||
from mwmbl.database import Database
|
from mwmbl.database import Database
|
||||||
|
@ -19,7 +21,7 @@ from mwmbl.indexer.index_batches import get_url_error_status
|
||||||
from mwmbl.indexer.indexdb import BatchStatus
|
from mwmbl.indexer.indexdb import BatchStatus
|
||||||
from mwmbl.indexer.paths import BATCH_DIR_NAME
|
from mwmbl.indexer.paths import BATCH_DIR_NAME
|
||||||
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
|
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
|
||||||
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
|
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL
|
||||||
from mwmbl.utils import get_domain
|
from mwmbl.utils import get_domain
|
||||||
|
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
@ -40,7 +42,11 @@ def run(batch_cache: BatchCache, new_item_queue: Queue):
|
||||||
|
|
||||||
|
|
||||||
def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
|
def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
|
||||||
logger.info(f"Recording URLs in database for {len(batches)} batches")
|
start = datetime.now()
|
||||||
|
blacklist_domains = get_blacklist_domains()
|
||||||
|
blacklist_retrieval_time = datetime.now() - start
|
||||||
|
logger.info(f"Recording URLs in database for {len(batches)} batches, with {len(blacklist_domains)} blacklist "
|
||||||
|
f"domains, retrieved in {blacklist_retrieval_time.total_seconds()} seconds")
|
||||||
with Database() as db:
|
with Database() as db:
|
||||||
url_db = URLDatabase(db.connection)
|
url_db = URLDatabase(db.connection)
|
||||||
url_scores = defaultdict(float)
|
url_scores = defaultdict(float)
|
||||||
|
@ -64,12 +70,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
|
||||||
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
||||||
for link in item.content.links:
|
for link in item.content.links:
|
||||||
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||||
url_timestamps, url_users, False)
|
url_timestamps, url_users, False, blacklist_domains)
|
||||||
|
|
||||||
if item.content.extra_links:
|
if item.content.extra_links:
|
||||||
for link in item.content.extra_links:
|
for link in item.content.extra_links:
|
||||||
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||||
url_timestamps, url_users, True)
|
url_timestamps, url_users, True, blacklist_domains)
|
||||||
|
|
||||||
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
|
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
|
||||||
for url in url_scores.keys() | url_statuses.keys()]
|
for url in url_scores.keys() | url_statuses.keys()]
|
||||||
|
@ -80,9 +86,16 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
|
||||||
logger.info(f"Put {len(urls)} new items in the URL queue")
|
logger.info(f"Put {len(urls)} new items in the URL queue")
|
||||||
|
|
||||||
|
|
||||||
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool):
|
def get_blacklist_domains():
|
||||||
|
with CachedSession(expire_after=timedelta(days=1)) as session:
|
||||||
|
response = session.get(BLACKLIST_DOMAINS_URL)
|
||||||
|
return set(response.text.split())
|
||||||
|
|
||||||
|
|
||||||
|
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
|
||||||
parsed_link = urlparse(link)
|
parsed_link = urlparse(link)
|
||||||
if parsed_link.netloc in EXCLUDED_DOMAINS:
|
if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains:
|
||||||
|
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
|
||||||
return
|
return
|
||||||
|
|
||||||
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
|
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
|
||||||
|
|
|
@ -42,3 +42,5 @@ CORE_DOMAINS = {
|
||||||
'arxiv.org',
|
'arxiv.org',
|
||||||
'www.python.org',
|
'www.python.org',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BLACKLIST_DOMAINS_URL = "https://github.com/T145/black-mirror/releases/download/latest/BLOCK_DOMAIN.txt"
|
||||||
|
|
111
poetry.lock
generated
111
poetry.lock
generated
|
@ -36,6 +36,25 @@ files = [
|
||||||
[package.extras]
|
[package.extras]
|
||||||
tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
|
tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "attrs"
|
||||||
|
version = "23.1.0"
|
||||||
|
description = "Classes Without Boilerplate"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"},
|
||||||
|
{file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
|
||||||
|
dev = ["attrs[docs,tests]", "pre-commit"]
|
||||||
|
docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
|
||||||
|
tests = ["attrs[tests-no-zope]", "zope-interface"]
|
||||||
|
tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "beautifulsoup4"
|
name = "beautifulsoup4"
|
||||||
version = "4.10.0"
|
version = "4.10.0"
|
||||||
|
@ -148,6 +167,32 @@ files = [
|
||||||
{file = "catalogue-2.0.8.tar.gz", hash = "sha256:b325c77659208bfb6af1b0d93b1a1aa4112e1bb29a4c5ced816758a722f0e388"},
|
{file = "catalogue-2.0.8.tar.gz", hash = "sha256:b325c77659208bfb6af1b0d93b1a1aa4112e1bb29a4c5ced816758a722f0e388"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cattrs"
|
||||||
|
version = "23.1.2"
|
||||||
|
description = "Composable complex class support for attrs and dataclasses."
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "cattrs-23.1.2-py3-none-any.whl", hash = "sha256:b2bb14311ac17bed0d58785e5a60f022e5431aca3932e3fc5cc8ed8639de50a4"},
|
||||||
|
{file = "cattrs-23.1.2.tar.gz", hash = "sha256:db1c821b8c537382b2c7c66678c3790091ca0275ac486c76f3c8f3920e83c657"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
attrs = ">=20"
|
||||||
|
exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
|
||||||
|
typing_extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
bson = ["pymongo (>=4.2.0,<5.0.0)"]
|
||||||
|
cbor2 = ["cbor2 (>=5.4.6,<6.0.0)"]
|
||||||
|
msgpack = ["msgpack (>=1.0.2,<2.0.0)"]
|
||||||
|
orjson = ["orjson (>=3.5.2,<4.0.0)"]
|
||||||
|
pyyaml = ["PyYAML (>=6.0,<7.0)"]
|
||||||
|
tomlkit = ["tomlkit (>=0.11.4,<0.12.0)"]
|
||||||
|
ujson = ["ujson (>=5.4.0,<6.0.0)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "certifi"
|
name = "certifi"
|
||||||
version = "2022.12.7"
|
version = "2022.12.7"
|
||||||
|
@ -949,6 +994,22 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
|
||||||
s3 = ["boto3"]
|
s3 = ["boto3"]
|
||||||
test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
|
test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "platformdirs"
|
||||||
|
version = "3.10.0"
|
||||||
|
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "platformdirs-3.10.0-py3-none-any.whl", hash = "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"},
|
||||||
|
{file = "platformdirs-3.10.0.tar.gz", hash = "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"]
|
||||||
|
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pluggy"
|
name = "pluggy"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
|
@ -1418,6 +1479,37 @@ urllib3 = ">=1.21.1,<1.27"
|
||||||
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
||||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "requests-cache"
|
||||||
|
version = "1.1.0"
|
||||||
|
description = "A persistent cache for python requests"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7,<4.0"
|
||||||
|
files = [
|
||||||
|
{file = "requests_cache-1.1.0-py3-none-any.whl", hash = "sha256:178282bce704b912c59e7f88f367c42bddd6cde6bf511b2a3e3cfb7e5332a92a"},
|
||||||
|
{file = "requests_cache-1.1.0.tar.gz", hash = "sha256:41b79166aa8e300cc4de982f7ab7c52af914a785160be1eda25c6e9265969a67"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
attrs = ">=21.2"
|
||||||
|
cattrs = ">=22.2"
|
||||||
|
platformdirs = ">=2.5"
|
||||||
|
requests = ">=2.22"
|
||||||
|
url-normalize = ">=1.4"
|
||||||
|
urllib3 = ">=1.25.5"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
all = ["boto3 (>=1.15)", "botocore (>=1.18)", "itsdangerous (>=2.0)", "pymongo (>=3)", "pyyaml (>=5.4)", "redis (>=3)", "ujson (>=5.4)"]
|
||||||
|
bson = ["bson (>=0.5)"]
|
||||||
|
docs = ["furo (>=2023.3,<2024.0)", "linkify-it-py (>=2.0,<3.0)", "myst-parser (>=1.0,<2.0)", "sphinx (>=5.0.2,<6.0.0)", "sphinx-autodoc-typehints (>=1.19)", "sphinx-automodapi (>=0.14)", "sphinx-copybutton (>=0.5)", "sphinx-design (>=0.2)", "sphinx-notfound-page (>=0.8)", "sphinxcontrib-apidoc (>=0.3)", "sphinxext-opengraph (>=0.6)"]
|
||||||
|
dynamodb = ["boto3 (>=1.15)", "botocore (>=1.18)"]
|
||||||
|
json = ["ujson (>=5.4)"]
|
||||||
|
mongodb = ["pymongo (>=3)"]
|
||||||
|
redis = ["redis (>=3)"]
|
||||||
|
security = ["itsdangerous (>=2.0)"]
|
||||||
|
yaml = ["pyyaml (>=5.4)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "s3transfer"
|
name = "s3transfer"
|
||||||
version = "0.6.0"
|
version = "0.6.0"
|
||||||
|
@ -1944,6 +2036,21 @@ files = [
|
||||||
{file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"},
|
{file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "url-normalize"
|
||||||
|
version = "1.4.3"
|
||||||
|
description = "URL normalization for Python"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||||
|
files = [
|
||||||
|
{file = "url-normalize-1.4.3.tar.gz", hash = "sha256:d23d3a070ac52a67b83a1c59a0e68f8608d1cd538783b401bc9de2c0fac999b2"},
|
||||||
|
{file = "url_normalize-1.4.3-py2.py3-none-any.whl", hash = "sha256:ec3c301f04e5bb676d333a7fa162fa977ad2ca04b7e652bfc9fac4e405728eed"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
six = "*"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "urllib3"
|
name = "urllib3"
|
||||||
version = "1.26.15"
|
version = "1.26.15"
|
||||||
|
@ -2069,9 +2176,9 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
||||||
cffi = ["cffi (>=1.11)"]
|
cffi = ["cffi (>=1.11)"]
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
|
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "pyarrow", "pyspark", "Levenshtein"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.10,<3.11"
|
python-versions = ">=3.10,<3.11"
|
||||||
content-hash = "65eb26c5a9dda0e504632fa433334157fe5c1cbe19135ab18d59c74f47249a0b"
|
content-hash = "0da699c2eea1ad81a41e5704a988b07cca28371b3260e082eb12bf9f21f985b4"
|
||||||
|
|
|
@ -33,6 +33,7 @@ langdetect = {version= "==1.0.9", optional = true}
|
||||||
pyarrow = {version= "==6.0.0", optional = true}
|
pyarrow = {version= "==6.0.0", optional = true}
|
||||||
pyspark = {version= "==3.2.0", optional = true}
|
pyspark = {version= "==3.2.0", optional = true}
|
||||||
Levenshtein = {version= "==0.16.0", optional = true}
|
Levenshtein = {version= "==0.16.0", optional = true}
|
||||||
|
requests-cache = "^1.1.0"
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
indexer = [
|
indexer = [
|
||||||
|
|
Loading…
Reference in a new issue