diff --git a/mwmbl/indexer/blacklist.py b/mwmbl/indexer/blacklist.py new file mode 100644 index 0000000..cb480d6 --- /dev/null +++ b/mwmbl/indexer/blacklist.py @@ -0,0 +1,32 @@ +from datetime import timedelta + +from requests_cache import CachedSession + +from mwmbl.hn_top_domains_filtered import DOMAINS +from mwmbl.settings import BLACKLIST_DOMAINS_URL, EXCLUDED_DOMAINS, DOMAIN_BLACKLIST_REGEX + + +def get_blacklist_domains(): + with CachedSession(expire_after=timedelta(days=1)) as session: + response = session.get(BLACKLIST_DOMAINS_URL) + return set(response.text.split()) + + +def is_domain_blacklisted(domain: str, blacklist_domains: set[str]): + if domain in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(domain) is not None \ + or domain in blacklist_domains: + return True + + if domain in DOMAINS: + return False + + # TODO: this is to filter out spammy domains that look like: + # brofqpxj.uelinc.com + # gwaspsag.enflightmultisport.com + # fmcqgzvk.onlinejobs2day.com + # btmjmhyj.universityslandown.com + # djqfctsq.ropman.com + # Eventually we can figure out a better way to identify SEO spam + domain_parts = domain.split('.') + if len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) == 8: + return True diff --git a/mwmbl/indexer/update_urls.py b/mwmbl/indexer/update_urls.py index 8a1b973..69605ba 100644 --- a/mwmbl/indexer/update_urls.py +++ b/mwmbl/indexer/update_urls.py @@ -7,20 +7,18 @@ from time import sleep from typing import Collection from urllib.parse import urlparse -from requests_cache import CachedSession - from mwmbl.crawler.batch import HashedBatch from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL from mwmbl.database import Database from mwmbl.hn_top_domains_filtered import DOMAINS from mwmbl.indexer import process_batch from mwmbl.indexer.batch_cache import BatchCache +from mwmbl.indexer.blacklist import get_blacklist_domains, is_domain_blacklisted from mwmbl.indexer.index_batches import get_url_error_status from mwmbl.indexer.indexdb import BatchStatus from mwmbl.indexer.paths import BATCH_DIR_NAME -from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \ - SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL, \ - DOMAIN_BLACKLIST_REGEX +from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, SCORE_FOR_SAME_DOMAIN, \ + SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER from mwmbl.utils import get_domain logger = getLogger(__name__) @@ -85,16 +83,9 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu logger.info(f"Put {len(urls)} new items in the URL queue") -def get_blacklist_domains(): - with CachedSession(expire_after=timedelta(days=1)) as session: - response = session.get(BLACKLIST_DOMAINS_URL) - return set(response.text.split()) - - def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains): parsed_link = urlparse(link) - if parsed_link.netloc in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(parsed_link.netloc) is not None \ - or parsed_link.netloc in blacklist_domains: + if is_domain_blacklisted(parsed_link.netloc, blacklist_domains): logger.info(f"Excluding link for blacklisted domain: {parsed_link}") return diff --git a/mwmbl/settings_dev.py b/mwmbl/settings_dev.py index d0c255b..ee82351 100644 --- a/mwmbl/settings_dev.py +++ b/mwmbl/settings_dev.py @@ -4,6 +4,6 @@ DEBUG = True ALLOWED_HOSTS = ["localhost", "127.0.0.1"] DATA_PATH = "./devdata" -RUN_BACKGROUND_PROCESSES = False +RUN_BACKGROUND_PROCESSES = True NUM_PAGES = 2560 diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index 8151550..70528fd 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -10,6 +10,7 @@ from typing import KeysView, Union from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS from mwmbl.database import Database from mwmbl.hn_top_domains_filtered import DOMAINS as TOP_DOMAINS +from mwmbl.indexer.blacklist import is_domain_blacklisted, get_blacklist_domains from mwmbl.settings import CORE_DOMAINS from mwmbl.utils import batch, get_domain @@ -52,6 +53,7 @@ class URLQueue: logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}") def update(self): + blacklist_domains = get_blacklist_domains() num_processed = 0 while True: try: @@ -59,10 +61,10 @@ class URLQueue: num_processed += 1 except Empty: break - self._process_found_urls(new_batch) + self._process_found_urls(new_batch, blacklist_domains) return num_processed - def _process_found_urls(self, found_urls: list[FoundURL]): + def _process_found_urls(self, found_urls: list[FoundURL], blacklist_domains: set[str]): min_updated_date = datetime.utcnow() - timedelta(hours=REASSIGN_MIN_HOURS) logger.info(f"Found URLS: {len(found_urls)}") @@ -70,7 +72,7 @@ class URLQueue: found_url.status == URLStatus.ASSIGNED.value and found_url.timestamp < min_updated_date)] logger.info(f"Valid URLs: {len(valid_urls)}") - self._sort_urls(valid_urls) + self._sort_urls(valid_urls, blacklist_domains) logger.info(f"Queue size: {self.num_queued_batches}") while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) >= self._min_top_domains: total_top_urls = sum(len(urls) for urls in self._top_urls.values()) @@ -82,12 +84,14 @@ class URLQueue: self._batch_urls() logger.info(f"Queue size after batching: {self.num_queued_batches}") - def _sort_urls(self, valid_urls: list[FoundURL]): + def _sort_urls(self, valid_urls: list[FoundURL], blacklist_domains: set[str]): for found_url in valid_urls: try: domain = get_domain(found_url.url) except ValueError: continue + if is_domain_blacklisted(domain, blacklist_domains): + continue url_store = self._top_urls if domain in TOP_DOMAINS else self._other_urls url_store[domain][found_url.url] = found_url.score