Exclude domains with bad pattern
This commit is contained in:
parent
f00eacf8aa
commit
b426fa3b7e
4 changed files with 45 additions and 18 deletions
32
mwmbl/indexer/blacklist.py
Normal file
32
mwmbl/indexer/blacklist.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
from datetime import timedelta
|
||||
|
||||
from requests_cache import CachedSession
|
||||
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.settings import BLACKLIST_DOMAINS_URL, EXCLUDED_DOMAINS, DOMAIN_BLACKLIST_REGEX
|
||||
|
||||
|
||||
def get_blacklist_domains():
|
||||
with CachedSession(expire_after=timedelta(days=1)) as session:
|
||||
response = session.get(BLACKLIST_DOMAINS_URL)
|
||||
return set(response.text.split())
|
||||
|
||||
|
||||
def is_domain_blacklisted(domain: str, blacklist_domains: set[str]):
|
||||
if domain in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(domain) is not None \
|
||||
or domain in blacklist_domains:
|
||||
return True
|
||||
|
||||
if domain in DOMAINS:
|
||||
return False
|
||||
|
||||
# TODO: this is to filter out spammy domains that look like:
|
||||
# brofqpxj.uelinc.com
|
||||
# gwaspsag.enflightmultisport.com
|
||||
# fmcqgzvk.onlinejobs2day.com
|
||||
# btmjmhyj.universityslandown.com
|
||||
# djqfctsq.ropman.com
|
||||
# Eventually we can figure out a better way to identify SEO spam
|
||||
domain_parts = domain.split('.')
|
||||
if len(domain_parts) == 3 and domain_parts[2] == "com" and len(domain_parts[0]) == 8:
|
||||
return True
|
|
@ -7,20 +7,18 @@ from time import sleep
|
|||
from typing import Collection
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from requests_cache import CachedSession
|
||||
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.indexer import process_batch
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.blacklist import get_blacklist_domains, is_domain_blacklisted
|
||||
from mwmbl.indexer.index_batches import get_url_error_status
|
||||
from mwmbl.indexer.indexdb import BatchStatus
|
||||
from mwmbl.indexer.paths import BATCH_DIR_NAME
|
||||
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
|
||||
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL, \
|
||||
DOMAIN_BLACKLIST_REGEX
|
||||
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, SCORE_FOR_SAME_DOMAIN, \
|
||||
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
|
||||
from mwmbl.utils import get_domain
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
@ -85,16 +83,9 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
|
|||
logger.info(f"Put {len(urls)} new items in the URL queue")
|
||||
|
||||
|
||||
def get_blacklist_domains():
|
||||
with CachedSession(expire_after=timedelta(days=1)) as session:
|
||||
response = session.get(BLACKLIST_DOMAINS_URL)
|
||||
return set(response.text.split())
|
||||
|
||||
|
||||
def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
|
||||
parsed_link = urlparse(link)
|
||||
if parsed_link.netloc in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(parsed_link.netloc) is not None \
|
||||
or parsed_link.netloc in blacklist_domains:
|
||||
if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
|
||||
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
|
||||
return
|
||||
|
||||
|
|
|
@ -4,6 +4,6 @@ DEBUG = True
|
|||
ALLOWED_HOSTS = ["localhost", "127.0.0.1"]
|
||||
|
||||
DATA_PATH = "./devdata"
|
||||
RUN_BACKGROUND_PROCESSES = False
|
||||
RUN_BACKGROUND_PROCESSES = True
|
||||
NUM_PAGES = 2560
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@ from typing import KeysView, Union
|
|||
from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS as TOP_DOMAINS
|
||||
from mwmbl.indexer.blacklist import is_domain_blacklisted, get_blacklist_domains
|
||||
from mwmbl.settings import CORE_DOMAINS
|
||||
from mwmbl.utils import batch, get_domain
|
||||
|
||||
|
@ -52,6 +53,7 @@ class URLQueue:
|
|||
logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}")
|
||||
|
||||
def update(self):
|
||||
blacklist_domains = get_blacklist_domains()
|
||||
num_processed = 0
|
||||
while True:
|
||||
try:
|
||||
|
@ -59,10 +61,10 @@ class URLQueue:
|
|||
num_processed += 1
|
||||
except Empty:
|
||||
break
|
||||
self._process_found_urls(new_batch)
|
||||
self._process_found_urls(new_batch, blacklist_domains)
|
||||
return num_processed
|
||||
|
||||
def _process_found_urls(self, found_urls: list[FoundURL]):
|
||||
def _process_found_urls(self, found_urls: list[FoundURL], blacklist_domains: set[str]):
|
||||
min_updated_date = datetime.utcnow() - timedelta(hours=REASSIGN_MIN_HOURS)
|
||||
|
||||
logger.info(f"Found URLS: {len(found_urls)}")
|
||||
|
@ -70,7 +72,7 @@ class URLQueue:
|
|||
found_url.status == URLStatus.ASSIGNED.value and found_url.timestamp < min_updated_date)]
|
||||
logger.info(f"Valid URLs: {len(valid_urls)}")
|
||||
|
||||
self._sort_urls(valid_urls)
|
||||
self._sort_urls(valid_urls, blacklist_domains)
|
||||
logger.info(f"Queue size: {self.num_queued_batches}")
|
||||
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) >= self._min_top_domains:
|
||||
total_top_urls = sum(len(urls) for urls in self._top_urls.values())
|
||||
|
@ -82,12 +84,14 @@ class URLQueue:
|
|||
self._batch_urls()
|
||||
logger.info(f"Queue size after batching: {self.num_queued_batches}")
|
||||
|
||||
def _sort_urls(self, valid_urls: list[FoundURL]):
|
||||
def _sort_urls(self, valid_urls: list[FoundURL], blacklist_domains: set[str]):
|
||||
for found_url in valid_urls:
|
||||
try:
|
||||
domain = get_domain(found_url.url)
|
||||
except ValueError:
|
||||
continue
|
||||
if is_domain_blacklisted(domain, blacklist_domains):
|
||||
continue
|
||||
url_store = self._top_urls if domain in TOP_DOMAINS else self._other_urls
|
||||
url_store[domain][found_url.url] = found_url.score
|
||||
|
||||
|
|
Loading…
Reference in a new issue