diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index 15713ca..da677cb 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -10,6 +10,7 @@ from logging import getLogger from psycopg2.extras import execute_values from mwmbl.hn_top_domains_filtered import DOMAINS +from mwmbl.settings import CORE_DOMAINS # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned from mwmbl.utils import batch @@ -161,7 +162,7 @@ class URLDatabase: now = datetime.utcnow() min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS) - domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS)) + domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS)) | CORE_DOMAINS domains = tuple(domain_sample) logger.info(f"Getting URLs for domains {domains}") with self.connection.cursor() as cursor: diff --git a/mwmbl/settings.py b/mwmbl/settings.py index 2c18aef..a450bc3 100644 --- a/mwmbl/settings.py +++ b/mwmbl/settings.py @@ -29,3 +29,13 @@ SCORE_FOR_SAME_DOMAIN = 0.01 EXTRA_LINK_MULTIPLIER = 0.001 UNKNOWN_DOMAIN_MULTIPLIER = 0.001 EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'} +CORE_DOMAINS = { + 'github.com,', + 'en.wikipedia.org', + 'stackoverflow.com', + 'docs.google.com', + 'programmers.stackexchange.com', + 'developer.mozilla.org', + 'arxiv.org', + 'www.python.org', +}