From dda5a25ad0484e0891ebb4cb9cd75b0cc06371d8 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Mon, 2 Jan 2023 12:05:22 +0000
Subject: [PATCH] Add core domains

---
 mwmbl/crawler/urls.py |  3 ++-
 mwmbl/settings.py     | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py
index 15713ca..da677cb 100644
--- a/mwmbl/crawler/urls.py
+++ b/mwmbl/crawler/urls.py
@@ -10,6 +10,7 @@ from logging import getLogger
 from psycopg2.extras import execute_values
 
 from mwmbl.hn_top_domains_filtered import DOMAINS
+from mwmbl.settings import CORE_DOMAINS
 # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
 from mwmbl.utils import batch
 
@@ -161,7 +162,7 @@ class URLDatabase:
 
         now = datetime.utcnow()
         min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
-        domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
+        domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS)) | CORE_DOMAINS
         domains = tuple(domain_sample)
         logger.info(f"Getting URLs for domains {domains}")
         with self.connection.cursor() as cursor:
diff --git a/mwmbl/settings.py b/mwmbl/settings.py
index 2c18aef..a450bc3 100644
--- a/mwmbl/settings.py
+++ b/mwmbl/settings.py
@@ -29,3 +29,13 @@ SCORE_FOR_SAME_DOMAIN = 0.01
 EXTRA_LINK_MULTIPLIER = 0.001
 UNKNOWN_DOMAIN_MULTIPLIER = 0.001
 EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
+CORE_DOMAINS = {
+    'github.com,',
+    'en.wikipedia.org',
+    'stackoverflow.com',
+    'docs.google.com',
+    'programmers.stackexchange.com',
+    'developer.mozilla.org',
+    'arxiv.org',
+    'www.python.org',
+}