浏览代码

Add core domains

Daoud Clarke 2 年之前
父节点
当前提交
dda5a25ad0
共有 2 个文件被更改,包括 12 次插入1 次删除
  1. 2 1
      mwmbl/crawler/urls.py
  2. 10 0
      mwmbl/settings.py

+ 2 - 1
mwmbl/crawler/urls.py

@@ -10,6 +10,7 @@ from logging import getLogger
 from psycopg2.extras import execute_values
 
 from mwmbl.hn_top_domains_filtered import DOMAINS
+from mwmbl.settings import CORE_DOMAINS
 # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
 from mwmbl.utils import batch
 
@@ -161,7 +162,7 @@ class URLDatabase:
 
         now = datetime.utcnow()
         min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
-        domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
+        domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS)) | CORE_DOMAINS
         domains = tuple(domain_sample)
         logger.info(f"Getting URLs for domains {domains}")
         with self.connection.cursor() as cursor:

+ 10 - 0
mwmbl/settings.py

@@ -29,3 +29,13 @@ SCORE_FOR_SAME_DOMAIN = 0.01
 EXTRA_LINK_MULTIPLIER = 0.001
 UNKNOWN_DOMAIN_MULTIPLIER = 0.001
 EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
+CORE_DOMAINS = {
+    'github.com,',
+    'en.wikipedia.org',
+    'stackoverflow.com',
+    'docs.google.com',
+    'programmers.stackexchange.com',
+    'developer.mozilla.org',
+    'arxiv.org',
+    'www.python.org',
+}