Add core domains

This commit is contained in:
Daoud Clarke 2023-01-02 12:05:22 +00:00
parent ab37bbe0a5
commit dda5a25ad0
2 changed files with 12 additions and 1 deletions

View file

@ -10,6 +10,7 @@ from logging import getLogger
from psycopg2.extras import execute_values
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.settings import CORE_DOMAINS
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
from mwmbl.utils import batch
@ -161,7 +162,7 @@ class URLDatabase:
now = datetime.utcnow()
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS)) | CORE_DOMAINS
domains = tuple(domain_sample)
logger.info(f"Getting URLs for domains {domains}")
with self.connection.cursor() as cursor:

View file

@ -29,3 +29,13 @@ SCORE_FOR_SAME_DOMAIN = 0.01
EXTRA_LINK_MULTIPLIER = 0.001
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
CORE_DOMAINS = {
'github.com,',
'en.wikipedia.org',
'stackoverflow.com',
'docs.google.com',
'programmers.stackexchange.com',
'developer.mozilla.org',
'arxiv.org',
'www.python.org',
}