Add core domains
This commit is contained in:
parent
ab37bbe0a5
commit
dda5a25ad0
2 changed files with 12 additions and 1 deletions
|
@ -10,6 +10,7 @@ from logging import getLogger
|
|||
from psycopg2.extras import execute_values
|
||||
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.settings import CORE_DOMAINS
|
||||
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
|
||||
from mwmbl.utils import batch
|
||||
|
||||
|
@ -161,7 +162,7 @@ class URLDatabase:
|
|||
|
||||
now = datetime.utcnow()
|
||||
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
|
||||
domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
|
||||
domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS)) | CORE_DOMAINS
|
||||
domains = tuple(domain_sample)
|
||||
logger.info(f"Getting URLs for domains {domains}")
|
||||
with self.connection.cursor() as cursor:
|
||||
|
|
|
@ -29,3 +29,13 @@ SCORE_FOR_SAME_DOMAIN = 0.01
|
|||
EXTRA_LINK_MULTIPLIER = 0.001
|
||||
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
|
||||
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
|
||||
CORE_DOMAINS = {
|
||||
'github.com,',
|
||||
'en.wikipedia.org',
|
||||
'stackoverflow.com',
|
||||
'docs.google.com',
|
||||
'programmers.stackexchange.com',
|
||||
'developer.mozilla.org',
|
||||
'arxiv.org',
|
||||
'www.python.org',
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue