diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index da677cb..4dc0db7 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -139,16 +139,16 @@ class URLDatabase: select_sql = f""" SELECT host, (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts WHERE host IN %(domains)s - AND status IN ({URLStatus.NEW.value}) OR ( + AND (status = {URLStatus.NEW.value} OR ( status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s - ) + )) GROUP BY host """ others_sql = f""" SELECT DISTINCT ON (host) url FROM ( SELECT * FROM url_and_hosts - WHERE status IN ({URLStatus.NEW.value}) OR ( + WHERE status = {URLStatus.NEW.value} OR ( status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s ) ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u @@ -174,9 +174,8 @@ class URLDatabase: results = [] for host, urls in agg_results: - # There seems to be a bug in psql where we can get things we didn't ask for... - if host in domain_sample: - results += urls + results += urls + logger.info(f"Got {len(results)} top domain results") with self.connection.cursor() as cursor: diff --git a/mwmbl/settings.py b/mwmbl/settings.py index a450bc3..6dc5553 100644 --- a/mwmbl/settings.py +++ b/mwmbl/settings.py @@ -30,7 +30,7 @@ EXTRA_LINK_MULTIPLIER = 0.001 UNKNOWN_DOMAIN_MULTIPLIER = 0.001 EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'} CORE_DOMAINS = { - 'github.com,', + 'github.com', 'en.wikipedia.org', 'stackoverflow.com', 'docs.google.com',