Fix some bugs in URL fetching query

This commit is contained in:
Daoud Clarke 2023-01-02 20:51:23 +00:00
parent a50f1d8ae3
commit 7bd12c1ead
2 changed files with 6 additions and 7 deletions

View file

@ -139,16 +139,16 @@ class URLDatabase:
select_sql = f"""
SELECT host, (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
WHERE host IN %(domains)s
AND status IN ({URLStatus.NEW.value}) OR (
AND (status = {URLStatus.NEW.value} OR (
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
)
))
GROUP BY host
"""
others_sql = f"""
SELECT DISTINCT ON (host) url FROM (
SELECT * FROM url_and_hosts
WHERE status IN ({URLStatus.NEW.value}) OR (
WHERE status = {URLStatus.NEW.value} OR (
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
)
ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u
@ -174,9 +174,8 @@ class URLDatabase:
results = []
for host, urls in agg_results:
# There seems to be a bug in psql where we can get things we didn't ask for...
if host in domain_sample:
results += urls
results += urls
logger.info(f"Got {len(results)} top domain results")
with self.connection.cursor() as cursor:

View file

@ -30,7 +30,7 @@ EXTRA_LINK_MULTIPLIER = 0.001
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
CORE_DOMAINS = {
'github.com,',
'github.com',
'en.wikipedia.org',
'stackoverflow.com',
'docs.google.com',