Pārlūkot izejas kodu

Check the domain is correct, potential bug in psql

Daoud Clarke 2 gadi atpakaļ
vecāks
revīzija
6edf48693b
1 mainītis faili ar 8 papildinājumiem un 5 dzēšanām
  1. 8 5
      mwmbl/crawler/urls.py

+ 8 - 5
mwmbl/crawler/urls.py

@@ -136,7 +136,7 @@ class URLDatabase:
         work_mem = "SET work_mem = '512MB'"
 
         select_sql = f"""
-            SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
+            SELECT host, (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
             WHERE host IN %(domains)s
                 AND status IN ({URLStatus.NEW.value}) OR (
                     status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
@@ -161,7 +161,8 @@ class URLDatabase:
 
         now = datetime.utcnow()
         min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
-        domains = tuple(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
+        domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
+        domains = tuple(domain_sample)
         logger.info(f"Getting URLs for domains {domains}")
         with self.connection.cursor() as cursor:
             cursor.execute(work_mem)
@@ -171,12 +172,14 @@ class URLDatabase:
             logger.info(f"Agg results: {agg_results}")
 
         results = []
-        for result in agg_results:
-            results += result[0]
+        for host, urls in agg_results:
+            # There seems to be a bug in psql where we can get things we didn't ask for...
+            if host in domain_sample:
+                results += urls
         logger.info(f"Got {len(results)} top domain results")
 
         with self.connection.cursor() as cursor:
-            cursor.execute(others_sql)
+            cursor.execute(others_sql, {'min_updated_date': min_updated_date})
             other_results = cursor.fetchall()
             other_results_list = [result[0] for result in other_results]
             logger.info(f"Got {len(other_results_list)} results from all domains")