Bläddra i källkod

Update URL status

Daoud Clarke 2 år sedan
förälder
incheckning
77f08d8f0a
1 ändrade filer med 23 tillägg och 16 borttagningar
  1. 23 16
      mwmbl/crawler/urls.py

+ 23 - 16
mwmbl/crawler/urls.py

@@ -133,33 +133,40 @@ class URLDatabase:
 
         work_mem = "SET work_mem = '512MB'"
 
-        sql = f"""
-         UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
-         WHERE url IN (
-           SELECT url FROM (
-               SELECT url, host, score, rank() OVER (PARTITION BY host ORDER BY score DESC) AS pos
-               FROM url_and_hosts
-               WHERE host IN %(domains)s
+        select_sql = f"""
+            SELECT (array_agg(url order by score desc))[:100] FROM url_and_hosts
+            WHERE host IN %(domains)s
                AND status IN ({URLStatus.NEW.value}) OR (
                    status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
                )
-           ) u
-           WHERE pos < {MAX_TOP_DOMAIN_URLS}
-         )
-         RETURNING url
+            GROUP BY host
+        """
+
+        update_sql = f"""
+         UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
+         WHERE url IN %(urls)s
          """
 
         now = datetime.utcnow()
         min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
-        domains = tuple(random.sample(DOMAINS.keys(), 100))
+        domains = tuple(random.sample(DOMAINS.keys(), 500))
+        # domains = tuple(DOMAINS.keys())
         logger.info(f"Getting URLs for domains {domains}")
         with self.connection.cursor() as cursor:
             cursor.execute(work_mem)
-            cursor.execute(sql,
-                           {'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls, 'domains': domains})
-            results = cursor.fetchall()
+            cursor.execute(select_sql,
+                           {'min_updated_date': min_updated_date, 'domains': domains})
+            agg_results = cursor.fetchall()
+
+        results = []
+        for result in agg_results:
+            results += result[0]
+
+        with self.connection.cursor() as cursor:
+            cursor.execute(update_sql,
+                           {'now': now, 'urls': tuple(results)})
+            print("Results", agg_results)
 
-        results = [result[0] for result in results]
         total_time_seconds = (datetime.now() - start).total_seconds()
         logger.info(f"Got {len(results)} in {total_time_seconds} seconds")