Forráskód Böngészése

WIP: improve method of getting URLs for crawling

Daoud Clarke 2 éve
szülő
commit
ea16e7b5cd
1 módosított fájl, 12 hozzáadás és 13 törlés
  1. 12 13
      mwmbl/crawler/urls.py

+ 12 - 13
mwmbl/crawler/urls.py

@@ -130,20 +130,17 @@ class URLDatabase:
         start = datetime.utcnow()
         logger.info("Getting URLs for crawling")
 
-        work_mem = "SET work_mem = '512MB'"
-
         sql = f"""
         UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
         WHERE url IN (
-          SELECT url FROM (
-              SELECT url, host, score, rank() OVER (PARTITION BY host ORDER BY score DESC) AS pos
-              FROM url_and_hosts
-              WHERE host IN %(domains)s
+          SELECT url FROM url_and_hosts
+              WHERE host = %(domain)s
               AND status IN ({URLStatus.NEW.value}) OR (
                   status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
               )
-          ) u
-          WHERE pos < {MAX_TOP_DOMAIN_URLS}
+              ORDER BY score DESC
+              LIMIT {MAX_TOP_DOMAIN_URLS}
+          )
         )
         RETURNING url
         """
@@ -151,13 +148,15 @@ class URLDatabase:
         now = datetime.utcnow()
         min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
         domains = tuple(DOMAINS.keys())
-        with self.connection.cursor() as cursor:
-            cursor.execute(work_mem)
-            cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls, 'domains': domains})
-            results = cursor.fetchall()
+
+        results = []
+        for domain in domains:
+            with self.connection.cursor() as cursor:
+                cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'domain': domain})
+                domain_results = cursor.fetchall()
+                results += [result[0] for result in domain_results]
 
         total_time_seconds = (datetime.now() - start).total_seconds()
-        results = [result[0] for result in results]
         logger.info(f"Got {len(results)} in {total_time_seconds} seconds")
 
         return results