Browse Source

Get results from other domains

Daoud Clarke 2 years ago
parent
commit
d9cd3c585b
1 changed files with 21 additions and 5 deletions
  1. 21 5
      mwmbl/crawler/urls.py

+ 21 - 5
mwmbl/crawler/urls.py

@@ -15,7 +15,9 @@ from mwmbl.utils import batch
 
 REASSIGN_MIN_HOURS = 5
 BATCH_SIZE = 100
-MAX_TOP_DOMAIN_URLS = 10
+MAX_URLS_PER_TOP_DOMAIN = 100
+MAX_TOP_DOMAINS = 500
+MAX_OTHER_DOMAINS = 50000
 
 
 logger = getLogger(__name__)
@@ -134,7 +136,7 @@ class URLDatabase:
         work_mem = "SET work_mem = '512MB'"
 
         select_sql = f"""
-            SELECT (array_agg(url order by score desc))[:100] FROM url_and_hosts
+            SELECT (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
             WHERE host IN %(domains)s
                AND status IN ({URLStatus.NEW.value}) OR (
                    status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
@@ -142,6 +144,14 @@ class URLDatabase:
             GROUP BY host
         """
 
+        others_sql = f"""
+            SELECT DISTINCT ON (host) url FROM (
+                SELECT * FROM url_and_hosts
+                WHERE status=0
+                ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u
+            ORDER BY host
+        """
+
         update_sql = f"""
          UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
          WHERE url IN %(urls)s
@@ -149,8 +159,7 @@ class URLDatabase:
 
         now = datetime.utcnow()
         min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
-        domains = tuple(random.sample(DOMAINS.keys(), 500))
-        # domains = tuple(DOMAINS.keys())
+        domains = tuple(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS))
         logger.info(f"Getting URLs for domains {domains}")
         with self.connection.cursor() as cursor:
             cursor.execute(work_mem)
@@ -161,11 +170,18 @@ class URLDatabase:
         results = []
         for result in agg_results:
             results += result[0]
+        logger.info(f"Got {len(results)} top domain results")
+
+        with self.connection.cursor() as cursor:
+            cursor.execute(others_sql)
+            other_results = cursor.fetchall()
+            other_results_list = [result[0] for result in other_results]
+            logger.info(f"Got {len(other_results_list)} results from all domains")
+            results += other_results_list
 
         with self.connection.cursor() as cursor:
             cursor.execute(update_sql,
                            {'now': now, 'urls': tuple(results)})
-            print("Results", agg_results)
 
         total_time_seconds = (datetime.now() - start).total_seconds()
         logger.info(f"Got {len(results)} in {total_time_seconds} seconds")