Browse Source

Use a randomised timeout for getting a new batch

Daoud Clarke 3 years ago
parent
commit
62ba9ddc7e
1 changed files with 6 additions and 1 deletions
  1. 6 1
      mwmbl/crawler/urls.py

+ 6 - 1
mwmbl/crawler/urls.py

@@ -2,6 +2,7 @@
 Database storing info on URLs
 """
 import os
+import random
 from dataclasses import dataclass
 from datetime import datetime, timedelta
 from enum import Enum
@@ -14,7 +15,7 @@ from psycopg2.extras import execute_values
 # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
 from mwmbl.database import Database
 
-REASSIGN_MIN_HOURS = 1
+REASSIGN_MIN_HOURS = 5
 BATCH_SIZE = 100
 
 
@@ -120,6 +121,9 @@ class URLDatabase:
                 execute_values(cursor, insert_sql, data)
 
     def get_new_batch_for_user(self, user_id_hash: str):
+        timeout_seconds = random.randint(10, 20)
+        timeout_sql = f"SET statement_timeout = '{timeout_seconds}s'"
+
         sql = f"""
         UPDATE urls SET status = {URLStatus.ASSIGNED.value}, user_id_hash = %(user_id_hash)s, updated = %(now)s
         WHERE url IN (
@@ -137,6 +141,7 @@ class URLDatabase:
         now = datetime.utcnow()
         min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
         with self.connection.cursor() as cursor:
+            cursor.execute(timeout_sql)
             cursor.execute(sql, {'user_id_hash': user_id_hash, 'min_updated_date': min_updated_date, 'now': now})
             results = cursor.fetchall()