From 62ba9ddc7ec5441ea36fdb47363196e2706a1e94 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 30 Jul 2022 23:10:37 +0100 Subject: [PATCH] Use a randomised timeout for getting a new batch --- mwmbl/crawler/urls.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index e4eddfd..eee2ce4 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -2,6 +2,7 @@ Database storing info on URLs """ import os +import random from dataclasses import dataclass from datetime import datetime, timedelta from enum import Enum @@ -14,7 +15,7 @@ from psycopg2.extras import execute_values # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned from mwmbl.database import Database -REASSIGN_MIN_HOURS = 1 +REASSIGN_MIN_HOURS = 5 BATCH_SIZE = 100 @@ -120,6 +121,9 @@ class URLDatabase: execute_values(cursor, insert_sql, data) def get_new_batch_for_user(self, user_id_hash: str): + timeout_seconds = random.randint(10, 20) + timeout_sql = f"SET statement_timeout = '{timeout_seconds}s'" + sql = f""" UPDATE urls SET status = {URLStatus.ASSIGNED.value}, user_id_hash = %(user_id_hash)s, updated = %(now)s WHERE url IN ( @@ -137,6 +141,7 @@ class URLDatabase: now = datetime.utcnow() min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS) with self.connection.cursor() as cursor: + cursor.execute(timeout_sql) cursor.execute(sql, {'user_id_hash': user_id_hash, 'min_updated_date': min_updated_date, 'now': now}) results = cursor.fetchall()