Use a randomised timeout for getting a new batch

This commit is contained in:
Daoud Clarke 2022-07-30 23:10:37 +01:00
parent a54e093cf1
commit 62ba9ddc7e

View file

@ -2,6 +2,7 @@
Database storing info on URLs
"""
import os
import random
from dataclasses import dataclass
from datetime import datetime, timedelta
from enum import Enum
@ -14,7 +15,7 @@ from psycopg2.extras import execute_values
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
from mwmbl.database import Database
REASSIGN_MIN_HOURS = 1
REASSIGN_MIN_HOURS = 5
BATCH_SIZE = 100
@ -120,6 +121,9 @@ class URLDatabase:
execute_values(cursor, insert_sql, data)
def get_new_batch_for_user(self, user_id_hash: str):
timeout_seconds = random.randint(10, 20)
timeout_sql = f"SET statement_timeout = '{timeout_seconds}s'"
sql = f"""
UPDATE urls SET status = {URLStatus.ASSIGNED.value}, user_id_hash = %(user_id_hash)s, updated = %(now)s
WHERE url IN (
@ -137,6 +141,7 @@ class URLDatabase:
now = datetime.utcnow()
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
with self.connection.cursor() as cursor:
cursor.execute(timeout_sql)
cursor.execute(sql, {'user_id_hash': user_id_hash, 'min_updated_date': min_updated_date, 'now': now})
results = cursor.fetchall()