Kaynağa Gözat

fix(api): reduce watchdog sensitivity

Peter Thomassen 1 yıl önce
ebeveyn
işleme
0cc1b4dfbb

+ 0 - 1
api/api/settings.py

@@ -230,7 +230,6 @@ CAPTCHA_VALIDITY_PERIOD = timedelta(hours=24)
 
 # Watchdog
 WATCHDOG_SECONDARIES = os.environ.get("DESECSTACK_WATCHDOG_SECONDARIES", "").split()
-WATCHDOG_WINDOW_SEC = 600
 
 # PCH
 PCH_API = os.environ.get("DESECSTACK_API_PCH_API", "")

+ 1 - 1
api/cronhook/crontab

@@ -1,3 +1,3 @@
 */5 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py chores >> /var/log/cron.log 2>&1
-*/5 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py check-secondaries >> /var/log/cron.log 2>&1
+*/15 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py check-secondaries >> /var/log/cron.log 2>&1
 7 11 * * * /usr/local/bin/python3 -u /usr/src/app/manage.py scavenge-unused >> /var/log/cron.log 2>&1

+ 7 - 4
api/desecapi/management/commands/check-secondaries.py

@@ -47,13 +47,13 @@ class Command(BaseCommand):
         parser.add_argument(
             "--delay",
             type=int,
-            default=120,
+            default=300,
             help="Delay SOA checks to allow pending AXFRs to finish.",
         )
         parser.add_argument(
             "--window",
             type=int,
-            default=settings.WATCHDOG_WINDOW_SEC,
+            default=1200,  # Should be sum of crontab interval and delay option (see above)
             help="Check domains that were published no longer than this many seconds ago.",
         )
 
@@ -129,10 +129,13 @@ class Command(BaseCommand):
         self.report(outdated_secondaries, output, timeouts)
 
     def report(self, outdated_secondaries, output, timeouts):
-        if not outdated_secondaries and not timeouts:
+        # Do not report when timeouts occur, unless there's also replication out-of-sync somwhere.
+        # Helps catch long-term unreachability, where subject will show timeouts for any emails.
+        # Individual node downtimes should be tracked by external monitoring.
+        if not outdated_secondaries:
             return
 
-        subject = f'{timeouts and "CRITICAL ALERT" or "ALERT"} {len(outdated_secondaries)} secondaries out of sync'
+        subject = f"{len(timeouts)} timeouts, {len(outdated_secondaries)} secondaries out of sync"
         message = ""
 
         if timeouts: