浏览代码

fix(api): reduce watchdog sensitivity

Peter Thomassen 1 年之前
父节点
当前提交
0cc1b4dfbb
共有 3 个文件被更改,包括 8 次插入6 次删除
  1. 0 1
      api/api/settings.py
  2. 1 1
      api/cronhook/crontab
  3. 7 4
      api/desecapi/management/commands/check-secondaries.py

+ 0 - 1
api/api/settings.py

@@ -230,7 +230,6 @@ CAPTCHA_VALIDITY_PERIOD = timedelta(hours=24)
 
 
 # Watchdog
 # Watchdog
 WATCHDOG_SECONDARIES = os.environ.get("DESECSTACK_WATCHDOG_SECONDARIES", "").split()
 WATCHDOG_SECONDARIES = os.environ.get("DESECSTACK_WATCHDOG_SECONDARIES", "").split()
-WATCHDOG_WINDOW_SEC = 600
 
 
 # PCH
 # PCH
 PCH_API = os.environ.get("DESECSTACK_API_PCH_API", "")
 PCH_API = os.environ.get("DESECSTACK_API_PCH_API", "")

+ 1 - 1
api/cronhook/crontab

@@ -1,3 +1,3 @@
 */5 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py chores >> /var/log/cron.log 2>&1
 */5 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py chores >> /var/log/cron.log 2>&1
-*/5 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py check-secondaries >> /var/log/cron.log 2>&1
+*/15 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py check-secondaries >> /var/log/cron.log 2>&1
 7 11 * * * /usr/local/bin/python3 -u /usr/src/app/manage.py scavenge-unused >> /var/log/cron.log 2>&1
 7 11 * * * /usr/local/bin/python3 -u /usr/src/app/manage.py scavenge-unused >> /var/log/cron.log 2>&1

+ 7 - 4
api/desecapi/management/commands/check-secondaries.py

@@ -47,13 +47,13 @@ class Command(BaseCommand):
         parser.add_argument(
         parser.add_argument(
             "--delay",
             "--delay",
             type=int,
             type=int,
-            default=120,
+            default=300,
             help="Delay SOA checks to allow pending AXFRs to finish.",
             help="Delay SOA checks to allow pending AXFRs to finish.",
         )
         )
         parser.add_argument(
         parser.add_argument(
             "--window",
             "--window",
             type=int,
             type=int,
-            default=settings.WATCHDOG_WINDOW_SEC,
+            default=1200,  # Should be sum of crontab interval and delay option (see above)
             help="Check domains that were published no longer than this many seconds ago.",
             help="Check domains that were published no longer than this many seconds ago.",
         )
         )
 
 
@@ -129,10 +129,13 @@ class Command(BaseCommand):
         self.report(outdated_secondaries, output, timeouts)
         self.report(outdated_secondaries, output, timeouts)
 
 
     def report(self, outdated_secondaries, output, timeouts):
     def report(self, outdated_secondaries, output, timeouts):
-        if not outdated_secondaries and not timeouts:
+        # Do not report when timeouts occur, unless there's also replication out-of-sync somwhere.
+        # Helps catch long-term unreachability, where subject will show timeouts for any emails.
+        # Individual node downtimes should be tracked by external monitoring.
+        if not outdated_secondaries:
             return
             return
 
 
-        subject = f'{timeouts and "CRITICAL ALERT" or "ALERT"} {len(outdated_secondaries)} secondaries out of sync'
+        subject = f"{len(timeouts)} timeouts, {len(outdated_secondaries)} secondaries out of sync"
         message = ""
         message = ""
 
 
         if timeouts:
         if timeouts: