check-secondaries.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. from datetime import timedelta
  2. from socket import gethostbyname
  3. from time import sleep
  4. from django.conf import settings
  5. from django.core.mail import get_connection, mail_admins
  6. from django.core.management import BaseCommand
  7. from django.utils import timezone
  8. import dns.exception, dns.message, dns.query, dns.rdatatype
  9. from desecapi import pdns
  10. from desecapi.models import Domain
  11. def query_serial(zone, server):
  12. """
  13. Checks a zone's serial on a server.
  14. :return: serial if received; None if the server did not know; False on error
  15. """
  16. query = dns.message.make_query(zone, "SOA")
  17. try:
  18. response = dns.query.tcp(query, server, timeout=5)
  19. except dns.exception.Timeout:
  20. return False
  21. for rrset in response.answer:
  22. if rrset.rdtype == dns.rdatatype.SOA:
  23. return int(rrset[0].serial)
  24. return None
  25. class Command(BaseCommand):
  26. help = "Check secondaries for consistency with nsmaster."
  27. def __init__(self, *args, **kwargs):
  28. self.servers = {
  29. gethostbyname(server): server for server in settings.WATCHDOG_SECONDARIES
  30. }
  31. super().__init__(*args, **kwargs)
  32. def add_arguments(self, parser):
  33. parser.add_argument(
  34. "domain-name",
  35. nargs="*",
  36. help="Domain name to check. If omitted, will check all recently published domains.",
  37. )
  38. parser.add_argument(
  39. "--delay",
  40. type=int,
  41. default=300,
  42. help="Delay SOA checks to allow pending AXFRs to finish.",
  43. )
  44. parser.add_argument(
  45. "--window",
  46. type=int,
  47. default=1200, # Should be sum of crontab interval and delay option (see above)
  48. help="Check domains that were published no longer than this many seconds ago.",
  49. )
  50. def find_outdated_servers(self, zone, local_serial):
  51. """
  52. Returns a dict, the key being the outdated secondary name, and the value being the node's current zone serial.
  53. """
  54. outdated = {}
  55. for server in self.servers:
  56. remote_serial = query_serial(zone, server)
  57. if not remote_serial or remote_serial < local_serial:
  58. outdated[self.servers[server]] = remote_serial
  59. return outdated
  60. def handle(self, *args, **options):
  61. threshold = timezone.now() - timedelta(seconds=options["window"])
  62. recent_domain_names = Domain.objects.filter(
  63. published__gt=threshold
  64. ).values_list("name", flat=True)
  65. serials = {
  66. zone: s
  67. for zone, s in pdns.get_serials().items()
  68. if zone.rstrip(".") in recent_domain_names
  69. }
  70. if options["domain-name"]:
  71. serials = {
  72. zone: serial
  73. for zone, serial in serials.items()
  74. if zone.rstrip(".") in options["domain-name"]
  75. }
  76. print(
  77. "Sleeping for {} seconds before checking {} domains ...".format(
  78. options["delay"], len(serials)
  79. )
  80. )
  81. sleep(options["delay"])
  82. outdated_zone_count = 0
  83. outdated_secondaries = set()
  84. output = []
  85. timeouts = {}
  86. for zone, local_serial in serials.items():
  87. outdated_serials = self.find_outdated_servers(zone, local_serial)
  88. for server, serial in outdated_serials.items():
  89. if serial is False:
  90. timeouts.setdefault(server, [])
  91. timeouts[server].append(zone)
  92. outdated_serials = {
  93. k: serial
  94. for k, serial in outdated_serials.items()
  95. if serial is not False
  96. }
  97. if outdated_serials:
  98. outdated_secondaries.update(outdated_serials.keys())
  99. output.append(
  100. f"{zone} ({local_serial}) is outdated on {outdated_serials}"
  101. )
  102. print(output[-1])
  103. outdated_zone_count += 1
  104. else:
  105. print(f"{zone} ok")
  106. output.append(
  107. f"Checked {len(serials)} domains, {outdated_zone_count} were outdated."
  108. )
  109. print(output[-1])
  110. self.report(outdated_secondaries, output, timeouts)
  111. def report(self, outdated_secondaries, output, timeouts):
  112. # Do not report when timeouts occur, unless there's also replication out-of-sync somwhere.
  113. # Helps catch long-term unreachability, where subject will show timeouts for any emails.
  114. # Individual node downtimes should be tracked by external monitoring.
  115. if not outdated_secondaries:
  116. return
  117. subject = f"{len(timeouts)} timeouts, {len(outdated_secondaries)} secondaries out of sync"
  118. message = ""
  119. if timeouts:
  120. message += f"The following servers had timeouts:\n\n{timeouts}\n\n"
  121. if outdated_secondaries:
  122. message += f"The following {len(outdated_secondaries)} secondaries are out of sync:\n"
  123. for outdated_secondary in outdated_secondaries:
  124. message += f"* {outdated_secondary}\n"
  125. message += "\n"
  126. message += f"Current secondary IPs: {self.servers}\n"
  127. message += "\n".join(output)
  128. mail_admins(
  129. subject,
  130. message,
  131. connection=get_connection("django.core.mail.backends.smtp.EmailBackend"),
  132. )