reconciler.go 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. package ca
  2. import (
  3. "bytes"
  4. "context"
  5. "fmt"
  6. "reflect"
  7. "sync"
  8. "time"
  9. "github.com/cloudflare/cfssl/helpers"
  10. "github.com/docker/swarmkit/api"
  11. "github.com/docker/swarmkit/api/equality"
  12. "github.com/docker/swarmkit/log"
  13. "github.com/docker/swarmkit/manager/state/store"
  14. "github.com/pkg/errors"
  15. )
  16. // IssuanceStateRotateMaxBatchSize is the maximum number of nodes we'll tell to rotate their certificates in any given update
  17. const IssuanceStateRotateMaxBatchSize = 30
  18. func hasIssuer(n *api.Node, info *IssuerInfo) bool {
  19. if n.Description == nil || n.Description.TLSInfo == nil {
  20. return false
  21. }
  22. return bytes.Equal(info.Subject, n.Description.TLSInfo.CertIssuerSubject) && bytes.Equal(info.PublicKey, n.Description.TLSInfo.CertIssuerPublicKey)
  23. }
  24. var errRootRotationChanged = errors.New("target root rotation has changed")
  25. // rootRotationReconciler keeps track of all the nodes in the store so that we can determine which ones need reconciliation when nodes are updated
  26. // or the root CA is updated. This is meant to be used with watches on nodes and the cluster, and provides functions to be called when the
  27. // cluster's RootCA has changed and when a node is added, updated, or removed.
  28. type rootRotationReconciler struct {
  29. mu sync.Mutex
  30. clusterID string
  31. batchUpdateInterval time.Duration
  32. ctx context.Context
  33. store *store.MemoryStore
  34. currentRootCA *api.RootCA
  35. currentIssuer IssuerInfo
  36. unconvergedNodes map[string]*api.Node
  37. wg sync.WaitGroup
  38. cancel func()
  39. }
  40. // IssuerFromAPIRootCA returns the desired issuer given an API root CA object
  41. func IssuerFromAPIRootCA(rootCA *api.RootCA) (*IssuerInfo, error) {
  42. wantedIssuer := rootCA.CACert
  43. if rootCA.RootRotation != nil {
  44. wantedIssuer = rootCA.RootRotation.CACert
  45. }
  46. issuerCerts, err := helpers.ParseCertificatesPEM(wantedIssuer)
  47. if err != nil {
  48. return nil, errors.Wrap(err, "invalid certificate in cluster root CA object")
  49. }
  50. if len(issuerCerts) == 0 {
  51. return nil, errors.New("invalid certificate in cluster root CA object")
  52. }
  53. return &IssuerInfo{
  54. Subject: issuerCerts[0].RawSubject,
  55. PublicKey: issuerCerts[0].RawSubjectPublicKeyInfo,
  56. }, nil
  57. }
  58. // assumption: UpdateRootCA will never be called with a `nil` root CA because the caller will be acting in response to
  59. // a store update event
  60. func (r *rootRotationReconciler) UpdateRootCA(newRootCA *api.RootCA) {
  61. issuerInfo, err := IssuerFromAPIRootCA(newRootCA)
  62. if err != nil {
  63. log.G(r.ctx).WithError(err).Error("unable to update process the current root CA")
  64. return
  65. }
  66. var (
  67. shouldStartNewLoop, waitForPrevLoop bool
  68. loopCtx context.Context
  69. )
  70. r.mu.Lock()
  71. defer func() {
  72. r.mu.Unlock()
  73. if shouldStartNewLoop {
  74. if waitForPrevLoop {
  75. r.wg.Wait()
  76. }
  77. r.wg.Add(1)
  78. go r.runReconcilerLoop(loopCtx, newRootCA)
  79. }
  80. }()
  81. // check if the issuer has changed, first
  82. if reflect.DeepEqual(&r.currentIssuer, issuerInfo) {
  83. r.currentRootCA = newRootCA
  84. return
  85. }
  86. // If the issuer has changed, iterate through all the nodes to figure out which ones need rotation
  87. if newRootCA.RootRotation != nil {
  88. var nodes []*api.Node
  89. r.store.View(func(tx store.ReadTx) {
  90. nodes, err = store.FindNodes(tx, store.ByMembership(api.NodeMembershipAccepted))
  91. })
  92. if err != nil {
  93. log.G(r.ctx).WithError(err).Error("unable to list nodes, so unable to process the current root CA")
  94. return
  95. }
  96. // from here on out, there will be no more errors that cause us to have to abandon updating the Root CA,
  97. // so we can start making changes to r's fields
  98. r.unconvergedNodes = make(map[string]*api.Node)
  99. for _, n := range nodes {
  100. if !hasIssuer(n, issuerInfo) {
  101. r.unconvergedNodes[n.ID] = n
  102. }
  103. }
  104. shouldStartNewLoop = true
  105. if r.cancel != nil { // there's already a loop going, so cancel it
  106. r.cancel()
  107. waitForPrevLoop = true
  108. }
  109. loopCtx, r.cancel = context.WithCancel(r.ctx)
  110. } else {
  111. r.unconvergedNodes = nil
  112. }
  113. r.currentRootCA = newRootCA
  114. r.currentIssuer = *issuerInfo
  115. }
  116. // assumption: UpdateNode will never be called with a `nil` node because the caller will be acting in response to
  117. // a store update event
  118. func (r *rootRotationReconciler) UpdateNode(node *api.Node) {
  119. r.mu.Lock()
  120. defer r.mu.Unlock()
  121. // if we're not in the middle of a root rotation, or if this node does not have membership, ignore it
  122. if r.currentRootCA == nil || r.currentRootCA.RootRotation == nil || node.Spec.Membership != api.NodeMembershipAccepted {
  123. return
  124. }
  125. if hasIssuer(node, &r.currentIssuer) {
  126. delete(r.unconvergedNodes, node.ID)
  127. } else {
  128. r.unconvergedNodes[node.ID] = node
  129. }
  130. }
  131. // assumption: DeleteNode will never be called with a `nil` node because the caller will be acting in response to
  132. // a store update event
  133. func (r *rootRotationReconciler) DeleteNode(node *api.Node) {
  134. r.mu.Lock()
  135. delete(r.unconvergedNodes, node.ID)
  136. r.mu.Unlock()
  137. }
  138. func (r *rootRotationReconciler) runReconcilerLoop(ctx context.Context, loopRootCA *api.RootCA) {
  139. defer r.wg.Done()
  140. for {
  141. r.mu.Lock()
  142. if len(r.unconvergedNodes) == 0 {
  143. r.mu.Unlock()
  144. err := r.store.Update(func(tx store.Tx) error {
  145. return r.finishRootRotation(tx, loopRootCA)
  146. })
  147. if err == nil {
  148. log.G(r.ctx).Info("completed root rotation")
  149. return
  150. }
  151. log.G(r.ctx).WithError(err).Error("could not complete root rotation")
  152. if err == errRootRotationChanged {
  153. // if the root rotation has changed, this loop will be cancelled anyway, so may as well abort early
  154. return
  155. }
  156. } else {
  157. var toUpdate []*api.Node
  158. for _, n := range r.unconvergedNodes {
  159. iState := n.Certificate.Status.State
  160. if iState != api.IssuanceStateRenew && iState != api.IssuanceStatePending && iState != api.IssuanceStateRotate {
  161. n = n.Copy()
  162. n.Certificate.Status.State = api.IssuanceStateRotate
  163. toUpdate = append(toUpdate, n)
  164. if len(toUpdate) >= IssuanceStateRotateMaxBatchSize {
  165. break
  166. }
  167. }
  168. }
  169. r.mu.Unlock()
  170. if err := r.batchUpdateNodes(toUpdate); err != nil {
  171. log.G(r.ctx).WithError(err).Errorf("store error when trying to batch update %d nodes to request certificate rotation", len(toUpdate))
  172. }
  173. }
  174. select {
  175. case <-ctx.Done():
  176. return
  177. case <-time.After(r.batchUpdateInterval):
  178. }
  179. }
  180. }
  181. // This function assumes that the expected root CA has root rotation. This is intended to be used by
  182. // `reconcileNodeRootsAndCerts`, which uses the root CA from the `lastSeenClusterRootCA`, and checks
  183. // that it has a root rotation before calling this function.
  184. func (r *rootRotationReconciler) finishRootRotation(tx store.Tx, expectedRootCA *api.RootCA) error {
  185. cluster := store.GetCluster(tx, r.clusterID)
  186. if cluster == nil {
  187. return fmt.Errorf("unable to get cluster %s", r.clusterID)
  188. }
  189. // If the RootCA object has changed (because another root rotation was started or because some other node
  190. // had finished the root rotation), we cannot finish the root rotation that we were working on.
  191. if !equality.RootCAEqualStable(expectedRootCA, &cluster.RootCA) {
  192. return errRootRotationChanged
  193. }
  194. var signerCert []byte
  195. if len(cluster.RootCA.RootRotation.CAKey) > 0 {
  196. signerCert = cluster.RootCA.RootRotation.CACert
  197. }
  198. // we don't actually have to parse out the default node expiration from the cluster - we are just using
  199. // the ca.RootCA object to generate new tokens and the digest
  200. updatedRootCA, err := NewRootCA(cluster.RootCA.RootRotation.CACert, signerCert, cluster.RootCA.RootRotation.CAKey,
  201. DefaultNodeCertExpiration, nil)
  202. if err != nil {
  203. return errors.Wrap(err, "invalid cluster root rotation object")
  204. }
  205. cluster.RootCA = api.RootCA{
  206. CACert: cluster.RootCA.RootRotation.CACert,
  207. CAKey: cluster.RootCA.RootRotation.CAKey,
  208. CACertHash: updatedRootCA.Digest.String(),
  209. JoinTokens: api.JoinTokens{
  210. Worker: GenerateJoinToken(&updatedRootCA),
  211. Manager: GenerateJoinToken(&updatedRootCA),
  212. },
  213. LastForcedRotation: cluster.RootCA.LastForcedRotation,
  214. }
  215. return store.UpdateCluster(tx, cluster)
  216. }
  217. func (r *rootRotationReconciler) batchUpdateNodes(toUpdate []*api.Node) error {
  218. if len(toUpdate) == 0 {
  219. return nil
  220. }
  221. err := r.store.Batch(func(batch *store.Batch) error {
  222. // Directly update the nodes rather than get + update, and ignore version errors. Since
  223. // `rootRotationReconciler` should be hooked up to all node update/delete/create events, we should have
  224. // close to the latest versions of all the nodes. If not, the node will updated later and the
  225. // next batch of updates should catch it.
  226. for _, n := range toUpdate {
  227. if err := batch.Update(func(tx store.Tx) error {
  228. return store.UpdateNode(tx, n)
  229. }); err != nil && err != store.ErrSequenceConflict {
  230. log.G(r.ctx).WithError(err).Errorf("unable to update node %s to request a certificate rotation", n.ID)
  231. }
  232. }
  233. return nil
  234. })
  235. return err
  236. }