123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259 |
- package ca
- import (
- "bytes"
- "context"
- "fmt"
- "reflect"
- "sync"
- "time"
- "github.com/cloudflare/cfssl/helpers"
- "github.com/docker/swarmkit/api"
- "github.com/docker/swarmkit/api/equality"
- "github.com/docker/swarmkit/log"
- "github.com/docker/swarmkit/manager/state/store"
- "github.com/pkg/errors"
- )
- // IssuanceStateRotateMaxBatchSize is the maximum number of nodes we'll tell to rotate their certificates in any given update
- const IssuanceStateRotateMaxBatchSize = 30
- func hasIssuer(n *api.Node, info *IssuerInfo) bool {
- if n.Description == nil || n.Description.TLSInfo == nil {
- return false
- }
- return bytes.Equal(info.Subject, n.Description.TLSInfo.CertIssuerSubject) && bytes.Equal(info.PublicKey, n.Description.TLSInfo.CertIssuerPublicKey)
- }
- var errRootRotationChanged = errors.New("target root rotation has changed")
- // rootRotationReconciler keeps track of all the nodes in the store so that we can determine which ones need reconciliation when nodes are updated
- // or the root CA is updated. This is meant to be used with watches on nodes and the cluster, and provides functions to be called when the
- // cluster's RootCA has changed and when a node is added, updated, or removed.
- type rootRotationReconciler struct {
- mu sync.Mutex
- clusterID string
- batchUpdateInterval time.Duration
- ctx context.Context
- store *store.MemoryStore
- currentRootCA *api.RootCA
- currentIssuer IssuerInfo
- unconvergedNodes map[string]*api.Node
- wg sync.WaitGroup
- cancel func()
- }
- // IssuerFromAPIRootCA returns the desired issuer given an API root CA object
- func IssuerFromAPIRootCA(rootCA *api.RootCA) (*IssuerInfo, error) {
- wantedIssuer := rootCA.CACert
- if rootCA.RootRotation != nil {
- wantedIssuer = rootCA.RootRotation.CACert
- }
- issuerCerts, err := helpers.ParseCertificatesPEM(wantedIssuer)
- if err != nil {
- return nil, errors.Wrap(err, "invalid certificate in cluster root CA object")
- }
- if len(issuerCerts) == 0 {
- return nil, errors.New("invalid certificate in cluster root CA object")
- }
- return &IssuerInfo{
- Subject: issuerCerts[0].RawSubject,
- PublicKey: issuerCerts[0].RawSubjectPublicKeyInfo,
- }, nil
- }
- // assumption: UpdateRootCA will never be called with a `nil` root CA because the caller will be acting in response to
- // a store update event
- func (r *rootRotationReconciler) UpdateRootCA(newRootCA *api.RootCA) {
- issuerInfo, err := IssuerFromAPIRootCA(newRootCA)
- if err != nil {
- log.G(r.ctx).WithError(err).Error("unable to update process the current root CA")
- return
- }
- var (
- shouldStartNewLoop, waitForPrevLoop bool
- loopCtx context.Context
- )
- r.mu.Lock()
- defer func() {
- r.mu.Unlock()
- if shouldStartNewLoop {
- if waitForPrevLoop {
- r.wg.Wait()
- }
- r.wg.Add(1)
- go r.runReconcilerLoop(loopCtx, newRootCA)
- }
- }()
- // check if the issuer has changed, first
- if reflect.DeepEqual(&r.currentIssuer, issuerInfo) {
- r.currentRootCA = newRootCA
- return
- }
- // If the issuer has changed, iterate through all the nodes to figure out which ones need rotation
- if newRootCA.RootRotation != nil {
- var nodes []*api.Node
- r.store.View(func(tx store.ReadTx) {
- nodes, err = store.FindNodes(tx, store.ByMembership(api.NodeMembershipAccepted))
- })
- if err != nil {
- log.G(r.ctx).WithError(err).Error("unable to list nodes, so unable to process the current root CA")
- return
- }
- // from here on out, there will be no more errors that cause us to have to abandon updating the Root CA,
- // so we can start making changes to r's fields
- r.unconvergedNodes = make(map[string]*api.Node)
- for _, n := range nodes {
- if !hasIssuer(n, issuerInfo) {
- r.unconvergedNodes[n.ID] = n
- }
- }
- shouldStartNewLoop = true
- if r.cancel != nil { // there's already a loop going, so cancel it
- r.cancel()
- waitForPrevLoop = true
- }
- loopCtx, r.cancel = context.WithCancel(r.ctx)
- } else {
- r.unconvergedNodes = nil
- }
- r.currentRootCA = newRootCA
- r.currentIssuer = *issuerInfo
- }
- // assumption: UpdateNode will never be called with a `nil` node because the caller will be acting in response to
- // a store update event
- func (r *rootRotationReconciler) UpdateNode(node *api.Node) {
- r.mu.Lock()
- defer r.mu.Unlock()
- // if we're not in the middle of a root rotation, or if this node does not have membership, ignore it
- if r.currentRootCA == nil || r.currentRootCA.RootRotation == nil || node.Spec.Membership != api.NodeMembershipAccepted {
- return
- }
- if hasIssuer(node, &r.currentIssuer) {
- delete(r.unconvergedNodes, node.ID)
- } else {
- r.unconvergedNodes[node.ID] = node
- }
- }
- // assumption: DeleteNode will never be called with a `nil` node because the caller will be acting in response to
- // a store update event
- func (r *rootRotationReconciler) DeleteNode(node *api.Node) {
- r.mu.Lock()
- delete(r.unconvergedNodes, node.ID)
- r.mu.Unlock()
- }
- func (r *rootRotationReconciler) runReconcilerLoop(ctx context.Context, loopRootCA *api.RootCA) {
- defer r.wg.Done()
- for {
- r.mu.Lock()
- if len(r.unconvergedNodes) == 0 {
- r.mu.Unlock()
- err := r.store.Update(func(tx store.Tx) error {
- return r.finishRootRotation(tx, loopRootCA)
- })
- if err == nil {
- log.G(r.ctx).Info("completed root rotation")
- return
- }
- log.G(r.ctx).WithError(err).Error("could not complete root rotation")
- if err == errRootRotationChanged {
- // if the root rotation has changed, this loop will be cancelled anyway, so may as well abort early
- return
- }
- } else {
- var toUpdate []*api.Node
- for _, n := range r.unconvergedNodes {
- iState := n.Certificate.Status.State
- if iState != api.IssuanceStateRenew && iState != api.IssuanceStatePending && iState != api.IssuanceStateRotate {
- n = n.Copy()
- n.Certificate.Status.State = api.IssuanceStateRotate
- toUpdate = append(toUpdate, n)
- if len(toUpdate) >= IssuanceStateRotateMaxBatchSize {
- break
- }
- }
- }
- r.mu.Unlock()
- if err := r.batchUpdateNodes(toUpdate); err != nil {
- log.G(r.ctx).WithError(err).Errorf("store error when trying to batch update %d nodes to request certificate rotation", len(toUpdate))
- }
- }
- select {
- case <-ctx.Done():
- return
- case <-time.After(r.batchUpdateInterval):
- }
- }
- }
- // This function assumes that the expected root CA has root rotation. This is intended to be used by
- // `reconcileNodeRootsAndCerts`, which uses the root CA from the `lastSeenClusterRootCA`, and checks
- // that it has a root rotation before calling this function.
- func (r *rootRotationReconciler) finishRootRotation(tx store.Tx, expectedRootCA *api.RootCA) error {
- cluster := store.GetCluster(tx, r.clusterID)
- if cluster == nil {
- return fmt.Errorf("unable to get cluster %s", r.clusterID)
- }
- // If the RootCA object has changed (because another root rotation was started or because some other node
- // had finished the root rotation), we cannot finish the root rotation that we were working on.
- if !equality.RootCAEqualStable(expectedRootCA, &cluster.RootCA) {
- return errRootRotationChanged
- }
- var signerCert []byte
- if len(cluster.RootCA.RootRotation.CAKey) > 0 {
- signerCert = cluster.RootCA.RootRotation.CACert
- }
- // we don't actually have to parse out the default node expiration from the cluster - we are just using
- // the ca.RootCA object to generate new tokens and the digest
- updatedRootCA, err := NewRootCA(cluster.RootCA.RootRotation.CACert, signerCert, cluster.RootCA.RootRotation.CAKey,
- DefaultNodeCertExpiration, nil)
- if err != nil {
- return errors.Wrap(err, "invalid cluster root rotation object")
- }
- cluster.RootCA = api.RootCA{
- CACert: cluster.RootCA.RootRotation.CACert,
- CAKey: cluster.RootCA.RootRotation.CAKey,
- CACertHash: updatedRootCA.Digest.String(),
- JoinTokens: api.JoinTokens{
- Worker: GenerateJoinToken(&updatedRootCA),
- Manager: GenerateJoinToken(&updatedRootCA),
- },
- LastForcedRotation: cluster.RootCA.LastForcedRotation,
- }
- return store.UpdateCluster(tx, cluster)
- }
- func (r *rootRotationReconciler) batchUpdateNodes(toUpdate []*api.Node) error {
- if len(toUpdate) == 0 {
- return nil
- }
- err := r.store.Batch(func(batch *store.Batch) error {
- // Directly update the nodes rather than get + update, and ignore version errors. Since
- // `rootRotationReconciler` should be hooked up to all node update/delete/create events, we should have
- // close to the latest versions of all the nodes. If not, the node will updated later and the
- // next batch of updates should catch it.
- for _, n := range toUpdate {
- if err := batch.Update(func(tx store.Tx) error {
- return store.UpdateNode(tx, n)
- }); err != nil && err != store.ErrSequenceConflict {
- log.G(r.ctx).WithError(err).Errorf("unable to update node %s to request a certificate rotation", n.ID)
- }
- }
- return nil
- })
- return err
- }
|