123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850 |
- package ca
- import (
- "bytes"
- "crypto/subtle"
- "crypto/x509"
- "sync"
- "time"
- "github.com/Sirupsen/logrus"
- "github.com/docker/swarmkit/api"
- "github.com/docker/swarmkit/api/equality"
- "github.com/docker/swarmkit/identity"
- "github.com/docker/swarmkit/log"
- "github.com/docker/swarmkit/manager/state/store"
- gogotypes "github.com/gogo/protobuf/types"
- "github.com/pkg/errors"
- "golang.org/x/net/context"
- "google.golang.org/grpc"
- "google.golang.org/grpc/codes"
- )
- const (
- defaultReconciliationRetryInterval = 10 * time.Second
- defaultRootReconciliationInterval = 3 * time.Second
- )
- // APISecurityConfigUpdater knows how to update a SecurityConfig from an api.Cluster object
- type APISecurityConfigUpdater interface {
- UpdateRootCA(ctx context.Context, cluster *api.Cluster) error
- }
- var _ APISecurityConfigUpdater = &Server{}
- // Server is the CA and NodeCA API gRPC server.
- // TODO(aaronl): At some point we may want to have separate implementations of
- // CA, NodeCA, and other hypothetical future CA services. At the moment,
- // breaking it apart doesn't seem worth it.
- type Server struct {
- mu sync.Mutex
- wg sync.WaitGroup
- ctx context.Context
- cancel func()
- store *store.MemoryStore
- securityConfig *SecurityConfig
- joinTokens *api.JoinTokens
- reconciliationRetryInterval time.Duration
- // pending is a map of nodes with pending certificates issuance or
- // renewal. They are indexed by node ID.
- pending map[string]*api.Node
- // started is a channel which gets closed once the server is running
- // and able to service RPCs.
- started chan struct{}
- // these are cached values to ensure we only update the security config when
- // the cluster root CA and external CAs have changed - the cluster object
- // can change for other reasons, and it would not be necessary to update
- // the security config as a result
- lastSeenClusterRootCA *api.RootCA
- lastSeenExternalCAs []*api.ExternalCA
- secConfigMu sync.Mutex
- // before we update the security config with the new root CA, we need to be able to save the root certs
- rootPaths CertPaths
- // lets us monitor and finish root rotations
- rootReconciler *rootRotationReconciler
- rootReconciliationRetryInterval time.Duration
- }
- // DefaultCAConfig returns the default CA Config, with a default expiration.
- func DefaultCAConfig() api.CAConfig {
- return api.CAConfig{
- NodeCertExpiry: gogotypes.DurationProto(DefaultNodeCertExpiration),
- }
- }
- // NewServer creates a CA API server.
- func NewServer(store *store.MemoryStore, securityConfig *SecurityConfig, rootCAPaths CertPaths) *Server {
- return &Server{
- store: store,
- securityConfig: securityConfig,
- pending: make(map[string]*api.Node),
- started: make(chan struct{}),
- reconciliationRetryInterval: defaultReconciliationRetryInterval,
- rootReconciliationRetryInterval: defaultRootReconciliationInterval,
- rootPaths: rootCAPaths,
- }
- }
- // SetReconciliationRetryInterval changes the time interval between
- // reconciliation attempts. This function must be called before Run.
- func (s *Server) SetReconciliationRetryInterval(reconciliationRetryInterval time.Duration) {
- s.reconciliationRetryInterval = reconciliationRetryInterval
- }
- // SetRootReconciliationInterval changes the time interval between root rotation
- // reconciliation attempts. This function must be called before Run.
- func (s *Server) SetRootReconciliationInterval(interval time.Duration) {
- s.rootReconciliationRetryInterval = interval
- }
- // GetUnlockKey is responsible for returning the current unlock key used for encrypting TLS private keys and
- // other at rest data. Access to this RPC call should only be allowed via mutual TLS from managers.
- func (s *Server) GetUnlockKey(ctx context.Context, request *api.GetUnlockKeyRequest) (*api.GetUnlockKeyResponse, error) {
- // This directly queries the store, rather than storing the unlock key and version on
- // the `Server` object and updating it `updateCluster` is called, because we need this
- // API to return the latest version of the key. Otherwise, there might be a slight delay
- // between when the cluster gets updated, and when this function returns the latest key.
- // This delay is currently unacceptable because this RPC call is the only way, after a
- // cluster update, to get the actual value of the unlock key, and we don't want to return
- // a cached value.
- resp := api.GetUnlockKeyResponse{}
- s.store.View(func(tx store.ReadTx) {
- cluster := store.GetCluster(tx, s.securityConfig.ClientTLSCreds.Organization())
- resp.Version = cluster.Meta.Version
- if cluster.Spec.EncryptionConfig.AutoLockManagers {
- for _, encryptionKey := range cluster.UnlockKeys {
- if encryptionKey.Subsystem == ManagerRole {
- resp.UnlockKey = encryptionKey.Key
- return
- }
- }
- }
- })
- return &resp, nil
- }
- // NodeCertificateStatus returns the current issuance status of an issuance request identified by the nodeID
- func (s *Server) NodeCertificateStatus(ctx context.Context, request *api.NodeCertificateStatusRequest) (*api.NodeCertificateStatusResponse, error) {
- if request.NodeID == "" {
- return nil, grpc.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
- }
- serverCtx, err := s.isRunningLocked()
- if err != nil {
- return nil, err
- }
- var node *api.Node
- event := api.EventUpdateNode{
- Node: &api.Node{ID: request.NodeID},
- Checks: []api.NodeCheckFunc{api.NodeCheckID},
- }
- // Retrieve the current value of the certificate with this token, and create a watcher
- updates, cancel, err := store.ViewAndWatch(
- s.store,
- func(tx store.ReadTx) error {
- node = store.GetNode(tx, request.NodeID)
- return nil
- },
- event,
- )
- if err != nil {
- return nil, err
- }
- defer cancel()
- // This node ID doesn't exist
- if node == nil {
- return nil, grpc.Errorf(codes.NotFound, codes.NotFound.String())
- }
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": node.ID,
- "status": node.Certificate.Status,
- "method": "NodeCertificateStatus",
- })
- // If this certificate has a final state, return it immediately (both pending and renew are transition states)
- if isFinalState(node.Certificate.Status) {
- return &api.NodeCertificateStatusResponse{
- Status: &node.Certificate.Status,
- Certificate: &node.Certificate,
- }, nil
- }
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": node.ID,
- "status": node.Certificate.Status,
- "method": "NodeCertificateStatus",
- }).Debugf("started watching for certificate updates")
- // Certificate is Pending or in an Unknown state, let's wait for changes.
- for {
- select {
- case event := <-updates:
- switch v := event.(type) {
- case api.EventUpdateNode:
- // We got an update on the certificate record. If the status is a final state,
- // return the certificate.
- if isFinalState(v.Node.Certificate.Status) {
- cert := v.Node.Certificate.Copy()
- return &api.NodeCertificateStatusResponse{
- Status: &cert.Status,
- Certificate: cert,
- }, nil
- }
- }
- case <-ctx.Done():
- return nil, ctx.Err()
- case <-serverCtx.Done():
- return nil, s.ctx.Err()
- }
- }
- }
- // IssueNodeCertificate is responsible for gatekeeping both certificate requests from new nodes in the swarm,
- // and authorizing certificate renewals.
- // If a node presented a valid certificate, the corresponding certificate is set in a RENEW state.
- // If a node failed to present a valid certificate, we check for a valid join token and set the
- // role accordingly. A new random node ID is generated, and the corresponding node entry is created.
- // IssueNodeCertificate is the only place where new node entries to raft should be created.
- func (s *Server) IssueNodeCertificate(ctx context.Context, request *api.IssueNodeCertificateRequest) (*api.IssueNodeCertificateResponse, error) {
- // First, let's see if the remote node is presenting a non-empty CSR
- if len(request.CSR) == 0 {
- return nil, grpc.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
- }
- if _, err := s.isRunningLocked(); err != nil {
- return nil, err
- }
- var (
- blacklistedCerts map[string]*api.BlacklistedCertificate
- clusters []*api.Cluster
- err error
- )
- s.store.View(func(readTx store.ReadTx) {
- clusters, err = store.FindClusters(readTx, store.ByName("default"))
- })
- // Not having a cluster object yet means we can't check
- // the blacklist.
- if err == nil && len(clusters) == 1 {
- blacklistedCerts = clusters[0].BlacklistedCertificates
- }
- // Renewing the cert with a local (unix socket) is always valid.
- localNodeInfo := ctx.Value(LocalRequestKey)
- if localNodeInfo != nil {
- nodeInfo, ok := localNodeInfo.(RemoteNodeInfo)
- if ok && nodeInfo.NodeID != "" {
- return s.issueRenewCertificate(ctx, nodeInfo.NodeID, request.CSR)
- }
- }
- // If the remote node is a worker (either forwarded by a manager, or calling directly),
- // issue a renew worker certificate entry with the correct ID
- nodeID, err := AuthorizeForwardedRoleAndOrg(ctx, []string{WorkerRole}, []string{ManagerRole}, s.securityConfig.ClientTLSCreds.Organization(), blacklistedCerts)
- if err == nil {
- return s.issueRenewCertificate(ctx, nodeID, request.CSR)
- }
- // If the remote node is a manager (either forwarded by another manager, or calling directly),
- // issue a renew certificate entry with the correct ID
- nodeID, err = AuthorizeForwardedRoleAndOrg(ctx, []string{ManagerRole}, []string{ManagerRole}, s.securityConfig.ClientTLSCreds.Organization(), blacklistedCerts)
- if err == nil {
- return s.issueRenewCertificate(ctx, nodeID, request.CSR)
- }
- // The remote node didn't successfully present a valid MTLS certificate, let's issue a
- // certificate with a new random ID
- role := api.NodeRole(-1)
- s.mu.Lock()
- if subtle.ConstantTimeCompare([]byte(s.joinTokens.Manager), []byte(request.Token)) == 1 {
- role = api.NodeRoleManager
- } else if subtle.ConstantTimeCompare([]byte(s.joinTokens.Worker), []byte(request.Token)) == 1 {
- role = api.NodeRoleWorker
- }
- s.mu.Unlock()
- if role < 0 {
- return nil, grpc.Errorf(codes.InvalidArgument, "A valid join token is necessary to join this cluster")
- }
- // Max number of collisions of ID or CN to tolerate before giving up
- maxRetries := 3
- // Generate a random ID for this new node
- for i := 0; ; i++ {
- nodeID = identity.NewID()
- // Create a new node
- err := s.store.Update(func(tx store.Tx) error {
- node := &api.Node{
- Role: role,
- ID: nodeID,
- Certificate: api.Certificate{
- CSR: request.CSR,
- CN: nodeID,
- Role: role,
- Status: api.IssuanceStatus{
- State: api.IssuanceStatePending,
- },
- },
- Spec: api.NodeSpec{
- DesiredRole: role,
- Membership: api.NodeMembershipAccepted,
- Availability: request.Availability,
- },
- }
- return store.CreateNode(tx, node)
- })
- if err == nil {
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": nodeID,
- "node.role": role,
- "method": "IssueNodeCertificate",
- }).Debugf("new certificate entry added")
- break
- }
- if err != store.ErrExist {
- return nil, err
- }
- if i == maxRetries {
- return nil, err
- }
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": nodeID,
- "node.role": role,
- "method": "IssueNodeCertificate",
- }).Errorf("randomly generated node ID collided with an existing one - retrying")
- }
- return &api.IssueNodeCertificateResponse{
- NodeID: nodeID,
- NodeMembership: api.NodeMembershipAccepted,
- }, nil
- }
- // issueRenewCertificate receives a nodeID and a CSR and modifies the node's certificate entry with the new CSR
- // and changes the state to RENEW, so it can be picked up and signed by the signing reconciliation loop
- func (s *Server) issueRenewCertificate(ctx context.Context, nodeID string, csr []byte) (*api.IssueNodeCertificateResponse, error) {
- var (
- cert api.Certificate
- node *api.Node
- )
- err := s.store.Update(func(tx store.Tx) error {
- // Attempt to retrieve the node with nodeID
- node = store.GetNode(tx, nodeID)
- if node == nil {
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": nodeID,
- "method": "issueRenewCertificate",
- }).Warnf("node does not exist")
- // If this node doesn't exist, we shouldn't be renewing a certificate for it
- return grpc.Errorf(codes.NotFound, "node %s not found when attempting to renew certificate", nodeID)
- }
- // Create a new Certificate entry for this node with the new CSR and a RENEW state
- cert = api.Certificate{
- CSR: csr,
- CN: node.ID,
- Role: node.Role,
- Status: api.IssuanceStatus{
- State: api.IssuanceStateRenew,
- },
- }
- node.Certificate = cert
- return store.UpdateNode(tx, node)
- })
- if err != nil {
- return nil, err
- }
- log.G(ctx).WithFields(logrus.Fields{
- "cert.cn": cert.CN,
- "cert.role": cert.Role,
- "method": "issueRenewCertificate",
- }).Debugf("node certificate updated")
- return &api.IssueNodeCertificateResponse{
- NodeID: nodeID,
- NodeMembership: node.Spec.Membership,
- }, nil
- }
- // GetRootCACertificate returns the certificate of the Root CA. It is used as a convenience for distributing
- // the root of trust for the swarm. Clients should be using the CA hash to verify if they weren't target to
- // a MiTM. If they fail to do so, node bootstrap works with TOFU semantics.
- func (s *Server) GetRootCACertificate(ctx context.Context, request *api.GetRootCACertificateRequest) (*api.GetRootCACertificateResponse, error) {
- log.G(ctx).WithFields(logrus.Fields{
- "method": "GetRootCACertificate",
- })
- return &api.GetRootCACertificateResponse{
- Certificate: s.securityConfig.RootCA().Certs,
- }, nil
- }
- // Run runs the CA signer main loop.
- // The CA signer can be stopped with cancelling ctx or calling Stop().
- func (s *Server) Run(ctx context.Context) error {
- s.mu.Lock()
- if s.isRunning() {
- s.mu.Unlock()
- return errors.New("CA signer is already running")
- }
- s.wg.Add(1)
- s.ctx, s.cancel = context.WithCancel(log.WithModule(ctx, "ca"))
- ctx = s.ctx
- // we need to set it on the server, because `Server.UpdateRootCA` can be called from outside the Run function
- s.rootReconciler = &rootRotationReconciler{
- ctx: log.WithField(ctx, "method", "(*Server).rootRotationReconciler"),
- clusterID: s.securityConfig.ClientTLSCreds.Organization(),
- store: s.store,
- batchUpdateInterval: s.rootReconciliationRetryInterval,
- }
- rootReconciler := s.rootReconciler
- s.mu.Unlock()
- defer s.wg.Done()
- defer func() {
- s.mu.Lock()
- s.rootReconciler = nil
- s.mu.Unlock()
- }()
- // Retrieve the channels to keep track of changes in the cluster
- // Retrieve all the currently registered nodes
- var nodes []*api.Node
- updates, cancel, err := store.ViewAndWatch(
- s.store,
- func(readTx store.ReadTx) error {
- clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
- if err != nil {
- return err
- }
- if len(clusters) != 1 {
- return errors.New("could not find cluster object")
- }
- s.UpdateRootCA(ctx, clusters[0]) // call once to ensure that the join tokens are always set
- nodes, err = store.FindNodes(readTx, store.All)
- return err
- },
- api.EventCreateNode{},
- api.EventUpdateNode{},
- api.EventDeleteNode{},
- )
- // Do this after updateCluster has been called, so isRunning never
- // returns true without joinTokens being set correctly.
- s.mu.Lock()
- close(s.started)
- s.mu.Unlock()
- if err != nil {
- log.G(ctx).WithFields(logrus.Fields{
- "method": "(*Server).Run",
- }).WithError(err).Errorf("snapshot store view failed")
- return err
- }
- defer cancel()
- // We might have missed some updates if there was a leader election,
- // so let's pick up the slack.
- if err := s.reconcileNodeCertificates(ctx, nodes); err != nil {
- // We don't return here because that means the Run loop would
- // never run. Log an error instead.
- log.G(ctx).WithFields(logrus.Fields{
- "method": "(*Server).Run",
- }).WithError(err).Errorf("error attempting to reconcile certificates")
- }
- ticker := time.NewTicker(s.reconciliationRetryInterval)
- defer ticker.Stop()
- // Watch for new nodes being created, new nodes being updated, and changes
- // to the cluster
- for {
- select {
- case <-ctx.Done():
- return nil
- default:
- }
- select {
- case event := <-updates:
- switch v := event.(type) {
- case api.EventCreateNode:
- s.evaluateAndSignNodeCert(ctx, v.Node)
- rootReconciler.UpdateNode(v.Node)
- case api.EventUpdateNode:
- // If this certificate is already at a final state
- // no need to evaluate and sign it.
- if !isFinalState(v.Node.Certificate.Status) {
- s.evaluateAndSignNodeCert(ctx, v.Node)
- }
- rootReconciler.UpdateNode(v.Node)
- case api.EventDeleteNode:
- rootReconciler.DeleteNode(v.Node)
- }
- case <-ticker.C:
- for _, node := range s.pending {
- if err := s.evaluateAndSignNodeCert(ctx, node); err != nil {
- // If this sign operation did not succeed, the rest are
- // unlikely to. Yield so that we don't hammer an external CA.
- // Since the map iteration order is randomized, there is no
- // risk of getting stuck on a problematic CSR.
- break
- }
- }
- case <-ctx.Done():
- return nil
- }
- }
- }
- // Stop stops the CA and closes all grpc streams.
- func (s *Server) Stop() error {
- s.mu.Lock()
- if !s.isRunning() {
- s.mu.Unlock()
- return errors.New("CA signer is already stopped")
- }
- s.cancel()
- s.started = make(chan struct{})
- s.mu.Unlock()
- // Wait for Run to complete
- s.wg.Wait()
- return nil
- }
- // Ready waits on the ready channel and returns when the server is ready to serve.
- func (s *Server) Ready() <-chan struct{} {
- s.mu.Lock()
- defer s.mu.Unlock()
- return s.started
- }
- func (s *Server) isRunningLocked() (context.Context, error) {
- s.mu.Lock()
- if !s.isRunning() {
- s.mu.Unlock()
- return nil, grpc.Errorf(codes.Aborted, "CA signer is stopped")
- }
- ctx := s.ctx
- s.mu.Unlock()
- return ctx, nil
- }
- func (s *Server) isRunning() bool {
- if s.ctx == nil {
- return false
- }
- select {
- case <-s.ctx.Done():
- return false
- default:
- }
- return true
- }
- // UpdateRootCA is called when there are cluster changes, and it ensures that the local RootCA is
- // always aware of changes in clusterExpiry and the Root CA key material - this can be called by
- // anything to update the root CA material
- func (s *Server) UpdateRootCA(ctx context.Context, cluster *api.Cluster) error {
- s.mu.Lock()
- s.joinTokens = cluster.RootCA.JoinTokens.Copy()
- reconciler := s.rootReconciler
- s.mu.Unlock()
- rCA := cluster.RootCA.Copy()
- if reconciler != nil {
- reconciler.UpdateRootCA(rCA)
- }
- s.secConfigMu.Lock()
- defer s.secConfigMu.Unlock()
- firstSeenCluster := s.lastSeenClusterRootCA == nil && s.lastSeenExternalCAs == nil
- rootCAChanged := len(rCA.CACert) != 0 && !equality.RootCAEqualStable(s.lastSeenClusterRootCA, rCA)
- externalCAChanged := !equality.ExternalCAsEqualStable(s.lastSeenExternalCAs, cluster.Spec.CAConfig.ExternalCAs)
- logger := log.G(ctx).WithFields(logrus.Fields{
- "cluster.id": cluster.ID,
- "method": "(*Server).UpdateRootCA",
- })
- if rootCAChanged {
- setOrUpdate := "set"
- if !firstSeenCluster {
- logger.Debug("Updating security config due to change in cluster Root CA")
- setOrUpdate = "updated"
- }
- expiry := DefaultNodeCertExpiration
- if cluster.Spec.CAConfig.NodeCertExpiry != nil {
- // NodeCertExpiry exists, let's try to parse the duration out of it
- clusterExpiry, err := gogotypes.DurationFromProto(cluster.Spec.CAConfig.NodeCertExpiry)
- if err != nil {
- logger.WithError(err).Warn("failed to parse certificate expiration, using default")
- } else {
- // We were able to successfully parse the expiration out of the cluster.
- expiry = clusterExpiry
- }
- } else {
- // NodeCertExpiry seems to be nil
- logger.Warn("no certificate expiration specified, using default")
- }
- // Attempt to update our local RootCA with the new parameters
- var intermediates []byte
- signingCert := rCA.CACert
- signingKey := rCA.CAKey
- if rCA.RootRotation != nil {
- signingCert = rCA.RootRotation.CrossSignedCACert
- signingKey = rCA.RootRotation.CAKey
- intermediates = rCA.RootRotation.CrossSignedCACert
- }
- if signingKey == nil {
- signingCert = nil
- }
- updatedRootCA, err := NewRootCA(rCA.CACert, signingCert, signingKey, expiry, intermediates)
- if err != nil {
- return errors.Wrap(err, "invalid Root CA object in cluster")
- }
- if err := SaveRootCA(updatedRootCA, s.rootPaths); err != nil {
- return errors.Wrap(err, "unable to save new root CA certificates")
- }
- externalCARootPool := updatedRootCA.Pool
- if rCA.RootRotation != nil {
- // the external CA has to trust the new CA cert
- externalCARootPool = x509.NewCertPool()
- externalCARootPool.AppendCertsFromPEM(rCA.CACert)
- externalCARootPool.AppendCertsFromPEM(rCA.RootRotation.CACert)
- }
- // Attempt to update our local RootCA with the new parameters
- if err := s.securityConfig.UpdateRootCA(&updatedRootCA, externalCARootPool); err != nil {
- return errors.Wrap(err, "updating Root CA failed")
- }
- // only update the server cache if we've successfully updated the root CA
- logger.Debugf("Root CA %s successfully", setOrUpdate)
- s.lastSeenClusterRootCA = rCA
- }
- // we want to update if the external CA changed, or if the root CA changed because the root CA could affect what
- // certificate for external CAs we want to filter by
- if rootCAChanged || externalCAChanged {
- if !firstSeenCluster {
- logger.Debug("Updating security config external CA URLs due to change in cluster Root CA or cluster spec")
- }
- wantedExternalCACert := rCA.CACert // we want to only add external CA URLs that use this cert
- if rCA.RootRotation != nil {
- // we're rotating to a new root, so we only want external CAs with the new root cert
- wantedExternalCACert = rCA.RootRotation.CACert
- }
- wantedExternalCACert = NormalizePEMs(wantedExternalCACert)
- // Update our security config with the list of External CA URLs
- // from the new cluster state.
- // TODO(aaronl): In the future, this will be abstracted with an
- // ExternalCA interface that has different implementations for
- // different CA types. At the moment, only CFSSL is supported.
- var cfsslURLs []string
- for i, extCA := range cluster.Spec.CAConfig.ExternalCAs {
- // We want to support old external CA specifications which did not have a CA cert. If there is no cert specified,
- // we assume it's the old cert
- certForExtCA := extCA.CACert
- if len(certForExtCA) == 0 {
- certForExtCA = rCA.CACert
- }
- certForExtCA = NormalizePEMs(certForExtCA)
- if extCA.Protocol != api.ExternalCA_CAProtocolCFSSL {
- logger.Debugf("skipping external CA %d (url: %s) due to unknown protocol type", i, extCA.URL)
- continue
- }
- if !bytes.Equal(certForExtCA, wantedExternalCACert) {
- logger.Debugf("skipping external CA %d (url: %s) because it has the wrong CA cert", i, extCA.URL)
- continue
- }
- cfsslURLs = append(cfsslURLs, extCA.URL)
- }
- s.securityConfig.externalCA.UpdateURLs(cfsslURLs...)
- s.lastSeenExternalCAs = cluster.Spec.CAConfig.Copy().ExternalCAs
- }
- return nil
- }
- // evaluateAndSignNodeCert implements the logic of which certificates to sign
- func (s *Server) evaluateAndSignNodeCert(ctx context.Context, node *api.Node) error {
- // If the desired membership and actual state are in sync, there's
- // nothing to do.
- certState := node.Certificate.Status.State
- if node.Spec.Membership == api.NodeMembershipAccepted &&
- (certState == api.IssuanceStateIssued || certState == api.IssuanceStateRotate) {
- return nil
- }
- // If the certificate state is renew, then it is a server-sided accepted cert (cert renewals)
- if certState == api.IssuanceStateRenew {
- return s.signNodeCert(ctx, node)
- }
- // Sign this certificate if a user explicitly changed it to Accepted, and
- // the certificate is in pending state
- if node.Spec.Membership == api.NodeMembershipAccepted && certState == api.IssuanceStatePending {
- return s.signNodeCert(ctx, node)
- }
- return nil
- }
- // signNodeCert does the bulk of the work for signing a certificate
- func (s *Server) signNodeCert(ctx context.Context, node *api.Node) error {
- rootCA := s.securityConfig.RootCA()
- externalCA := s.securityConfig.externalCA
- node = node.Copy()
- nodeID := node.ID
- // Convert the role from proto format
- role, err := ParseRole(node.Certificate.Role)
- if err != nil {
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": node.ID,
- "method": "(*Server).signNodeCert",
- }).WithError(err).Errorf("failed to parse role")
- return errors.New("failed to parse role")
- }
- s.pending[node.ID] = node
- // Attempt to sign the CSR
- var (
- rawCSR = node.Certificate.CSR
- cn = node.Certificate.CN
- ou = role
- org = s.securityConfig.ClientTLSCreds.Organization()
- )
- // Try using the external CA first.
- cert, err := externalCA.Sign(ctx, PrepareCSR(rawCSR, cn, ou, org))
- if err == ErrNoExternalCAURLs {
- // No external CA servers configured. Try using the local CA.
- cert, err = rootCA.ParseValidateAndSignCSR(rawCSR, cn, ou, org)
- }
- if err != nil {
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": node.ID,
- "method": "(*Server).signNodeCert",
- }).WithError(err).Errorf("failed to sign CSR")
- // If the current state is already Failed, no need to change it
- if node.Certificate.Status.State == api.IssuanceStateFailed {
- delete(s.pending, node.ID)
- return errors.New("failed to sign CSR")
- }
- if _, ok := err.(recoverableErr); ok {
- // Return without changing the state of the certificate. We may
- // retry signing it in the future.
- return errors.New("failed to sign CSR")
- }
- // We failed to sign this CSR, change the state to FAILED
- err = s.store.Update(func(tx store.Tx) error {
- node := store.GetNode(tx, nodeID)
- if node == nil {
- return errors.Errorf("node %s not found", nodeID)
- }
- node.Certificate.Status = api.IssuanceStatus{
- State: api.IssuanceStateFailed,
- Err: err.Error(),
- }
- return store.UpdateNode(tx, node)
- })
- if err != nil {
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": nodeID,
- "method": "(*Server).signNodeCert",
- }).WithError(err).Errorf("transaction failed when setting state to FAILED")
- }
- delete(s.pending, node.ID)
- return errors.New("failed to sign CSR")
- }
- // We were able to successfully sign the new CSR. Let's try to update the nodeStore
- for {
- err = s.store.Update(func(tx store.Tx) error {
- node.Certificate.Certificate = cert
- node.Certificate.Status = api.IssuanceStatus{
- State: api.IssuanceStateIssued,
- }
- err := store.UpdateNode(tx, node)
- if err != nil {
- node = store.GetNode(tx, nodeID)
- if node == nil {
- err = errors.Errorf("node %s does not exist", nodeID)
- }
- }
- return err
- })
- if err == nil {
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": node.ID,
- "node.role": node.Certificate.Role,
- "method": "(*Server).signNodeCert",
- }).Debugf("certificate issued")
- delete(s.pending, node.ID)
- break
- }
- if err == store.ErrSequenceConflict {
- continue
- }
- log.G(ctx).WithFields(logrus.Fields{
- "node.id": nodeID,
- "method": "(*Server).signNodeCert",
- }).WithError(err).Errorf("transaction failed")
- return errors.New("transaction failed")
- }
- return nil
- }
- // reconcileNodeCertificates is a helper method that calls evaluateAndSignNodeCert on all the
- // nodes.
- func (s *Server) reconcileNodeCertificates(ctx context.Context, nodes []*api.Node) error {
- for _, node := range nodes {
- s.evaluateAndSignNodeCert(ctx, node)
- }
- return nil
- }
- // A successfully issued certificate and a failed certificate are our current final states
- func isFinalState(status api.IssuanceStatus) bool {
- if status.State == api.IssuanceStateIssued || status.State == api.IssuanceStateFailed ||
- status.State == api.IssuanceStateRotate {
- return true
- }
- return false
- }
|