server.go 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850
  1. package ca
  2. import (
  3. "bytes"
  4. "crypto/subtle"
  5. "crypto/x509"
  6. "sync"
  7. "time"
  8. "github.com/Sirupsen/logrus"
  9. "github.com/docker/swarmkit/api"
  10. "github.com/docker/swarmkit/api/equality"
  11. "github.com/docker/swarmkit/identity"
  12. "github.com/docker/swarmkit/log"
  13. "github.com/docker/swarmkit/manager/state/store"
  14. gogotypes "github.com/gogo/protobuf/types"
  15. "github.com/pkg/errors"
  16. "golang.org/x/net/context"
  17. "google.golang.org/grpc"
  18. "google.golang.org/grpc/codes"
  19. )
  20. const (
  21. defaultReconciliationRetryInterval = 10 * time.Second
  22. defaultRootReconciliationInterval = 3 * time.Second
  23. )
  24. // APISecurityConfigUpdater knows how to update a SecurityConfig from an api.Cluster object
  25. type APISecurityConfigUpdater interface {
  26. UpdateRootCA(ctx context.Context, cluster *api.Cluster) error
  27. }
  28. var _ APISecurityConfigUpdater = &Server{}
  29. // Server is the CA and NodeCA API gRPC server.
  30. // TODO(aaronl): At some point we may want to have separate implementations of
  31. // CA, NodeCA, and other hypothetical future CA services. At the moment,
  32. // breaking it apart doesn't seem worth it.
  33. type Server struct {
  34. mu sync.Mutex
  35. wg sync.WaitGroup
  36. ctx context.Context
  37. cancel func()
  38. store *store.MemoryStore
  39. securityConfig *SecurityConfig
  40. joinTokens *api.JoinTokens
  41. reconciliationRetryInterval time.Duration
  42. // pending is a map of nodes with pending certificates issuance or
  43. // renewal. They are indexed by node ID.
  44. pending map[string]*api.Node
  45. // started is a channel which gets closed once the server is running
  46. // and able to service RPCs.
  47. started chan struct{}
  48. // these are cached values to ensure we only update the security config when
  49. // the cluster root CA and external CAs have changed - the cluster object
  50. // can change for other reasons, and it would not be necessary to update
  51. // the security config as a result
  52. lastSeenClusterRootCA *api.RootCA
  53. lastSeenExternalCAs []*api.ExternalCA
  54. secConfigMu sync.Mutex
  55. // before we update the security config with the new root CA, we need to be able to save the root certs
  56. rootPaths CertPaths
  57. // lets us monitor and finish root rotations
  58. rootReconciler *rootRotationReconciler
  59. rootReconciliationRetryInterval time.Duration
  60. }
  61. // DefaultCAConfig returns the default CA Config, with a default expiration.
  62. func DefaultCAConfig() api.CAConfig {
  63. return api.CAConfig{
  64. NodeCertExpiry: gogotypes.DurationProto(DefaultNodeCertExpiration),
  65. }
  66. }
  67. // NewServer creates a CA API server.
  68. func NewServer(store *store.MemoryStore, securityConfig *SecurityConfig, rootCAPaths CertPaths) *Server {
  69. return &Server{
  70. store: store,
  71. securityConfig: securityConfig,
  72. pending: make(map[string]*api.Node),
  73. started: make(chan struct{}),
  74. reconciliationRetryInterval: defaultReconciliationRetryInterval,
  75. rootReconciliationRetryInterval: defaultRootReconciliationInterval,
  76. rootPaths: rootCAPaths,
  77. }
  78. }
  79. // SetReconciliationRetryInterval changes the time interval between
  80. // reconciliation attempts. This function must be called before Run.
  81. func (s *Server) SetReconciliationRetryInterval(reconciliationRetryInterval time.Duration) {
  82. s.reconciliationRetryInterval = reconciliationRetryInterval
  83. }
  84. // SetRootReconciliationInterval changes the time interval between root rotation
  85. // reconciliation attempts. This function must be called before Run.
  86. func (s *Server) SetRootReconciliationInterval(interval time.Duration) {
  87. s.rootReconciliationRetryInterval = interval
  88. }
  89. // GetUnlockKey is responsible for returning the current unlock key used for encrypting TLS private keys and
  90. // other at rest data. Access to this RPC call should only be allowed via mutual TLS from managers.
  91. func (s *Server) GetUnlockKey(ctx context.Context, request *api.GetUnlockKeyRequest) (*api.GetUnlockKeyResponse, error) {
  92. // This directly queries the store, rather than storing the unlock key and version on
  93. // the `Server` object and updating it `updateCluster` is called, because we need this
  94. // API to return the latest version of the key. Otherwise, there might be a slight delay
  95. // between when the cluster gets updated, and when this function returns the latest key.
  96. // This delay is currently unacceptable because this RPC call is the only way, after a
  97. // cluster update, to get the actual value of the unlock key, and we don't want to return
  98. // a cached value.
  99. resp := api.GetUnlockKeyResponse{}
  100. s.store.View(func(tx store.ReadTx) {
  101. cluster := store.GetCluster(tx, s.securityConfig.ClientTLSCreds.Organization())
  102. resp.Version = cluster.Meta.Version
  103. if cluster.Spec.EncryptionConfig.AutoLockManagers {
  104. for _, encryptionKey := range cluster.UnlockKeys {
  105. if encryptionKey.Subsystem == ManagerRole {
  106. resp.UnlockKey = encryptionKey.Key
  107. return
  108. }
  109. }
  110. }
  111. })
  112. return &resp, nil
  113. }
  114. // NodeCertificateStatus returns the current issuance status of an issuance request identified by the nodeID
  115. func (s *Server) NodeCertificateStatus(ctx context.Context, request *api.NodeCertificateStatusRequest) (*api.NodeCertificateStatusResponse, error) {
  116. if request.NodeID == "" {
  117. return nil, grpc.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
  118. }
  119. serverCtx, err := s.isRunningLocked()
  120. if err != nil {
  121. return nil, err
  122. }
  123. var node *api.Node
  124. event := api.EventUpdateNode{
  125. Node: &api.Node{ID: request.NodeID},
  126. Checks: []api.NodeCheckFunc{api.NodeCheckID},
  127. }
  128. // Retrieve the current value of the certificate with this token, and create a watcher
  129. updates, cancel, err := store.ViewAndWatch(
  130. s.store,
  131. func(tx store.ReadTx) error {
  132. node = store.GetNode(tx, request.NodeID)
  133. return nil
  134. },
  135. event,
  136. )
  137. if err != nil {
  138. return nil, err
  139. }
  140. defer cancel()
  141. // This node ID doesn't exist
  142. if node == nil {
  143. return nil, grpc.Errorf(codes.NotFound, codes.NotFound.String())
  144. }
  145. log.G(ctx).WithFields(logrus.Fields{
  146. "node.id": node.ID,
  147. "status": node.Certificate.Status,
  148. "method": "NodeCertificateStatus",
  149. })
  150. // If this certificate has a final state, return it immediately (both pending and renew are transition states)
  151. if isFinalState(node.Certificate.Status) {
  152. return &api.NodeCertificateStatusResponse{
  153. Status: &node.Certificate.Status,
  154. Certificate: &node.Certificate,
  155. }, nil
  156. }
  157. log.G(ctx).WithFields(logrus.Fields{
  158. "node.id": node.ID,
  159. "status": node.Certificate.Status,
  160. "method": "NodeCertificateStatus",
  161. }).Debugf("started watching for certificate updates")
  162. // Certificate is Pending or in an Unknown state, let's wait for changes.
  163. for {
  164. select {
  165. case event := <-updates:
  166. switch v := event.(type) {
  167. case api.EventUpdateNode:
  168. // We got an update on the certificate record. If the status is a final state,
  169. // return the certificate.
  170. if isFinalState(v.Node.Certificate.Status) {
  171. cert := v.Node.Certificate.Copy()
  172. return &api.NodeCertificateStatusResponse{
  173. Status: &cert.Status,
  174. Certificate: cert,
  175. }, nil
  176. }
  177. }
  178. case <-ctx.Done():
  179. return nil, ctx.Err()
  180. case <-serverCtx.Done():
  181. return nil, s.ctx.Err()
  182. }
  183. }
  184. }
  185. // IssueNodeCertificate is responsible for gatekeeping both certificate requests from new nodes in the swarm,
  186. // and authorizing certificate renewals.
  187. // If a node presented a valid certificate, the corresponding certificate is set in a RENEW state.
  188. // If a node failed to present a valid certificate, we check for a valid join token and set the
  189. // role accordingly. A new random node ID is generated, and the corresponding node entry is created.
  190. // IssueNodeCertificate is the only place where new node entries to raft should be created.
  191. func (s *Server) IssueNodeCertificate(ctx context.Context, request *api.IssueNodeCertificateRequest) (*api.IssueNodeCertificateResponse, error) {
  192. // First, let's see if the remote node is presenting a non-empty CSR
  193. if len(request.CSR) == 0 {
  194. return nil, grpc.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
  195. }
  196. if _, err := s.isRunningLocked(); err != nil {
  197. return nil, err
  198. }
  199. var (
  200. blacklistedCerts map[string]*api.BlacklistedCertificate
  201. clusters []*api.Cluster
  202. err error
  203. )
  204. s.store.View(func(readTx store.ReadTx) {
  205. clusters, err = store.FindClusters(readTx, store.ByName("default"))
  206. })
  207. // Not having a cluster object yet means we can't check
  208. // the blacklist.
  209. if err == nil && len(clusters) == 1 {
  210. blacklistedCerts = clusters[0].BlacklistedCertificates
  211. }
  212. // Renewing the cert with a local (unix socket) is always valid.
  213. localNodeInfo := ctx.Value(LocalRequestKey)
  214. if localNodeInfo != nil {
  215. nodeInfo, ok := localNodeInfo.(RemoteNodeInfo)
  216. if ok && nodeInfo.NodeID != "" {
  217. return s.issueRenewCertificate(ctx, nodeInfo.NodeID, request.CSR)
  218. }
  219. }
  220. // If the remote node is a worker (either forwarded by a manager, or calling directly),
  221. // issue a renew worker certificate entry with the correct ID
  222. nodeID, err := AuthorizeForwardedRoleAndOrg(ctx, []string{WorkerRole}, []string{ManagerRole}, s.securityConfig.ClientTLSCreds.Organization(), blacklistedCerts)
  223. if err == nil {
  224. return s.issueRenewCertificate(ctx, nodeID, request.CSR)
  225. }
  226. // If the remote node is a manager (either forwarded by another manager, or calling directly),
  227. // issue a renew certificate entry with the correct ID
  228. nodeID, err = AuthorizeForwardedRoleAndOrg(ctx, []string{ManagerRole}, []string{ManagerRole}, s.securityConfig.ClientTLSCreds.Organization(), blacklistedCerts)
  229. if err == nil {
  230. return s.issueRenewCertificate(ctx, nodeID, request.CSR)
  231. }
  232. // The remote node didn't successfully present a valid MTLS certificate, let's issue a
  233. // certificate with a new random ID
  234. role := api.NodeRole(-1)
  235. s.mu.Lock()
  236. if subtle.ConstantTimeCompare([]byte(s.joinTokens.Manager), []byte(request.Token)) == 1 {
  237. role = api.NodeRoleManager
  238. } else if subtle.ConstantTimeCompare([]byte(s.joinTokens.Worker), []byte(request.Token)) == 1 {
  239. role = api.NodeRoleWorker
  240. }
  241. s.mu.Unlock()
  242. if role < 0 {
  243. return nil, grpc.Errorf(codes.InvalidArgument, "A valid join token is necessary to join this cluster")
  244. }
  245. // Max number of collisions of ID or CN to tolerate before giving up
  246. maxRetries := 3
  247. // Generate a random ID for this new node
  248. for i := 0; ; i++ {
  249. nodeID = identity.NewID()
  250. // Create a new node
  251. err := s.store.Update(func(tx store.Tx) error {
  252. node := &api.Node{
  253. Role: role,
  254. ID: nodeID,
  255. Certificate: api.Certificate{
  256. CSR: request.CSR,
  257. CN: nodeID,
  258. Role: role,
  259. Status: api.IssuanceStatus{
  260. State: api.IssuanceStatePending,
  261. },
  262. },
  263. Spec: api.NodeSpec{
  264. DesiredRole: role,
  265. Membership: api.NodeMembershipAccepted,
  266. Availability: request.Availability,
  267. },
  268. }
  269. return store.CreateNode(tx, node)
  270. })
  271. if err == nil {
  272. log.G(ctx).WithFields(logrus.Fields{
  273. "node.id": nodeID,
  274. "node.role": role,
  275. "method": "IssueNodeCertificate",
  276. }).Debugf("new certificate entry added")
  277. break
  278. }
  279. if err != store.ErrExist {
  280. return nil, err
  281. }
  282. if i == maxRetries {
  283. return nil, err
  284. }
  285. log.G(ctx).WithFields(logrus.Fields{
  286. "node.id": nodeID,
  287. "node.role": role,
  288. "method": "IssueNodeCertificate",
  289. }).Errorf("randomly generated node ID collided with an existing one - retrying")
  290. }
  291. return &api.IssueNodeCertificateResponse{
  292. NodeID: nodeID,
  293. NodeMembership: api.NodeMembershipAccepted,
  294. }, nil
  295. }
  296. // issueRenewCertificate receives a nodeID and a CSR and modifies the node's certificate entry with the new CSR
  297. // and changes the state to RENEW, so it can be picked up and signed by the signing reconciliation loop
  298. func (s *Server) issueRenewCertificate(ctx context.Context, nodeID string, csr []byte) (*api.IssueNodeCertificateResponse, error) {
  299. var (
  300. cert api.Certificate
  301. node *api.Node
  302. )
  303. err := s.store.Update(func(tx store.Tx) error {
  304. // Attempt to retrieve the node with nodeID
  305. node = store.GetNode(tx, nodeID)
  306. if node == nil {
  307. log.G(ctx).WithFields(logrus.Fields{
  308. "node.id": nodeID,
  309. "method": "issueRenewCertificate",
  310. }).Warnf("node does not exist")
  311. // If this node doesn't exist, we shouldn't be renewing a certificate for it
  312. return grpc.Errorf(codes.NotFound, "node %s not found when attempting to renew certificate", nodeID)
  313. }
  314. // Create a new Certificate entry for this node with the new CSR and a RENEW state
  315. cert = api.Certificate{
  316. CSR: csr,
  317. CN: node.ID,
  318. Role: node.Role,
  319. Status: api.IssuanceStatus{
  320. State: api.IssuanceStateRenew,
  321. },
  322. }
  323. node.Certificate = cert
  324. return store.UpdateNode(tx, node)
  325. })
  326. if err != nil {
  327. return nil, err
  328. }
  329. log.G(ctx).WithFields(logrus.Fields{
  330. "cert.cn": cert.CN,
  331. "cert.role": cert.Role,
  332. "method": "issueRenewCertificate",
  333. }).Debugf("node certificate updated")
  334. return &api.IssueNodeCertificateResponse{
  335. NodeID: nodeID,
  336. NodeMembership: node.Spec.Membership,
  337. }, nil
  338. }
  339. // GetRootCACertificate returns the certificate of the Root CA. It is used as a convenience for distributing
  340. // the root of trust for the swarm. Clients should be using the CA hash to verify if they weren't target to
  341. // a MiTM. If they fail to do so, node bootstrap works with TOFU semantics.
  342. func (s *Server) GetRootCACertificate(ctx context.Context, request *api.GetRootCACertificateRequest) (*api.GetRootCACertificateResponse, error) {
  343. log.G(ctx).WithFields(logrus.Fields{
  344. "method": "GetRootCACertificate",
  345. })
  346. return &api.GetRootCACertificateResponse{
  347. Certificate: s.securityConfig.RootCA().Certs,
  348. }, nil
  349. }
  350. // Run runs the CA signer main loop.
  351. // The CA signer can be stopped with cancelling ctx or calling Stop().
  352. func (s *Server) Run(ctx context.Context) error {
  353. s.mu.Lock()
  354. if s.isRunning() {
  355. s.mu.Unlock()
  356. return errors.New("CA signer is already running")
  357. }
  358. s.wg.Add(1)
  359. s.ctx, s.cancel = context.WithCancel(log.WithModule(ctx, "ca"))
  360. ctx = s.ctx
  361. // we need to set it on the server, because `Server.UpdateRootCA` can be called from outside the Run function
  362. s.rootReconciler = &rootRotationReconciler{
  363. ctx: log.WithField(ctx, "method", "(*Server).rootRotationReconciler"),
  364. clusterID: s.securityConfig.ClientTLSCreds.Organization(),
  365. store: s.store,
  366. batchUpdateInterval: s.rootReconciliationRetryInterval,
  367. }
  368. rootReconciler := s.rootReconciler
  369. s.mu.Unlock()
  370. defer s.wg.Done()
  371. defer func() {
  372. s.mu.Lock()
  373. s.rootReconciler = nil
  374. s.mu.Unlock()
  375. }()
  376. // Retrieve the channels to keep track of changes in the cluster
  377. // Retrieve all the currently registered nodes
  378. var nodes []*api.Node
  379. updates, cancel, err := store.ViewAndWatch(
  380. s.store,
  381. func(readTx store.ReadTx) error {
  382. clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
  383. if err != nil {
  384. return err
  385. }
  386. if len(clusters) != 1 {
  387. return errors.New("could not find cluster object")
  388. }
  389. s.UpdateRootCA(ctx, clusters[0]) // call once to ensure that the join tokens are always set
  390. nodes, err = store.FindNodes(readTx, store.All)
  391. return err
  392. },
  393. api.EventCreateNode{},
  394. api.EventUpdateNode{},
  395. api.EventDeleteNode{},
  396. )
  397. // Do this after updateCluster has been called, so isRunning never
  398. // returns true without joinTokens being set correctly.
  399. s.mu.Lock()
  400. close(s.started)
  401. s.mu.Unlock()
  402. if err != nil {
  403. log.G(ctx).WithFields(logrus.Fields{
  404. "method": "(*Server).Run",
  405. }).WithError(err).Errorf("snapshot store view failed")
  406. return err
  407. }
  408. defer cancel()
  409. // We might have missed some updates if there was a leader election,
  410. // so let's pick up the slack.
  411. if err := s.reconcileNodeCertificates(ctx, nodes); err != nil {
  412. // We don't return here because that means the Run loop would
  413. // never run. Log an error instead.
  414. log.G(ctx).WithFields(logrus.Fields{
  415. "method": "(*Server).Run",
  416. }).WithError(err).Errorf("error attempting to reconcile certificates")
  417. }
  418. ticker := time.NewTicker(s.reconciliationRetryInterval)
  419. defer ticker.Stop()
  420. // Watch for new nodes being created, new nodes being updated, and changes
  421. // to the cluster
  422. for {
  423. select {
  424. case <-ctx.Done():
  425. return nil
  426. default:
  427. }
  428. select {
  429. case event := <-updates:
  430. switch v := event.(type) {
  431. case api.EventCreateNode:
  432. s.evaluateAndSignNodeCert(ctx, v.Node)
  433. rootReconciler.UpdateNode(v.Node)
  434. case api.EventUpdateNode:
  435. // If this certificate is already at a final state
  436. // no need to evaluate and sign it.
  437. if !isFinalState(v.Node.Certificate.Status) {
  438. s.evaluateAndSignNodeCert(ctx, v.Node)
  439. }
  440. rootReconciler.UpdateNode(v.Node)
  441. case api.EventDeleteNode:
  442. rootReconciler.DeleteNode(v.Node)
  443. }
  444. case <-ticker.C:
  445. for _, node := range s.pending {
  446. if err := s.evaluateAndSignNodeCert(ctx, node); err != nil {
  447. // If this sign operation did not succeed, the rest are
  448. // unlikely to. Yield so that we don't hammer an external CA.
  449. // Since the map iteration order is randomized, there is no
  450. // risk of getting stuck on a problematic CSR.
  451. break
  452. }
  453. }
  454. case <-ctx.Done():
  455. return nil
  456. }
  457. }
  458. }
  459. // Stop stops the CA and closes all grpc streams.
  460. func (s *Server) Stop() error {
  461. s.mu.Lock()
  462. if !s.isRunning() {
  463. s.mu.Unlock()
  464. return errors.New("CA signer is already stopped")
  465. }
  466. s.cancel()
  467. s.started = make(chan struct{})
  468. s.mu.Unlock()
  469. // Wait for Run to complete
  470. s.wg.Wait()
  471. return nil
  472. }
  473. // Ready waits on the ready channel and returns when the server is ready to serve.
  474. func (s *Server) Ready() <-chan struct{} {
  475. s.mu.Lock()
  476. defer s.mu.Unlock()
  477. return s.started
  478. }
  479. func (s *Server) isRunningLocked() (context.Context, error) {
  480. s.mu.Lock()
  481. if !s.isRunning() {
  482. s.mu.Unlock()
  483. return nil, grpc.Errorf(codes.Aborted, "CA signer is stopped")
  484. }
  485. ctx := s.ctx
  486. s.mu.Unlock()
  487. return ctx, nil
  488. }
  489. func (s *Server) isRunning() bool {
  490. if s.ctx == nil {
  491. return false
  492. }
  493. select {
  494. case <-s.ctx.Done():
  495. return false
  496. default:
  497. }
  498. return true
  499. }
  500. // UpdateRootCA is called when there are cluster changes, and it ensures that the local RootCA is
  501. // always aware of changes in clusterExpiry and the Root CA key material - this can be called by
  502. // anything to update the root CA material
  503. func (s *Server) UpdateRootCA(ctx context.Context, cluster *api.Cluster) error {
  504. s.mu.Lock()
  505. s.joinTokens = cluster.RootCA.JoinTokens.Copy()
  506. reconciler := s.rootReconciler
  507. s.mu.Unlock()
  508. rCA := cluster.RootCA.Copy()
  509. if reconciler != nil {
  510. reconciler.UpdateRootCA(rCA)
  511. }
  512. s.secConfigMu.Lock()
  513. defer s.secConfigMu.Unlock()
  514. firstSeenCluster := s.lastSeenClusterRootCA == nil && s.lastSeenExternalCAs == nil
  515. rootCAChanged := len(rCA.CACert) != 0 && !equality.RootCAEqualStable(s.lastSeenClusterRootCA, rCA)
  516. externalCAChanged := !equality.ExternalCAsEqualStable(s.lastSeenExternalCAs, cluster.Spec.CAConfig.ExternalCAs)
  517. logger := log.G(ctx).WithFields(logrus.Fields{
  518. "cluster.id": cluster.ID,
  519. "method": "(*Server).UpdateRootCA",
  520. })
  521. if rootCAChanged {
  522. setOrUpdate := "set"
  523. if !firstSeenCluster {
  524. logger.Debug("Updating security config due to change in cluster Root CA")
  525. setOrUpdate = "updated"
  526. }
  527. expiry := DefaultNodeCertExpiration
  528. if cluster.Spec.CAConfig.NodeCertExpiry != nil {
  529. // NodeCertExpiry exists, let's try to parse the duration out of it
  530. clusterExpiry, err := gogotypes.DurationFromProto(cluster.Spec.CAConfig.NodeCertExpiry)
  531. if err != nil {
  532. logger.WithError(err).Warn("failed to parse certificate expiration, using default")
  533. } else {
  534. // We were able to successfully parse the expiration out of the cluster.
  535. expiry = clusterExpiry
  536. }
  537. } else {
  538. // NodeCertExpiry seems to be nil
  539. logger.Warn("no certificate expiration specified, using default")
  540. }
  541. // Attempt to update our local RootCA with the new parameters
  542. var intermediates []byte
  543. signingCert := rCA.CACert
  544. signingKey := rCA.CAKey
  545. if rCA.RootRotation != nil {
  546. signingCert = rCA.RootRotation.CrossSignedCACert
  547. signingKey = rCA.RootRotation.CAKey
  548. intermediates = rCA.RootRotation.CrossSignedCACert
  549. }
  550. if signingKey == nil {
  551. signingCert = nil
  552. }
  553. updatedRootCA, err := NewRootCA(rCA.CACert, signingCert, signingKey, expiry, intermediates)
  554. if err != nil {
  555. return errors.Wrap(err, "invalid Root CA object in cluster")
  556. }
  557. if err := SaveRootCA(updatedRootCA, s.rootPaths); err != nil {
  558. return errors.Wrap(err, "unable to save new root CA certificates")
  559. }
  560. externalCARootPool := updatedRootCA.Pool
  561. if rCA.RootRotation != nil {
  562. // the external CA has to trust the new CA cert
  563. externalCARootPool = x509.NewCertPool()
  564. externalCARootPool.AppendCertsFromPEM(rCA.CACert)
  565. externalCARootPool.AppendCertsFromPEM(rCA.RootRotation.CACert)
  566. }
  567. // Attempt to update our local RootCA with the new parameters
  568. if err := s.securityConfig.UpdateRootCA(&updatedRootCA, externalCARootPool); err != nil {
  569. return errors.Wrap(err, "updating Root CA failed")
  570. }
  571. // only update the server cache if we've successfully updated the root CA
  572. logger.Debugf("Root CA %s successfully", setOrUpdate)
  573. s.lastSeenClusterRootCA = rCA
  574. }
  575. // we want to update if the external CA changed, or if the root CA changed because the root CA could affect what
  576. // certificate for external CAs we want to filter by
  577. if rootCAChanged || externalCAChanged {
  578. if !firstSeenCluster {
  579. logger.Debug("Updating security config external CA URLs due to change in cluster Root CA or cluster spec")
  580. }
  581. wantedExternalCACert := rCA.CACert // we want to only add external CA URLs that use this cert
  582. if rCA.RootRotation != nil {
  583. // we're rotating to a new root, so we only want external CAs with the new root cert
  584. wantedExternalCACert = rCA.RootRotation.CACert
  585. }
  586. wantedExternalCACert = NormalizePEMs(wantedExternalCACert)
  587. // Update our security config with the list of External CA URLs
  588. // from the new cluster state.
  589. // TODO(aaronl): In the future, this will be abstracted with an
  590. // ExternalCA interface that has different implementations for
  591. // different CA types. At the moment, only CFSSL is supported.
  592. var cfsslURLs []string
  593. for i, extCA := range cluster.Spec.CAConfig.ExternalCAs {
  594. // We want to support old external CA specifications which did not have a CA cert. If there is no cert specified,
  595. // we assume it's the old cert
  596. certForExtCA := extCA.CACert
  597. if len(certForExtCA) == 0 {
  598. certForExtCA = rCA.CACert
  599. }
  600. certForExtCA = NormalizePEMs(certForExtCA)
  601. if extCA.Protocol != api.ExternalCA_CAProtocolCFSSL {
  602. logger.Debugf("skipping external CA %d (url: %s) due to unknown protocol type", i, extCA.URL)
  603. continue
  604. }
  605. if !bytes.Equal(certForExtCA, wantedExternalCACert) {
  606. logger.Debugf("skipping external CA %d (url: %s) because it has the wrong CA cert", i, extCA.URL)
  607. continue
  608. }
  609. cfsslURLs = append(cfsslURLs, extCA.URL)
  610. }
  611. s.securityConfig.externalCA.UpdateURLs(cfsslURLs...)
  612. s.lastSeenExternalCAs = cluster.Spec.CAConfig.Copy().ExternalCAs
  613. }
  614. return nil
  615. }
  616. // evaluateAndSignNodeCert implements the logic of which certificates to sign
  617. func (s *Server) evaluateAndSignNodeCert(ctx context.Context, node *api.Node) error {
  618. // If the desired membership and actual state are in sync, there's
  619. // nothing to do.
  620. certState := node.Certificate.Status.State
  621. if node.Spec.Membership == api.NodeMembershipAccepted &&
  622. (certState == api.IssuanceStateIssued || certState == api.IssuanceStateRotate) {
  623. return nil
  624. }
  625. // If the certificate state is renew, then it is a server-sided accepted cert (cert renewals)
  626. if certState == api.IssuanceStateRenew {
  627. return s.signNodeCert(ctx, node)
  628. }
  629. // Sign this certificate if a user explicitly changed it to Accepted, and
  630. // the certificate is in pending state
  631. if node.Spec.Membership == api.NodeMembershipAccepted && certState == api.IssuanceStatePending {
  632. return s.signNodeCert(ctx, node)
  633. }
  634. return nil
  635. }
  636. // signNodeCert does the bulk of the work for signing a certificate
  637. func (s *Server) signNodeCert(ctx context.Context, node *api.Node) error {
  638. rootCA := s.securityConfig.RootCA()
  639. externalCA := s.securityConfig.externalCA
  640. node = node.Copy()
  641. nodeID := node.ID
  642. // Convert the role from proto format
  643. role, err := ParseRole(node.Certificate.Role)
  644. if err != nil {
  645. log.G(ctx).WithFields(logrus.Fields{
  646. "node.id": node.ID,
  647. "method": "(*Server).signNodeCert",
  648. }).WithError(err).Errorf("failed to parse role")
  649. return errors.New("failed to parse role")
  650. }
  651. s.pending[node.ID] = node
  652. // Attempt to sign the CSR
  653. var (
  654. rawCSR = node.Certificate.CSR
  655. cn = node.Certificate.CN
  656. ou = role
  657. org = s.securityConfig.ClientTLSCreds.Organization()
  658. )
  659. // Try using the external CA first.
  660. cert, err := externalCA.Sign(ctx, PrepareCSR(rawCSR, cn, ou, org))
  661. if err == ErrNoExternalCAURLs {
  662. // No external CA servers configured. Try using the local CA.
  663. cert, err = rootCA.ParseValidateAndSignCSR(rawCSR, cn, ou, org)
  664. }
  665. if err != nil {
  666. log.G(ctx).WithFields(logrus.Fields{
  667. "node.id": node.ID,
  668. "method": "(*Server).signNodeCert",
  669. }).WithError(err).Errorf("failed to sign CSR")
  670. // If the current state is already Failed, no need to change it
  671. if node.Certificate.Status.State == api.IssuanceStateFailed {
  672. delete(s.pending, node.ID)
  673. return errors.New("failed to sign CSR")
  674. }
  675. if _, ok := err.(recoverableErr); ok {
  676. // Return without changing the state of the certificate. We may
  677. // retry signing it in the future.
  678. return errors.New("failed to sign CSR")
  679. }
  680. // We failed to sign this CSR, change the state to FAILED
  681. err = s.store.Update(func(tx store.Tx) error {
  682. node := store.GetNode(tx, nodeID)
  683. if node == nil {
  684. return errors.Errorf("node %s not found", nodeID)
  685. }
  686. node.Certificate.Status = api.IssuanceStatus{
  687. State: api.IssuanceStateFailed,
  688. Err: err.Error(),
  689. }
  690. return store.UpdateNode(tx, node)
  691. })
  692. if err != nil {
  693. log.G(ctx).WithFields(logrus.Fields{
  694. "node.id": nodeID,
  695. "method": "(*Server).signNodeCert",
  696. }).WithError(err).Errorf("transaction failed when setting state to FAILED")
  697. }
  698. delete(s.pending, node.ID)
  699. return errors.New("failed to sign CSR")
  700. }
  701. // We were able to successfully sign the new CSR. Let's try to update the nodeStore
  702. for {
  703. err = s.store.Update(func(tx store.Tx) error {
  704. node.Certificate.Certificate = cert
  705. node.Certificate.Status = api.IssuanceStatus{
  706. State: api.IssuanceStateIssued,
  707. }
  708. err := store.UpdateNode(tx, node)
  709. if err != nil {
  710. node = store.GetNode(tx, nodeID)
  711. if node == nil {
  712. err = errors.Errorf("node %s does not exist", nodeID)
  713. }
  714. }
  715. return err
  716. })
  717. if err == nil {
  718. log.G(ctx).WithFields(logrus.Fields{
  719. "node.id": node.ID,
  720. "node.role": node.Certificate.Role,
  721. "method": "(*Server).signNodeCert",
  722. }).Debugf("certificate issued")
  723. delete(s.pending, node.ID)
  724. break
  725. }
  726. if err == store.ErrSequenceConflict {
  727. continue
  728. }
  729. log.G(ctx).WithFields(logrus.Fields{
  730. "node.id": nodeID,
  731. "method": "(*Server).signNodeCert",
  732. }).WithError(err).Errorf("transaction failed")
  733. return errors.New("transaction failed")
  734. }
  735. return nil
  736. }
  737. // reconcileNodeCertificates is a helper method that calls evaluateAndSignNodeCert on all the
  738. // nodes.
  739. func (s *Server) reconcileNodeCertificates(ctx context.Context, nodes []*api.Node) error {
  740. for _, node := range nodes {
  741. s.evaluateAndSignNodeCert(ctx, node)
  742. }
  743. return nil
  744. }
  745. // A successfully issued certificate and a failed certificate are our current final states
  746. func isFinalState(status api.IssuanceStatus) bool {
  747. if status.State == api.IssuanceStateIssued || status.State == api.IssuanceStateFailed ||
  748. status.State == api.IssuanceStateRotate {
  749. return true
  750. }
  751. return false
  752. }