agent.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
  1. package agent
  2. import (
  3. "bytes"
  4. "fmt"
  5. "math/rand"
  6. "reflect"
  7. "sync"
  8. "time"
  9. "github.com/docker/swarmkit/agent/exec"
  10. "github.com/docker/swarmkit/api"
  11. "github.com/docker/swarmkit/log"
  12. "golang.org/x/net/context"
  13. )
  14. const (
  15. initialSessionFailureBackoff = 100 * time.Millisecond
  16. maxSessionFailureBackoff = 8 * time.Second
  17. nodeUpdatePeriod = 20 * time.Second
  18. )
  19. // Agent implements the primary node functionality for a member of a swarm
  20. // cluster. The primary functionality is to run and report on the status of
  21. // tasks assigned to the node.
  22. type Agent struct {
  23. config *Config
  24. // The latest node object state from manager
  25. // for this node known to the agent.
  26. node *api.Node
  27. keys []*api.EncryptionKey
  28. sessionq chan sessionOperation
  29. worker Worker
  30. started chan struct{}
  31. startOnce sync.Once // start only once
  32. ready chan struct{}
  33. leaving chan struct{}
  34. leaveOnce sync.Once
  35. left chan struct{} // closed after "run" processes "leaving" and will no longer accept new assignments
  36. stopped chan struct{} // requests shutdown
  37. stopOnce sync.Once // only allow stop to be called once
  38. closed chan struct{} // only closed in run
  39. err error // read only after closed is closed
  40. nodeUpdatePeriod time.Duration
  41. }
  42. // New returns a new agent, ready for task dispatch.
  43. func New(config *Config) (*Agent, error) {
  44. if err := config.validate(); err != nil {
  45. return nil, err
  46. }
  47. a := &Agent{
  48. config: config,
  49. sessionq: make(chan sessionOperation),
  50. started: make(chan struct{}),
  51. leaving: make(chan struct{}),
  52. left: make(chan struct{}),
  53. stopped: make(chan struct{}),
  54. closed: make(chan struct{}),
  55. ready: make(chan struct{}),
  56. nodeUpdatePeriod: nodeUpdatePeriod,
  57. }
  58. a.worker = newWorker(config.DB, config.Executor, a)
  59. return a, nil
  60. }
  61. // Start begins execution of the agent in the provided context, if not already
  62. // started.
  63. //
  64. // Start returns an error if the agent has already started.
  65. func (a *Agent) Start(ctx context.Context) error {
  66. err := errAgentStarted
  67. a.startOnce.Do(func() {
  68. close(a.started)
  69. go a.run(ctx)
  70. err = nil // clear error above, only once.
  71. })
  72. return err
  73. }
  74. // Leave instructs the agent to leave the cluster. This method will shutdown
  75. // assignment processing and remove all assignments from the node.
  76. // Leave blocks until worker has finished closing all task managers or agent
  77. // is closed.
  78. func (a *Agent) Leave(ctx context.Context) error {
  79. select {
  80. case <-a.started:
  81. default:
  82. return errAgentNotStarted
  83. }
  84. a.leaveOnce.Do(func() {
  85. close(a.leaving)
  86. })
  87. // Do not call Wait until we have confirmed that the agent is no longer
  88. // accepting assignments. Starting a worker might race with Wait.
  89. select {
  90. case <-a.left:
  91. case <-a.closed:
  92. return ErrClosed
  93. case <-ctx.Done():
  94. return ctx.Err()
  95. }
  96. // agent could be closed while Leave is in progress
  97. var err error
  98. ch := make(chan struct{})
  99. go func() {
  100. err = a.worker.Wait(ctx)
  101. close(ch)
  102. }()
  103. select {
  104. case <-ch:
  105. return err
  106. case <-a.closed:
  107. return ErrClosed
  108. }
  109. }
  110. // Stop shuts down the agent, blocking until full shutdown. If the agent is not
  111. // started, Stop will block until the agent has fully shutdown.
  112. func (a *Agent) Stop(ctx context.Context) error {
  113. select {
  114. case <-a.started:
  115. default:
  116. return errAgentNotStarted
  117. }
  118. a.stop()
  119. // wait till closed or context cancelled
  120. select {
  121. case <-a.closed:
  122. return nil
  123. case <-ctx.Done():
  124. return ctx.Err()
  125. }
  126. }
  127. // stop signals the agent shutdown process, returning true if this call was the
  128. // first to actually shutdown the agent.
  129. func (a *Agent) stop() bool {
  130. var stopped bool
  131. a.stopOnce.Do(func() {
  132. close(a.stopped)
  133. stopped = true
  134. })
  135. return stopped
  136. }
  137. // Err returns the error that caused the agent to shutdown or nil. Err blocks
  138. // until the agent is fully shutdown.
  139. func (a *Agent) Err(ctx context.Context) error {
  140. select {
  141. case <-a.closed:
  142. return a.err
  143. case <-ctx.Done():
  144. return ctx.Err()
  145. }
  146. }
  147. // Ready returns a channel that will be closed when agent first becomes ready.
  148. func (a *Agent) Ready() <-chan struct{} {
  149. return a.ready
  150. }
  151. func (a *Agent) run(ctx context.Context) {
  152. ctx, cancel := context.WithCancel(ctx)
  153. defer cancel()
  154. defer close(a.closed) // full shutdown.
  155. ctx = log.WithModule(ctx, "agent")
  156. log.G(ctx).Debug("(*Agent).run")
  157. defer log.G(ctx).Debug("(*Agent).run exited")
  158. nodeTLSInfo := a.config.NodeTLSInfo
  159. // get the node description
  160. nodeDescription, err := a.nodeDescriptionWithHostname(ctx, nodeTLSInfo)
  161. if err != nil {
  162. log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Error("agent: node description unavailable")
  163. }
  164. // nodeUpdateTicker is used to periodically check for updates to node description
  165. nodeUpdateTicker := time.NewTicker(a.nodeUpdatePeriod)
  166. defer nodeUpdateTicker.Stop()
  167. var (
  168. backoff time.Duration
  169. session = newSession(ctx, a, backoff, "", nodeDescription) // start the initial session
  170. registered = session.registered
  171. ready = a.ready // first session ready
  172. sessionq chan sessionOperation
  173. leaving = a.leaving
  174. subscriptions = map[string]context.CancelFunc{}
  175. )
  176. if err := a.worker.Init(ctx); err != nil {
  177. log.G(ctx).WithError(err).Error("worker initialization failed")
  178. a.err = err
  179. return // fatal?
  180. }
  181. defer a.worker.Close()
  182. // setup a reliable reporter to call back to us.
  183. reporter := newStatusReporter(ctx, a)
  184. defer reporter.Close()
  185. a.worker.Listen(ctx, reporter)
  186. updateNode := func() {
  187. // skip updating if the registration isn't finished
  188. if registered != nil {
  189. return
  190. }
  191. // get the current node description
  192. newNodeDescription, err := a.nodeDescriptionWithHostname(ctx, nodeTLSInfo)
  193. if err != nil {
  194. log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Error("agent: updated node description unavailable")
  195. }
  196. // if newNodeDescription is nil, it will cause a panic when
  197. // trying to create a session. Typically this can happen
  198. // if the engine goes down
  199. if newNodeDescription == nil {
  200. return
  201. }
  202. // if the node description has changed, update it to the new one
  203. // and close the session. The old session will be stopped and a
  204. // new one will be created with the updated description
  205. if !reflect.DeepEqual(nodeDescription, newNodeDescription) {
  206. nodeDescription = newNodeDescription
  207. // close the session
  208. log.G(ctx).Info("agent: found node update")
  209. if err := session.close(); err != nil {
  210. log.G(ctx).WithError(err).Error("agent: closing session failed")
  211. }
  212. sessionq = nil
  213. registered = nil
  214. }
  215. }
  216. for {
  217. select {
  218. case operation := <-sessionq:
  219. operation.response <- operation.fn(session)
  220. case <-leaving:
  221. leaving = nil
  222. // TODO(stevvooe): Signal to the manager that the node is leaving.
  223. // when leaving we remove all assignments.
  224. if err := a.worker.Assign(ctx, nil); err != nil {
  225. log.G(ctx).WithError(err).Error("failed removing all assignments")
  226. }
  227. close(a.left)
  228. case msg := <-session.assignments:
  229. // if we have left, accept no more assignments
  230. if leaving == nil {
  231. continue
  232. }
  233. switch msg.Type {
  234. case api.AssignmentsMessage_COMPLETE:
  235. // Need to assign secrets and configs before tasks,
  236. // because tasks might depend on new secrets or configs
  237. if err := a.worker.Assign(ctx, msg.Changes); err != nil {
  238. log.G(ctx).WithError(err).Error("failed to synchronize worker assignments")
  239. }
  240. case api.AssignmentsMessage_INCREMENTAL:
  241. if err := a.worker.Update(ctx, msg.Changes); err != nil {
  242. log.G(ctx).WithError(err).Error("failed to update worker assignments")
  243. }
  244. }
  245. case msg := <-session.messages:
  246. if err := a.handleSessionMessage(ctx, msg, nodeTLSInfo); err != nil {
  247. log.G(ctx).WithError(err).Error("session message handler failed")
  248. }
  249. case sub := <-session.subscriptions:
  250. if sub.Close {
  251. if cancel, ok := subscriptions[sub.ID]; ok {
  252. cancel()
  253. }
  254. delete(subscriptions, sub.ID)
  255. continue
  256. }
  257. if _, ok := subscriptions[sub.ID]; ok {
  258. // Duplicate subscription
  259. continue
  260. }
  261. subCtx, subCancel := context.WithCancel(ctx)
  262. subscriptions[sub.ID] = subCancel
  263. // TODO(dperny) we're tossing the error here, that seems wrong
  264. go a.worker.Subscribe(subCtx, sub)
  265. case <-registered:
  266. log.G(ctx).Debugln("agent: registered")
  267. if ready != nil {
  268. close(ready)
  269. }
  270. ready = nil
  271. registered = nil // we only care about this once per session
  272. backoff = 0 // reset backoff
  273. sessionq = a.sessionq
  274. case err := <-session.errs:
  275. // TODO(stevvooe): This may actually block if a session is closed
  276. // but no error was sent. This must be the only place
  277. // session.close is called in response to errors, for this to work.
  278. if err != nil {
  279. log.G(ctx).WithError(err).Error("agent: session failed")
  280. backoff = initialSessionFailureBackoff + 2*backoff
  281. if backoff > maxSessionFailureBackoff {
  282. backoff = maxSessionFailureBackoff
  283. }
  284. }
  285. if err := session.close(); err != nil {
  286. log.G(ctx).WithError(err).Error("agent: closing session failed")
  287. }
  288. sessionq = nil
  289. // if we're here before <-registered, do nothing for that event
  290. registered = nil
  291. case <-session.closed:
  292. log.G(ctx).Debugf("agent: rebuild session")
  293. // select a session registration delay from backoff range.
  294. delay := time.Duration(0)
  295. if backoff > 0 {
  296. delay = time.Duration(rand.Int63n(int64(backoff)))
  297. }
  298. session = newSession(ctx, a, delay, session.sessionID, nodeDescription)
  299. registered = session.registered
  300. case ev := <-a.config.NotifyTLSChange:
  301. // the TLS info has changed, so force a check to see if we need to restart the session
  302. if tlsInfo, ok := ev.(*api.NodeTLSInfo); ok {
  303. nodeTLSInfo = tlsInfo
  304. updateNode()
  305. nodeUpdateTicker.Stop()
  306. nodeUpdateTicker = time.NewTicker(a.nodeUpdatePeriod)
  307. }
  308. case <-nodeUpdateTicker.C:
  309. // periodically check to see whether the node information has changed, and if so, restart the session
  310. updateNode()
  311. case <-a.stopped:
  312. // TODO(stevvooe): Wait on shutdown and cleanup. May need to pump
  313. // this loop a few times.
  314. return
  315. case <-ctx.Done():
  316. if a.err == nil {
  317. a.err = ctx.Err()
  318. }
  319. session.close()
  320. return
  321. }
  322. }
  323. }
  324. func (a *Agent) handleSessionMessage(ctx context.Context, message *api.SessionMessage, nti *api.NodeTLSInfo) error {
  325. seen := map[api.Peer]struct{}{}
  326. for _, manager := range message.Managers {
  327. if manager.Peer.Addr == "" {
  328. continue
  329. }
  330. a.config.ConnBroker.Remotes().Observe(*manager.Peer, int(manager.Weight))
  331. seen[*manager.Peer] = struct{}{}
  332. }
  333. var changes *NodeChanges
  334. if message.Node != nil && (a.node == nil || !nodesEqual(a.node, message.Node)) {
  335. if a.config.NotifyNodeChange != nil {
  336. changes = &NodeChanges{Node: message.Node.Copy()}
  337. }
  338. a.node = message.Node.Copy()
  339. if err := a.config.Executor.Configure(ctx, a.node); err != nil {
  340. log.G(ctx).WithError(err).Error("node configure failed")
  341. }
  342. }
  343. if len(message.RootCA) > 0 && !bytes.Equal(message.RootCA, nti.TrustRoot) {
  344. if changes == nil {
  345. changes = &NodeChanges{RootCert: message.RootCA}
  346. } else {
  347. changes.RootCert = message.RootCA
  348. }
  349. }
  350. if changes != nil {
  351. a.config.NotifyNodeChange <- changes
  352. }
  353. // prune managers not in list.
  354. for peer := range a.config.ConnBroker.Remotes().Weights() {
  355. if _, ok := seen[peer]; !ok {
  356. a.config.ConnBroker.Remotes().Remove(peer)
  357. }
  358. }
  359. if message.NetworkBootstrapKeys == nil {
  360. return nil
  361. }
  362. for _, key := range message.NetworkBootstrapKeys {
  363. same := false
  364. for _, agentKey := range a.keys {
  365. if agentKey.LamportTime == key.LamportTime {
  366. same = true
  367. }
  368. }
  369. if !same {
  370. a.keys = message.NetworkBootstrapKeys
  371. if err := a.config.Executor.SetNetworkBootstrapKeys(a.keys); err != nil {
  372. panic(fmt.Errorf("configuring network key failed"))
  373. }
  374. }
  375. }
  376. return nil
  377. }
  378. type sessionOperation struct {
  379. fn func(session *session) error
  380. response chan error
  381. }
  382. // withSession runs fn with the current session.
  383. func (a *Agent) withSession(ctx context.Context, fn func(session *session) error) error {
  384. response := make(chan error, 1)
  385. select {
  386. case a.sessionq <- sessionOperation{
  387. fn: fn,
  388. response: response,
  389. }:
  390. select {
  391. case err := <-response:
  392. return err
  393. case <-a.closed:
  394. return ErrClosed
  395. case <-ctx.Done():
  396. return ctx.Err()
  397. }
  398. case <-a.closed:
  399. return ErrClosed
  400. case <-ctx.Done():
  401. return ctx.Err()
  402. }
  403. }
  404. // UpdateTaskStatus attempts to send a task status update over the current session,
  405. // blocking until the operation is completed.
  406. //
  407. // If an error is returned, the operation should be retried.
  408. func (a *Agent) UpdateTaskStatus(ctx context.Context, taskID string, status *api.TaskStatus) error {
  409. log.G(ctx).WithField("task.id", taskID).Debug("(*Agent).UpdateTaskStatus")
  410. ctx, cancel := context.WithCancel(ctx)
  411. defer cancel()
  412. errs := make(chan error, 1)
  413. if err := a.withSession(ctx, func(session *session) error {
  414. go func() {
  415. err := session.sendTaskStatus(ctx, taskID, status)
  416. if err != nil {
  417. if err == errTaskUnknown {
  418. err = nil // dispatcher no longer cares about this task.
  419. } else {
  420. log.G(ctx).WithError(err).Error("closing session after fatal error")
  421. session.sendError(err)
  422. }
  423. } else {
  424. log.G(ctx).Debug("task status reported")
  425. }
  426. errs <- err
  427. }()
  428. return nil
  429. }); err != nil {
  430. return err
  431. }
  432. select {
  433. case err := <-errs:
  434. return err
  435. case <-ctx.Done():
  436. return ctx.Err()
  437. }
  438. }
  439. // Publisher returns a LogPublisher for the given subscription
  440. // as well as a cancel function that should be called when the log stream
  441. // is completed.
  442. func (a *Agent) Publisher(ctx context.Context, subscriptionID string) (exec.LogPublisher, func(), error) {
  443. // TODO(stevvooe): The level of coordination here is WAY too much for logs.
  444. // These should only be best effort and really just buffer until a session is
  445. // ready. Ideally, they would use a separate connection completely.
  446. var (
  447. err error
  448. publisher api.LogBroker_PublishLogsClient
  449. )
  450. err = a.withSession(ctx, func(session *session) error {
  451. publisher, err = api.NewLogBrokerClient(session.conn.ClientConn).PublishLogs(ctx)
  452. return err
  453. })
  454. if err != nil {
  455. return nil, nil, err
  456. }
  457. // make little closure for ending the log stream
  458. sendCloseMsg := func() {
  459. // send a close message, to tell the manager our logs are done
  460. publisher.Send(&api.PublishLogsMessage{
  461. SubscriptionID: subscriptionID,
  462. Close: true,
  463. })
  464. // close the stream forreal
  465. publisher.CloseSend()
  466. }
  467. return exec.LogPublisherFunc(func(ctx context.Context, message api.LogMessage) error {
  468. select {
  469. case <-ctx.Done():
  470. sendCloseMsg()
  471. return ctx.Err()
  472. default:
  473. }
  474. return publisher.Send(&api.PublishLogsMessage{
  475. SubscriptionID: subscriptionID,
  476. Messages: []api.LogMessage{message},
  477. })
  478. }), func() {
  479. sendCloseMsg()
  480. }, nil
  481. }
  482. // nodeDescriptionWithHostname retrieves node description, and overrides hostname if available
  483. func (a *Agent) nodeDescriptionWithHostname(ctx context.Context, tlsInfo *api.NodeTLSInfo) (*api.NodeDescription, error) {
  484. desc, err := a.config.Executor.Describe(ctx)
  485. // Override hostname and TLS info
  486. if desc != nil {
  487. if a.config.Hostname != "" && desc != nil {
  488. desc.Hostname = a.config.Hostname
  489. }
  490. desc.TLSInfo = tlsInfo
  491. }
  492. return desc, err
  493. }
  494. // nodesEqual returns true if the node states are functionally equal, ignoring status,
  495. // version and other superfluous fields.
  496. //
  497. // This used to decide whether or not to propagate a node update to executor.
  498. func nodesEqual(a, b *api.Node) bool {
  499. a, b = a.Copy(), b.Copy()
  500. a.Status, b.Status = api.NodeStatus{}, api.NodeStatus{}
  501. a.Meta, b.Meta = api.Meta{}, api.Meta{}
  502. return reflect.DeepEqual(a, b)
  503. }