grpclb_remote_balancer.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. /*
  2. *
  3. * Copyright 2017 gRPC authors.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. */
  18. package grpclb
  19. import (
  20. "context"
  21. "fmt"
  22. "io"
  23. "net"
  24. "sync"
  25. "time"
  26. "github.com/golang/protobuf/proto"
  27. timestamppb "github.com/golang/protobuf/ptypes/timestamp"
  28. "github.com/google/go-cmp/cmp"
  29. "google.golang.org/grpc"
  30. "google.golang.org/grpc/balancer"
  31. lbpb "google.golang.org/grpc/balancer/grpclb/grpc_lb_v1"
  32. "google.golang.org/grpc/connectivity"
  33. "google.golang.org/grpc/credentials/insecure"
  34. "google.golang.org/grpc/internal/backoff"
  35. imetadata "google.golang.org/grpc/internal/metadata"
  36. "google.golang.org/grpc/keepalive"
  37. "google.golang.org/grpc/metadata"
  38. "google.golang.org/grpc/resolver"
  39. )
  40. // processServerList updates balancer's internal state, create/remove SubConns
  41. // and regenerates picker using the received serverList.
  42. func (lb *lbBalancer) processServerList(l *lbpb.ServerList) {
  43. if logger.V(2) {
  44. logger.Infof("lbBalancer: processing server list: %+v", l)
  45. }
  46. lb.mu.Lock()
  47. defer lb.mu.Unlock()
  48. // Set serverListReceived to true so fallback will not take effect if it has
  49. // not hit timeout.
  50. lb.serverListReceived = true
  51. // If the new server list == old server list, do nothing.
  52. if cmp.Equal(lb.fullServerList, l.Servers, cmp.Comparer(proto.Equal)) {
  53. if logger.V(2) {
  54. logger.Infof("lbBalancer: new serverlist same as the previous one, ignoring")
  55. }
  56. return
  57. }
  58. lb.fullServerList = l.Servers
  59. var backendAddrs []resolver.Address
  60. for i, s := range l.Servers {
  61. if s.Drop {
  62. continue
  63. }
  64. md := metadata.Pairs(lbTokenKey, s.LoadBalanceToken)
  65. ip := net.IP(s.IpAddress)
  66. ipStr := ip.String()
  67. if ip.To4() == nil {
  68. // Add square brackets to ipv6 addresses, otherwise net.Dial() and
  69. // net.SplitHostPort() will return too many colons error.
  70. ipStr = fmt.Sprintf("[%s]", ipStr)
  71. }
  72. addr := imetadata.Set(resolver.Address{Addr: fmt.Sprintf("%s:%d", ipStr, s.Port)}, md)
  73. if logger.V(2) {
  74. logger.Infof("lbBalancer: server list entry[%d]: ipStr:|%s|, port:|%d|, load balancer token:|%v|",
  75. i, ipStr, s.Port, s.LoadBalanceToken)
  76. }
  77. backendAddrs = append(backendAddrs, addr)
  78. }
  79. // Call refreshSubConns to create/remove SubConns. If we are in fallback,
  80. // this is also exiting fallback.
  81. lb.refreshSubConns(backendAddrs, false, lb.usePickFirst)
  82. }
  83. // refreshSubConns creates/removes SubConns with backendAddrs, and refreshes
  84. // balancer state and picker.
  85. //
  86. // Caller must hold lb.mu.
  87. func (lb *lbBalancer) refreshSubConns(backendAddrs []resolver.Address, fallback bool, pickFirst bool) {
  88. opts := balancer.NewSubConnOptions{}
  89. if !fallback {
  90. opts.CredsBundle = lb.grpclbBackendCreds
  91. }
  92. lb.backendAddrs = backendAddrs
  93. lb.backendAddrsWithoutMetadata = nil
  94. fallbackModeChanged := lb.inFallback != fallback
  95. lb.inFallback = fallback
  96. if fallbackModeChanged && lb.inFallback {
  97. // Clear previous received list when entering fallback, so if the server
  98. // comes back and sends the same list again, the new addresses will be
  99. // used.
  100. lb.fullServerList = nil
  101. }
  102. balancingPolicyChanged := lb.usePickFirst != pickFirst
  103. lb.usePickFirst = pickFirst
  104. if fallbackModeChanged || balancingPolicyChanged {
  105. // Remove all SubConns when switching balancing policy or switching
  106. // fallback mode.
  107. //
  108. // For fallback mode switching with pickfirst, we want to recreate the
  109. // SubConn because the creds could be different.
  110. for a, sc := range lb.subConns {
  111. sc.Shutdown()
  112. delete(lb.subConns, a)
  113. }
  114. }
  115. if lb.usePickFirst {
  116. var (
  117. scKey resolver.Address
  118. sc balancer.SubConn
  119. )
  120. for scKey, sc = range lb.subConns {
  121. break
  122. }
  123. if sc != nil {
  124. if len(backendAddrs) == 0 {
  125. sc.Shutdown()
  126. delete(lb.subConns, scKey)
  127. return
  128. }
  129. lb.cc.ClientConn.UpdateAddresses(sc, backendAddrs)
  130. sc.Connect()
  131. return
  132. }
  133. opts.StateListener = func(scs balancer.SubConnState) { lb.updateSubConnState(sc, scs) }
  134. // This bypasses the cc wrapper with SubConn cache.
  135. sc, err := lb.cc.ClientConn.NewSubConn(backendAddrs, opts)
  136. if err != nil {
  137. logger.Warningf("grpclb: failed to create new SubConn: %v", err)
  138. return
  139. }
  140. sc.Connect()
  141. lb.subConns[backendAddrs[0]] = sc
  142. lb.scStates[sc] = connectivity.Idle
  143. return
  144. }
  145. // addrsSet is the set converted from backendAddrsWithoutMetadata, it's used to quick
  146. // lookup for an address.
  147. addrsSet := make(map[resolver.Address]struct{})
  148. // Create new SubConns.
  149. for _, addr := range backendAddrs {
  150. addrWithoutAttrs := addr
  151. addrWithoutAttrs.Attributes = nil
  152. addrsSet[addrWithoutAttrs] = struct{}{}
  153. lb.backendAddrsWithoutMetadata = append(lb.backendAddrsWithoutMetadata, addrWithoutAttrs)
  154. if _, ok := lb.subConns[addrWithoutAttrs]; !ok {
  155. // Use addrWithMD to create the SubConn.
  156. var sc balancer.SubConn
  157. opts.StateListener = func(scs balancer.SubConnState) { lb.updateSubConnState(sc, scs) }
  158. sc, err := lb.cc.NewSubConn([]resolver.Address{addr}, opts)
  159. if err != nil {
  160. logger.Warningf("grpclb: failed to create new SubConn: %v", err)
  161. continue
  162. }
  163. lb.subConns[addrWithoutAttrs] = sc // Use the addr without MD as key for the map.
  164. if _, ok := lb.scStates[sc]; !ok {
  165. // Only set state of new sc to IDLE. The state could already be
  166. // READY for cached SubConns.
  167. lb.scStates[sc] = connectivity.Idle
  168. }
  169. sc.Connect()
  170. }
  171. }
  172. for a, sc := range lb.subConns {
  173. // a was removed by resolver.
  174. if _, ok := addrsSet[a]; !ok {
  175. sc.Shutdown()
  176. delete(lb.subConns, a)
  177. // Keep the state of this sc in b.scStates until sc's state becomes Shutdown.
  178. // The entry will be deleted in UpdateSubConnState.
  179. }
  180. }
  181. // Regenerate and update picker after refreshing subconns because with
  182. // cache, even if SubConn was newed/removed, there might be no state
  183. // changes (the subconn will be kept in cache, not actually
  184. // newed/removed).
  185. lb.updateStateAndPicker(true, true)
  186. }
  187. type remoteBalancerCCWrapper struct {
  188. cc *grpc.ClientConn
  189. lb *lbBalancer
  190. backoff backoff.Strategy
  191. done chan struct{}
  192. streamMu sync.Mutex
  193. streamCancel func()
  194. // waitgroup to wait for all goroutines to exit.
  195. wg sync.WaitGroup
  196. }
  197. func (lb *lbBalancer) newRemoteBalancerCCWrapper() {
  198. var dopts []grpc.DialOption
  199. if creds := lb.opt.DialCreds; creds != nil {
  200. dopts = append(dopts, grpc.WithTransportCredentials(creds))
  201. } else if bundle := lb.grpclbClientConnCreds; bundle != nil {
  202. dopts = append(dopts, grpc.WithCredentialsBundle(bundle))
  203. } else {
  204. dopts = append(dopts, grpc.WithTransportCredentials(insecure.NewCredentials()))
  205. }
  206. if lb.opt.Dialer != nil {
  207. dopts = append(dopts, grpc.WithContextDialer(lb.opt.Dialer))
  208. }
  209. if lb.opt.CustomUserAgent != "" {
  210. dopts = append(dopts, grpc.WithUserAgent(lb.opt.CustomUserAgent))
  211. }
  212. // Explicitly set pickfirst as the balancer.
  213. dopts = append(dopts, grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy":"pick_first"}`))
  214. dopts = append(dopts, grpc.WithResolvers(lb.manualResolver))
  215. dopts = append(dopts, grpc.WithChannelzParentID(lb.opt.ChannelzParentID))
  216. // Enable Keepalive for grpclb client.
  217. dopts = append(dopts, grpc.WithKeepaliveParams(keepalive.ClientParameters{
  218. Time: 20 * time.Second,
  219. Timeout: 10 * time.Second,
  220. PermitWithoutStream: true,
  221. }))
  222. // The dial target is not important.
  223. //
  224. // The grpclb server addresses will set field ServerName, and creds will
  225. // receive ServerName as authority.
  226. cc, err := grpc.DialContext(context.Background(), lb.manualResolver.Scheme()+":///grpclb.subClientConn", dopts...)
  227. if err != nil {
  228. logger.Fatalf("failed to dial: %v", err)
  229. }
  230. ccw := &remoteBalancerCCWrapper{
  231. cc: cc,
  232. lb: lb,
  233. backoff: lb.backoff,
  234. done: make(chan struct{}),
  235. }
  236. lb.ccRemoteLB = ccw
  237. ccw.wg.Add(1)
  238. go ccw.watchRemoteBalancer()
  239. }
  240. // close closed the ClientConn to remote balancer, and waits until all
  241. // goroutines to finish.
  242. func (ccw *remoteBalancerCCWrapper) close() {
  243. close(ccw.done)
  244. ccw.cc.Close()
  245. ccw.wg.Wait()
  246. }
  247. func (ccw *remoteBalancerCCWrapper) readServerList(s *balanceLoadClientStream) error {
  248. for {
  249. reply, err := s.Recv()
  250. if err != nil {
  251. if err == io.EOF {
  252. return errServerTerminatedConnection
  253. }
  254. return fmt.Errorf("grpclb: failed to recv server list: %v", err)
  255. }
  256. if serverList := reply.GetServerList(); serverList != nil {
  257. ccw.lb.processServerList(serverList)
  258. }
  259. if reply.GetFallbackResponse() != nil {
  260. // Eagerly enter fallback
  261. ccw.lb.mu.Lock()
  262. ccw.lb.refreshSubConns(ccw.lb.resolvedBackendAddrs, true, ccw.lb.usePickFirst)
  263. ccw.lb.mu.Unlock()
  264. }
  265. }
  266. }
  267. func (ccw *remoteBalancerCCWrapper) sendLoadReport(s *balanceLoadClientStream, interval time.Duration) {
  268. ticker := time.NewTicker(interval)
  269. defer ticker.Stop()
  270. lastZero := false
  271. for {
  272. select {
  273. case <-ticker.C:
  274. case <-s.Context().Done():
  275. return
  276. }
  277. stats := ccw.lb.clientStats.toClientStats()
  278. zero := isZeroStats(stats)
  279. if zero && lastZero {
  280. // Quash redundant empty load reports.
  281. continue
  282. }
  283. lastZero = zero
  284. t := time.Now()
  285. stats.Timestamp = &timestamppb.Timestamp{
  286. Seconds: t.Unix(),
  287. Nanos: int32(t.Nanosecond()),
  288. }
  289. if err := s.Send(&lbpb.LoadBalanceRequest{
  290. LoadBalanceRequestType: &lbpb.LoadBalanceRequest_ClientStats{
  291. ClientStats: stats,
  292. },
  293. }); err != nil {
  294. return
  295. }
  296. }
  297. }
  298. func (ccw *remoteBalancerCCWrapper) callRemoteBalancer(ctx context.Context) (backoff bool, _ error) {
  299. lbClient := &loadBalancerClient{cc: ccw.cc}
  300. stream, err := lbClient.BalanceLoad(ctx, grpc.WaitForReady(true))
  301. if err != nil {
  302. return true, fmt.Errorf("grpclb: failed to perform RPC to the remote balancer: %v", err)
  303. }
  304. ccw.lb.mu.Lock()
  305. ccw.lb.remoteBalancerConnected = true
  306. ccw.lb.mu.Unlock()
  307. // grpclb handshake on the stream.
  308. initReq := &lbpb.LoadBalanceRequest{
  309. LoadBalanceRequestType: &lbpb.LoadBalanceRequest_InitialRequest{
  310. InitialRequest: &lbpb.InitialLoadBalanceRequest{
  311. Name: ccw.lb.target,
  312. },
  313. },
  314. }
  315. if err := stream.Send(initReq); err != nil {
  316. return true, fmt.Errorf("grpclb: failed to send init request: %v", err)
  317. }
  318. reply, err := stream.Recv()
  319. if err != nil {
  320. return true, fmt.Errorf("grpclb: failed to recv init response: %v", err)
  321. }
  322. initResp := reply.GetInitialResponse()
  323. if initResp == nil {
  324. return true, fmt.Errorf("grpclb: reply from remote balancer did not include initial response")
  325. }
  326. ccw.wg.Add(1)
  327. go func() {
  328. defer ccw.wg.Done()
  329. if d := convertDuration(initResp.ClientStatsReportInterval); d > 0 {
  330. ccw.sendLoadReport(stream, d)
  331. }
  332. }()
  333. // No backoff if init req/resp handshake was successful.
  334. return false, ccw.readServerList(stream)
  335. }
  336. // cancelRemoteBalancerCall cancels the context used by the stream to the remote
  337. // balancer. watchRemoteBalancer() takes care of restarting this call after the
  338. // stream fails.
  339. func (ccw *remoteBalancerCCWrapper) cancelRemoteBalancerCall() {
  340. ccw.streamMu.Lock()
  341. if ccw.streamCancel != nil {
  342. ccw.streamCancel()
  343. ccw.streamCancel = nil
  344. }
  345. ccw.streamMu.Unlock()
  346. }
  347. func (ccw *remoteBalancerCCWrapper) watchRemoteBalancer() {
  348. defer func() {
  349. ccw.wg.Done()
  350. ccw.streamMu.Lock()
  351. if ccw.streamCancel != nil {
  352. // This is to make sure that we don't leak the context when we are
  353. // directly returning from inside of the below `for` loop.
  354. ccw.streamCancel()
  355. ccw.streamCancel = nil
  356. }
  357. ccw.streamMu.Unlock()
  358. }()
  359. var retryCount int
  360. var ctx context.Context
  361. for {
  362. ccw.streamMu.Lock()
  363. if ccw.streamCancel != nil {
  364. ccw.streamCancel()
  365. ccw.streamCancel = nil
  366. }
  367. ctx, ccw.streamCancel = context.WithCancel(context.Background())
  368. ccw.streamMu.Unlock()
  369. doBackoff, err := ccw.callRemoteBalancer(ctx)
  370. select {
  371. case <-ccw.done:
  372. return
  373. default:
  374. if err != nil {
  375. if err == errServerTerminatedConnection {
  376. logger.Info(err)
  377. } else {
  378. logger.Warning(err)
  379. }
  380. }
  381. }
  382. // Trigger a re-resolve when the stream errors.
  383. ccw.lb.cc.ClientConn.ResolveNow(resolver.ResolveNowOptions{})
  384. ccw.lb.mu.Lock()
  385. ccw.lb.remoteBalancerConnected = false
  386. ccw.lb.fullServerList = nil
  387. // Enter fallback when connection to remote balancer is lost, and the
  388. // aggregated state is not Ready.
  389. if !ccw.lb.inFallback && ccw.lb.state != connectivity.Ready {
  390. // Entering fallback.
  391. ccw.lb.refreshSubConns(ccw.lb.resolvedBackendAddrs, true, ccw.lb.usePickFirst)
  392. }
  393. ccw.lb.mu.Unlock()
  394. if !doBackoff {
  395. retryCount = 0
  396. continue
  397. }
  398. timer := time.NewTimer(ccw.backoff.Backoff(retryCount)) // Copy backoff
  399. select {
  400. case <-timer.C:
  401. case <-ccw.done:
  402. timer.Stop()
  403. return
  404. }
  405. retryCount++
  406. }
  407. }