service_linux.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. package libnetwork
  2. import (
  3. "fmt"
  4. "io"
  5. "io/ioutil"
  6. "net"
  7. "os"
  8. "os/exec"
  9. "runtime"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "syscall"
  14. "github.com/Sirupsen/logrus"
  15. "github.com/docker/docker/pkg/reexec"
  16. "github.com/docker/libnetwork/iptables"
  17. "github.com/docker/libnetwork/ipvs"
  18. "github.com/gogo/protobuf/proto"
  19. "github.com/vishvananda/netlink/nl"
  20. "github.com/vishvananda/netns"
  21. )
  22. func init() {
  23. reexec.Register("fwmarker", fwMarker)
  24. }
  25. func newService(name string, id string, ingressPorts []*PortConfig) *service {
  26. return &service{
  27. name: name,
  28. id: id,
  29. ingressPorts: ingressPorts,
  30. loadBalancers: make(map[string]*loadBalancer),
  31. }
  32. }
  33. func (c *controller) addServiceBinding(name, sid, nid, eid string, vip net.IP, ingressPorts []*PortConfig, ip net.IP) error {
  34. var (
  35. s *service
  36. addService bool
  37. )
  38. n, err := c.NetworkByID(nid)
  39. if err != nil {
  40. return err
  41. }
  42. c.Lock()
  43. s, ok := c.serviceBindings[sid]
  44. if !ok {
  45. // Create a new service if we are seeing this service
  46. // for the first time.
  47. s = newService(name, sid, ingressPorts)
  48. c.serviceBindings[sid] = s
  49. }
  50. c.Unlock()
  51. // Add endpoint IP to special "tasks.svc_name" so that the
  52. // applications have access to DNS RR.
  53. n.(*network).addSvcRecords("tasks."+name, ip, nil, false)
  54. // Add service name to vip in DNS, if vip is valid. Otherwise resort to DNS RR
  55. svcIP := vip
  56. if len(svcIP) == 0 {
  57. svcIP = ip
  58. }
  59. n.(*network).addSvcRecords(name, svcIP, nil, false)
  60. s.Lock()
  61. defer s.Unlock()
  62. lb, ok := s.loadBalancers[nid]
  63. if !ok {
  64. // Create a new load balancer if we are seeing this
  65. // network attachment on the service for the first
  66. // time.
  67. lb = &loadBalancer{
  68. vip: vip,
  69. fwMark: fwMarkCtr,
  70. backEnds: make(map[string]net.IP),
  71. service: s,
  72. }
  73. fwMarkCtrMu.Lock()
  74. fwMarkCtr++
  75. fwMarkCtrMu.Unlock()
  76. s.loadBalancers[nid] = lb
  77. // Since we just created this load balancer make sure
  78. // we add a new service service in IPVS rules.
  79. addService = true
  80. }
  81. lb.backEnds[eid] = ip
  82. // Add loadbalancer service and backend in all sandboxes in
  83. // the network only if vip is valid.
  84. if len(vip) != 0 {
  85. n.(*network).addLBBackend(ip, vip, lb.fwMark, ingressPorts, addService)
  86. }
  87. return nil
  88. }
  89. func (c *controller) rmServiceBinding(name, sid, nid, eid string, vip net.IP, ingressPorts []*PortConfig, ip net.IP) error {
  90. var rmService bool
  91. n, err := c.NetworkByID(nid)
  92. if err != nil {
  93. return err
  94. }
  95. c.Lock()
  96. s, ok := c.serviceBindings[sid]
  97. if !ok {
  98. c.Unlock()
  99. return nil
  100. }
  101. c.Unlock()
  102. // Delete the special "tasks.svc_name" backend record.
  103. n.(*network).deleteSvcRecords("tasks."+name, ip, nil, false)
  104. // Make sure to remove the right IP since if vip is
  105. // not valid we would have added a DNS RR record.
  106. svcIP := vip
  107. if len(svcIP) == 0 {
  108. svcIP = ip
  109. }
  110. n.(*network).deleteSvcRecords(name, svcIP, nil, false)
  111. s.Lock()
  112. defer s.Unlock()
  113. lb, ok := s.loadBalancers[nid]
  114. if !ok {
  115. return nil
  116. }
  117. delete(lb.backEnds, eid)
  118. if len(lb.backEnds) == 0 {
  119. // All the backends for this service have been
  120. // removed. Time to remove the load balancer and also
  121. // remove the service entry in IPVS.
  122. rmService = true
  123. delete(s.loadBalancers, nid)
  124. }
  125. if len(s.loadBalancers) == 0 {
  126. // All loadbalancers for the service removed. Time to
  127. // remove the service itself.
  128. delete(c.serviceBindings, sid)
  129. }
  130. // Remove loadbalancer service(if needed) and backend in all
  131. // sandboxes in the network only if the vip is valid.
  132. if len(vip) != 0 {
  133. n.(*network).rmLBBackend(ip, vip, lb.fwMark, ingressPorts, rmService)
  134. }
  135. return nil
  136. }
  137. // Get all loadbalancers on this network that is currently discovered
  138. // on this node.
  139. func (n *network) connectedLoadbalancers() []*loadBalancer {
  140. c := n.getController()
  141. c.Lock()
  142. defer c.Unlock()
  143. var lbs []*loadBalancer
  144. for _, s := range c.serviceBindings {
  145. if lb, ok := s.loadBalancers[n.ID()]; ok {
  146. lbs = append(lbs, lb)
  147. }
  148. }
  149. return lbs
  150. }
  151. // Populate all loadbalancers on the network that the passed endpoint
  152. // belongs to, into this sandbox.
  153. func (sb *sandbox) populateLoadbalancers(ep *endpoint) {
  154. var gwIP net.IP
  155. n := ep.getNetwork()
  156. eIP := ep.Iface().Address()
  157. if sb.ingress {
  158. // For the ingress sandbox if this is not gateway
  159. // endpoint do nothing.
  160. if ep != sb.getGatewayEndpoint() {
  161. return
  162. }
  163. // This is the gateway endpoint. Now get the ingress
  164. // network and plumb the loadbalancers.
  165. gwIP = ep.Iface().Address().IP
  166. for _, ep := range sb.getConnectedEndpoints() {
  167. if !ep.endpointInGWNetwork() {
  168. n = ep.getNetwork()
  169. eIP = ep.Iface().Address()
  170. }
  171. }
  172. }
  173. for _, lb := range n.connectedLoadbalancers() {
  174. // Skip if vip is not valid.
  175. if len(lb.vip) == 0 {
  176. continue
  177. }
  178. addService := true
  179. for _, ip := range lb.backEnds {
  180. sb.addLBBackend(ip, lb.vip, lb.fwMark, lb.service.ingressPorts,
  181. eIP, gwIP, addService)
  182. addService = false
  183. }
  184. }
  185. }
  186. // Add loadbalancer backend to all sandboxes which has a connection to
  187. // this network. If needed add the service as well, as specified by
  188. // the addService bool.
  189. func (n *network) addLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*PortConfig, addService bool) {
  190. n.WalkEndpoints(func(e Endpoint) bool {
  191. ep := e.(*endpoint)
  192. if sb, ok := ep.getSandbox(); ok {
  193. var gwIP net.IP
  194. if ep := sb.getGatewayEndpoint(); ep != nil {
  195. gwIP = ep.Iface().Address().IP
  196. }
  197. sb.addLBBackend(ip, vip, fwMark, ingressPorts, ep.Iface().Address(), gwIP, addService)
  198. }
  199. return false
  200. })
  201. }
  202. // Remove loadbalancer backend from all sandboxes which has a
  203. // connection to this network. If needed remove the service entry as
  204. // well, as specified by the rmService bool.
  205. func (n *network) rmLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*PortConfig, rmService bool) {
  206. n.WalkEndpoints(func(e Endpoint) bool {
  207. ep := e.(*endpoint)
  208. if sb, ok := ep.getSandbox(); ok {
  209. var gwIP net.IP
  210. if ep := sb.getGatewayEndpoint(); ep != nil {
  211. gwIP = ep.Iface().Address().IP
  212. }
  213. sb.rmLBBackend(ip, vip, fwMark, ingressPorts, ep.Iface().Address(), gwIP, rmService)
  214. }
  215. return false
  216. })
  217. }
  218. // Add loadbalancer backend into one connected sandbox.
  219. func (sb *sandbox) addLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*PortConfig, eIP *net.IPNet, gwIP net.IP, addService bool) {
  220. if sb.osSbox == nil {
  221. return
  222. }
  223. i, err := ipvs.New(sb.Key())
  224. if err != nil {
  225. logrus.Errorf("Failed to create a ipvs handle for sbox %s: %v", sb.Key(), err)
  226. return
  227. }
  228. defer i.Close()
  229. s := &ipvs.Service{
  230. AddressFamily: nl.FAMILY_V4,
  231. FWMark: fwMark,
  232. SchedName: ipvs.RoundRobin,
  233. }
  234. if addService {
  235. var iPorts []*PortConfig
  236. if sb.ingress {
  237. iPorts = ingressPorts
  238. if err := programIngress(gwIP, iPorts, false); err != nil {
  239. logrus.Errorf("Failed to add ingress: %v", err)
  240. return
  241. }
  242. }
  243. logrus.Debugf("Creating service for vip %s fwMark %d ingressPorts %#v", vip, fwMark, iPorts)
  244. if err := invokeFWMarker(sb.Key(), vip, fwMark, iPorts, eIP, false); err != nil {
  245. logrus.Errorf("Failed to add firewall mark rule in sbox %s: %v", sb.Key(), err)
  246. return
  247. }
  248. if err := i.NewService(s); err != nil {
  249. logrus.Errorf("Failed to create a new service for vip %s fwmark %d: %v", vip, fwMark, err)
  250. return
  251. }
  252. }
  253. d := &ipvs.Destination{
  254. AddressFamily: nl.FAMILY_V4,
  255. Address: ip,
  256. Weight: 1,
  257. }
  258. // Remove the sched name before using the service to add
  259. // destination.
  260. s.SchedName = ""
  261. if err := i.NewDestination(s, d); err != nil && err != syscall.EEXIST {
  262. logrus.Errorf("Failed to create real server %s for vip %s fwmark %d in sb %s: %v", ip, vip, fwMark, sb.containerID, err)
  263. }
  264. }
  265. // Remove loadbalancer backend from one connected sandbox.
  266. func (sb *sandbox) rmLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*PortConfig, eIP *net.IPNet, gwIP net.IP, rmService bool) {
  267. if sb.osSbox == nil {
  268. return
  269. }
  270. i, err := ipvs.New(sb.Key())
  271. if err != nil {
  272. logrus.Errorf("Failed to create a ipvs handle for sbox %s: %v", sb.Key(), err)
  273. return
  274. }
  275. defer i.Close()
  276. s := &ipvs.Service{
  277. AddressFamily: nl.FAMILY_V4,
  278. FWMark: fwMark,
  279. }
  280. d := &ipvs.Destination{
  281. AddressFamily: nl.FAMILY_V4,
  282. Address: ip,
  283. Weight: 1,
  284. }
  285. if err := i.DelDestination(s, d); err != nil {
  286. logrus.Errorf("Failed to delete real server %s for vip %s fwmark %d: %v", ip, vip, fwMark, err)
  287. return
  288. }
  289. if rmService {
  290. s.SchedName = ipvs.RoundRobin
  291. if err := i.DelService(s); err != nil {
  292. logrus.Errorf("Failed to create a new service for vip %s fwmark %d: %v", vip, fwMark, err)
  293. return
  294. }
  295. var iPorts []*PortConfig
  296. if sb.ingress {
  297. iPorts = ingressPorts
  298. if err := programIngress(gwIP, iPorts, true); err != nil {
  299. logrus.Errorf("Failed to delete ingress: %v", err)
  300. return
  301. }
  302. }
  303. if err := invokeFWMarker(sb.Key(), vip, fwMark, iPorts, eIP, true); err != nil {
  304. logrus.Errorf("Failed to add firewall mark rule in sbox %s: %v", sb.Key(), err)
  305. return
  306. }
  307. }
  308. }
  309. const ingressChain = "DOCKER-INGRESS"
  310. var (
  311. ingressOnce sync.Once
  312. ingressProxyMu sync.Mutex
  313. ingressProxyTbl = make(map[string]io.Closer)
  314. )
  315. func programIngress(gwIP net.IP, ingressPorts []*PortConfig, isDelete bool) error {
  316. addDelOpt := "-I"
  317. if isDelete {
  318. addDelOpt = "-D"
  319. }
  320. chainExists := iptables.ExistChain(ingressChain, iptables.Nat)
  321. ingressOnce.Do(func() {
  322. if chainExists {
  323. // Flush ingress chain rules during init if it
  324. // exists. It might contain stale rules from
  325. // previous life.
  326. if err := iptables.RawCombinedOutput("-t", "nat", "-F", ingressChain); err != nil {
  327. logrus.Errorf("Could not flush ingress chain rules during init: %v", err)
  328. }
  329. }
  330. })
  331. if !isDelete {
  332. if !chainExists {
  333. if err := iptables.RawCombinedOutput("-t", "nat", "-N", ingressChain); err != nil {
  334. return fmt.Errorf("failed to create ingress chain: %v", err)
  335. }
  336. }
  337. if !iptables.Exists(iptables.Nat, ingressChain, "-j", "RETURN") {
  338. if err := iptables.RawCombinedOutput("-t", "nat", "-A", ingressChain, "-j", "RETURN"); err != nil {
  339. return fmt.Errorf("failed to add return rule in ingress chain: %v", err)
  340. }
  341. }
  342. for _, chain := range []string{"OUTPUT", "PREROUTING"} {
  343. if !iptables.Exists(iptables.Nat, chain, "-j", ingressChain) {
  344. if err := iptables.RawCombinedOutput("-t", "nat", "-I", chain, "-j", ingressChain); err != nil {
  345. return fmt.Errorf("failed to add jump rule in %s to ingress chain: %v", chain, err)
  346. }
  347. }
  348. }
  349. }
  350. for _, iPort := range ingressPorts {
  351. if iptables.ExistChain(ingressChain, iptables.Nat) {
  352. rule := strings.Fields(fmt.Sprintf("-t nat %s %s -p %s --dport %d -j DNAT --to-destination %s:%d",
  353. addDelOpt, ingressChain, strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)]), iPort.NodePort, gwIP, iPort.NodePort))
  354. if err := iptables.RawCombinedOutput(rule...); err != nil {
  355. return fmt.Errorf("setting up rule failed, %v: %v", rule, err)
  356. }
  357. }
  358. if err := plumbProxy(iPort, isDelete); err != nil {
  359. return fmt.Errorf("failed to create proxy for port %d: %v", iPort.NodePort, err)
  360. }
  361. }
  362. return nil
  363. }
  364. func plumbProxy(iPort *PortConfig, isDelete bool) error {
  365. var (
  366. err error
  367. l io.Closer
  368. )
  369. portSpec := fmt.Sprintf("%d/%s", iPort.NodePort, strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)]))
  370. if isDelete {
  371. ingressProxyMu.Lock()
  372. if listener, ok := ingressProxyTbl[portSpec]; ok {
  373. if listener != nil {
  374. listener.Close()
  375. }
  376. }
  377. ingressProxyMu.Unlock()
  378. return nil
  379. }
  380. switch iPort.Protocol {
  381. case ProtocolTCP:
  382. l, err = net.ListenTCP("tcp", &net.TCPAddr{Port: int(iPort.NodePort)})
  383. case ProtocolUDP:
  384. l, err = net.ListenUDP("udp", &net.UDPAddr{Port: int(iPort.NodePort)})
  385. }
  386. if err != nil {
  387. return err
  388. }
  389. ingressProxyMu.Lock()
  390. ingressProxyTbl[portSpec] = l
  391. ingressProxyMu.Unlock()
  392. return nil
  393. }
  394. // Invoke fwmarker reexec routine to mark vip destined packets with
  395. // the passed firewall mark.
  396. func invokeFWMarker(path string, vip net.IP, fwMark uint32, ingressPorts []*PortConfig, eIP *net.IPNet, isDelete bool) error {
  397. var ingressPortsFile string
  398. if len(ingressPorts) != 0 {
  399. f, err := ioutil.TempFile("", "port_configs")
  400. if err != nil {
  401. return err
  402. }
  403. buf, err := proto.Marshal(&EndpointRecord{
  404. IngressPorts: ingressPorts,
  405. })
  406. n, err := f.Write(buf)
  407. if err != nil {
  408. f.Close()
  409. return err
  410. }
  411. if n < len(buf) {
  412. f.Close()
  413. return io.ErrShortWrite
  414. }
  415. ingressPortsFile = f.Name()
  416. f.Close()
  417. }
  418. addDelOpt := "-A"
  419. if isDelete {
  420. addDelOpt = "-D"
  421. }
  422. cmd := &exec.Cmd{
  423. Path: reexec.Self(),
  424. Args: append([]string{"fwmarker"}, path, vip.String(), fmt.Sprintf("%d", fwMark), addDelOpt, ingressPortsFile, eIP.IP.String()),
  425. Stdout: os.Stdout,
  426. Stderr: os.Stderr,
  427. }
  428. if err := cmd.Run(); err != nil {
  429. return fmt.Errorf("reexec failed: %v", err)
  430. }
  431. return nil
  432. }
  433. // Firewall marker reexec function.
  434. func fwMarker() {
  435. runtime.LockOSThread()
  436. defer runtime.UnlockOSThread()
  437. if len(os.Args) < 7 {
  438. logrus.Error("invalid number of arguments..")
  439. os.Exit(1)
  440. }
  441. var ingressPorts []*PortConfig
  442. if os.Args[5] != "" {
  443. buf, err := ioutil.ReadFile(os.Args[5])
  444. if err != nil {
  445. logrus.Errorf("Failed to read ports config file: %v", err)
  446. os.Exit(6)
  447. }
  448. var epRec EndpointRecord
  449. err = proto.Unmarshal(buf, &epRec)
  450. if err != nil {
  451. logrus.Errorf("Failed to unmarshal ports config data: %v", err)
  452. os.Exit(7)
  453. }
  454. ingressPorts = epRec.IngressPorts
  455. }
  456. vip := os.Args[2]
  457. fwMark, err := strconv.ParseUint(os.Args[3], 10, 32)
  458. if err != nil {
  459. logrus.Errorf("bad fwmark value(%s) passed: %v", os.Args[3], err)
  460. os.Exit(2)
  461. }
  462. addDelOpt := os.Args[4]
  463. rules := [][]string{}
  464. for _, iPort := range ingressPorts {
  465. rule := strings.Fields(fmt.Sprintf("-t nat %s PREROUTING -p %s --dport %d -j REDIRECT --to-port %d",
  466. addDelOpt, strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)]), iPort.NodePort, iPort.Port))
  467. rules = append(rules, rule)
  468. rule = strings.Fields(fmt.Sprintf("-t mangle %s PREROUTING -p %s --dport %d -j MARK --set-mark %d",
  469. addDelOpt, strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)]), iPort.NodePort, fwMark))
  470. rules = append(rules, rule)
  471. }
  472. ns, err := netns.GetFromPath(os.Args[1])
  473. if err != nil {
  474. logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
  475. os.Exit(3)
  476. }
  477. defer ns.Close()
  478. if err := netns.Set(ns); err != nil {
  479. logrus.Errorf("setting into container net ns %v failed, %v", os.Args[1], err)
  480. os.Exit(4)
  481. }
  482. if len(ingressPorts) != 0 && addDelOpt == "-A" {
  483. ruleParams := strings.Fields(fmt.Sprintf("-m ipvs --ipvs -j SNAT --to-source %s", os.Args[6]))
  484. if !iptables.Exists("nat", "POSTROUTING", ruleParams...) {
  485. rule := append(strings.Fields("-t nat -A POSTROUTING"), ruleParams...)
  486. rules = append(rules, rule)
  487. err := ioutil.WriteFile("/proc/sys/net/ipv4/vs/conntrack", []byte{'1', '\n'}, 0644)
  488. if err != nil {
  489. logrus.Errorf("Failed to write to /proc/sys/net/ipv4/vs/conntrack: %v", err)
  490. os.Exit(8)
  491. }
  492. }
  493. }
  494. rule := strings.Fields(fmt.Sprintf("-t mangle %s OUTPUT -d %s/32 -j MARK --set-mark %d", addDelOpt, vip, fwMark))
  495. rules = append(rules, rule)
  496. for _, rule := range rules {
  497. if err := iptables.RawCombinedOutputNative(rule...); err != nil {
  498. logrus.Errorf("setting up rule failed, %v: %v", rule, err)
  499. os.Exit(5)
  500. }
  501. }
  502. }