2016-05-25 05:46:18 +00:00
package libnetwork
import (
"fmt"
2016-05-31 06:55:51 +00:00
"io"
2016-05-25 05:46:18 +00:00
"net"
"os"
2016-06-13 00:28:28 +00:00
"path/filepath"
2016-05-25 05:46:18 +00:00
"strconv"
"strings"
2016-06-07 18:50:17 +00:00
"sync"
2016-06-05 20:45:52 +00:00
"syscall"
2016-05-25 05:46:18 +00:00
2021-04-06 00:24:47 +00:00
"github.com/docker/docker/libnetwork/iptables"
"github.com/docker/docker/libnetwork/ns"
2017-06-13 05:29:56 +00:00
"github.com/ishidawataru/sctp"
2020-03-11 16:36:12 +00:00
"github.com/moby/ipvs"
2017-07-26 21:18:31 +00:00
"github.com/sirupsen/logrus"
2016-05-25 05:46:18 +00:00
"github.com/vishvananda/netlink/nl"
)
// Populate all loadbalancers on the network that the passed endpoint
// belongs to, into this sandbox.
2023-01-12 01:42:24 +00:00
func ( sb * Sandbox ) populateLoadBalancers ( ep * Endpoint ) {
2016-08-30 16:41:16 +00:00
// This is an interface less endpoint. Nothing to do.
if ep . Iface ( ) == nil {
return
}
2016-05-25 05:46:18 +00:00
n := ep . getNetwork ( )
2016-05-31 06:55:51 +00:00
eIP := ep . Iface ( ) . Address ( )
2016-09-21 19:15:14 +00:00
if n . ingress {
2022-11-01 16:26:33 +00:00
if err := sb . addRedirectRules ( eIP , ep . ingressPorts ) ; err != nil {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to add redirect rules for ep %s (%.7s): %v" , ep . Name ( ) , ep . ID ( ) , err )
2016-09-21 19:15:14 +00:00
}
}
2018-04-10 16:34:41 +00:00
}
2016-09-21 19:15:14 +00:00
2023-01-12 01:42:24 +00:00
func ( n * network ) findLBEndpointSandbox ( ) ( * Endpoint , * Sandbox , error ) {
2018-04-10 16:34:41 +00:00
// TODO: get endpoint from store? See EndpointInfo()
2023-01-12 01:42:24 +00:00
var ep * Endpoint
2018-04-10 16:34:41 +00:00
// Find this node's LB sandbox endpoint: there should be exactly one
for _ , e := range n . Endpoints ( ) {
epi := e . Info ( )
if epi != nil && epi . LoadBalancer ( ) {
2023-01-12 01:42:24 +00:00
ep = e
2018-04-10 16:34:41 +00:00
break
2016-05-31 06:55:51 +00:00
}
}
2018-04-10 16:34:41 +00:00
if ep == nil {
return nil , nil , fmt . Errorf ( "Unable to find load balancing endpoint for network %s" , n . ID ( ) )
}
// Get the load balancer sandbox itself as well
sb , ok := ep . getSandbox ( )
if ! ok {
return nil , nil , fmt . Errorf ( "Unable to get sandbox for %s(%s) in for %s" , ep . Name ( ) , ep . ID ( ) , n . ID ( ) )
}
2021-05-28 00:15:56 +00:00
sep := sb . getEndpoint ( ep . ID ( ) )
2020-09-22 06:56:03 +00:00
if sep == nil {
2018-04-10 16:34:41 +00:00
return nil , nil , fmt . Errorf ( "Load balancing endpoint %s(%s) removed from %s" , ep . Name ( ) , ep . ID ( ) , n . ID ( ) )
}
2020-09-22 06:56:03 +00:00
return sep , sb , nil
2018-04-10 16:34:41 +00:00
}
2016-05-31 06:55:51 +00:00
2018-04-10 16:34:41 +00:00
// Searches the OS sandbox for the name of the endpoint interface
// within the sandbox. This is required for adding/removing IP
// aliases to the interface.
2023-01-12 01:42:24 +00:00
func findIfaceDstName ( sb * Sandbox , ep * Endpoint ) string {
2018-04-10 16:34:41 +00:00
srcName := ep . Iface ( ) . SrcName ( )
for _ , i := range sb . osSbox . Info ( ) . Interfaces ( ) {
if i . SrcName ( ) == srcName {
return i . DstName ( )
2016-05-25 05:46:18 +00:00
}
}
2018-04-10 16:34:41 +00:00
return ""
2016-05-25 05:46:18 +00:00
}
2018-04-10 16:34:41 +00:00
// Add loadbalancer backend to the loadbalncer sandbox for the network.
// If needed add the service as well.
2018-04-10 04:36:19 +00:00
func ( n * network ) addLBBackend ( ip net . IP , lb * loadBalancer ) {
if len ( lb . vip ) == 0 {
return
}
2018-04-10 16:34:41 +00:00
ep , sb , err := n . findLBEndpointSandbox ( )
if err != nil {
2018-06-20 19:54:25 +00:00
logrus . Errorf ( "addLBBackend %s/%s: %v" , n . ID ( ) , n . Name ( ) , err )
2018-04-10 04:36:19 +00:00
return
}
2016-06-05 20:45:52 +00:00
if sb . osSbox == nil {
return
}
2016-08-04 21:20:54 +00:00
2018-04-10 16:34:41 +00:00
eIP := ep . Iface ( ) . Address ( )
2016-05-25 05:46:18 +00:00
i , err := ipvs . New ( sb . Key ( ) )
if err != nil {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to create an ipvs handle for sbox %.7s (%.7s,%s) for lb addition: %v" , sb . ID ( ) , sb . ContainerID ( ) , sb . Key ( ) , err )
2016-05-25 05:46:18 +00:00
return
}
defer i . Close ( )
s := & ipvs . Service {
AddressFamily : nl . FAMILY_V4 ,
2018-04-10 16:34:41 +00:00
FWMark : lb . fwMark ,
2016-05-25 05:46:18 +00:00
SchedName : ipvs . RoundRobin ,
}
2017-02-01 22:55:39 +00:00
if ! i . IsServicePresent ( s ) {
2018-04-10 16:34:41 +00:00
// Add IP alias for the VIP to the endpoint
ifName := findIfaceDstName ( sb , ep )
if ifName == "" {
logrus . Errorf ( "Failed find interface name for endpoint %s(%s) to create LB alias" , ep . ID ( ) , ep . Name ( ) )
return
}
err := sb . osSbox . AddAliasIP ( ifName , & net . IPNet { IP : lb . vip , Mask : net . CIDRMask ( 32 , 32 ) } )
if err != nil {
logrus . Errorf ( "Failed add IP alias %s to network %s LB endpoint interface %s: %v" , lb . vip , n . ID ( ) , ifName , err )
return
}
2016-05-31 06:55:51 +00:00
if sb . ingress {
2018-04-10 16:34:41 +00:00
var gwIP net . IP
if ep := sb . getGatewayEndpoint ( ) ; ep != nil {
gwIP = ep . Iface ( ) . Address ( ) . IP
}
2018-02-05 06:17:07 +00:00
if err := programIngress ( gwIP , lb . service . ingressPorts , false ) ; err != nil {
2016-05-31 06:55:51 +00:00
logrus . Errorf ( "Failed to add ingress: %v" , err )
return
}
}
2018-07-05 20:33:01 +00:00
logrus . Debugf ( "Creating service for vip %s fwMark %d ingressPorts %#v in sbox %.7s (%.7s)" , lb . vip , lb . fwMark , lb . service . ingressPorts , sb . ID ( ) , sb . ContainerID ( ) )
2022-10-31 22:04:50 +00:00
if err := sb . configureFWMark ( lb . vip , lb . fwMark , lb . service . ingressPorts , eIP , false , n . loadBalancerMode ) ; err != nil {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to add firewall mark rule in sbox %.7s (%.7s): %v" , sb . ID ( ) , sb . ContainerID ( ) , err )
2016-05-25 05:46:18 +00:00
return
}
2017-02-01 00:25:56 +00:00
if err := i . NewService ( s ) ; err != nil && err != syscall . EEXIST {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to create a new service for vip %s fwmark %d in sbox %.7s (%.7s): %v" , lb . vip , lb . fwMark , sb . ID ( ) , sb . ContainerID ( ) , err )
2016-05-25 05:46:18 +00:00
return
}
}
d := & ipvs . Destination {
AddressFamily : nl . FAMILY_V4 ,
Address : ip ,
Weight : 1 ,
}
2018-10-09 14:04:31 +00:00
if n . loadBalancerMode == loadBalancerModeDSR {
2018-09-07 13:48:05 +00:00
d . ConnectionFlags = ipvs . ConnFwdDirectRoute
}
2016-05-25 05:46:18 +00:00
// Remove the sched name before using the service to add
// destination.
s . SchedName = ""
2016-06-05 20:45:52 +00:00
if err := i . NewDestination ( s , d ) ; err != nil && err != syscall . EEXIST {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to create real server %s for vip %s fwmark %d in sbox %.7s (%.7s): %v" , ip , lb . vip , lb . fwMark , sb . ID ( ) , sb . ContainerID ( ) , err )
2016-05-25 05:46:18 +00:00
}
2022-05-31 15:44:34 +00:00
// Ensure that kernel tweaks are applied in case this is the first time
// we've initialized ip_vs
sb . osSbox . ApplyOSTweaks ( sb . oslTypes )
2016-05-25 05:46:18 +00:00
}
2018-04-10 16:34:41 +00:00
// Remove loadbalancer backend the load balancing endpoint for this
// network. If 'rmService' is true, then remove the service entry as well.
// If 'fullRemove' is true then completely remove the entry, otherwise
// just deweight it for now.
func ( n * network ) rmLBBackend ( ip net . IP , lb * loadBalancer , rmService bool , fullRemove bool ) {
if len ( lb . vip ) == 0 {
return
}
ep , sb , err := n . findLBEndpointSandbox ( )
if err != nil {
2018-06-20 19:54:25 +00:00
logrus . Debugf ( "rmLBBackend for %s/%s: %v -- probably transient state" , n . ID ( ) , n . Name ( ) , err )
2018-04-10 16:34:41 +00:00
return
}
2016-06-05 20:45:52 +00:00
if sb . osSbox == nil {
return
}
2016-08-04 21:20:54 +00:00
2018-04-10 16:34:41 +00:00
eIP := ep . Iface ( ) . Address ( )
2016-05-25 05:46:18 +00:00
i , err := ipvs . New ( sb . Key ( ) )
if err != nil {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to create an ipvs handle for sbox %.7s (%.7s,%s) for lb removal: %v" , sb . ID ( ) , sb . ContainerID ( ) , sb . Key ( ) , err )
2016-05-25 05:46:18 +00:00
return
}
defer i . Close ( )
s := & ipvs . Service {
AddressFamily : nl . FAMILY_V4 ,
2018-04-10 16:34:41 +00:00
FWMark : lb . fwMark ,
2016-05-25 05:46:18 +00:00
}
d := & ipvs . Destination {
AddressFamily : nl . FAMILY_V4 ,
Address : ip ,
Weight : 1 ,
}
2018-10-09 14:04:31 +00:00
if n . loadBalancerMode == loadBalancerModeDSR {
2018-09-07 13:48:05 +00:00
d . ConnectionFlags = ipvs . ConnFwdDirectRoute
}
2016-05-25 05:46:18 +00:00
Gracefully remove LB endpoints from services
This patch attempts to allow endpoints to complete servicing connections
while being removed from a service. The change adds a flag to the
endpoint.deleteServiceInfoFromCluster() method to indicate whether this
removal should fully remove connectivity through the load balancer
to the endpoint or should just disable directing further connections to
the endpoint. If the flag is 'false', then the load balancer assigns
a weight of 0 to the endpoint but does not remove it as a linux load
balancing destination. It does remove the endpoint as a docker load
balancing endpoint but tracks it in a special map of "disabled-but-not-
destroyed" load balancing endpoints. This allows traffic to continue
flowing, at least under Linux. If the flag is 'true', then the code
removes the endpoint entirely as a load balancing destination.
The sandbox.DisableService() method invokes deleteServiceInfoFromCluster()
with the flag sent to 'false', while the endpoint.sbLeave() method invokes
it with the flag set to 'true' to complete the removal on endpoint
finalization. Renaming the endpoint invokes deleteServiceInfoFromCluster()
with the flag set to 'true' because renaming attempts to completely
remove and then re-add each endpoint service entry.
The controller.rmServiceBinding() method, which carries out the operation,
similarly gets a new flag for whether to fully remove the endpoint. If
the flag is false, it does the job of moving the endpoint from the
load balancing set to the 'disabled' set. It then removes or
de-weights the entry in the OS load balancing table via
network.rmLBBackend(). It removes the service entirely via said method
ONLY IF there are no more live or disabled load balancing endpoints.
Similarly network.addLBBackend() requires slight tweaking to properly
manage the disabled set.
Finally, this change requires propagating the status of disabled
service endpoints via the networkDB. Accordingly, the patch includes
both code to generate and handle service update messages. It also
augments the service structure with a ServiceDisabled boolean to convey
whether an endpoint should ultimately be removed or just disabled.
This, naturally, required a rebuild of the protocol buffer code as well.
Signed-off-by: Chris Telfer <ctelfer@docker.com>
2018-02-14 22:04:23 +00:00
if fullRemove {
if err := i . DelDestination ( s , d ) ; err != nil && err != syscall . ENOENT {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to delete real server %s for vip %s fwmark %d in sbox %.7s (%.7s): %v" , ip , lb . vip , lb . fwMark , sb . ID ( ) , sb . ContainerID ( ) , err )
Gracefully remove LB endpoints from services
This patch attempts to allow endpoints to complete servicing connections
while being removed from a service. The change adds a flag to the
endpoint.deleteServiceInfoFromCluster() method to indicate whether this
removal should fully remove connectivity through the load balancer
to the endpoint or should just disable directing further connections to
the endpoint. If the flag is 'false', then the load balancer assigns
a weight of 0 to the endpoint but does not remove it as a linux load
balancing destination. It does remove the endpoint as a docker load
balancing endpoint but tracks it in a special map of "disabled-but-not-
destroyed" load balancing endpoints. This allows traffic to continue
flowing, at least under Linux. If the flag is 'true', then the code
removes the endpoint entirely as a load balancing destination.
The sandbox.DisableService() method invokes deleteServiceInfoFromCluster()
with the flag sent to 'false', while the endpoint.sbLeave() method invokes
it with the flag set to 'true' to complete the removal on endpoint
finalization. Renaming the endpoint invokes deleteServiceInfoFromCluster()
with the flag set to 'true' because renaming attempts to completely
remove and then re-add each endpoint service entry.
The controller.rmServiceBinding() method, which carries out the operation,
similarly gets a new flag for whether to fully remove the endpoint. If
the flag is false, it does the job of moving the endpoint from the
load balancing set to the 'disabled' set. It then removes or
de-weights the entry in the OS load balancing table via
network.rmLBBackend(). It removes the service entirely via said method
ONLY IF there are no more live or disabled load balancing endpoints.
Similarly network.addLBBackend() requires slight tweaking to properly
manage the disabled set.
Finally, this change requires propagating the status of disabled
service endpoints via the networkDB. Accordingly, the patch includes
both code to generate and handle service update messages. It also
augments the service structure with a ServiceDisabled boolean to convey
whether an endpoint should ultimately be removed or just disabled.
This, naturally, required a rebuild of the protocol buffer code as well.
Signed-off-by: Chris Telfer <ctelfer@docker.com>
2018-02-14 22:04:23 +00:00
}
} else {
d . Weight = 0
if err := i . UpdateDestination ( s , d ) ; err != nil && err != syscall . ENOENT {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to set LB weight of real server %s to 0 for vip %s fwmark %d in sbox %.7s (%.7s): %v" , ip , lb . vip , lb . fwMark , sb . ID ( ) , sb . ContainerID ( ) , err )
Gracefully remove LB endpoints from services
This patch attempts to allow endpoints to complete servicing connections
while being removed from a service. The change adds a flag to the
endpoint.deleteServiceInfoFromCluster() method to indicate whether this
removal should fully remove connectivity through the load balancer
to the endpoint or should just disable directing further connections to
the endpoint. If the flag is 'false', then the load balancer assigns
a weight of 0 to the endpoint but does not remove it as a linux load
balancing destination. It does remove the endpoint as a docker load
balancing endpoint but tracks it in a special map of "disabled-but-not-
destroyed" load balancing endpoints. This allows traffic to continue
flowing, at least under Linux. If the flag is 'true', then the code
removes the endpoint entirely as a load balancing destination.
The sandbox.DisableService() method invokes deleteServiceInfoFromCluster()
with the flag sent to 'false', while the endpoint.sbLeave() method invokes
it with the flag set to 'true' to complete the removal on endpoint
finalization. Renaming the endpoint invokes deleteServiceInfoFromCluster()
with the flag set to 'true' because renaming attempts to completely
remove and then re-add each endpoint service entry.
The controller.rmServiceBinding() method, which carries out the operation,
similarly gets a new flag for whether to fully remove the endpoint. If
the flag is false, it does the job of moving the endpoint from the
load balancing set to the 'disabled' set. It then removes or
de-weights the entry in the OS load balancing table via
network.rmLBBackend(). It removes the service entirely via said method
ONLY IF there are no more live or disabled load balancing endpoints.
Similarly network.addLBBackend() requires slight tweaking to properly
manage the disabled set.
Finally, this change requires propagating the status of disabled
service endpoints via the networkDB. Accordingly, the patch includes
both code to generate and handle service update messages. It also
augments the service structure with a ServiceDisabled boolean to convey
whether an endpoint should ultimately be removed or just disabled.
This, naturally, required a rebuild of the protocol buffer code as well.
Signed-off-by: Chris Telfer <ctelfer@docker.com>
2018-02-14 22:04:23 +00:00
}
2016-05-25 05:46:18 +00:00
}
if rmService {
s . SchedName = ipvs . RoundRobin
2017-02-01 23:10:24 +00:00
if err := i . DelService ( s ) ; err != nil && err != syscall . ENOENT {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to delete service for vip %s fwmark %d in sbox %.7s (%.7s): %v" , lb . vip , lb . fwMark , sb . ID ( ) , sb . ContainerID ( ) , err )
2016-05-25 05:46:18 +00:00
}
2016-05-31 06:55:51 +00:00
if sb . ingress {
2018-04-10 16:34:41 +00:00
var gwIP net . IP
if ep := sb . getGatewayEndpoint ( ) ; ep != nil {
gwIP = ep . Iface ( ) . Address ( ) . IP
}
2018-02-05 06:17:07 +00:00
if err := programIngress ( gwIP , lb . service . ingressPorts , true ) ; err != nil {
2016-05-31 06:55:51 +00:00
logrus . Errorf ( "Failed to delete ingress: %v" , err )
}
}
2022-10-31 22:04:50 +00:00
if err := sb . configureFWMark ( lb . vip , lb . fwMark , lb . service . ingressPorts , eIP , true , n . loadBalancerMode ) ; err != nil {
2018-07-05 20:33:01 +00:00
logrus . Errorf ( "Failed to delete firewall mark rule in sbox %.7s (%.7s): %v" , sb . ID ( ) , sb . ContainerID ( ) , err )
2016-05-25 05:46:18 +00:00
}
2018-04-10 16:34:41 +00:00
// Remove IP alias from the VIP to the endpoint
ifName := findIfaceDstName ( sb , ep )
if ifName == "" {
logrus . Errorf ( "Failed find interface name for endpoint %s(%s) to create LB alias" , ep . ID ( ) , ep . Name ( ) )
return
}
err := sb . osSbox . RemoveAliasIP ( ifName , & net . IPNet { IP : lb . vip , Mask : net . CIDRMask ( 32 , 32 ) } )
if err != nil {
logrus . Errorf ( "Failed add IP alias %s to network %s LB endpoint interface %s: %v" , lb . vip , n . ID ( ) , ifName , err )
}
2016-05-25 05:46:18 +00:00
}
}
2016-06-07 18:50:17 +00:00
const ingressChain = "DOCKER-INGRESS"
var (
ingressOnce sync . Once
2018-06-07 20:02:04 +00:00
ingressMu sync . Mutex // lock for operations on ingress
2016-06-07 18:50:17 +00:00
ingressProxyTbl = make ( map [ string ] io . Closer )
2016-06-24 23:37:14 +00:00
portConfigMu sync . Mutex
portConfigTbl = make ( map [ PortConfig ] int )
2016-06-07 18:50:17 +00:00
)
2016-06-24 23:37:14 +00:00
func filterPortConfigs ( ingressPorts [ ] * PortConfig , isDelete bool ) [ ] * PortConfig {
portConfigMu . Lock ( )
iPorts := make ( [ ] * PortConfig , 0 , len ( ingressPorts ) )
for _ , pc := range ingressPorts {
if isDelete {
if cnt , ok := portConfigTbl [ * pc ] ; ok {
// This is the last reference to this
// port config. Delete the port config
// and add it to filtered list to be
// plumbed.
if cnt == 1 {
delete ( portConfigTbl , * pc )
iPorts = append ( iPorts , pc )
continue
}
portConfigTbl [ * pc ] = cnt - 1
}
continue
}
if cnt , ok := portConfigTbl [ * pc ] ; ok {
portConfigTbl [ * pc ] = cnt + 1
continue
}
// We are adding it for the first time. Add it to the
// filter list to be plumbed.
portConfigTbl [ * pc ] = 1
iPorts = append ( iPorts , pc )
}
portConfigMu . Unlock ( )
return iPorts
}
2016-05-31 06:55:51 +00:00
func programIngress ( gwIP net . IP , ingressPorts [ ] * PortConfig , isDelete bool ) error {
2020-07-23 14:52:40 +00:00
// TODO IPv6 support
2017-11-28 21:15:55 +00:00
iptable := iptables . GetIptable ( iptables . IPv4 )
2016-06-07 18:50:17 +00:00
addDelOpt := "-I"
2018-02-05 06:17:07 +00:00
rollbackAddDelOpt := "-D"
2016-05-31 06:55:51 +00:00
if isDelete {
addDelOpt = "-D"
2018-02-05 06:17:07 +00:00
rollbackAddDelOpt = "-I"
2016-05-31 06:55:51 +00:00
}
2018-06-07 20:02:04 +00:00
ingressMu . Lock ( )
defer ingressMu . Unlock ( )
2017-11-28 21:15:55 +00:00
chainExists := iptable . ExistChain ( ingressChain , iptables . Nat )
filterChainExists := iptable . ExistChain ( ingressChain , iptables . Filter )
2016-06-07 18:50:17 +00:00
ingressOnce . Do ( func ( ) {
2016-08-15 17:54:18 +00:00
// Flush nat table and filter table ingress chain rules during init if it
// exists. It might contain stale rules from previous life.
2016-06-07 18:50:17 +00:00
if chainExists {
2017-11-28 21:15:55 +00:00
if err := iptable . RawCombinedOutput ( "-t" , "nat" , "-F" , ingressChain ) ; err != nil {
2016-08-15 17:54:18 +00:00
logrus . Errorf ( "Could not flush nat table ingress chain rules during init: %v" , err )
}
}
if filterChainExists {
2017-11-28 21:15:55 +00:00
if err := iptable . RawCombinedOutput ( "-F" , ingressChain ) ; err != nil {
2016-08-15 17:54:18 +00:00
logrus . Errorf ( "Could not flush filter table ingress chain rules during init: %v" , err )
2016-06-07 18:50:17 +00:00
}
}
} )
if ! isDelete {
if ! chainExists {
2017-11-28 21:15:55 +00:00
if err := iptable . RawCombinedOutput ( "-t" , "nat" , "-N" , ingressChain ) ; err != nil {
2016-06-07 18:50:17 +00:00
return fmt . Errorf ( "failed to create ingress chain: %v" , err )
}
}
2016-08-15 17:54:18 +00:00
if ! filterChainExists {
2017-11-28 21:15:55 +00:00
if err := iptable . RawCombinedOutput ( "-N" , ingressChain ) ; err != nil {
2016-08-15 17:54:18 +00:00
return fmt . Errorf ( "failed to create filter table ingress chain: %v" , err )
}
}
2016-06-07 18:50:17 +00:00
2017-11-28 21:15:55 +00:00
if ! iptable . Exists ( iptables . Nat , ingressChain , "-j" , "RETURN" ) {
if err := iptable . RawCombinedOutput ( "-t" , "nat" , "-A" , ingressChain , "-j" , "RETURN" ) ; err != nil {
2016-08-15 17:54:18 +00:00
return fmt . Errorf ( "failed to add return rule in nat table ingress chain: %v" , err )
}
}
2017-11-28 21:15:55 +00:00
if ! iptable . Exists ( iptables . Filter , ingressChain , "-j" , "RETURN" ) {
if err := iptable . RawCombinedOutput ( "-A" , ingressChain , "-j" , "RETURN" ) ; err != nil {
2016-08-15 17:54:18 +00:00
return fmt . Errorf ( "failed to add return rule to filter table ingress chain: %v" , err )
2016-06-07 18:50:17 +00:00
}
}
for _ , chain := range [ ] string { "OUTPUT" , "PREROUTING" } {
2017-11-28 21:15:55 +00:00
if ! iptable . Exists ( iptables . Nat , chain , "-m" , "addrtype" , "--dst-type" , "LOCAL" , "-j" , ingressChain ) {
if err := iptable . RawCombinedOutput ( "-t" , "nat" , "-I" , chain , "-m" , "addrtype" , "--dst-type" , "LOCAL" , "-j" , ingressChain ) ; err != nil {
2016-06-07 18:50:17 +00:00
return fmt . Errorf ( "failed to add jump rule in %s to ingress chain: %v" , chain , err )
}
}
}
2016-06-13 00:28:28 +00:00
2017-11-28 21:15:55 +00:00
if ! iptable . Exists ( iptables . Filter , "FORWARD" , "-j" , ingressChain ) {
if err := iptable . RawCombinedOutput ( "-I" , "FORWARD" , "-j" , ingressChain ) ; err != nil {
2016-08-15 17:54:18 +00:00
return fmt . Errorf ( "failed to add jump rule to %s in filter table forward chain: %v" , ingressChain , err )
}
2017-07-20 07:51:32 +00:00
arrangeUserFilterRule ( )
2016-08-15 17:54:18 +00:00
}
2016-06-13 00:28:28 +00:00
oifName , err := findOIFName ( gwIP )
if err != nil {
return fmt . Errorf ( "failed to find gateway bridge interface name for %s: %v" , gwIP , err )
}
path := filepath . Join ( "/proc/sys/net/ipv4/conf" , oifName , "route_localnet" )
2022-10-31 22:04:50 +00:00
if err := os . WriteFile ( path , [ ] byte { '1' , '\n' } , 0 o644 ) ; err != nil { //nolint:gosec // gosec complains about perms here, which must be 0644 in this case
2016-06-13 00:28:28 +00:00
return fmt . Errorf ( "could not write to %s: %v" , path , err )
}
2022-04-20 12:43:07 +00:00
ruleArgs := [ ] string { "-m" , "addrtype" , "--src-type" , "LOCAL" , "-o" , oifName , "-j" , "MASQUERADE" }
2017-11-28 21:15:55 +00:00
if ! iptable . Exists ( iptables . Nat , "POSTROUTING" , ruleArgs ... ) {
if err := iptable . RawCombinedOutput ( append ( [ ] string { "-t" , "nat" , "-I" , "POSTROUTING" } , ruleArgs ... ) ... ) ; err != nil {
2016-06-13 00:28:28 +00:00
return fmt . Errorf ( "failed to add ingress localhost POSTROUTING rule for %s: %v" , oifName , err )
}
}
2016-06-07 18:50:17 +00:00
}
2022-04-20 12:43:07 +00:00
// Filter the ingress ports until port rules start to be added/deleted
2018-02-05 06:17:07 +00:00
filteredPorts := filterPortConfigs ( ingressPorts , isDelete )
rollbackRules := make ( [ ] [ ] string , 0 , len ( filteredPorts ) * 3 )
var portErr error
defer func ( ) {
if portErr != nil && ! isDelete {
filterPortConfigs ( filteredPorts , ! isDelete )
for _ , rule := range rollbackRules {
2017-11-28 21:15:55 +00:00
if err := iptable . RawCombinedOutput ( rule ... ) ; err != nil {
2018-02-05 06:17:07 +00:00
logrus . Warnf ( "roll back rule failed, %v: %v" , rule , err )
}
}
}
} ( )
for _ , iPort := range filteredPorts {
2022-04-20 12:43:07 +00:00
var (
protocol = strings . ToLower ( PortConfig_Protocol_name [ int32 ( iPort . Protocol ) ] )
publishedPort = strconv . FormatUint ( uint64 ( iPort . PublishedPort ) , 10 )
destination = net . JoinHostPort ( gwIP . String ( ) , publishedPort )
)
2017-11-28 21:15:55 +00:00
if iptable . ExistChain ( ingressChain , iptables . Nat ) {
2022-04-20 12:43:07 +00:00
rule := [ ] string { "-t" , "nat" , addDelOpt , ingressChain , "-p" , protocol , "--dport" , publishedPort , "-j" , "DNAT" , "--to-destination" , destination }
2017-11-28 21:15:55 +00:00
if portErr = iptable . RawCombinedOutput ( rule ... ) ; portErr != nil {
2022-04-20 12:43:07 +00:00
err := fmt . Errorf ( "set up rule failed, %v: %v" , rule , portErr )
2016-06-15 21:00:48 +00:00
if ! isDelete {
2022-04-20 12:43:07 +00:00
return err
2016-06-15 21:00:48 +00:00
}
2022-04-20 12:43:07 +00:00
logrus . Info ( err )
2016-06-07 18:50:17 +00:00
}
2022-04-20 12:43:07 +00:00
rollbackRule := [ ] string { "-t" , "nat" , rollbackAddDelOpt , ingressChain , "-p" , protocol , "--dport" , publishedPort , "-j" , "DNAT" , "--to-destination" , destination }
2018-02-05 06:17:07 +00:00
rollbackRules = append ( rollbackRules , rollbackRule )
2016-06-07 18:50:17 +00:00
}
2016-08-15 17:54:18 +00:00
// Filter table rules to allow a published service to be accessible in the local node from..
// 1) service tasks attached to other networks
// 2) unmanaged containers on bridge networks
2022-04-20 12:43:07 +00:00
rule := [ ] string { addDelOpt , ingressChain , "-m" , "state" , "-p" , protocol , "--sport" , publishedPort , "--state" , "ESTABLISHED,RELATED" , "-j" , "ACCEPT" }
2017-11-28 21:15:55 +00:00
if portErr = iptable . RawCombinedOutput ( rule ... ) ; portErr != nil {
2022-04-20 12:43:07 +00:00
err := fmt . Errorf ( "set up rule failed, %v: %v" , rule , portErr )
2016-08-15 17:54:18 +00:00
if ! isDelete {
2022-04-20 12:43:07 +00:00
return err
2016-08-15 17:54:18 +00:00
}
2022-04-20 12:43:07 +00:00
logrus . Warn ( err )
2016-08-15 17:54:18 +00:00
}
2022-04-20 12:43:07 +00:00
rollbackRule := [ ] string { rollbackAddDelOpt , ingressChain , "-m" , "state" , "-p" , protocol , "--sport" , publishedPort , "--state" , "ESTABLISHED,RELATED" , "-j" , "ACCEPT" }
2018-02-05 06:17:07 +00:00
rollbackRules = append ( rollbackRules , rollbackRule )
2016-08-15 17:54:18 +00:00
2022-04-20 12:43:07 +00:00
rule = [ ] string { addDelOpt , ingressChain , "-p" , protocol , "--dport" , publishedPort , "-j" , "ACCEPT" }
2017-11-28 21:15:55 +00:00
if portErr = iptable . RawCombinedOutput ( rule ... ) ; portErr != nil {
2022-04-20 12:43:07 +00:00
err := fmt . Errorf ( "set up rule failed, %v: %v" , rule , portErr )
2016-08-15 17:54:18 +00:00
if ! isDelete {
2022-04-20 12:43:07 +00:00
return err
2016-08-15 17:54:18 +00:00
}
2022-04-20 12:43:07 +00:00
logrus . Warn ( err )
2016-08-15 17:54:18 +00:00
}
2022-04-20 12:43:07 +00:00
rollbackRule = [ ] string { rollbackAddDelOpt , ingressChain , "-p" , protocol , "--dport" , publishedPort , "-j" , "ACCEPT" }
2018-02-05 06:17:07 +00:00
rollbackRules = append ( rollbackRules , rollbackRule )
2016-08-15 17:54:18 +00:00
2016-06-07 18:50:17 +00:00
if err := plumbProxy ( iPort , isDelete ) ; err != nil {
2022-04-20 12:43:07 +00:00
logrus . Warnf ( "failed to create proxy for port %s: %v" , publishedPort , err )
2016-05-31 06:55:51 +00:00
}
}
return nil
}
2017-07-20 07:51:32 +00:00
// In the filter table FORWARD chain the first rule should be to jump to
// DOCKER-USER so the user is able to filter packet first.
// The second rule should be jump to INGRESS-CHAIN.
2016-08-15 17:54:18 +00:00
// This chain has the rules to allow access to the published ports for swarm tasks
2017-05-22 02:25:52 +00:00
// from local bridge networks and docker_gwbridge (ie:taks on other swarm networks)
2016-08-15 17:54:18 +00:00
func arrangeIngressFilterRule ( ) {
2020-07-23 14:52:40 +00:00
// TODO IPv6 support
2017-11-28 21:15:55 +00:00
iptable := iptables . GetIptable ( iptables . IPv4 )
if iptable . ExistChain ( ingressChain , iptables . Filter ) {
if iptable . Exists ( iptables . Filter , "FORWARD" , "-j" , ingressChain ) {
if err := iptable . RawCombinedOutput ( "-D" , "FORWARD" , "-j" , ingressChain ) ; err != nil {
2016-08-15 17:54:18 +00:00
logrus . Warnf ( "failed to delete jump rule to ingressChain in filter table: %v" , err )
}
}
2017-11-28 21:15:55 +00:00
if err := iptable . RawCombinedOutput ( "-I" , "FORWARD" , "-j" , ingressChain ) ; err != nil {
2016-08-15 17:54:18 +00:00
logrus . Warnf ( "failed to add jump rule to ingressChain in filter table: %v" , err )
}
}
}
2016-06-13 00:28:28 +00:00
func findOIFName ( ip net . IP ) ( string , error ) {
nlh := ns . NlHandle ( )
routes , err := nlh . RouteGet ( ip )
if err != nil {
return "" , err
}
if len ( routes ) == 0 {
return "" , fmt . Errorf ( "no route to %s" , ip )
}
// Pick the first route(typically there is only one route). We
// don't support multipath.
link , err := nlh . LinkByIndex ( routes [ 0 ] . LinkIndex )
if err != nil {
return "" , err
}
return link . Attrs ( ) . Name , nil
}
2016-06-07 18:50:17 +00:00
func plumbProxy ( iPort * PortConfig , isDelete bool ) error {
var (
err error
l io . Closer
)
2016-06-13 21:11:18 +00:00
portSpec := fmt . Sprintf ( "%d/%s" , iPort . PublishedPort , strings . ToLower ( PortConfig_Protocol_name [ int32 ( iPort . Protocol ) ] ) )
2016-06-07 18:50:17 +00:00
if isDelete {
if listener , ok := ingressProxyTbl [ portSpec ] ; ok {
if listener != nil {
listener . Close ( )
}
}
return nil
}
switch iPort . Protocol {
case ProtocolTCP :
2016-06-13 21:11:18 +00:00
l , err = net . ListenTCP ( "tcp" , & net . TCPAddr { Port : int ( iPort . PublishedPort ) } )
2016-06-07 18:50:17 +00:00
case ProtocolUDP :
2016-06-13 21:11:18 +00:00
l , err = net . ListenUDP ( "udp" , & net . UDPAddr { Port : int ( iPort . PublishedPort ) } )
2017-06-13 05:29:56 +00:00
case ProtocolSCTP :
l , err = sctp . ListenSCTP ( "sctp" , & sctp . SCTPAddr { Port : int ( iPort . PublishedPort ) } )
default :
err = fmt . Errorf ( "unknown protocol %v" , iPort . Protocol )
2016-06-07 18:50:17 +00:00
}
if err != nil {
return err
}
ingressProxyTbl [ portSpec ] = l
return nil
}
2022-10-31 22:04:50 +00:00
// configureFWMark configures the sandbox firewall to mark vip destined packets
// with the firewall mark fwMark.
2023-01-12 01:10:09 +00:00
func ( sb * Sandbox ) configureFWMark ( vip net . IP , fwMark uint32 , ingressPorts [ ] * PortConfig , eIP * net . IPNet , isDelete bool , lbMode string ) error {
2022-10-31 22:04:50 +00:00
// TODO IPv6 support
iptable := iptables . GetIptable ( iptables . IPv4 )
2016-05-31 06:55:51 +00:00
2022-10-31 22:04:50 +00:00
fwMarkStr := strconv . FormatUint ( uint64 ( fwMark ) , 10 )
2016-05-25 05:46:18 +00:00
addDelOpt := "-A"
if isDelete {
addDelOpt = "-D"
}
2022-04-20 12:43:07 +00:00
rules := make ( [ ] [ ] string , 0 , len ( ingressPorts ) )
2016-09-07 17:10:00 +00:00
for _ , iPort := range ingressPorts {
2022-04-20 12:43:07 +00:00
var (
protocol = strings . ToLower ( PortConfig_Protocol_name [ int32 ( iPort . Protocol ) ] )
publishedPort = strconv . FormatUint ( uint64 ( iPort . PublishedPort ) , 10 )
)
2022-10-31 22:04:50 +00:00
rule := [ ] string { "-t" , "mangle" , addDelOpt , "PREROUTING" , "-p" , protocol , "--dport" , publishedPort , "-j" , "MARK" , "--set-mark" , fwMarkStr }
2016-05-31 06:55:51 +00:00
rules = append ( rules , rule )
}
2022-10-31 22:04:50 +00:00
var innerErr error
err := sb . ExecFunc ( func ( ) {
if ! isDelete && lbMode == loadBalancerModeNAT {
subnet := net . IPNet { IP : eIP . IP . Mask ( eIP . Mask ) , Mask : eIP . Mask }
ruleParams := [ ] string { "-m" , "ipvs" , "--ipvs" , "-d" , subnet . String ( ) , "-j" , "SNAT" , "--to-source" , eIP . IP . String ( ) }
if ! iptable . Exists ( "nat" , "POSTROUTING" , ruleParams ... ) {
rule := append ( [ ] string { "-t" , "nat" , "-A" , "POSTROUTING" } , ruleParams ... )
rules = append ( rules , rule )
2016-05-25 05:46:18 +00:00
2022-10-31 22:04:50 +00:00
err := os . WriteFile ( "/proc/sys/net/ipv4/vs/conntrack" , [ ] byte { '1' , '\n' } , 0644 )
if err != nil {
innerErr = err
return
}
2016-05-31 06:55:51 +00:00
}
}
2022-10-31 22:04:50 +00:00
rule := [ ] string { "-t" , "mangle" , addDelOpt , "INPUT" , "-d" , vip . String ( ) + "/32" , "-j" , "MARK" , "--set-mark" , fwMarkStr }
rules = append ( rules , rule )
2016-11-21 00:54:32 +00:00
2022-10-31 22:04:50 +00:00
for _ , rule := range rules {
if err := iptable . RawCombinedOutputNative ( rule ... ) ; err != nil {
innerErr = fmt . Errorf ( "set up rule failed, %v: %w" , rule , err )
return
}
2016-05-31 06:55:51 +00:00
}
2022-10-31 22:04:50 +00:00
} )
if err != nil {
return err
2016-05-25 05:46:18 +00:00
}
2022-10-31 22:04:50 +00:00
return innerErr
2016-05-25 05:46:18 +00:00
}
2016-09-21 19:15:14 +00:00
2023-01-12 01:10:09 +00:00
func ( sb * Sandbox ) addRedirectRules ( eIP * net . IPNet , ingressPorts [ ] * PortConfig ) error {
2020-07-23 14:52:40 +00:00
// TODO IPv6 support
2017-11-28 21:15:55 +00:00
iptable := iptables . GetIptable ( iptables . IPv4 )
2022-11-01 16:26:33 +00:00
ipAddr := eIP . IP . String ( )
2016-09-21 19:15:14 +00:00
2022-04-20 12:43:07 +00:00
rules := make ( [ ] [ ] string , 0 , len ( ingressPorts ) * 3 ) // 3 rules per port
2016-09-21 19:15:14 +00:00
for _ , iPort := range ingressPorts {
2022-04-20 12:43:07 +00:00
var (
protocol = strings . ToLower ( PortConfig_Protocol_name [ int32 ( iPort . Protocol ) ] )
publishedPort = strconv . FormatUint ( uint64 ( iPort . PublishedPort ) , 10 )
targetPort = strconv . FormatUint ( uint64 ( iPort . TargetPort ) , 10 )
)
rules = append ( rules ,
[ ] string { "-t" , "nat" , "-A" , "PREROUTING" , "-d" , ipAddr , "-p" , protocol , "--dport" , publishedPort , "-j" , "REDIRECT" , "--to-port" , targetPort } ,
// Allow only incoming connections to exposed ports
[ ] string { "-I" , "INPUT" , "-d" , ipAddr , "-p" , protocol , "--dport" , targetPort , "-m" , "conntrack" , "--ctstate" , "NEW,ESTABLISHED" , "-j" , "ACCEPT" } ,
// Allow only outgoing connections from exposed ports
[ ] string { "-I" , "OUTPUT" , "-s" , ipAddr , "-p" , protocol , "--sport" , targetPort , "-m" , "conntrack" , "--ctstate" , "ESTABLISHED" , "-j" , "ACCEPT" } ,
)
2016-09-21 19:15:14 +00:00
}
2022-11-01 16:26:33 +00:00
var innerErr error
err := sb . ExecFunc ( func ( ) {
for _ , rule := range rules {
if err := iptable . RawCombinedOutputNative ( rule ... ) ; err != nil {
innerErr = fmt . Errorf ( "set up rule failed, %v: %w" , rule , err )
return
}
2016-10-05 07:04:05 +00:00
}
2022-11-01 16:26:33 +00:00
if len ( ingressPorts ) == 0 {
return
}
2016-10-05 07:04:05 +00:00
2022-11-01 16:26:33 +00:00
// Ensure blocking rules for anything else in/to ingress network
for _ , rule := range [ ] [ ] string {
{ "-d" , ipAddr , "-p" , "sctp" , "-j" , "DROP" } ,
{ "-d" , ipAddr , "-p" , "udp" , "-j" , "DROP" } ,
{ "-d" , ipAddr , "-p" , "tcp" , "-j" , "DROP" } ,
} {
if ! iptable . ExistsNative ( iptables . Filter , "INPUT" , rule ... ) {
if err := iptable . RawCombinedOutputNative ( append ( [ ] string { "-A" , "INPUT" } , rule ... ) ... ) ; err != nil {
innerErr = fmt . Errorf ( "set up rule failed, %v: %w" , rule , err )
return
}
2016-10-05 07:04:05 +00:00
}
2022-11-01 16:26:33 +00:00
rule [ 0 ] = "-s"
if ! iptable . ExistsNative ( iptables . Filter , "OUTPUT" , rule ... ) {
if err := iptable . RawCombinedOutputNative ( append ( [ ] string { "-A" , "OUTPUT" } , rule ... ) ... ) ; err != nil {
innerErr = fmt . Errorf ( "set up rule failed, %v: %w" , rule , err )
return
}
2016-10-05 07:04:05 +00:00
}
2016-09-21 19:15:14 +00:00
}
2022-11-01 16:26:33 +00:00
} )
if err != nil {
return err
2016-09-21 19:15:14 +00:00
}
2022-11-01 16:26:33 +00:00
return innerErr
2016-09-21 19:15:14 +00:00
}