From 441a3f9592ba60712099a49e14725ac3240b4db0 Mon Sep 17 00:00:00 2001 From: Santhosh Manohar Date: Sat, 10 Jun 2017 16:51:58 -0700 Subject: [PATCH 1/2] Cleanup interfaces properly when vxlan plumbling fails Signed-off-by: Santhosh Manohar Signed-off-by: Chris Telfer --- drivers/overlay/ov_network.go | 20 ++++++++++++++++++++ osl/interface_linux.go | 10 ++++++++++ 2 files changed, 30 insertions(+) diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index 4dda2801fb..628a706933 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -588,6 +588,23 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error if err := sbox.AddInterface(vxlanName, "vxlan", sbox.InterfaceOptions().Master(brName)); err != nil { + // If adding vxlan device to the overlay namespace fails, remove the bridge interface we + // already added to the namespace. This allows the caller to try the setup again. + for _, iface := range sbox.Info().Interfaces() { + if iface.SrcName() == brName { + if ierr := iface.Remove(); ierr != nil { + logrus.Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr) + } + } + } + + // Also, delete the vxlan interface. Since a global vni id is associated + // with the vxlan interface, an orphaned vxlan interface will result in + // failure of vxlan device creation if the vni is assigned to some other + // network. + if deleteErr := deleteInterface(vxlanName); deleteErr != nil { + logrus.Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err) + } return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err) } @@ -629,6 +646,9 @@ func (n *network) initSubnetSandbox(s *subnet, restore bool) error { } } else { if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil { + // The error in setupSubnetSandbox could be a temporary glitch. reset the + // subnet once object to allow the setup to be retried on another endpoint join. + s.once = &sync.Once{} return err } } diff --git a/osl/interface_linux.go b/osl/interface_linux.go index 0ecda09f6e..a924af4bdf 100644 --- a/osl/interface_linux.go +++ b/osl/interface_linux.go @@ -289,6 +289,16 @@ func (n *networkNamespace) AddInterface(srcName, dstPrefix string, options ...If // Configure the interface now this is moved in the proper namespace. if err := configureInterface(nlh, iface, i); err != nil { + // If configuring the device fails move it back to the host namespace + // and change the name back to the source name. This allows the caller + // to properly cleanup the interface. Its important especially for + // interfaces with global attributes, ex: vni id for vxlan interfaces. + if nerr := nlh.LinkSetName(iface, i.SrcName()); nerr != nil { + logrus.Errorf("renaming interface (%s->%s) failed, %v after config error %v", i.DstName(), i.SrcName(), nerr, err) + } + if nerr := nlh.LinkSetNsFd(iface, ns.ParseHandlerInt()); nerr != nil { + logrus.Errorf("moving inteface %s to host ns failed, %v, after config error %v", i.SrcName(), nerr, err) + } return err } From 5d113d19f93b14b7b6a2b2d1f2fc1a2c1b6f9b0d Mon Sep 17 00:00:00 2001 From: Chris Telfer Date: Fri, 4 May 2018 14:33:00 -0400 Subject: [PATCH 2/2] Refactor locking for join/leave to avoid race Instead of using "sync.Once" to determine whether to initialize a network sandbox or subnet sandbox, we use a traditional mutex + initialization boolean. This is because the initialization state isn't truly a once-and-done condition. Rather, libnetwork destroys network and subnet sandboxes when the last endpoint leaves them. The use of sync.Once in this kind of scenario requires, therefore, re-initializing the Once which is impoissible. So the approach that libnetwork currently takes is to use a pointer to a Once and redirect that pointer to a new Once on reset. This leads to nasty race conditions. In addition to refactoring the locking, this patch merges the functions joinSandbox(), and joinSubnetSandbox(). This makes the code both cleaner and it also holds the network and subnet locks through the series of read-modify-writes avoiding further potential races. This does reduce the potential parallelism which could be applied should there be many joins coming in on many different subnets in the same overlay network. However, this should be an extremely minor performance hit for a very obscure case. One important pattern in this commit is that it is crucial to avoid sending peerDB messages while holding a driver or network lock. The changes herein defer such (asynchronous) notifications until after release of such locks. This prevents deadlocks where the peerDB blocks acquiring said locks while the network method blocks trying to send to the peerDB's channel. Signed-off-by: Chris Telfer --- drivers/overlay/joinleave.go | 10 +-- drivers/overlay/ov_network.go | 157 +++++++++++++++++----------------- drivers/overlay/overlay.go | 19 +--- drivers/overlay/peerdb.go | 2 +- 4 files changed, 82 insertions(+), 106 deletions(-) diff --git a/drivers/overlay/joinleave.go b/drivers/overlay/joinleave.go index 985d997784..a51bcd8985 100644 --- a/drivers/overlay/joinleave.go +++ b/drivers/overlay/joinleave.go @@ -47,18 +47,10 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err) } - if err := n.joinSandbox(false); err != nil { + if err := n.joinSandbox(s, false, true); err != nil { return fmt.Errorf("network sandbox join failed: %v", err) } - if err := n.joinSubnetSandbox(s, false); err != nil { - return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err) - } - - // joinSubnetSandbox gets called when an endpoint comes up on a new subnet in the - // overlay network. Hence the Endpoint count should be updated outside joinSubnetSandbox - n.incEndpointCount() - sbox := n.sandbox() overlayIfName, containerIfName, err := createVethPair() diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index 628a706933..cf32e45951 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -39,7 +39,7 @@ var ( type networkTable map[string]*network type subnet struct { - once *sync.Once + sboxInit bool vxlanName string brName string vni uint32 @@ -63,7 +63,7 @@ type network struct { endpoints endpointTable driver *driver joinCnt int - once *sync.Once + sboxInit bool initEpoch int initErr error subnets []*subnet @@ -150,7 +150,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d id: id, driver: d, endpoints: endpointTable{}, - once: &sync.Once{}, subnets: []*subnet{}, } @@ -193,7 +192,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d s := &subnet{ subnetIP: ipd.Pool, gwIP: ipd.Gateway, - once: &sync.Once{}, } if len(vnis) != 0 { @@ -277,7 +275,7 @@ func (d *driver) DeleteNetwork(nid string) error { logrus.Warnf("Failed to delete overlay endpoint %.7s from local store: %v", ep.id, err) } } - // flush the peerDB entries + doPeerFlush = true delete(d.networks, nid) @@ -304,29 +302,54 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error { return nil } -func (n *network) incEndpointCount() { - n.Lock() - defer n.Unlock() - n.joinCnt++ -} - -func (n *network) joinSandbox(restore bool) error { +func (n *network) joinSandbox(s *subnet, restore bool, incJoinCount bool) error { // If there is a race between two go routines here only one will win // the other will wait. - n.once.Do(func() { - // save the error status of initSandbox in n.initErr so that - // all the racing go routines are able to know the status. + networkOnce.Do(networkOnceInit) + + n.Lock() + // If non-restore initialization occurred and was successful then + // tell the peerDB to initialize the sandbox with all the peers + // previously received from networkdb. But only do this after + // unlocking the network. Otherwise we could deadlock with + // on the peerDB channel while peerDB is waiting for the network lock. + var doInitPeerDB bool + defer func() { + n.Unlock() + if doInitPeerDB { + n.driver.initSandboxPeerDB(n.id) + } + }() + + if !n.sboxInit { n.initErr = n.initSandbox(restore) - }) + doInitPeerDB = n.initErr == nil && !restore + // If there was an error, we cannot recover it + n.sboxInit = true + } - return n.initErr -} + if n.initErr != nil { + return fmt.Errorf("network sandbox join failed: %v", n.initErr) + } -func (n *network) joinSubnetSandbox(s *subnet, restore bool) error { - s.once.Do(func() { - s.initErr = n.initSubnetSandbox(s, restore) - }) - return s.initErr + subnetErr := s.initErr + if !s.sboxInit { + subnetErr = n.initSubnetSandbox(s, restore) + // We can recover from these errors, but not on restore + if restore || subnetErr == nil { + s.initErr = subnetErr + s.sboxInit = true + } + } + if subnetErr != nil { + return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr) + } + + if incJoinCount { + n.joinCnt++ + } + + return nil } func (n *network) leaveSandbox() { @@ -337,15 +360,14 @@ func (n *network) leaveSandbox() { return } - // We are about to destroy sandbox since the container is leaving the network - // Reinitialize the once variable so that we will be able to trigger one time - // sandbox initialization(again) when another container joins subsequently. - n.once = &sync.Once{} + n.destroySandbox() + + n.sboxInit = false + n.initErr = nil for _, s := range n.subnets { - s.once = &sync.Once{} + s.sboxInit = false + s.initErr = nil } - - n.destroySandbox() } // to be called while holding network lock @@ -478,7 +500,7 @@ func (n *network) generateVxlanName(s *subnet) string { id = n.id[:5] } - return "vx-" + fmt.Sprintf("%06x", n.vxlanID(s)) + "-" + id + return fmt.Sprintf("vx-%06x-%v", s.vni, id) } func (n *network) generateBridgeName(s *subnet) string { @@ -491,7 +513,7 @@ func (n *network) generateBridgeName(s *subnet) string { } func (n *network) getBridgeNamePrefix(s *subnet) string { - return "ov-" + fmt.Sprintf("%06x", n.vxlanID(s)) + return fmt.Sprintf("ov-%06x", s.vni) } func checkOverlap(nw *net.IPNet) error { @@ -513,7 +535,7 @@ func checkOverlap(nw *net.IPNet) error { } func (n *network) restoreSubnetSandbox(s *subnet, brName, vxlanName string) error { - sbox := n.sandbox() + sbox := n.sbox // restore overlay osl sandbox Ifaces := make(map[string][]osl.IfaceOption) @@ -542,7 +564,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error deleteInterfaceBySubnet(n.getBridgeNamePrefix(s), s) } // Try to delete the vxlan interface by vni if already present - deleteVxlanByVNI("", n.vxlanID(s)) + deleteVxlanByVNI("", s.vni) if err := checkOverlap(s.subnetIP); err != nil { return err @@ -556,24 +578,24 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error // it must a stale namespace from previous // life. Destroy it completely and reclaim resourced. networkMu.Lock() - path, ok := vniTbl[n.vxlanID(s)] + path, ok := vniTbl[s.vni] networkMu.Unlock() if ok { - deleteVxlanByVNI(path, n.vxlanID(s)) + deleteVxlanByVNI(path, s.vni) if err := syscall.Unmount(path, syscall.MNT_FORCE); err != nil { logrus.Errorf("unmount of %s failed: %v", path, err) } os.Remove(path) networkMu.Lock() - delete(vniTbl, n.vxlanID(s)) + delete(vniTbl, s.vni) networkMu.Unlock() } } // create a bridge and vxlan device for this subnet and move it to the sandbox - sbox := n.sandbox() + sbox := n.sbox if err := sbox.AddInterface(brName, "br", sbox.InterfaceOptions().Address(s.gwIP), @@ -581,7 +603,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err) } - err := createVxlan(vxlanName, n.vxlanID(s), n.maxMTU()) + err := createVxlan(vxlanName, s.vni, n.maxMTU()) if err != nil { return err } @@ -636,6 +658,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error return nil } +// Must be called with the network lock func (n *network) initSubnetSandbox(s *subnet, restore bool) error { brName := n.generateBridgeName(s) vxlanName := n.generateVxlanName(s) @@ -646,17 +669,12 @@ func (n *network) initSubnetSandbox(s *subnet, restore bool) error { } } else { if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil { - // The error in setupSubnetSandbox could be a temporary glitch. reset the - // subnet once object to allow the setup to be retried on another endpoint join. - s.once = &sync.Once{} return err } } - n.Lock() s.vxlanName = vxlanName s.brName = brName - n.Unlock() return nil } @@ -697,11 +715,7 @@ func (n *network) cleanupStaleSandboxes() { } func (n *network) initSandbox(restore bool) error { - n.Lock() n.initEpoch++ - n.Unlock() - - networkOnce.Do(networkOnceInit) if !restore { if hostMode { @@ -731,12 +745,7 @@ func (n *network) initSandbox(restore bool) error { } // this is needed to let the peerAdd configure the sandbox - n.setSandbox(sbox) - - if !restore { - // Initialize the sandbox with all the peers previously received from networkdb - n.driver.initSandboxPeerDB(n.id) - } + n.sbox = sbox // If we are in swarm mode, we don't need anymore the watchMiss routine. // This will save 1 thread and 1 netlink socket per network @@ -754,7 +763,7 @@ func (n *network) initSandbox(restore bool) error { tv := syscall.NsecToTimeval(soTimeout.Nanoseconds()) err = nlSock.SetReceiveTimeout(&tv) }) - n.setNetlinkSocket(nlSock) + n.nlSocket = nlSock if err == nil { go n.watchMiss(nlSock, key) @@ -856,7 +865,6 @@ func (d *driver) restoreNetworkFromStore(nid string) *network { if n != nil { n.driver = d n.endpoints = endpointTable{} - n.once = &sync.Once{} d.networks[nid] = n } return n @@ -864,11 +872,11 @@ func (d *driver) restoreNetworkFromStore(nid string) *network { func (d *driver) network(nid string) *network { d.Lock() - defer d.Unlock() n, ok := d.networks[nid] if !ok { n = d.restoreNetworkFromStore(nid) } + d.Unlock() return n } @@ -889,26 +897,12 @@ func (d *driver) getNetworkFromStore(nid string) *network { func (n *network) sandbox() osl.Sandbox { n.Lock() defer n.Unlock() - return n.sbox } -func (n *network) setSandbox(sbox osl.Sandbox) { - n.Lock() - n.sbox = sbox - n.Unlock() -} - -func (n *network) setNetlinkSocket(nlSk *nl.NetlinkSocket) { - n.Lock() - n.nlSocket = nlSk - n.Unlock() -} - func (n *network) vxlanID(s *subnet) uint32 { n.Lock() defer n.Unlock() - return s.vni } @@ -1017,7 +1011,6 @@ func (n *network) SetValue(value []byte) error { subnetIP: subnetIP, gwIP: gwIP, vni: vni, - once: &sync.Once{}, } n.subnets = append(n.subnets, s) } else { @@ -1043,7 +1036,10 @@ func (n *network) writeToStore() error { } func (n *network) releaseVxlanID() ([]uint32, error) { - if len(n.subnets) == 0 { + n.Lock() + nSubnets := len(n.subnets) + n.Unlock() + if nSubnets == 0 { return nil, nil } @@ -1059,14 +1055,17 @@ func (n *network) releaseVxlanID() ([]uint32, error) { } } var vnis []uint32 + n.Lock() for _, s := range n.subnets { if n.driver.vxlanIdm != nil { - vni := n.vxlanID(s) - vnis = append(vnis, vni) - n.driver.vxlanIdm.Release(uint64(vni)) + vnis = append(vnis, s.vni) } + s.vni = 0 + } + n.Unlock() - n.setVxlanID(s, 0) + for _, vni := range vnis { + n.driver.vxlanIdm.Release(uint64(vni)) } return vnis, nil @@ -1074,7 +1073,7 @@ func (n *network) releaseVxlanID() ([]uint32, error) { func (n *network) obtainVxlanID(s *subnet) error { //return if the subnet already has a vxlan id assigned - if s.vni != 0 { + if n.vxlanID(s) != 0 { return nil } @@ -1087,7 +1086,7 @@ func (n *network) obtainVxlanID(s *subnet) error { return fmt.Errorf("getting network %q from datastore failed %v", n.id, err) } - if s.vni == 0 { + if n.vxlanID(s) == 0 { vxlanID, err := n.driver.vxlanIdm.GetID(true) if err != nil { return fmt.Errorf("failed to allocate vxlan id: %v", err) diff --git a/drivers/overlay/overlay.go b/drivers/overlay/overlay.go index f31a8ca597..1bbd761c2f 100644 --- a/drivers/overlay/overlay.go +++ b/drivers/overlay/overlay.go @@ -105,17 +105,6 @@ func Init(dc driverapi.DriverCallback, config map[string]interface{}) error { logrus.Warnf("Failure during overlay endpoints restore: %v", err) } - // If an error happened when the network join the sandbox during the endpoints restore - // we should reset it now along with the once variable, so that subsequent endpoint joins - // outside of the restore path can potentially fix the network join and succeed. - for nid, n := range d.networks { - if n.initErr != nil { - logrus.Infof("resetting init error and once variable for network %s after unsuccessful endpoint restore: %v", nid, n.initErr) - n.initErr = nil - n.once = &sync.Once{} - } - } - return dc.RegisterDriver(networkType, d, c) } @@ -151,14 +140,10 @@ func (d *driver) restoreEndpoints() error { return fmt.Errorf("could not find subnet for endpoint %s", ep.id) } - if err := n.joinSandbox(true); err != nil { + if err := n.joinSandbox(s, true, true); err != nil { return fmt.Errorf("restore network sandbox failed: %v", err) } - if err := n.joinSubnetSandbox(s, true); err != nil { - return fmt.Errorf("restore subnet sandbox failed for %q: %v", s.subnetIP.String(), err) - } - Ifaces := make(map[string][]osl.IfaceOption) vethIfaceOption := make([]osl.IfaceOption, 1) vethIfaceOption = append(vethIfaceOption, n.sbox.InterfaceOptions().Master(s.brName)) @@ -166,10 +151,10 @@ func (d *driver) restoreEndpoints() error { err := n.sbox.Restore(Ifaces, nil, nil, nil) if err != nil { + n.leaveSandbox() return fmt.Errorf("failed to restore overlay sandbox: %v", err) } - n.incEndpointCount() d.peerAdd(ep.nid, ep.id, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), false, false, true) } return nil diff --git a/drivers/overlay/peerdb.go b/drivers/overlay/peerdb.go index bdd3cb12af..819b68ff90 100644 --- a/drivers/overlay/peerdb.go +++ b/drivers/overlay/peerdb.go @@ -384,7 +384,7 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err) } - if err := n.joinSubnetSandbox(s, false); err != nil { + if err := n.joinSandbox(s, false, false); err != nil { return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err) }