From 49200cbd8405988bdc55cfd62abc06ab16f33750 Mon Sep 17 00:00:00 2001 From: Flavio Crisciani Date: Tue, 5 Sep 2017 10:43:20 -0700 Subject: [PATCH 1/5] Handle IP reuse in overlay In case of IP reuse locally there was a race condition that was leaving the overlay namespace with wrong configuration causing connectivity issues. This commit introduces the use of setMatrix to handle the transient state and make sure that the proper configuration is maintained Signed-off-by: Flavio Crisciani --- drivers/overlay/joinleave.go | 10 +- drivers/overlay/ov_network.go | 35 +----- drivers/overlay/ov_serf.go | 8 +- drivers/overlay/overlay.go | 2 +- drivers/overlay/overlay.proto | 2 +- drivers/overlay/peerdb.go | 205 +++++++++++++++++++++------------- osl/neigh_linux.go | 35 ++++-- 7 files changed, 171 insertions(+), 126 deletions(-) diff --git a/drivers/overlay/joinleave.go b/drivers/overlay/joinleave.go index a07838b760..11edf43765 100644 --- a/drivers/overlay/joinleave.go +++ b/drivers/overlay/joinleave.go @@ -200,7 +200,7 @@ func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key stri } if etype == driverapi.Delete { - d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep) + d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep, false) return } @@ -234,9 +234,11 @@ func (d *driver) Leave(nid, eid string) error { n.leaveSandbox() - if err := d.checkEncryption(nid, nil, 0, true, false); err != nil { - logrus.Warn(err) - } + // if err := d.checkEncryption(nid, nil, 0, true, false); err != nil { + // logrus.Warn(err) + // } + + d.peerDelete(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), true) return nil } diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index 0e9ca77866..419f22d03d 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -760,12 +760,11 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { continue } - logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac) - if neigh.State&(netlink.NUD_STALE|netlink.NUD_INCOMPLETE) == 0 { continue } + logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac) if n.driver.isSerfAlive() { mac, IPmask, vtep, err := n.driver.resolvePeer(n.id, ip) if err != nil { @@ -775,43 +774,17 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { n.driver.peerAdd(n.id, "dummy", ip, IPmask, mac, vtep, l2Miss, l3Miss, false) } else { // If the gc_thresh values are lower kernel might knock off the neighor entries. - // When we get a L3 miss check if its a valid peer and reprogram the neighbor - // entry again. Rate limit it to once attempt every 500ms, just in case a faulty - // container sends a flood of packets to invalid peers - if !l3Miss { - continue - } + // This case can happen only for local entries, from the documentation (http://man7.org/linux/man-pages/man7/arp.7.html): + // Entries which are marked as permanent are never deleted by the garbage-collector. if time.Since(t) > 500*time.Millisecond { t = time.Now() - n.programNeighbor(ip) + logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresholds", neigh, l3Miss, l2Miss) } } } } } -func (n *network) programNeighbor(ip net.IP) { - peerMac, _, _, err := n.driver.peerDbSearch(n.id, ip) - if err != nil { - logrus.Errorf("Reprogramming on L3 miss failed for %s, no peer entry", ip) - return - } - s := n.getSubnetforIPAddr(ip) - if s == nil { - logrus.Errorf("Reprogramming on L3 miss failed for %s, not a valid subnet", ip) - return - } - sbox := n.sandbox() - if sbox == nil { - logrus.Errorf("Reprogramming on L3 miss failed for %s, overlay sandbox missing", ip) - return - } - if err := sbox.AddNeighbor(ip, peerMac, true, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil { - logrus.Errorf("Reprogramming on L3 miss failed for %s: %v", ip, err) - return - } -} - func (d *driver) addNetwork(n *network) { d.Lock() d.networks[n.id] = n diff --git a/drivers/overlay/ov_serf.go b/drivers/overlay/ov_serf.go index 6e034ada46..f644799afd 100644 --- a/drivers/overlay/ov_serf.go +++ b/drivers/overlay/ov_serf.go @@ -122,7 +122,7 @@ func (d *driver) processEvent(u serf.UserEvent) { case "join": d.peerAdd(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false, false, false) case "leave": - d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr)) + d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false) } } @@ -135,13 +135,13 @@ func (d *driver) processQuery(q *serf.Query) { fmt.Printf("Failed to scan query payload string: %v\n", err) } - peerMac, peerIPMask, vtep, err := d.peerDbSearch(nid, net.ParseIP(ipStr)) + pKey, pEntry, err := d.peerDbSearch(nid, net.ParseIP(ipStr)) if err != nil { return } - logrus.Debugf("Sending peer query resp mac %s, mask %s, vtep %s", peerMac, net.IP(peerIPMask), vtep) - q.Respond([]byte(fmt.Sprintf("%s %s %s", peerMac.String(), net.IP(peerIPMask).String(), vtep.String()))) + logrus.Debugf("Sending peer query resp mac %v, mask %s, vtep %s", pKey.peerMac, net.IP(pEntry.peerIPMask).String(), pEntry.vtep) + q.Respond([]byte(fmt.Sprintf("%s %s %s", pKey.peerMac.String(), net.IP(pEntry.peerIPMask).String(), pEntry.vtep.String()))) } func (d *driver) resolvePeer(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) { diff --git a/drivers/overlay/overlay.go b/drivers/overlay/overlay.go index 2bae0823e1..f029c5cce4 100644 --- a/drivers/overlay/overlay.go +++ b/drivers/overlay/overlay.go @@ -262,7 +262,7 @@ func (d *driver) nodeJoin(advertiseAddress, bindAddress string, self bool) { d.Unlock() // If containers are already running on this network update the - // advertiseaddress in the peerDB + // advertise address in the peerDB d.localJoinOnce.Do(func() { d.peerDBUpdateSelf() }) diff --git a/drivers/overlay/overlay.proto b/drivers/overlay/overlay.proto index 45b8c9de7e..3133386e03 100644 --- a/drivers/overlay/overlay.proto +++ b/drivers/overlay/overlay.proto @@ -24,4 +24,4 @@ message PeerRecord { // which this container is running and can be reached by // building a tunnel to that host IP. string tunnel_endpoint_ip = 3 [(gogoproto.customname) = "TunnelEndpointIP"]; -} \ No newline at end of file +} diff --git a/drivers/overlay/peerdb.go b/drivers/overlay/peerdb.go index f953e3c872..7779f3fd59 100644 --- a/drivers/overlay/peerdb.go +++ b/drivers/overlay/peerdb.go @@ -8,6 +8,7 @@ import ( "syscall" "github.com/docker/libnetwork/common" + "github.com/docker/libnetwork/osl" "github.com/sirupsen/logrus" ) @@ -22,12 +23,42 @@ type peerEntry struct { eid string vtep net.IP peerIPMask net.IPMask - inSandbox bool isLocal bool } +func (p *peerEntry) MarshalDB() peerEntryDB { + ones, bits := p.peerIPMask.Size() + return peerEntryDB{ + eid: p.eid, + vtep: p.vtep.String(), + isLocal: p.isLocal, + peerIPMaskOnes: ones, + peerIPMaskBits: bits, + } +} + +// This the structure saved into the set (SetMatrix), due to the implementation of it +// the value inserted in the set has to be Hashable so the []byte had to be converted into +// strings +type peerEntryDB struct { + eid string + vtep string + peerIPMaskOnes int + peerIPMaskBits int + isLocal bool +} + +func (p *peerEntryDB) UnMarshalDB() peerEntry { + return peerEntry{ + eid: p.eid, + vtep: net.ParseIP(p.vtep), + peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits), + isLocal: p.isLocal, + } +} + type peerMap struct { - mp map[string]peerEntry + mp common.SetMatrix sync.Mutex } @@ -54,11 +85,7 @@ func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error { } pKey.peerMac, err = net.ParseMAC(string(macB)) - if err != nil { - return err - } - - return nil + return err } func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error { @@ -87,10 +114,13 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool } mp := map[string]peerEntry{} - pMap.Lock() - for pKeyStr, pEntry := range pMap.mp { - mp[pKeyStr] = pEntry + for _, pKeyStr := range pMap.mp.Keys() { + entryDBList, ok := pMap.mp.Get(pKeyStr) + if ok { + peerEntryDB := entryDBList[0].(peerEntryDB) + mp[pKeyStr] = peerEntryDB.UnMarshalDB() + } } pMap.Unlock() @@ -107,45 +137,38 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool return nil } -func (d *driver) peerDbSearch(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) { - var ( - peerMac net.HardwareAddr - vtep net.IP - peerIPMask net.IPMask - found bool - ) - +func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) { + var pKeyMatched *peerKey + var pEntryMatched *peerEntry err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { if pKey.peerIP.Equal(peerIP) { - peerMac = pKey.peerMac - peerIPMask = pEntry.peerIPMask - vtep = pEntry.vtep - found = true - return found + pKeyMatched = pKey + pEntryMatched = pEntry + return true } - return found + return false }) if err != nil { - return nil, nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err) + return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err) } - if !found { - return nil, nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP) + if pKeyMatched == nil || pEntryMatched == nil { + return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP) } - return peerMac, peerIPMask, vtep, nil + return pKeyMatched, pEntryMatched, nil } func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP, isLocal bool) { + peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { d.peerDb.Lock() pMap, ok := d.peerDb.mp[nid] if !ok { d.peerDb.mp[nid] = &peerMap{ - mp: make(map[string]peerEntry), + mp: common.NewSetMatrix(), } pMap = d.peerDb.mp[nid] @@ -165,18 +188,24 @@ func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask } pMap.Lock() - pMap.mp[pKey.String()] = pEntry - pMap.Unlock() + defer pMap.Unlock() + b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB()) + if i != 1 { + // Transient case, there is more than one endpoint that is using the same IP,MAC pair + s, _ := pMap.mp.String(pKey.String()) + logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) + } + return b, i } func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP) peerEntry { + peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { d.peerDb.Lock() pMap, ok := d.peerDb.mp[nid] if !ok { d.peerDb.Unlock() - return peerEntry{} + return false, 0 } d.peerDb.Unlock() @@ -185,22 +214,22 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPM peerMac: peerMac, } - pMap.Lock() - - pEntry, ok := pMap.mp[pKey.String()] - if ok { - // Mismatched endpoint ID(possibly outdated). Do not - // delete peerdb - if pEntry.eid != eid { - pMap.Unlock() - return pEntry - } + pEntry := peerEntry{ + eid: eid, + vtep: vtep, + peerIPMask: peerIPMask, + isLocal: isLocal, } - delete(pMap.mp, pKey.String()) - pMap.Unlock() - - return pEntry + pMap.Lock() + defer pMap.Unlock() + b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB()) + if i != 0 { + // Transient case, there is more than one endpoint that is using the same IP,MAC pair + s, _ := pMap.mp.String(pKey.String()) + logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) + } + return b, i } // The overlay uses a lazy initialization approach, this means that when a network is created @@ -253,7 +282,7 @@ func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) { case peerOperationADD: err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer) case peerOperationDELETE: - err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP) + err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer) } if err != nil { logrus.Warnf("Peer operation failed:%s op:%v", err, op) @@ -303,19 +332,27 @@ func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, } func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, updateOnlyDB bool) error { + peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error { if err := validateID(nid, eid); err != nil { return err } + var dbEntries int + var inserted bool if updateDB { - d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, false) - if updateOnlyDB { - return nil + inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) + if !inserted { + logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", + nid, eid, peerIP, peerMac, localPeer, vtep) } } + // Local peers do not need any further configuration + if localPeer { + return nil + } + n := d.network(nid) if n == nil { return nil @@ -353,20 +390,26 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask // Add neighbor entry for the peer IP if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil { - return fmt.Errorf("could not add neighbor entry into the sandbox: %v", err) + if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 { + // We are in the transient case so only the first configuration is programmed into the kernel + // Upon deletion if the active configuration is deleted the next one from the database will be restored + // Note we are skipping also the next configuration + return nil + } + return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) } // Add fdb entry to the bridge for the peer mac if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName), sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil { - return fmt.Errorf("could not add fdb entry into the sandbox: %v", err) + return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) } return nil } func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP) { + peerMac net.HardwareAddr, vtep net.IP, localPeer bool) { callerName := common.CallerName(1) d.peerOpCh <- &peerOperation{ opType: peerOperationDELETE, @@ -377,17 +420,22 @@ func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMas peerMac: peerMac, vtepIP: vtep, callerName: callerName, + localPeer: localPeer, } } func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP) error { + peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error { if err := validateID(nid, eid); err != nil { return err } - pEntry := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep) + deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) + if !deleted { + logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", + nid, eid, peerIP, peerMac, localPeer, vtep) + } n := d.network(nid) if n == nil { @@ -399,31 +447,38 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPM return nil } - // Delete fdb entry to the bridge for the peer mac only if the - // entry existed in local peerdb. If it is a stale delete - // request, still call DeleteNeighbor but only to cleanup any - // leftover sandbox neighbor cache and not actually delete the - // kernel state. - if (eid == pEntry.eid && vtep.Equal(pEntry.vtep)) || - (eid != pEntry.eid && !vtep.Equal(pEntry.vtep)) { - if err := sbox.DeleteNeighbor(vtep, peerMac, - eid == pEntry.eid && vtep.Equal(pEntry.vtep)); err != nil { - return fmt.Errorf("could not delete fdb entry into the sandbox: %v", err) + if err := d.checkEncryption(nid, vtep, 0, false, false); err != nil { + logrus.Warn(err) + } + + // Remove fdb entry to the bridge for the peer mac + if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil { + if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 { + // We fall in here if there is a transient state and if the neighbor that is being deleted + // was never been configured into the kernel (we allow only 1 configuration at the time per mapping) + return nil } + return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) } // Delete neighbor entry for the peer IP - if eid == pEntry.eid { - if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { - return fmt.Errorf("could not delete neighbor entry into the sandbox: %v", err) - } + if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { + return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) } - if err := d.checkEncryption(nid, vtep, 0, false, false); err != nil { - logrus.Warn(err) + if dbEntries == 0 { + return nil } - return nil + // If there is still an entry into the database and the deletion went through without errors means that there is now no + // configuration active in the kernel. + // Restore one configuration for the directly from the database, note that is guaranteed that there is one + peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP) + if err != nil { + logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err) + return err + } + return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal) } func (d *driver) pushLocalDb() { diff --git a/osl/neigh_linux.go b/osl/neigh_linux.go index 4e479489fa..fa5ac9af04 100644 --- a/osl/neigh_linux.go +++ b/osl/neigh_linux.go @@ -9,6 +9,17 @@ import ( "github.com/vishvananda/netlink" ) +// NeighborSearchError indicates that the neighbor is already present +type NeighborSearchError struct { + ip net.IP + mac net.HardwareAddr + present bool +} + +func (n NeighborSearchError) Error() string { + return fmt.Sprintf("Search neighbor failed for IP %v, mac %v, present in db:%t", n.ip, n.mac, n.present) +} + // NeighOption is a function option type to set interface options type NeighOption func(nh *neigh) @@ -41,7 +52,7 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr, nh := n.findNeighbor(dstIP, dstMac) if nh == nil { - return fmt.Errorf("could not find the neighbor entry to delete") + return NeighborSearchError{dstIP, dstMac, false} } if osDelete { @@ -103,26 +114,27 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr, } } n.Unlock() - logrus.Debugf("Neighbor entry deleted for IP %v, mac %v", dstIP, dstMac) + logrus.Debugf("Neighbor entry deleted for IP %v, mac %v osDelete:%t", dstIP, dstMac, osDelete) return nil } func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, force bool, options ...NeighOption) error { var ( - iface netlink.Link - err error + iface netlink.Link + err error + neighborAlreadyPresent bool ) // If the namespace already has the neighbor entry but the AddNeighbor is called // because of a miss notification (force flag) program the kernel anyway. nh := n.findNeighbor(dstIP, dstMac) if nh != nil { + neighborAlreadyPresent = true + logrus.Warnf("Neighbor entry already present for IP %v, mac %v neighbor:%+v forceUpdate:%t", dstIP, dstMac, nh, force) if !force { - logrus.Warnf("Neighbor entry already present for IP %v, mac %v", dstIP, dstMac) - return nil + return NeighborSearchError{dstIP, dstMac, true} } - logrus.Warnf("Force kernel update, Neighbor entry already present for IP %v, mac %v", dstIP, dstMac) } nh = &neigh{ @@ -146,8 +158,7 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo if nh.linkDst != "" { iface, err = nlh.LinkByName(nh.linkDst) if err != nil { - return fmt.Errorf("could not find interface with destination name %s: %v", - nh.linkDst, err) + return fmt.Errorf("could not find interface with destination name %s: %v", nh.linkDst, err) } } @@ -167,7 +178,11 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo } if err := nlh.NeighSet(nlnh); err != nil { - return fmt.Errorf("could not add neighbor entry: %v", err) + return fmt.Errorf("could not add neighbor entry:%+v error:%v", nlnh, err) + } + + if neighborAlreadyPresent { + return nil } n.Lock() From 2ec096ace3e55967de909408a512f1fb01505577 Mon Sep 17 00:00:00 2001 From: Flavio Crisciani Date: Thu, 7 Sep 2017 11:25:06 -0700 Subject: [PATCH 2/5] flush peerdb entries on network delete peerDB was never being flushed on network delete leaveing behind stale entries Signed-off-by: Flavio Crisciani --- drivers/overlay/encryption.go | 1 - drivers/overlay/joinleave.go | 19 ++++++++----------- drivers/overlay/ov_network.go | 20 ++++---------------- drivers/overlay/peerdb.go | 32 +++++++++++++++++++++++++++----- osl/neigh_linux.go | 2 +- 5 files changed, 40 insertions(+), 34 deletions(-) diff --git a/drivers/overlay/encryption.go b/drivers/overlay/encryption.go index f12d7a8c67..802d7bc36d 100644 --- a/drivers/overlay/encryption.go +++ b/drivers/overlay/encryption.go @@ -21,7 +21,6 @@ import ( const ( r = 0xD0C4E3 - timeout = 30 pktExpansion = 26 // SPI(4) + SeqN(4) + IV(8) + PadLength(1) + NextHeader(1) + ICV(8) ) diff --git a/drivers/overlay/joinleave.go b/drivers/overlay/joinleave.go index 11edf43765..b97cc88f05 100644 --- a/drivers/overlay/joinleave.go +++ b/drivers/overlay/joinleave.go @@ -68,7 +68,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, ep.ifName = containerIfName - if err := d.writeEndpointToStore(ep); err != nil { + if err = d.writeEndpointToStore(ep); err != nil { return fmt.Errorf("failed to update overlay endpoint %s to local data store: %v", ep.id[0:7], err) } @@ -86,7 +86,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, return err } - if err := sbox.AddInterface(overlayIfName, "veth", + if err = sbox.AddInterface(overlayIfName, "veth", sbox.InterfaceOptions().Master(s.brName)); err != nil { return fmt.Errorf("could not add veth pair inside the network sandbox: %v", err) } @@ -100,7 +100,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, return err } - if err := nlh.LinkSetHardwareAddr(veth, ep.mac); err != nil { + if err = nlh.LinkSetHardwareAddr(veth, ep.mac); err != nil { return fmt.Errorf("could not set mac address (%v) to the container interface: %v", ep.mac, err) } @@ -108,7 +108,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, if sub == s { continue } - if err := jinfo.AddStaticRoute(sub.subnetIP, types.NEXTHOP, s.gwIP.IP); err != nil { + if err = jinfo.AddStaticRoute(sub.subnetIP, types.NEXTHOP, s.gwIP.IP); err != nil { logrus.Errorf("Adding subnet %s static route in network %q failed\n", s.subnetIP, n.id) } } @@ -122,7 +122,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, d.peerAdd(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), false, false, true) - if err := d.checkEncryption(nid, nil, n.vxlanID(s), true, true); err != nil { + if err = d.checkEncryption(nid, nil, n.vxlanID(s), true, true); err != nil { logrus.Warn(err) } @@ -224,6 +224,7 @@ func (d *driver) Leave(nid, eid string) error { return types.InternalMaskableErrorf("could not find endpoint with id %s", eid) } + logrus.Errorf("The channel is valid:%t", d.notifyCh != nil) if d.notifyCh != nil { d.notifyCh <- ovNotify{ action: "leave", @@ -232,13 +233,9 @@ func (d *driver) Leave(nid, eid string) error { } } - n.leaveSandbox() - - // if err := d.checkEncryption(nid, nil, 0, true, false); err != nil { - // logrus.Warn(err) - // } - d.peerDelete(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), true) + n.leaveSandbox() + return nil } diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index 419f22d03d..b320432da5 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -251,8 +251,9 @@ func (d *driver) DeleteNetwork(nid string) error { if err := d.deleteEndpointFromStore(ep); err != nil { logrus.Warnf("Failed to delete overlay endpoint %s from local store: %v", ep.id[0:7], err) } - } + // flush the peerDB entries + d.peerFlush(nid) d.deleteNetwork(nid) vnis, err := n.releaseVxlanID() @@ -505,11 +506,7 @@ func (n *network) restoreSubnetSandbox(s *subnet, brName, vxlanName string) erro vxlanIfaceOption := make([]osl.IfaceOption, 1) vxlanIfaceOption = append(vxlanIfaceOption, sbox.InterfaceOptions().Master(brName)) Ifaces[vxlanName+"+vxlan"] = vxlanIfaceOption - err = sbox.Restore(Ifaces, nil, nil, nil) - if err != nil { - return err - } - return nil + return sbox.Restore(Ifaces, nil, nil, nil) } func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error { @@ -764,8 +761,8 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { continue } - logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac) if n.driver.isSerfAlive() { + logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac) mac, IPmask, vtep, err := n.driver.resolvePeer(n.id, ip) if err != nil { logrus.Errorf("could not resolve peer %q: %v", ip, err) @@ -1063,15 +1060,6 @@ func (n *network) contains(ip net.IP) bool { return false } -func (n *network) getSubnetforIPAddr(ip net.IP) *subnet { - for _, s := range n.subnets { - if s.subnetIP.Contains(ip) { - return s - } - } - return nil -} - // getSubnetforIP returns the subnet to which the given IP belongs func (n *network) getSubnetforIP(ip *net.IPNet) *subnet { for _, s := range n.subnets { diff --git a/drivers/overlay/peerdb.go b/drivers/overlay/peerdb.go index 7779f3fd59..b6e7be080a 100644 --- a/drivers/overlay/peerdb.go +++ b/drivers/overlay/peerdb.go @@ -58,11 +58,13 @@ func (p *peerEntryDB) UnMarshalDB() peerEntry { } type peerMap struct { + // set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks mp common.SetMatrix sync.Mutex } type peerNetworkMap struct { + // map with key peerKey mp map[string]*peerMap sync.Mutex } @@ -253,6 +255,7 @@ const ( peerOperationINIT peerOperationType = iota peerOperationADD peerOperationDELETE + peerOperationFLUSH ) type peerOperation struct { @@ -283,6 +286,8 @@ func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) { err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer) case peerOperationDELETE: err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer) + case peerOperationFLUSH: + err = d.peerFlushOp(op.networkID) } if err != nil { logrus.Warnf("Peer operation failed:%s op:%v", err, op) @@ -315,7 +320,6 @@ func (d *driver) peerInitOp(nid string) error { func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) { - callerName := common.CallerName(1) d.peerOpCh <- &peerOperation{ opType: peerOperationADD, networkID: nid, @@ -327,7 +331,7 @@ func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, l2Miss: l2Miss, l3Miss: l3Miss, localPeer: localPeer, - callerName: callerName, + callerName: common.CallerName(1), } } @@ -410,7 +414,6 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, localPeer bool) { - callerName := common.CallerName(1) d.peerOpCh <- &peerOperation{ opType: peerOperationDELETE, networkID: nid, @@ -419,7 +422,7 @@ func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMas peerIPMask: peerIPMask, peerMac: peerMac, vtepIP: vtep, - callerName: callerName, + callerName: common.CallerName(1), localPeer: localPeer, } } @@ -447,7 +450,7 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPM return nil } - if err := d.checkEncryption(nid, vtep, 0, false, false); err != nil { + if err := d.checkEncryption(nid, vtep, 0, localPeer, false); err != nil { logrus.Warn(err) } @@ -481,6 +484,25 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPM return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal) } +func (d *driver) peerFlush(nid string) { + d.peerOpCh <- &peerOperation{ + opType: peerOperationFLUSH, + networkID: nid, + callerName: common.CallerName(1), + } +} + +func (d *driver) peerFlushOp(nid string) error { + d.peerDb.Lock() + defer d.peerDb.Unlock() + _, ok := d.peerDb.mp[nid] + if !ok { + return fmt.Errorf("Unable to find the peerDB for nid:%s", nid) + } + delete(d.peerDb.mp, nid) + return nil +} + func (d *driver) pushLocalDb() { d.peerDbWalk(func(nid string, pKey *peerKey, pEntry *peerEntry) bool { if pEntry.isLocal { diff --git a/osl/neigh_linux.go b/osl/neigh_linux.go index fa5ac9af04..ae854fc1b7 100644 --- a/osl/neigh_linux.go +++ b/osl/neigh_linux.go @@ -188,7 +188,7 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo n.Lock() n.neighbors = append(n.neighbors, nh) n.Unlock() - logrus.Debugf("Neighbor entry added for IP %v, mac %v", dstIP, dstMac) + logrus.Debugf("Neighbor entry added for IP %v, mac %v on ifc:%s", dstIP, dstMac, nh.linkName) return nil } From 097b363e6f9f1f65338aa26e8ef16a0c335d3550 Mon Sep 17 00:00:00 2001 From: Flavio Crisciani Date: Thu, 7 Sep 2017 14:56:13 -0700 Subject: [PATCH 3/5] log for miss notification Signed-off-by: Flavio Crisciani --- drivers/overlay/ov_network.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index b320432da5..d8700e92c3 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -770,12 +770,21 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { } n.driver.peerAdd(n.id, "dummy", ip, IPmask, mac, vtep, l2Miss, l3Miss, false) } else { - // If the gc_thresh values are lower kernel might knock off the neighor entries. - // This case can happen only for local entries, from the documentation (http://man7.org/linux/man-pages/man7/arp.7.html): + // All the local peers will trigger a miss notification but this one is expected and the local container will reply + // autonomously to the ARP request + // In case the gc_thresh3 values is low kernel might reject new entries during peerAdd. This will trigger the following + // extra logs that will inform of the possible issue. + // Entries created would not be deleted see documentation http://man7.org/linux/man-pages/man7/arp.7.html: // Entries which are marked as permanent are never deleted by the garbage-collector. - if time.Since(t) > 500*time.Millisecond { + // The time limit here is to guarantee that the dbSearch is not + // done too frequently causing a stall of the peerDB operations. + if l3Miss && time.Since(t) > 500*time.Millisecond { t = time.Now() - logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresholds", neigh, l3Miss, l2Miss) + pKey, pEntry, err := n.driver.peerDbSearch(n.id, ip) + if !pEntry.isLocal { + logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresh on the host pKey:%+v pEntry:%+v err:%v", + neigh, l3Miss, l2Miss, *pKey, *pEntry, err) + } } } } From b12d63c76d9e43b521e37acc5b2209cd189edc18 Mon Sep 17 00:00:00 2001 From: Flavio Crisciani Date: Wed, 20 Sep 2017 08:37:18 -0700 Subject: [PATCH 4/5] Addressing code review comments Signed-off-by: Flavio Crisciani --- drivers/overlay/joinleave.go | 1 - drivers/overlay/ov_network.go | 14 ++++++-------- osl/neigh_linux.go | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/overlay/joinleave.go b/drivers/overlay/joinleave.go index b97cc88f05..0770513e7d 100644 --- a/drivers/overlay/joinleave.go +++ b/drivers/overlay/joinleave.go @@ -224,7 +224,6 @@ func (d *driver) Leave(nid, eid string) error { return types.InternalMaskableErrorf("could not find endpoint with id %s", eid) } - logrus.Errorf("The channel is valid:%t", d.notifyCh != nil) if d.notifyCh != nil { d.notifyCh <- ovNotify{ action: "leave", diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index d8700e92c3..55bb49b30f 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -769,7 +769,7 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { continue } n.driver.peerAdd(n.id, "dummy", ip, IPmask, mac, vtep, l2Miss, l3Miss, false) - } else { + } else if l3Miss && time.Since(t) > 500*time.Millisecond { // All the local peers will trigger a miss notification but this one is expected and the local container will reply // autonomously to the ARP request // In case the gc_thresh3 values is low kernel might reject new entries during peerAdd. This will trigger the following @@ -778,13 +778,11 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { // Entries which are marked as permanent are never deleted by the garbage-collector. // The time limit here is to guarantee that the dbSearch is not // done too frequently causing a stall of the peerDB operations. - if l3Miss && time.Since(t) > 500*time.Millisecond { - t = time.Now() - pKey, pEntry, err := n.driver.peerDbSearch(n.id, ip) - if !pEntry.isLocal { - logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresh on the host pKey:%+v pEntry:%+v err:%v", - neigh, l3Miss, l2Miss, *pKey, *pEntry, err) - } + t = time.Now() + pKey, pEntry, err := n.driver.peerDbSearch(n.id, ip) + if !pEntry.isLocal { + logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresh on the host pKey:%+v pEntry:%+v err:%v", + neigh, l3Miss, l2Miss, *pKey, *pEntry, err) } } } diff --git a/osl/neigh_linux.go b/osl/neigh_linux.go index ae854fc1b7..6bf1c16dc5 100644 --- a/osl/neigh_linux.go +++ b/osl/neigh_linux.go @@ -188,7 +188,7 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo n.Lock() n.neighbors = append(n.neighbors, nh) n.Unlock() - logrus.Debugf("Neighbor entry added for IP %v, mac %v on ifc:%s", dstIP, dstMac, nh.linkName) + logrus.Debugf("Neighbor entry added for IP:%v, mac:%v on ifc:%s", dstIP, dstMac, nh.linkName) return nil } From d93b9b086a7dba24e76d36b9f770d95cad9bc789 Mon Sep 17 00:00:00 2001 From: Flavio Crisciani Date: Mon, 2 Oct 2017 17:05:12 -0700 Subject: [PATCH 5/5] Changed ipMask to string Avoid error logs in case of local peer case, there is no need for deleteNeighbor Avoid the network leave to readvertise already deleted entries to upper layer Signed-off-by: Flavio Crisciani --- drivers/overlay/ov_network.go | 6 ++--- drivers/overlay/peerdb.go | 46 +++++++++++++++++------------------ networkdb/networkdb.go | 5 +++- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index 55bb49b30f..758b329187 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -769,7 +769,7 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { continue } n.driver.peerAdd(n.id, "dummy", ip, IPmask, mac, vtep, l2Miss, l3Miss, false) - } else if l3Miss && time.Since(t) > 500*time.Millisecond { + } else if l3Miss && time.Since(t) > time.Second { // All the local peers will trigger a miss notification but this one is expected and the local container will reply // autonomously to the ARP request // In case the gc_thresh3 values is low kernel might reject new entries during peerAdd. This will trigger the following @@ -778,9 +778,9 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { // Entries which are marked as permanent are never deleted by the garbage-collector. // The time limit here is to guarantee that the dbSearch is not // done too frequently causing a stall of the peerDB operations. - t = time.Now() pKey, pEntry, err := n.driver.peerDbSearch(n.id, ip) - if !pEntry.isLocal { + if err == nil && !pEntry.isLocal { + t = time.Now() logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresh on the host pKey:%+v pEntry:%+v err:%v", neigh, l3Miss, l2Miss, *pKey, *pEntry, err) } diff --git a/drivers/overlay/peerdb.go b/drivers/overlay/peerdb.go index b6e7be080a..a27687bcbe 100644 --- a/drivers/overlay/peerdb.go +++ b/drivers/overlay/peerdb.go @@ -27,13 +27,11 @@ type peerEntry struct { } func (p *peerEntry) MarshalDB() peerEntryDB { - ones, bits := p.peerIPMask.Size() return peerEntryDB{ - eid: p.eid, - vtep: p.vtep.String(), - isLocal: p.isLocal, - peerIPMaskOnes: ones, - peerIPMaskBits: bits, + eid: p.eid, + vtep: p.vtep.String(), + peerIPMask: p.peerIPMask.String(), + isLocal: p.isLocal, } } @@ -41,18 +39,17 @@ func (p *peerEntry) MarshalDB() peerEntryDB { // the value inserted in the set has to be Hashable so the []byte had to be converted into // strings type peerEntryDB struct { - eid string - vtep string - peerIPMaskOnes int - peerIPMaskBits int - isLocal bool + eid string + vtep string + peerIPMask string + isLocal bool } func (p *peerEntryDB) UnMarshalDB() peerEntry { return peerEntry{ eid: p.eid, vtep: net.ParseIP(p.vtep), - peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits), + peerIPMask: net.IPMask(net.ParseIP(p.peerIPMask)), isLocal: p.isLocal, } } @@ -454,19 +451,22 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPM logrus.Warn(err) } - // Remove fdb entry to the bridge for the peer mac - if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil { - if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 { - // We fall in here if there is a transient state and if the neighbor that is being deleted - // was never been configured into the kernel (we allow only 1 configuration at the time per mapping) - return nil + // Local peers do not have any local configuration to delete + if !localPeer { + // Remove fdb entry to the bridge for the peer mac + if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil { + if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 { + // We fall in here if there is a transient state and if the neighbor that is being deleted + // was never been configured into the kernel (we allow only 1 configuration at the time per mapping) + return nil + } + return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) } - return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) - } - // Delete neighbor entry for the peer IP - if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { - return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) + // Delete neighbor entry for the peer IP + if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { + return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) + } } if dbEntries == 0 { diff --git a/networkdb/networkdb.go b/networkdb/networkdb.go index 9a2195ea54..02502cb483 100644 --- a/networkdb/networkdb.go +++ b/networkdb/networkdb.go @@ -505,7 +505,10 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) { nDB.deleteEntry(nid, tname, key) } - nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, entry.value)) + // Notify to the upper layer only entries not already marked for deletion + if !oldEntry.deleting { + nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, entry.value)) + } return false }) }