From 28dbfc2ed3305af6a45f1a50a3578a1f9bd66b6c Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Wed, 15 Oct 2025 03:06:22 +0000 Subject: [PATCH 1/7] feat: Add a wrapper for netlink to retry on ErrDumpInterrupted --- internal/nlwrap/netlink.go | 326 +++++++++++++++++++++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 internal/nlwrap/netlink.go diff --git a/internal/nlwrap/netlink.go b/internal/nlwrap/netlink.go new file mode 100644 index 00000000..5f42c993 --- /dev/null +++ b/internal/nlwrap/netlink.go @@ -0,0 +1,326 @@ +// Package nlwrap wraps vishvandanda/netlink functions that may return EINTR. +// +// A Handle instantiated using [NewHandle] or [NewHandleAt] can be used in place +// of a netlink.Handle, it's a wrapper that replaces methods that need to be +// wrapped. Functions that use the package handle need to be called as "nlwrap.X" +// instead of "netlink.X". +// +// The wrapped functions currently return EINTR when NLM_F_DUMP_INTR flagged +// in a netlink response, meaning something changed during the dump so results +// may be incomplete or inconsistent. +// +// At present, the possibly incomplete/inconsistent results are not returned +// by netlink functions along with the EINTR. So, it's not possible to do +// anything but retry. After maxAttempts the EINTR will be returned to the +// caller. + +// SPDX-License-Identifier: Apache-2.0 +// Copyright Docker, Inc. +// Copied from https://github.com/moby/moby/blob/e6fd1ce06bc00db6501528a6917c1c96cc8ee5cc/daemon/libnetwork/nlwrap/nlwrap_linux.go + +package nlwrap + +import ( + "log" + + "github.com/pkg/errors" + "github.com/vishvananda/netlink" + "github.com/vishvananda/netlink/nl" + "github.com/vishvananda/netns" +) + +// Arbitrary limit on max attempts at netlink calls if they are repeatedly interrupted. +const maxAttempts = 5 + +type Handle struct { + *netlink.Handle +} + +func NewHandle(nlFamilies ...int) (Handle, error) { + nlh, err := netlink.NewHandle(nlFamilies...) + if err != nil { + return Handle{}, err + } + return Handle{nlh}, nil +} + +func NewHandleAt(ns netns.NsHandle, nlFamilies ...int) (Handle, error) { + nlh, err := netlink.NewHandleAt(ns, nlFamilies...) + if err != nil { + return Handle{}, err + } + return Handle{nlh}, nil +} + +func (h Handle) Close() { + if h.Handle != nil { + h.Handle.Close() + } +} + +func retryOnIntr(f func() error) { + for attempt := 0; attempt < maxAttempts; attempt++ { + if err := f(); !errors.Is(err, netlink.ErrDumpInterrupted) { + return + } + } + log.Printf("netlink call interrupted after %d attempts", maxAttempts) +} + +func discardErrDumpInterrupted(err error) error { + if errors.Is(err, netlink.ErrDumpInterrupted) { + // The netlink function has returned possibly-inconsistent data along with the + // error. Discard the error and return the data. This restores the behaviour of + // the netlink package prior to v1.2.1, in which NLM_F_DUMP_INTR was ignored in + // the netlink response. + log.Printf("discarding ErrDumpInterrupted: %+v", errors.WithStack(err)) + return nil + } + return err +} + +// AddrList calls netlink.AddrList, retrying if necessary. +func AddrList(link netlink.Link, family int) ([]netlink.Addr, error) { + var addrs []netlink.Addr + var err error + retryOnIntr(func() error { + addrs, err = netlink.AddrList(link, family) //nolint:forbidigo + return err + }) + return addrs, discardErrDumpInterrupted(err) +} + +// LinkByName calls h.Handle.LinkByName, retrying if necessary. The netlink function +// doesn't normally ask the kernel for a dump of links. But, on an old kernel, it +// will do as a fallback and that dump may get inconsistent results. +func (h Handle) LinkByName(name string) (netlink.Link, error) { + var link netlink.Link + var err error + retryOnIntr(func() error { + link, err = h.Handle.LinkByName(name) //nolint:forbidigo + return err + }) + return link, discardErrDumpInterrupted(err) +} + +// LinkByName calls netlink.LinkByName, retrying if necessary. The netlink +// function doesn't normally ask the kernel for a dump of links. But, on an old +// kernel, it will do as a fallback and that dump may get inconsistent results. +func LinkByName(name string) (netlink.Link, error) { + var link netlink.Link + var err error + retryOnIntr(func() error { + link, err = netlink.LinkByName(name) //nolint:forbidigo + return err + }) + return link, discardErrDumpInterrupted(err) +} + +// LinkList calls h.Handle.LinkList, retrying if necessary. +func (h Handle) LinkList() ([]netlink.Link, error) { + var links []netlink.Link + var err error + retryOnIntr(func() error { + links, err = h.Handle.LinkList() //nolint:forbidigo + return err + }) + return links, discardErrDumpInterrupted(err) +} + +// LinkList calls netlink.Handle.LinkList, retrying if necessary. +func LinkList() ([]netlink.Link, error) { + var links []netlink.Link + var err error + retryOnIntr(func() error { + links, err = netlink.LinkList() //nolint:forbidigo + return err + }) + return links, discardErrDumpInterrupted(err) +} + +// RouteList calls h.Handle.RouteList, retrying if necessary. +func (h Handle) RouteList(link netlink.Link, family int) ([]netlink.Route, error) { + var routes []netlink.Route + var err error + retryOnIntr(func() error { + routes, err = h.Handle.RouteList(link, family) //nolint:forbidigo + return err + }) + return routes, err +} + +// RouteList calls netlink.RouteList, retrying if necessary. +func RouteList(link netlink.Link, family int) ([]netlink.Route, error) { + var route []netlink.Route + var err error + retryOnIntr(func() error { + route, err = netlink.RouteList(link, family) //nolint:forbidigo + return err + }) + return route, discardErrDumpInterrupted(err) +} + +// BridgeVlanList calls netlink.BridgeVlanList, retrying if necessary. +func BridgeVlanList() (map[int32][]*nl.BridgeVlanInfo, error) { + var err error + var info map[int32][]*nl.BridgeVlanInfo + retryOnIntr(func() error { + info, err = netlink.BridgeVlanList() //nolint:forbidigo + return err + }) + return info, discardErrDumpInterrupted(err) +} + +// RouteListFiltered calls h.Handle.RouteListFiltered, retrying if necessary. +func (h Handle) RouteListFiltered(family int, filter *netlink.Route, filterMask uint64) ([]netlink.Route, error) { + var routes []netlink.Route + var err error + retryOnIntr(func() error { + routes, err = h.Handle.RouteListFiltered(family, filter, filterMask) //nolint:forbidigo + return err + }) + return routes, err +} + +// RouteListFiltered calls netlink.RouteListFiltered, retrying if necessary. +func RouteListFiltered(family int, filter *netlink.Route, filterMask uint64) ([]netlink.Route, error) { + var route []netlink.Route + var err error + retryOnIntr(func() error { + route, err = netlink.RouteListFiltered(family, filter, filterMask) //nolint:forbidigo + return err + }) + return route, discardErrDumpInterrupted(err) +} + +// QdiscList calls netlink.QdiscList, retrying if necessary. +func QdiscList(link netlink.Link) ([]netlink.Qdisc, error) { + var qdisc []netlink.Qdisc + var err error + retryOnIntr(func() error { + qdisc, err = netlink.QdiscList(link) //nolint:forbidigo + return err + }) + return qdisc, discardErrDumpInterrupted(err) +} + +// QdiscList calls h.Handle.QdiscList, retrying if necessary. +func (h *Handle) QdiscList(link netlink.Link) ([]netlink.Qdisc, error) { + var qdisc []netlink.Qdisc + var err error + retryOnIntr(func() error { + qdisc, err = h.Handle.QdiscList(link) //nolint:forbidigo + return err + }) + return qdisc, err +} + +// LinkGetProtinfo calls netlink.LinkGetProtinfo, retrying if necessary. +func LinkGetProtinfo(link netlink.Link) (netlink.Protinfo, error) { + var protinfo netlink.Protinfo + var err error + retryOnIntr(func() error { + protinfo, err = netlink.LinkGetProtinfo(link) //nolint:forbidigo + return err + }) + return protinfo, discardErrDumpInterrupted(err) +} + +// LinkGetProtinfo calls h.Handle.LinkGetProtinfo, retrying if necessary. +func (h *Handle) LinkGetProtinfo(link netlink.Link) (netlink.Protinfo, error) { + var protinfo netlink.Protinfo + var err error + retryOnIntr(func() error { + protinfo, err = h.Handle.LinkGetProtinfo(link) //nolint:forbidigo + return err + }) + return protinfo, err +} + +// RuleListFiltered calls netlink.RuleListFiltered, retrying if necessary. +func RuleListFiltered(family int, filter *netlink.Rule, filterMask uint64) ([]netlink.Rule, error) { + var rules []netlink.Rule + var err error + retryOnIntr(func() error { + rules, err = netlink.RuleListFiltered(family, filter, filterMask) //nolint:forbidigo + return err + }) + return rules, discardErrDumpInterrupted(err) +} + +// RuleListFiltered calls h.Handle.RuleListFiltered, retrying if necessary. +func (h *Handle) RuleListFiltered(family int, filter *netlink.Rule, filterMask uint64) ([]netlink.Rule, error) { + var rules []netlink.Rule + var err error + retryOnIntr(func() error { + rules, err = h.Handle.RuleListFiltered(family, filter, filterMask) //nolint:forbidigo + return err + }) + return rules, err +} + +// FilterList calls netlink.FilterList, retrying if necessary. +func FilterList(link netlink.Link, parent uint32) ([]netlink.Filter, error) { + var filters []netlink.Filter + var err error + retryOnIntr(func() error { + filters, err = netlink.FilterList(link, parent) //nolint:forbidigo + return err + }) + return filters, discardErrDumpInterrupted(err) +} + +// FilterList calls h.Handle.FilterList, retrying if necessary. +func (h *Handle) FilterList(link netlink.Link, parent uint32) ([]netlink.Filter, error) { + var filters []netlink.Filter + var err error + retryOnIntr(func() error { + filters, err = h.Handle.FilterList(link, parent) //nolint:forbidigo + return err + }) + return filters, err +} + +// RuleList calls netlink.RuleList, retrying if necessary. +func RuleList(family int) ([]netlink.Rule, error) { + var rules []netlink.Rule + var err error + retryOnIntr(func() error { + rules, err = netlink.RuleList(family) //nolint:forbidigo + return err + }) + return rules, discardErrDumpInterrupted(err) +} + +// RuleList calls h.Handle.RuleList, retrying if necessary. +func (h *Handle) RuleList(family int) ([]netlink.Rule, error) { + var rules []netlink.Rule + var err error + retryOnIntr(func() error { + rules, err = h.Handle.RuleList(family) //nolint:forbidigo + return err + }) + return rules, err +} + +// ConntrackDeleteFilters calls netlink.ConntrackDeleteFilters, retrying if necessary. +func ConntrackDeleteFilters(table netlink.ConntrackTableType, family netlink.InetFamily, filters ...netlink.CustomConntrackFilter) (uint, error) { + var deleted uint + var err error + retryOnIntr(func() error { + deleted, err = netlink.ConntrackDeleteFilters(table, family, filters...) //nolint:forbidigo + return err + }) + return deleted, discardErrDumpInterrupted(err) +} + +// ConntrackDeleteFilters calls h.Handle.ConntrackDeleteFilters, retrying if necessary. +func (h *Handle) ConntrackDeleteFilters(table netlink.ConntrackTableType, family netlink.InetFamily, filters ...netlink.CustomConntrackFilter) (uint, error) { + var deleted uint + var err error + retryOnIntr(func() error { + deleted, err = h.Handle.ConntrackDeleteFilters(table, family, filters...) //nolint:forbidigo + return err + }) + return deleted, err +} From 482a38583ff62e0660af529feb587d229047b2e9 Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Wed, 15 Oct 2025 04:09:21 +0000 Subject: [PATCH 2/7] refactor: Replace direct netlink calls with nlwrap This commit replaces numerous direct calls to the `netlink` library with the `nlwrap` wrapper. This ensures that the updated calls will benefit from the retry logic implemented in the wrapper, making the code more resilient to `ErrDumpInterrupted` errors. Most of the wrapped functions were already available in the forked wrapper. --- pkg/driver/dhcp.go | 3 ++- pkg/driver/dra_hooks.go | 3 ++- pkg/driver/ebpf.go | 11 ++++++----- pkg/driver/ethtool_test.go | 3 ++- pkg/driver/hostdevice.go | 9 +++++---- pkg/driver/hostdevice_test.go | 5 +++-- pkg/driver/netnamespace.go | 5 +++-- pkg/driver/rdmadevice.go | 3 ++- pkg/driver/subinterfaces.go | 5 +++-- pkg/inventory/db.go | 5 +++-- pkg/inventory/net.go | 5 +++-- 11 files changed, 34 insertions(+), 23 deletions(-) diff --git a/pkg/driver/dhcp.go b/pkg/driver/dhcp.go index 50e73210..4ab7d292 100644 --- a/pkg/driver/dhcp.go +++ b/pkg/driver/dhcp.go @@ -23,12 +23,13 @@ import ( "github.com/google/dranet/pkg/apis" + "github.com/google/dranet/internal/nlwrap" "github.com/insomniacslk/dhcp/dhcpv4/nclient4" "github.com/vishvananda/netlink" ) func getDHCP(ctx context.Context, ifName string) (ip string, routes []apis.RouteConfig, err error) { - link, err := netlink.LinkByName(ifName) + link, err := nlwrap.LinkByName(ifName) if err != nil { return "", nil, err } diff --git a/pkg/driver/dra_hooks.go b/pkg/driver/dra_hooks.go index 5867aa45..1955ea20 100644 --- a/pkg/driver/dra_hooks.go +++ b/pkg/driver/dra_hooks.go @@ -27,6 +27,7 @@ import ( "github.com/google/dranet/pkg/filter" "github.com/Mellanox/rdmamap" + "github.com/google/dranet/internal/nlwrap" "github.com/vishvananda/netlink" "golang.org/x/sys/unix" @@ -152,7 +153,7 @@ func (np *NetworkDriver) prepareResourceClaim(ctx context.Context, claim *resour return kubeletplugin.PrepareResult{} } - nlHandle, err := netlink.NewHandle() + nlHandle, err := nlwrap.NewHandle() if err != nil { return kubeletplugin.PrepareResult{ Err: fmt.Errorf("error creating netlink handle %v", err), diff --git a/pkg/driver/ebpf.go b/pkg/driver/ebpf.go index 87618e85..8d72cf84 100644 --- a/pkg/driver/ebpf.go +++ b/pkg/driver/ebpf.go @@ -25,6 +25,7 @@ import ( "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" + "github.com/google/dranet/internal/nlwrap" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" "k8s.io/klog/v2" @@ -32,8 +33,8 @@ import ( // unpinBPFPrograms runs in the host namespace to delete all the pinned bpf programs func unpinBPFPrograms(ifName string) error { - device, err := netlink.LinkByName(ifName) - if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { + device, err := nlwrap.LinkByName(ifName) + if err != nil { return err } ifIndex := uint32(device.Attrs().Index) @@ -120,15 +121,15 @@ func detachEBPFPrograms(containerNsPAth string, ifName string) error { defer netns.Set(origns) // nolint:errcheck var errs []error - device, err := netlink.LinkByName(ifName) - if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { + device, err := nlwrap.LinkByName(ifName) + if err != nil { return err } // Detach TC filters (legacy) klog.V(2).Infof("Attempting to detach TC filters from interface %s", device.Attrs().Name) for _, parent := range []uint32{netlink.HANDLE_MIN_INGRESS, netlink.HANDLE_MIN_EGRESS} { - filters, err := netlink.FilterList(device, parent) + filters, err := nlwrap.FilterList(device, parent) if err != nil { klog.V(4).Infof("Could not list TC filters for interface %s (parent %d): %v", device.Attrs().Name, parent, err) continue diff --git a/pkg/driver/ethtool_test.go b/pkg/driver/ethtool_test.go index be37e69d..1dfe767c 100644 --- a/pkg/driver/ethtool_test.go +++ b/pkg/driver/ethtool_test.go @@ -26,6 +26,7 @@ import ( "strings" "testing" + "github.com/google/dranet/internal/nlwrap" "github.com/google/dranet/pkg/apis" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" @@ -59,7 +60,7 @@ func Test_applyEthtoolConfig(t *testing.T) { netns.Set(origns) // Create a dummy interface in the test namespace - nhNs, err := netlink.NewHandleAt(testNS) + nhNs, err := nlwrap.NewHandleAt(testNS) if err != nil { t.Fatalf("fail to open netlink handle: %v", err) } diff --git a/pkg/driver/hostdevice.go b/pkg/driver/hostdevice.go index 15cfb3fc..921dccea 100644 --- a/pkg/driver/hostdevice.go +++ b/pkg/driver/hostdevice.go @@ -23,6 +23,7 @@ import ( "github.com/google/dranet/pkg/apis" + "github.com/google/dranet/internal/nlwrap" "github.com/vishvananda/netlink" "github.com/vishvananda/netlink/nl" "github.com/vishvananda/netns" @@ -33,7 +34,7 @@ import ( ) func nsAttachNetdev(hostIfName string, containerNsPAth string, interfaceConfig apis.InterfaceConfig) (*resourceapi.NetworkDeviceData, error) { - hostDev, err := netlink.LinkByName(hostIfName) + hostDev, err := nlwrap.LinkByName(hostIfName) // recover same behavior on vishvananda/netlink@1.2.1 and do not fail when the kernel returns NLM_F_DUMP_INTR. if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { return nil, fmt.Errorf("failed to get link for interface %s: %w", hostIfName, err) @@ -126,7 +127,7 @@ func nsAttachNetdev(hostIfName string, containerNsPAth string, interfaceConfig a // to avoid golang problem with goroutines we create the socket in the // namespace and use it directly - nhNs, err := netlink.NewHandleAt(containerNs) + nhNs, err := nlwrap.NewHandleAt(containerNs) if err != nil { return nil, fmt.Errorf("failed to get netlink handle in container namespace %s: %w", containerNsPAth, err) } @@ -171,7 +172,7 @@ func nsDetachNetdev(containerNsPAth string, devName string, outName string) erro defer containerNs.Close() // to avoid golang problem with goroutines we create the socket in the // namespace and use it directly - nhNs, err := netlink.NewHandleAt(containerNs) + nhNs, err := nlwrap.NewHandleAt(containerNs) if err != nil { return fmt.Errorf("could not get network namespace handle: %w", err) } @@ -233,7 +234,7 @@ func nsDetachNetdev(containerNsPAth string, devName string, outName string) erro } // Set up the interface in case host network workloads depend on it - hostDev, err := netlink.LinkByName(ifName) + hostDev, err := nlwrap.LinkByName(ifName) // recover same behavior on vishvananda/netlink@1.2.1 and do not fail when the kernel returns NLM_F_DUMP_INTR. if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { return fmt.Errorf("failed to get link for interface %s: %w", ifName, err) diff --git a/pkg/driver/hostdevice_test.go b/pkg/driver/hostdevice_test.go index 659c4d0d..1cbb7d0b 100644 --- a/pkg/driver/hostdevice_test.go +++ b/pkg/driver/hostdevice_test.go @@ -26,6 +26,7 @@ import ( "strings" "testing" + "github.com/google/dranet/internal/nlwrap" "github.com/google/dranet/pkg/apis" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" @@ -60,7 +61,7 @@ func Test_nhNetdev(t *testing.T) { netns.Set(origns) // Create a dummy interface in the test namespace - nhNs, err := netlink.NewHandleAt(testNS) + nhNs, err := nlwrap.NewHandleAt(testNS) if err != nil { t.Fatalf("fail to open netlink handle: %v", err) } @@ -86,7 +87,7 @@ func Test_nhNetdev(t *testing.T) { } t.Cleanup(func() { - link, err := netlink.LinkByName(ifaceName) + link, err := nlwrap.LinkByName(ifaceName) if err == nil { _ = netlink.LinkDel(link) } diff --git a/pkg/driver/netnamespace.go b/pkg/driver/netnamespace.go index de37321d..4c31744c 100644 --- a/pkg/driver/netnamespace.go +++ b/pkg/driver/netnamespace.go @@ -25,6 +25,7 @@ import ( "github.com/google/dranet/pkg/apis" + "github.com/google/dranet/internal/nlwrap" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" ) @@ -38,7 +39,7 @@ func applyRoutingConfig(containerNsPAth string, ifName string, routeConfig []api // to avoid golang problem with goroutines we create the socket in the // namespace and use it directly - nhNs, err := netlink.NewHandleAt(containerNs) + nhNs, err := nlwrap.NewHandleAt(containerNs) if err != nil { return fmt.Errorf("can not get netlink handle: %v", err) } @@ -95,7 +96,7 @@ func applyNeighborConfig(containerNsPAth string, ifName string, neighConfig []ap } defer containerNs.Close() - nhNs, err := netlink.NewHandleAt(containerNs) + nhNs, err := nlwrap.NewHandleAt(containerNs) if err != nil { return fmt.Errorf("can not get netlink handle: %v", err) } diff --git a/pkg/driver/rdmadevice.go b/pkg/driver/rdmadevice.go index 782e43d9..a66f50ae 100644 --- a/pkg/driver/rdmadevice.go +++ b/pkg/driver/rdmadevice.go @@ -21,6 +21,7 @@ import ( "os" "syscall" + "github.com/google/dranet/internal/nlwrap" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" "golang.org/x/sys/unix" @@ -55,7 +56,7 @@ func nsDetachRdmadev(containerNsPAth string, ifName string) error { // to avoid golang problem with goroutines we create the socket in the // namespace and use it directly - nhNs, err := netlink.NewHandleAt(containerNs) + nhNs, err := nlwrap.NewHandleAt(containerNs) if err != nil { return fmt.Errorf("could not get network namespace handle: %w", err) } diff --git a/pkg/driver/subinterfaces.go b/pkg/driver/subinterfaces.go index 32270e96..0c1e1b2c 100644 --- a/pkg/driver/subinterfaces.go +++ b/pkg/driver/subinterfaces.go @@ -19,6 +19,7 @@ package driver import ( "fmt" + "github.com/google/dranet/internal/nlwrap" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" ) @@ -30,7 +31,7 @@ func addMacVlan(containerNsPAth string, devName string, mode netlink.MacvlanMode } defer containerNs.Close() - parentLink, err := netlink.LinkByName(devName) + parentLink, err := nlwrap.LinkByName(devName) if err != nil { return fmt.Errorf("could not find parent interface %s : %w", devName, err) } @@ -58,7 +59,7 @@ func addIPVlan(containerNsPAth string, devName string, mode netlink.IPVlanMode) } defer containerNs.Close() - parentLink, err := netlink.LinkByName(devName) + parentLink, err := nlwrap.LinkByName(devName) if err != nil { return fmt.Errorf("could not find parent interface %s : %w", devName, err) } diff --git a/pkg/inventory/db.go b/pkg/inventory/db.go index e4bd84d5..c0386e8f 100644 --- a/pkg/inventory/db.go +++ b/pkg/inventory/db.go @@ -30,6 +30,7 @@ import ( "github.com/google/dranet/pkg/names" "github.com/Mellanox/rdmamap" + "github.com/google/dranet/internal/nlwrap" "github.com/jaypipes/ghw" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" @@ -261,7 +262,7 @@ func (db *DB) discoverPCIDevices() []resourceapi.Device { // - For Network interfaces which are not associated with a PCI Device (like // virtual interfaces), they are added as their own device. func (db *DB) discoverNetworkInterfaces(pciDevices []resourceapi.Device) []resourceapi.Device { - links, err := netlink.LinkList() + links, err := nlwrap.LinkList() if err != nil { klog.Errorf("Could not list network interfaces: %v", err) return pciDevices @@ -334,7 +335,7 @@ func addLinkAttributes(device *resourceapi.Device, link netlink.Link) { v4 := sets.Set[string]{} v6 := sets.Set[string]{} - if ips, err := netlink.AddrList(link, netlink.FAMILY_ALL); err == nil && len(ips) > 0 { + if ips, err := nlwrap.AddrList(link, netlink.FAMILY_ALL); err == nil && len(ips) > 0 { for _, address := range ips { if !address.IP.IsGlobalUnicast() { continue diff --git a/pkg/inventory/net.go b/pkg/inventory/net.go index 8b137742..8ff56df3 100644 --- a/pkg/inventory/net.go +++ b/pkg/inventory/net.go @@ -19,6 +19,7 @@ package inventory import ( "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" + "github.com/google/dranet/internal/nlwrap" "github.com/vishvananda/netlink" "k8s.io/apimachinery/pkg/util/sets" @@ -28,7 +29,7 @@ import ( func getDefaultGwInterfaces() sets.Set[string] { interfaces := sets.Set[string]{} filter := &netlink.Route{} - routes, err := netlink.RouteListFiltered(netlink.FAMILY_ALL, filter, netlink.RT_FILTER_TABLE) + routes, err := nlwrap.RouteListFiltered(netlink.FAMILY_ALL, filter, netlink.RT_FILTER_TABLE) if err != nil { return interfaces } @@ -75,7 +76,7 @@ func getTcFilters(link netlink.Link) ([]string, bool) { isTcEBPF := false filterNames := sets.Set[string]{} for _, parent := range []uint32{netlink.HANDLE_MIN_INGRESS, netlink.HANDLE_MIN_EGRESS} { - filters, err := netlink.FilterList(link, parent) + filters, err := nlwrap.FilterList(link, parent) if err == nil { for _, f := range filters { if bpffFilter, ok := f.(*netlink.BpfFilter); ok { From e63e8f2551a8c778361d2c884c536e34f8a15623 Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Wed, 15 Oct 2025 04:22:29 +0000 Subject: [PATCH 3/7] refactor: Add wrappers for RDMA netlink functions This commit introduces wrappers for the `RdmaLinkByName` and `RdmaSystemGetNetnsMode` netlink functions. These wrappers provide retry logic to handle `ErrDumpInterrupted` errors. The corresponding callsites in the driver have been updated to use the new wrappers. --- internal/nlwrap/netlink.go | 44 ++++++++++++++++++++++++++++++++++++++ pkg/driver/driver.go | 4 ++-- pkg/driver/rdmadevice.go | 2 +- 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/internal/nlwrap/netlink.go b/internal/nlwrap/netlink.go index 5f42c993..2081b989 100644 --- a/internal/nlwrap/netlink.go +++ b/internal/nlwrap/netlink.go @@ -324,3 +324,47 @@ func (h *Handle) ConntrackDeleteFilters(table netlink.ConntrackTableType, family }) return deleted, err } + +// RdmaLinkByName calls netlink.RdmaLinkByName, retrying if necessary. +func RdmaLinkByName(name string) (*netlink.RdmaLink, error) { + var rdmaLink *netlink.RdmaLink + var err error + retryOnIntr(func() error { + rdmaLink, err = netlink.RdmaLinkByName(name) //nolint:forbidigo + return err + }) + return rdmaLink, discardErrDumpInterrupted(err) +} + +// RdmaLinkByName calls h.Handle.RdmaLinkByName, retrying if necessary. +func (h *Handle) RdmaLinkByName(name string) (*netlink.RdmaLink, error) { + var rdmaLink *netlink.RdmaLink + var err error + retryOnIntr(func() error { + rdmaLink, err = h.Handle.RdmaLinkByName(name) //nolint:forbidigo + return err + }) + return rdmaLink, discardErrDumpInterrupted(err) +} + +// RdmaSystemGetNetnsMode calls netlink.RdmaSystemGetNetnsMode, retrying if necessary. +func RdmaSystemGetNetnsMode() (string, error) { + var mode string + var err error + retryOnIntr(func() error { + mode, err = netlink.RdmaSystemGetNetnsMode() //nolint:forbidigo + return err + }) + return mode, discardErrDumpInterrupted(err) +} + +// RdmaSystemGetNetnsMode calls h.Handle.RdmaSystemGetNetnsMode, retrying if necessary. +func (h *Handle) RdmaSystemGetNetnsMode() (string, error) { + var mode string + var err error + retryOnIntr(func() error { + mode, err = h.Handle.RdmaSystemGetNetnsMode() //nolint:forbidigo + return err + }) + return mode, discardErrDumpInterrupted(err) +} diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index e95bd72d..186f484c 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -28,7 +28,7 @@ import ( "github.com/google/dranet/pkg/inventory" "github.com/containerd/nri/pkg/stub" - "github.com/vishvananda/netlink" + "github.com/google/dranet/internal/nlwrap" resourceapi "k8s.io/api/resource/v1" "k8s.io/apimachinery/pkg/util/wait" @@ -101,7 +101,7 @@ type Option func(*NetworkDriver) func Start(ctx context.Context, driverName string, kubeClient kubernetes.Interface, nodeName string, opts ...Option) (*NetworkDriver, error) { registerMetrics() - rdmaNetnsMode, err := netlink.RdmaSystemGetNetnsMode() + rdmaNetnsMode, err := nlwrap.RdmaSystemGetNetnsMode() if err != nil { klog.Infof("failed to determine the RDMA subsystem's network namespace mode, assume shared mode: %v", err) rdmaNetnsMode = apis.RdmaNetnsModeShared diff --git a/pkg/driver/rdmadevice.go b/pkg/driver/rdmadevice.go index a66f50ae..001a9559 100644 --- a/pkg/driver/rdmadevice.go +++ b/pkg/driver/rdmadevice.go @@ -36,7 +36,7 @@ func nsAttachRdmadev(hostIfName string, containerNsPAth string) error { return fmt.Errorf("could not get network namespace from path %s for network device %s : %w", containerNsPAth, hostIfName, err) } - hostDev, err := netlink.RdmaLinkByName(hostIfName) + hostDev, err := nlwrap.RdmaLinkByName(hostIfName) if err != nil { return err } From 9faa7e901c7503ee18f7a221426e7377e546549c Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Wed, 15 Oct 2025 05:12:26 +0000 Subject: [PATCH 4/7] feat: Add AddrList to nlwrap handle --- internal/nlwrap/netlink.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/internal/nlwrap/netlink.go b/internal/nlwrap/netlink.go index 2081b989..981792f2 100644 --- a/internal/nlwrap/netlink.go +++ b/internal/nlwrap/netlink.go @@ -90,6 +90,17 @@ func AddrList(link netlink.Link, family int) ([]netlink.Addr, error) { return addrs, discardErrDumpInterrupted(err) } +// AddrList calls h.Handle.AddrList, retrying if necessary. +func (h Handle) AddrList(link netlink.Link, family int) ([]netlink.Addr, error) { + var addrs []netlink.Addr + var err error + retryOnIntr(func() error { + addrs, err = h.Handle.AddrList(link, family) //nolint:forbidigo + return err + }) + return addrs, discardErrDumpInterrupted(err) +} + // LinkByName calls h.Handle.LinkByName, retrying if necessary. The netlink function // doesn't normally ask the kernel for a dump of links. But, on an old kernel, it // will do as a fallback and that dump may get inconsistent results. From aee299eed92456aefec2dfa416bbfbd9e5a52ec0 Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Wed, 15 Oct 2025 05:14:06 +0000 Subject: [PATCH 5/7] refactor: Remove redundant error checks involving netlink.ErrDumpInterrupted --- pkg/driver/dra_hooks.go | 2 +- pkg/driver/hostdevice.go | 8 +++----- pkg/driver/netnamespace.go | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pkg/driver/dra_hooks.go b/pkg/driver/dra_hooks.go index 1955ea20..51a079ea 100644 --- a/pkg/driver/dra_hooks.go +++ b/pkg/driver/dra_hooks.go @@ -238,7 +238,7 @@ func (np *NetworkDriver) prepareResourceClaim(ctx context.Context, claim *resour // If there is no custom addresses and no DHCP, then use the existing ones // get the existing IP addresses nlAddresses, err := nlHandle.AddrList(link, netlink.FAMILY_ALL) - if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { + if err != nil { errorList = append(errorList, fmt.Errorf("fail to get ip addresses for interface %s : %w", ifName, err)) } else { for _, address := range nlAddresses { diff --git a/pkg/driver/hostdevice.go b/pkg/driver/hostdevice.go index 921dccea..1b214579 100644 --- a/pkg/driver/hostdevice.go +++ b/pkg/driver/hostdevice.go @@ -35,8 +35,7 @@ import ( func nsAttachNetdev(hostIfName string, containerNsPAth string, interfaceConfig apis.InterfaceConfig) (*resourceapi.NetworkDeviceData, error) { hostDev, err := nlwrap.LinkByName(hostIfName) - // recover same behavior on vishvananda/netlink@1.2.1 and do not fail when the kernel returns NLM_F_DUMP_INTR. - if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { + if err != nil { return nil, fmt.Errorf("failed to get link for interface %s: %w", hostIfName, err) } @@ -134,7 +133,7 @@ func nsAttachNetdev(hostIfName string, containerNsPAth string, interfaceConfig a defer nhNs.Close() nsLink, err := nhNs.LinkByName(ifName) - if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { + if err != nil { return nil, fmt.Errorf("link not found for interface %s on namespace %s: %w", ifName, containerNsPAth, err) } @@ -235,8 +234,7 @@ func nsDetachNetdev(containerNsPAth string, devName string, outName string) erro // Set up the interface in case host network workloads depend on it hostDev, err := nlwrap.LinkByName(ifName) - // recover same behavior on vishvananda/netlink@1.2.1 and do not fail when the kernel returns NLM_F_DUMP_INTR. - if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { + if err != nil { return fmt.Errorf("failed to get link for interface %s: %w", ifName, err) } diff --git a/pkg/driver/netnamespace.go b/pkg/driver/netnamespace.go index 4c31744c..ebe9c2a4 100644 --- a/pkg/driver/netnamespace.go +++ b/pkg/driver/netnamespace.go @@ -46,7 +46,7 @@ func applyRoutingConfig(containerNsPAth string, ifName string, routeConfig []api defer nhNs.Close() nsLink, err := nhNs.LinkByName(ifName) - if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { + if err != nil { return fmt.Errorf("link not found for interface %s on namespace %s: %w", ifName, containerNsPAth, err) } @@ -103,7 +103,7 @@ func applyNeighborConfig(containerNsPAth string, ifName string, neighConfig []ap defer nhNs.Close() nsLink, err := nhNs.LinkByName(ifName) - if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) { + if err != nil { return fmt.Errorf("link not found for interface %s on namespace %s: %w", ifName, containerNsPAth, err) } From 4b85d348ae8713a2274d3cb6ec3259f5928beb73 Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Wed, 15 Oct 2025 15:19:00 +0000 Subject: [PATCH 6/7] refactor: Use klog in netlink wrapper library --- internal/nlwrap/netlink.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/internal/nlwrap/netlink.go b/internal/nlwrap/netlink.go index 981792f2..f284c0c9 100644 --- a/internal/nlwrap/netlink.go +++ b/internal/nlwrap/netlink.go @@ -21,12 +21,11 @@ package nlwrap import ( - "log" - "github.com/pkg/errors" "github.com/vishvananda/netlink" "github.com/vishvananda/netlink/nl" "github.com/vishvananda/netns" + "k8s.io/klog/v2" ) // Arbitrary limit on max attempts at netlink calls if they are repeatedly interrupted. @@ -64,7 +63,7 @@ func retryOnIntr(f func() error) { return } } - log.Printf("netlink call interrupted after %d attempts", maxAttempts) + klog.Infof("netlink call interrupted after %d attempts", maxAttempts) } func discardErrDumpInterrupted(err error) error { @@ -73,7 +72,7 @@ func discardErrDumpInterrupted(err error) error { // error. Discard the error and return the data. This restores the behaviour of // the netlink package prior to v1.2.1, in which NLM_F_DUMP_INTR was ignored in // the netlink response. - log.Printf("discarding ErrDumpInterrupted: %+v", errors.WithStack(err)) + klog.Infof("discarding ErrDumpInterrupted: %+v", errors.WithStack(err)) return nil } return err From 7cf5d3ac1922602d2b0334d431ad544d1a2b33b0 Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Wed, 15 Oct 2025 16:08:00 +0000 Subject: [PATCH 7/7] feat: Update golinting for netlink functions returning ErrDumpInterrupted --- .golangci.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.golangci.yaml b/.golangci.yaml index 4cb216d2..23669ee7 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -8,6 +8,7 @@ linters: - gocritic - govet - ineffassign + - forbidigo - staticcheck exclusions: generated: lax @@ -20,6 +21,14 @@ linters: - third_party$ - builtin$ - examples$ + settings: + forbidigo: + forbid: + # List based on https://github.com/vishvananda/netlink/pull/1018 + - pattern: ^netlink\.(Handle\.)?("AddrList|BridgeVlanList|ChainList|ClassList|ConntrackTableList|DevLinkGetDeviceList|DevLinkGetAllPortList|DevlinkGetDeviceParams|FilterList|FouList|GenlFamilyList|GTPPDPList|LinkByName|LinkByAlias|LinkList|LinkSubscribeWithOptions|NeighList|NeighProxyList|NeighListExecute|LinkGetProtinfo|QdiscList|RdmaLinkList|RdmaLinkByName|RdmaLinkDel|RouteList|RouteListFiltered|RouteListFilteredIter|RouteSubscribeWithOptions|RuleList|RuleListFiltered|SocketGet|SocketDiagTCPInfo|SocketDiagTCP|SocketDiagUDPInfo|SocketDiagUDP|UnixSocketDiagInfo|UnixSocketDiag|SocketXDPGetInfo|SocketDiagXDP|VDPAGetDevList|VDPAGetDevConfigList|VDPAGetMGMTDevList|XfrmPolicyList|XfrmStateList) + pkg: ^github.com/vishvananda/netlink$ + msg: Found netlink function which can return ErrDumpInterrupted. Use nlwrap package instead. + analyze-types: true formatters: exclusions: generated: lax