From 4e2b864a4e182df9b231b4c120081c1e82b874da Mon Sep 17 00:00:00 2001 From: Aman Agrawal Date: Tue, 2 Sep 2025 23:50:33 +0000 Subject: [PATCH] add DRA and NRI latency, success metrics and tests for metrics --- go.mod | 1 + pkg/driver/dra_hooks.go | 67 +++++++- pkg/driver/dra_hooks_test.go | 241 +++++++++++++++++++++++++++ pkg/driver/metrics.go | 53 +++++- pkg/driver/nri_hooks.go | 50 +++++- pkg/driver/nri_hooks_test.go | 309 ++++++++++++++++++++++++++++++++++- 6 files changed, 708 insertions(+), 13 deletions(-) create mode 100644 pkg/driver/dra_hooks_test.go diff --git a/go.mod b/go.mod index 2fb39316..51a33a7c 100644 --- a/go.mod +++ b/go.mod @@ -66,6 +66,7 @@ require ( github.com/josharian/native v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/knqyf263/go-plugin v0.9.0 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/mdlayher/packet v1.1.2 // indirect github.com/mdlayher/socket v0.5.1 // indirect diff --git a/pkg/driver/dra_hooks.go b/pkg/driver/dra_hooks.go index ec1936aa..c84bccee 100644 --- a/pkg/driver/dra_hooks.go +++ b/pkg/driver/dra_hooks.go @@ -58,6 +58,9 @@ func (np *NetworkDriver) PublishResources(ctx context.Context) { klog.V(4).Infof("Received %d devices", len(devices)) devices = filter.FilterDevices(np.celProgram, devices) klog.V(4).Infof("After filtering %d devices", len(devices)) + + np.publishResourcesPrometheusMetrics(devices) + resources := resourceslice.DriverResources{ Pools: map[string]resourceslice.Pool{ np.nodeName: {Slices: []resourceslice.Slice{{Devices: devices}}}}, @@ -75,11 +78,45 @@ func (np *NetworkDriver) PublishResources(ctx context.Context) { } } +func (np *NetworkDriver) publishResourcesPrometheusMetrics(devices []resourceapi.Device) { + rdmaCount := 0 + for _, device := range devices { + if attr, ok := device.Attributes[apis.AttrRDMA]; ok && attr.BoolValue != nil && *attr.BoolValue { + rdmaCount++ + } + } + publishedDevicesTotal.WithLabelValues("rdma").Set(float64(rdmaCount)) + publishedDevicesTotal.WithLabelValues("total").Set(float64(len(devices))) +} + func (np *NetworkDriver) PrepareResourceClaims(ctx context.Context, claims []*resourceapi.ResourceClaim) (map[types.UID]kubeletplugin.PrepareResult, error) { klog.V(2).Infof("PrepareResourceClaims is called: number of claims: %d", len(claims)) + start := time.Now() + defer func() { + draPluginRequestsLatencySeconds.WithLabelValues(methodPrepareResourceClaims).Observe(time.Since(start).Seconds()) + }() + result, err := np.prepareResourceClaims(ctx, claims) + if err != nil { + draPluginRequestsTotal.WithLabelValues(methodPrepareResourceClaims, statusFailed).Inc() + return result, err + } + // identify errors and log metrics + isError := false + for _, res := range result { + if res.Err != nil { + isError = true + break + } + } + if isError { + draPluginRequestsTotal.WithLabelValues(methodPrepareResourceClaims, statusFailed).Inc() + } else { + draPluginRequestsTotal.WithLabelValues(methodPrepareResourceClaims, statusSuccess).Inc() + } + return result, err +} - nodePrepareRequestsTotal.Inc() - +func (np *NetworkDriver) prepareResourceClaims(ctx context.Context, claims []*resourceapi.ResourceClaim) (map[types.UID]kubeletplugin.PrepareResult, error) { if len(claims) == 0 { return nil, nil } @@ -323,6 +360,32 @@ func (np *NetworkDriver) prepareResourceClaim(ctx context.Context, claim *resour func (np *NetworkDriver) UnprepareResourceClaims(ctx context.Context, claims []kubeletplugin.NamespacedObject) (map[types.UID]error, error) { klog.V(2).Infof("UnprepareResourceClaims is called: number of claims: %d", len(claims)) + start := time.Now() + defer func() { + draPluginRequestsLatencySeconds.WithLabelValues(methodUnprepareResourceClaims).Observe(time.Since(start).Seconds()) + }() + result, err := np.unprepareResourceClaims(ctx, claims) + if err != nil { + draPluginRequestsTotal.WithLabelValues(methodUnprepareResourceClaims, statusFailed).Inc() + return result, err + } + // identify errors and log metrics + isError := false + for _, res := range result { + if res != nil { + isError = true + break + } + } + if isError { + draPluginRequestsTotal.WithLabelValues(methodUnprepareResourceClaims, statusFailed).Inc() + } else { + draPluginRequestsTotal.WithLabelValues(methodUnprepareResourceClaims, statusSuccess).Inc() + } + return result, err +} + +func (np *NetworkDriver) unprepareResourceClaims(ctx context.Context, claims []kubeletplugin.NamespacedObject) (map[types.UID]error, error) { if len(claims) == 0 { return nil, nil } diff --git a/pkg/driver/dra_hooks_test.go b/pkg/driver/dra_hooks_test.go new file mode 100644 index 00000000..8d5f9488 --- /dev/null +++ b/pkg/driver/dra_hooks_test.go @@ -0,0 +1,241 @@ +/* +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package driver + +import ( + "context" + "strings" + "testing" + + "github.com/google/dranet/pkg/apis" + "github.com/google/dranet/pkg/inventory" + "github.com/prometheus/client_golang/prometheus/testutil" + resourcev1 "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/dynamic-resource-allocation/kubeletplugin" +) + +func TestPublishResourcesPrometheusMetrics(t *testing.T) { + testCases := []struct { + name string + devices []resourcev1.Device + expectedRdma float64 + expectedTotal float64 + }{ + { + name: "No devices", + devices: []resourcev1.Device{}, + expectedRdma: 0, + expectedTotal: 0, + }, + { + name: "Only RDMA devices", + devices: []resourcev1.Device{ + {Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + apis.AttrRDMA: {BoolValue: func() *bool { b := true; return &b }()}, + }}, + }, + expectedRdma: 1, + expectedTotal: 1, + }, + { + name: "Only non-RDMA devices", + devices: []resourcev1.Device{ + {Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + apis.AttrRDMA: {BoolValue: func() *bool { b := false; return &b }()}, + }}, + }, + expectedRdma: 0, + expectedTotal: 1, + }, + { + name: "Mixed RDMA and non-RDMA devices", + devices: []resourcev1.Device{ + {Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + apis.AttrRDMA: {BoolValue: func() *bool { b := true; return &b }()}, + }}, + {Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + apis.AttrRDMA: {BoolValue: func() *bool { b := true; return &b }()}, + }}, + {Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + apis.AttrRDMA: {BoolValue: func() *bool { b := false; return &b }()}, + }}, + }, + expectedRdma: 2, + expectedTotal: 3, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + publishedDevicesTotal.Reset() + np := &NetworkDriver{} + np.publishResourcesPrometheusMetrics(tc.devices) + + if got := testutil.ToFloat64(publishedDevicesTotal.WithLabelValues("rdma")); got != tc.expectedRdma { + t.Errorf("Expected %f for RDMA devices, got %f", tc.expectedRdma, got) + } + if got := testutil.ToFloat64(publishedDevicesTotal.WithLabelValues("total")); got != tc.expectedTotal { + t.Errorf("Expected %f for Total devices, got %f", tc.expectedTotal, got) + } + }) + } +} + +func TestPrepareResourceClaimsMetrics(t *testing.T) { + ctx := context.Background() + + t.Run("Success Case", func(t *testing.T) { + draPluginRequestsTotal.Reset() + draPluginRequestsLatencySeconds.Reset() + + np := &NetworkDriver{} + if _, err := np.PrepareResourceClaims(ctx, []*resourcev1.ResourceClaim{}); err != nil { + t.Fatalf("PrepareResourceClaims failed: %v", err) + } + + if got := testutil.ToFloat64(draPluginRequestsTotal.WithLabelValues(methodPrepareResourceClaims, statusSuccess)); got != float64(1) { + t.Errorf("Expected 1 success, got %f", got) + } + if got := testutil.ToFloat64(draPluginRequestsTotal.WithLabelValues(methodPrepareResourceClaims, statusFailed)); got != float64(0) { + t.Errorf("Expected 0 failures, got %f", got) + } + + expected := ` + # HELP dranet_driver_dra_plugin_requests_latency_seconds DRA plugin request latency in seconds. + # TYPE dranet_driver_dra_plugin_requests_latency_seconds histogram + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="0.005"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="0.01"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="0.025"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="0.05"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="0.1"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="0.25"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="0.5"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="1"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="2.5"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="5"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="10"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="PrepareResourceClaims",le="+Inf"} 1 + ` + if err := testutil.CollectAndCompare(draPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_dra_plugin_requests_latency_seconds_bucket"); err != nil { + t.Fatalf("CollectAndCompare failed: %v", err) + } + }) + + t.Run("Failure Case", func(t *testing.T) { + draPluginRequestsTotal.Reset() + draPluginRequestsLatencySeconds.Reset() + + np := &NetworkDriver{ + netdb: inventory.New(), + driverName: "test.driver", + } + + claims := []*resourcev1.ResourceClaim{ + { + ObjectMeta: metav1.ObjectMeta{UID: "claim-uid-1"}, + Status: resourcev1.ResourceClaimStatus{ + ReservedFor: []resourcev1.ResourceClaimConsumerReference{ + {APIGroup: "", Resource: "pods", Name: "test-pod", UID: "pod-uid-1"}, + }, + Allocation: &resourcev1.AllocationResult{ + Devices: resourcev1.DeviceAllocationResult{ + Results: []resourcev1.DeviceRequestAllocationResult{ + {Driver: "test.driver", Device: "device-does-not-exist"}, + }, + }, + }, + }, + }, + } + + res, err := np.PrepareResourceClaims(ctx, claims) + if err != nil { + t.Fatalf("PrepareResourceClaims failed: %v", err) + } + if res["claim-uid-1"].Err == nil { + t.Errorf("Expected an error for claim-uid-1, but got none") + } + + if got := testutil.ToFloat64(draPluginRequestsTotal.WithLabelValues(methodPrepareResourceClaims, statusSuccess)); got != float64(0) { + t.Errorf("Expected 0 successes, got %f", got) + } + if got := testutil.ToFloat64(draPluginRequestsTotal.WithLabelValues(methodPrepareResourceClaims, statusFailed)); got != float64(1) { + t.Errorf("Expected 1 failure, got %f", got) + } + + if count := testutil.CollectAndCount(draPluginRequestsLatencySeconds); count != 1 { + t.Errorf("Expected 1 latency metric, got %d", count) + } + }) +} + +func TestUnprepareResourceClaimsMetrics(t *testing.T) { + ctx := context.Background() + + t.Run("Success Case", func(t *testing.T) { + draPluginRequestsTotal.Reset() + draPluginRequestsLatencySeconds.Reset() + + np := &NetworkDriver{ + podConfigStore: NewPodConfigStore(), + } + claimName := types.NamespacedName{Name: "test-claim", Namespace: "test-ns"} + np.podConfigStore.Set("pod-uid-1", "device-a", PodConfig{Claim: claimName}) + + claims := []kubeletplugin.NamespacedObject{ + {NamespacedName: claimName, UID: "claim-uid-1"}, + } + + if _, err := np.UnprepareResourceClaims(ctx, claims); err != nil { + t.Fatalf("UnprepareResourceClaims failed: %v", err) + } + + // Verify the claim was removed from the store + if _, ok := np.podConfigStore.GetPodConfigs("pod-uid-1"); ok { + t.Errorf("Pod config should have been removed, but was found") + } + + if got := testutil.ToFloat64(draPluginRequestsTotal.WithLabelValues(methodUnprepareResourceClaims, statusSuccess)); got != float64(1) { + t.Errorf("Expected 1 success, got %f", got) + } + if got := testutil.ToFloat64(draPluginRequestsTotal.WithLabelValues(methodUnprepareResourceClaims, statusFailed)); got != float64(0) { + t.Errorf("Expected 0 failures, got %f", got) + } + + expected := ` + # HELP dranet_driver_dra_plugin_requests_latency_seconds DRA plugin request latency in seconds. + # TYPE dranet_driver_dra_plugin_requests_latency_seconds histogram + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="0.005"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="0.01"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="0.025"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="0.05"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="0.1"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="0.25"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="0.5"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="1"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="2.5"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="5"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="10"} 1 + dranet_driver_dra_plugin_requests_latency_seconds_bucket{method="UnprepareResourceClaims",le="+Inf"} 1 + ` + if err := testutil.CollectAndCompare(draPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_dra_plugin_requests_latency_seconds_bucket"); err != nil { + t.Fatalf("CollectAndCompare failed: %v", err) + } + }) +} diff --git a/pkg/driver/metrics.go b/pkg/driver/metrics.go index c7311196..b1f68501 100644 --- a/pkg/driver/metrics.go +++ b/pkg/driver/metrics.go @@ -22,19 +22,62 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +const ( + statusSuccess = "success" + statusFailed = "failed" +) + +const ( + methodPrepareResourceClaims = "PrepareResourceClaims" + methodUnprepareResourceClaims = "UnprepareResourceClaims" + methodRunPodSandbox = "RunPodSandbox" + methodStopPodSandbox = "StopPodSandbox" + methodRemovePodSandbox = "RemovePodSandbox" + methodCreateContainer = "CreateContainer" +) + var registerMetricsOnce sync.Once func registerMetrics() { registerMetricsOnce.Do(func() { - prometheus.MustRegister(nodePrepareRequestsTotal) + prometheus.MustRegister(draPluginRequestsTotal) + prometheus.MustRegister(draPluginRequestsLatencySeconds) + prometheus.MustRegister(nriPluginRequestsTotal) + prometheus.MustRegister(nriPluginRequestsLatencySeconds) + prometheus.MustRegister(publishedDevicesTotal) }) } var ( - nodePrepareRequestsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + draPluginRequestsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "dranet", Subsystem: "driver", - Name: "node_prepare_requests_total", - Help: "Total number of NodePrepareResources requests received.", - }) + Name: "dra_plugin_requests_total", + Help: "Total number of DRA plugin requests.", + }, []string{"method", "status"}) + draPluginRequestsLatencySeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "dranet", + Subsystem: "driver", + Name: "dra_plugin_requests_latency_seconds", + Help: "DRA plugin request latency in seconds.", + }, []string{"method"}) + nriPluginRequestsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "dranet", + Subsystem: "driver", + Name: "nri_plugin_requests_total", + Help: "Total number of NRI plugin requests.", + }, []string{"method", "status"}) + nriPluginRequestsLatencySeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "dranet", + Subsystem: "driver", + Name: "nri_plugin_requests_latency_seconds", + Help: "NRI plugin request latency in seconds.", + }, []string{"method"}) + publishedDevicesTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "dranet", + Subsystem: "driver", + Name: "published_devices_total", + Help: "Total number of published devices.", + }, []string{"feature"}) ) + diff --git a/pkg/driver/nri_hooks.go b/pkg/driver/nri_hooks.go index 3c095ef1..26aafac4 100644 --- a/pkg/driver/nri_hooks.go +++ b/pkg/driver/nri_hooks.go @@ -58,8 +58,24 @@ func (np *NetworkDriver) Synchronize(_ context.Context, pods []*api.PodSandbox, } // CreateContainer handles container creation requests. -func (np *NetworkDriver) CreateContainer(_ context.Context, pod *api.PodSandbox, ctr *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) { +func (np *NetworkDriver) CreateContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) { klog.V(2).Infof("CreateContainer Pod %s/%s UID %s Container %s", pod.Namespace, pod.Name, pod.Uid, ctr.Name) + start := time.Now() + defer func() { + nriPluginRequestsLatencySeconds.WithLabelValues(methodCreateContainer).Observe(time.Since(start).Seconds()) + }() + + adjust, update, err := np.createContainer(ctx, pod, ctr) + + if err != nil { + nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusFailed).Inc() + } else { + nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusSuccess).Inc() + } + return adjust, update, err +} + +func (np *NetworkDriver) createContainer(_ context.Context, pod *api.PodSandbox, ctr *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) { podConfig, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid())) if !ok { return nil, nil, nil @@ -91,7 +107,18 @@ func (np *NetworkDriver) RunPodSandbox(ctx context.Context, pod *api.PodSandbox) start := time.Now() defer func() { klog.V(2).Infof("RunPodSandbox Pod %s/%s UID %s took %v", pod.Namespace, pod.Name, pod.Uid, time.Since(start)) + nriPluginRequestsLatencySeconds.WithLabelValues(methodRunPodSandbox).Observe(time.Since(start).Seconds()) }() + err := np.runPodSandbox(ctx, pod) + if err != nil { + nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusFailed).Inc() + } else { + nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusSuccess).Inc() + } + return err +} + +func (np *NetworkDriver) runPodSandbox(ctx context.Context, pod *api.PodSandbox) error { // get the devices associated to this Pod podConfig, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid())) if !ok { @@ -230,10 +257,22 @@ func (np *NetworkDriver) StopPodSandbox(ctx context.Context, pod *api.PodSandbox klog.V(2).Infof("StopPodSandbox Pod %s/%s UID %s", pod.Namespace, pod.Name, pod.Uid) start := time.Now() defer func() { - np.netdb.RemovePodNetNs(podKey(pod)) klog.V(2).Infof("StopPodSandbox Pod %s/%s UID %s took %v", pod.Namespace, pod.Name, pod.Uid, time.Since(start)) + nriPluginRequestsLatencySeconds.WithLabelValues(methodStopPodSandbox).Observe(time.Since(start).Seconds()) }() + err := np.stopPodSandbox(ctx, pod) + if err != nil { + nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusFailed).Inc() + } else { + nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusSuccess).Inc() + } + return err +} +func (np *NetworkDriver) stopPodSandbox(ctx context.Context, pod *api.PodSandbox) error { + defer func() { + np.netdb.RemovePodNetNs(podKey(pod)) + }() // get the devices associated to this Pod podConfig, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid())) if !ok { @@ -251,7 +290,6 @@ func (np *NetworkDriver) StopPodSandbox(ctx context.Context, pod *api.PodSandbox return nil } } - for deviceName, config := range podConfig { if err := nsDetachNetdev(ns, config.NetworkInterfaceConfigInPod.Interface.Name, config.NetworkInterfaceConfigInHost.Interface.Name); err != nil { klog.Infof("fail to return network device %s : %v", deviceName, err) @@ -268,7 +306,13 @@ func (np *NetworkDriver) StopPodSandbox(ctx context.Context, pod *api.PodSandbox func (np *NetworkDriver) RemovePodSandbox(_ context.Context, pod *api.PodSandbox) error { klog.V(2).Infof("RemovePodSandbox Pod %s/%s UID %s", pod.Namespace, pod.Name, pod.Uid) + start := time.Now() + defer func() { + nriPluginRequestsLatencySeconds.WithLabelValues(methodRemovePodSandbox).Observe(time.Since(start).Seconds()) + }() + np.netdb.RemovePodNetNs(podKey(pod)) + nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusSuccess).Inc() return nil } diff --git a/pkg/driver/nri_hooks_test.go b/pkg/driver/nri_hooks_test.go index 0467e462..2378a351 100644 --- a/pkg/driver/nri_hooks_test.go +++ b/pkg/driver/nri_hooks_test.go @@ -18,9 +18,12 @@ package driver import ( "context" + "strings" "testing" "github.com/containerd/nri/pkg/api" + "github.com/google/dranet/pkg/inventory" + "github.com/prometheus/client_golang/prometheus/testutil" "k8s.io/apimachinery/pkg/types" ) @@ -29,8 +32,9 @@ func TestCreateContainerNoDuplicateDevices(t *testing.T) { podConfigStore: NewPodConfigStore(), } - podUID := types.UID("test-pod") - pod := &api.PodSandbox{ + +podUID := types.UID("test-pod") +pod := &api.PodSandbox{ Uid: string(podUID), Name: "test-pod", Namespace: "test-ns", @@ -43,7 +47,8 @@ func TestCreateContainerNoDuplicateDevices(t *testing.T) { rdmaDevChars := []LinuxDevice{ {Path: "/dev/infiniband/uverbs0", Type: "c", Major: 231, Minor: 192}, } - podConfig := PodConfig{ + +podConfig := PodConfig{ RDMADevice: RDMAConfig{ DevChars: rdmaDevChars, }, @@ -60,3 +65,301 @@ func TestCreateContainerNoDuplicateDevices(t *testing.T) { t.Errorf("CreateContainer should not adjust the same device multiple times\n%v", adjust.Linux.Devices) } } + +func TestCreateContainerMetrics(t *testing.T) { + testCases := []struct { + name string + podConfigStore *PodConfigStore + expectSuccess bool + }{ + { + name: "Success", + podConfigStore: NewPodConfigStore(), + expectSuccess: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + nriPluginRequestsTotal.Reset() + nriPluginRequestsLatencySeconds.Reset() + np := &NetworkDriver{ + podConfigStore: tc.podConfigStore, + } + +podUID := types.UID("test-pod") +pod := &api.PodSandbox{ + Uid: string(podUID), + Name: "test-pod", + Namespace: "test-ns", + } + ctr := &api.Container{ + Name: "test-container", + } + + np.CreateContainer(context.Background(), pod, ctr) + expected := ` + # HELP dranet_driver_nri_plugin_requests_latency_seconds NRI plugin request latency in seconds. + # TYPE dranet_driver_nri_plugin_requests_latency_seconds histogram + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.005"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.01"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.025"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.05"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.1"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.25"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="1"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="2.5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="10"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="+Inf"} 1 + ` + if err := testutil.CollectAndCompare(nriPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_nri_plugin_requests_latency_seconds_bucket"); err != nil { + t.Fatalf("CollectAndCompare failed: %v", err) + } + if tc.expectSuccess { + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusSuccess)); got != float64(1) { + t.Errorf("Expected 1 success, got %f", got) + } + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusFailed)); got != float64(0) { + t.Errorf("Expected 0 failures, got %f", got) + } + } else { + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusSuccess)); got != float64(0) { + t.Errorf("Expected 0 successes, got %f", got) + } + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusFailed)); got != float64(1) { + t.Errorf("Expected 1 failure, got %f", got) + } + } + }) + } +} + +func TestRunPodSandboxMetrics(t *testing.T) { + podUID := types.UID("test-pod") + podUIDHostNetwork := types.UID("test-pod-host-network") + + testCases := []struct { + name string + podConfigStore *PodConfigStore + pod *api.PodSandbox + expectSuccess bool + }{ + { + name: "Success", + podConfigStore: NewPodConfigStore(), + pod: &api.PodSandbox{ + Uid: string(podUID), + Name: "test-pod", + Namespace: "test-ns", + Linux: &api.LinuxPodSandbox{ + Namespaces: []*api.LinuxNamespace{ + { + Type: "network", + Path: "/var/run/netns/test", + }, + }, + }, + }, + expectSuccess: true, + }, + { + name: "Failure - Host Network", + podConfigStore: NewPodConfigStore(), + pod: &api.PodSandbox{ + Uid: string(podUIDHostNetwork), + Name: "test-pod-host-network", + Namespace: "test-ns", + Linux: &api.LinuxPodSandbox{}, // No network namespace + }, + expectSuccess: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + nriPluginRequestsTotal.Reset() + nriPluginRequestsLatencySeconds.Reset() + np := &NetworkDriver{ + podConfigStore: tc.podConfigStore, + netdb: inventory.New(), + } + + // For the failure case, a pod config must exist. + if !tc.expectSuccess { + tc.podConfigStore.Set(podUIDHostNetwork, "eth0", PodConfig{}) + } + + np.RunPodSandbox(context.Background(), tc.pod) + expected := ` + # HELP dranet_driver_nri_plugin_requests_latency_seconds NRI plugin request latency in seconds. + # TYPE dranet_driver_nri_plugin_requests_latency_seconds histogram + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="0.005"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="0.01"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="0.025"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="0.05"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="0.1"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="0.25"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="0.5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="1"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="2.5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="10"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="+Inf"} 1 + ` + if err := testutil.CollectAndCompare(nriPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_nri_plugin_requests_latency_seconds_bucket"); err != nil { + t.Fatalf("CollectAndCompare failed: %v", err) + } + if tc.expectSuccess { + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusSuccess)); got != float64(1) { + t.Errorf("Expected 1 success, got %f", got) + } + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusFailed)); got != float64(0) { + t.Errorf("Expected 0 failures, got %f", got) + } + } else { + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusSuccess)); got != float64(0) { + t.Errorf("Expected 0 successes, got %f", got) + } + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusFailed)); got != float64(1) { + t.Errorf("Expected 1 failure, got %f", got) + } + } + }) + } +} + +func TestStopPodSandboxMetrics(t *testing.T) { + testCases := []struct { + name string + podConfigStore *PodConfigStore + expectSuccess bool + }{ + { + name: "Success", + podConfigStore: NewPodConfigStore(), + expectSuccess: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + nriPluginRequestsTotal.Reset() + nriPluginRequestsLatencySeconds.Reset() + np := &NetworkDriver{ + podConfigStore: tc.podConfigStore, + netdb: inventory.New(), + } + podUID := types.UID("test-pod") + pod := &api.PodSandbox{ + Uid: string(podUID), + Name: "test-pod", + Namespace: "test-ns", + } + + np.StopPodSandbox(context.Background(), pod) + expected := ` + # HELP dranet_driver_nri_plugin_requests_latency_seconds NRI plugin request latency in seconds. + # TYPE dranet_driver_nri_plugin_requests_latency_seconds histogram + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.005"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.01"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.025"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.05"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.1"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.25"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="1"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="2.5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="10"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="+Inf"} 1 + ` + if err := testutil.CollectAndCompare(nriPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_nri_plugin_requests_latency_seconds_bucket"); err != nil { + t.Fatalf("CollectAndCompare failed: %v", err) + } + if tc.expectSuccess { + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusSuccess)); got != float64(1) { + t.Errorf("Expected 1 success, got %f", got) + } + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusFailed)); got != float64(0) { + t.Errorf("Expected 0 failures, got %f", got) + } + } else { + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusSuccess)); got != float64(0) { + t.Errorf("Expected 0 successes, got %f", got) + } + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusFailed)); got != float64(1) { + t.Errorf("Expected 1 failure, got %f", got) + } + } + }) + } +} + +func TestRemovePodSandboxMetrics(t *testing.T) { + testCases := []struct { + name string + podConfigStore *PodConfigStore + expectSuccess bool + }{ + { + name: "Success", + podConfigStore: NewPodConfigStore(), + expectSuccess: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + nriPluginRequestsTotal.Reset() + nriPluginRequestsLatencySeconds.Reset() + np := &NetworkDriver{ + podConfigStore: tc.podConfigStore, + netdb: inventory.New(), + } + podUID := types.UID("test-pod") + pod := &api.PodSandbox{ + Uid: string(podUID), + Name: "test-pod", + Namespace: "test-ns", + } + + np.RemovePodSandbox(context.Background(), pod) + expected := ` + # HELP dranet_driver_nri_plugin_requests_latency_seconds NRI plugin request latency in seconds. + # TYPE dranet_driver_nri_plugin_requests_latency_seconds histogram + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.005"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.01"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.025"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.05"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.1"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.25"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="1"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="2.5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="5"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="10"} 1 + dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="+Inf"} 1 + ` + if err := testutil.CollectAndCompare(nriPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_nri_plugin_requests_latency_seconds_bucket"); err != nil { + t.Fatalf("CollectAndCompare failed: %v", err) + } + if tc.expectSuccess { + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusSuccess)); got != float64(1) { + t.Errorf("Expected 1 success, got %f", got) + } + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusFailed)); got != float64(0) { + t.Errorf("Expected 0 failures, got %f", got) + } + } else { + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusSuccess)); got != float64(0) { + t.Errorf("Expected 0 successes, got %f", got) + } + if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusFailed)); got != float64(1) { + t.Errorf("Expected 1 failure, got %f", got) + } + } + }) + } +}