Skip to content
This repository was archived by the owner on May 6, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pkg/driver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
const (
statusSuccess = "success"
statusFailed = "failed"
statusNoop = "noop"
)

const (
Expand Down Expand Up @@ -73,7 +74,7 @@ var (
Subsystem: "driver",
Name: "nri_plugin_requests_latency_seconds",
Help: "NRI plugin request latency in seconds.",
}, []string{"method"})
}, []string{"method", "status"})
publishedDevicesTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "dranet",
Subsystem: "driver",
Expand Down
87 changes: 52 additions & 35 deletions pkg/driver/nri_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,25 @@ func (np *NetworkDriver) Synchronize(_ context.Context, pods []*api.PodSandbox,
func (np *NetworkDriver) CreateContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) {
klog.V(2).Infof("CreateContainer Pod %s/%s UID %s Container %s", pod.Namespace, pod.Name, pod.Uid, ctr.Name)
start := time.Now()
status := statusNoop
defer func() {
nriPluginRequestsLatencySeconds.WithLabelValues(methodCreateContainer).Observe(time.Since(start).Seconds())
nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, status).Inc()
nriPluginRequestsLatencySeconds.WithLabelValues(methodCreateContainer, status).Observe(time.Since(start).Seconds())
}()

adjust, update, err := np.createContainer(ctx, pod, ctr)

podConfig, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid()))
if !ok {
return nil, nil, nil
}
adjust, update, err := np.createContainer(ctx, pod, ctr, podConfig)
if err != nil {
nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusFailed).Inc()
status = statusFailed
} else {
nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusSuccess).Inc()
status = statusSuccess
}
return adjust, update, err
}

func (np *NetworkDriver) createContainer(_ context.Context, pod *api.PodSandbox, ctr *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) {
podConfig, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid()))
if !ok {
return nil, nil, nil
}
func (np *NetworkDriver) createContainer(_ context.Context, pod *api.PodSandbox, _ *api.Container, podConfig map[string]PodConfig) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) {
// Containers only cares about the RDMA char devices
devPaths := set.Set[string]{}
adjust := &api.ContainerAdjustment{}
Expand All @@ -105,25 +105,27 @@ func (np *NetworkDriver) createContainer(_ context.Context, pod *api.PodSandbox,
func (np *NetworkDriver) RunPodSandbox(ctx context.Context, pod *api.PodSandbox) error {
klog.V(2).Infof("RunPodSandbox Pod %s/%s UID %s", pod.Namespace, pod.Name, pod.Uid)
start := time.Now()
status := statusNoop
defer func() {
nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, status).Inc()
klog.V(2).Infof("RunPodSandbox Pod %s/%s UID %s took %v", pod.Namespace, pod.Name, pod.Uid, time.Since(start))
nriPluginRequestsLatencySeconds.WithLabelValues(methodRunPodSandbox).Observe(time.Since(start).Seconds())
}()
err := np.runPodSandbox(ctx, pod)
if err != nil {
nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusFailed).Inc()
} else {
nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusSuccess).Inc()
}
return err
}
nriPluginRequestsLatencySeconds.WithLabelValues(methodRunPodSandbox, status).Observe(time.Since(start).Seconds())
Comment thread
gauravkghildiyal marked this conversation as resolved.

func (np *NetworkDriver) runPodSandbox(ctx context.Context, pod *api.PodSandbox) error {
}()
// get the devices associated to this Pod
podConfig, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid()))
if !ok {
return nil
}
err := np.runPodSandbox(ctx, pod, podConfig)
if err != nil {
status = statusFailed
} else {
status = statusSuccess
}
return err
}
func (np *NetworkDriver) runPodSandbox(_ context.Context, pod *api.PodSandbox, podConfig map[string]PodConfig) error {
// get the pod network namespace
ns := getNetworkNamespace(pod)
// host network pods can not allocate network devices because it impact the host
Expand Down Expand Up @@ -264,28 +266,30 @@ func (np *NetworkDriver) runPodSandbox(ctx context.Context, pod *api.PodSandbox)
func (np *NetworkDriver) StopPodSandbox(ctx context.Context, pod *api.PodSandbox) error {
klog.V(2).Infof("StopPodSandbox Pod %s/%s UID %s", pod.Namespace, pod.Name, pod.Uid)
start := time.Now()
status := statusNoop
defer func() {
nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, status).Inc()
klog.V(2).Infof("StopPodSandbox Pod %s/%s UID %s took %v", pod.Namespace, pod.Name, pod.Uid, time.Since(start))
nriPluginRequestsLatencySeconds.WithLabelValues(methodStopPodSandbox).Observe(time.Since(start).Seconds())
nriPluginRequestsLatencySeconds.WithLabelValues(methodStopPodSandbox, status).Observe(time.Since(start).Seconds())
}()
err := np.stopPodSandbox(ctx, pod)
// get the devices associated to this Pod
podConfig, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid()))
if !ok {
return nil
}
err := np.stopPodSandbox(ctx, pod, podConfig)
if err != nil {
nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusFailed).Inc()
status = statusFailed
} else {
nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusSuccess).Inc()
status = statusSuccess
}
return err
}

func (np *NetworkDriver) stopPodSandbox(ctx context.Context, pod *api.PodSandbox) error {
func (np *NetworkDriver) stopPodSandbox(_ context.Context, pod *api.PodSandbox, podConfig map[string]PodConfig) error {
defer func() {
np.netdb.RemovePodNetNs(podKey(pod))
}()
// get the devices associated to this Pod
podConfig, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid()))
if !ok {
return nil
}
// get the pod network namespace
ns := getNetworkNamespace(pod)
if ns == "" {
Expand All @@ -312,15 +316,28 @@ func (np *NetworkDriver) stopPodSandbox(ctx context.Context, pod *api.PodSandbox
return nil
}

func (np *NetworkDriver) RemovePodSandbox(_ context.Context, pod *api.PodSandbox) error {
func (np *NetworkDriver) RemovePodSandbox(ctx context.Context, pod *api.PodSandbox) error {
klog.V(2).Infof("RemovePodSandbox Pod %s/%s UID %s", pod.Namespace, pod.Name, pod.Uid)
start := time.Now()
status := statusNoop
defer func() {
nriPluginRequestsLatencySeconds.WithLabelValues(methodRemovePodSandbox).Observe(time.Since(start).Seconds())
nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, status).Inc()
nriPluginRequestsLatencySeconds.WithLabelValues(methodRemovePodSandbox, status).Observe(time.Since(start).Seconds())
}()
if _, ok := np.podConfigStore.GetPodConfigs(types.UID(pod.GetUid())); !ok {
return nil
}
err := np.removePodSandbox(ctx, pod)
if err != nil {
status = statusFailed
} else {
status = statusSuccess
}
return err
}

func (np *NetworkDriver) removePodSandbox(_ context.Context, pod *api.PodSandbox) error {
np.netdb.RemovePodNetNs(podKey(pod))
nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusSuccess).Inc()
return nil
}

Expand Down
99 changes: 52 additions & 47 deletions pkg/driver/nri_hooks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,8 @@ func TestCreateContainerNoDuplicateDevices(t *testing.T) {
podConfigStore: NewPodConfigStore(),
}


podUID := types.UID("test-pod")
pod := &api.PodSandbox{
podUID := types.UID("test-pod")
pod := &api.PodSandbox{
Uid: string(podUID),
Name: "test-pod",
Namespace: "test-ns",
Expand All @@ -48,7 +47,7 @@ pod := &api.PodSandbox{
{Path: "/dev/infiniband/uverbs0", Type: "c", Major: 231, Minor: 192},
}

podConfig := PodConfig{
podConfig := PodConfig{
RDMADevice: RDMAConfig{
DevChars: rdmaDevChars,
},
Expand Down Expand Up @@ -85,10 +84,11 @@ func TestCreateContainerMetrics(t *testing.T) {
nriPluginRequestsLatencySeconds.Reset()
np := &NetworkDriver{
podConfigStore: tc.podConfigStore,
netdb: inventory.New(),
}
podUID := types.UID("test-pod")
pod := &api.PodSandbox{

podUID := types.UID("test-pod")
pod := &api.PodSandbox{
Uid: string(podUID),
Name: "test-pod",
Namespace: "test-ns",
Expand All @@ -101,24 +101,24 @@ pod := &api.PodSandbox{
expected := `
# HELP dranet_driver_nri_plugin_requests_latency_seconds NRI plugin request latency in seconds.
# TYPE dranet_driver_nri_plugin_requests_latency_seconds histogram
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.005"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.01"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.025"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.05"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.25"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="0.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="2.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="10"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",le="+Inf"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="0.005"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="0.01"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="0.025"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="0.05"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="0.1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="0.25"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="0.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="2.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="10"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="CreateContainer",status="noop",le="+Inf"} 1
`
if err := testutil.CollectAndCompare(nriPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_nri_plugin_requests_latency_seconds_bucket"); err != nil {
t.Fatalf("CollectAndCompare failed: %v", err)
}
if tc.expectSuccess {
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusSuccess)); got != float64(1) {
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusNoop)); got != float64(1) {
t.Errorf("Expected 1 success, got %f", got)
}
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodCreateContainer, statusFailed)); got != float64(0) {
Expand Down Expand Up @@ -192,6 +192,10 @@ func TestRunPodSandboxMetrics(t *testing.T) {
}

np.RunPodSandbox(context.Background(), tc.pod)
status := statusSuccess
if !tc.expectSuccess {
status = statusFailed
}
expected := `
# HELP dranet_driver_nri_plugin_requests_latency_seconds NRI plugin request latency in seconds.
# TYPE dranet_driver_nri_plugin_requests_latency_seconds histogram
Expand All @@ -208,11 +212,12 @@ func TestRunPodSandboxMetrics(t *testing.T) {
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="10"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RunPodSandbox",le="+Inf"} 1
`
expected = strings.Replace(expected, `method="RunPodSandbox"`, `method="RunPodSandbox",status="`+status+`"`, -1)
if err := testutil.CollectAndCompare(nriPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_nri_plugin_requests_latency_seconds_bucket"); err != nil {
t.Fatalf("CollectAndCompare failed: %v", err)
}
if tc.expectSuccess {
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusSuccess)); got != float64(1) {
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusNoop)); got != float64(1) {
t.Errorf("Expected 1 success, got %f", got)
}
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRunPodSandbox, statusFailed)); got != float64(0) {
Expand Down Expand Up @@ -262,24 +267,24 @@ func TestStopPodSandboxMetrics(t *testing.T) {
expected := `
# HELP dranet_driver_nri_plugin_requests_latency_seconds NRI plugin request latency in seconds.
# TYPE dranet_driver_nri_plugin_requests_latency_seconds histogram
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.005"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.01"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.025"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.05"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.25"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="0.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="2.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="10"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",le="+Inf"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="0.005"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="0.01"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="0.025"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="0.05"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="0.1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="0.25"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="0.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="2.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="10"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="StopPodSandbox",status="success",le="+Inf"} 1
`
if err := testutil.CollectAndCompare(nriPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_nri_plugin_requests_latency_seconds_bucket"); err != nil {
t.Fatalf("CollectAndCompare failed: %v", err)
}
if tc.expectSuccess {
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusSuccess)); got != float64(1) {
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusNoop)); got != float64(1) {
t.Errorf("Expected 1 success, got %f", got)
}
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodStopPodSandbox, statusFailed)); got != float64(0) {
Expand Down Expand Up @@ -329,24 +334,24 @@ func TestRemovePodSandboxMetrics(t *testing.T) {
expected := `
# HELP dranet_driver_nri_plugin_requests_latency_seconds NRI plugin request latency in seconds.
# TYPE dranet_driver_nri_plugin_requests_latency_seconds histogram
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.005"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.01"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.025"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.05"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.25"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="0.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="2.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="10"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",le="+Inf"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="0.005"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="0.01"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="0.025"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="0.05"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="0.1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="0.25"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="0.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="1"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="2.5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="5"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="10"} 1
dranet_driver_nri_plugin_requests_latency_seconds_bucket{method="RemovePodSandbox",status="success",le="+Inf"} 1
`
if err := testutil.CollectAndCompare(nriPluginRequestsLatencySeconds, strings.NewReader(expected), "dranet_driver_nri_plugin_requests_latency_seconds_bucket"); err != nil {
t.Fatalf("CollectAndCompare failed: %v", err)
}
if tc.expectSuccess {
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusSuccess)); got != float64(1) {
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusNoop)); got != float64(1) {
t.Errorf("Expected 1 success, got %f", got)
}
if got := testutil.ToFloat64(nriPluginRequestsTotal.WithLabelValues(methodRemovePodSandbox, statusFailed)); got != float64(0) {
Expand Down
Loading