From 2b64ef86fe489d797c9647a219141372ca3e09e0 Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Mon, 14 Jul 2025 21:14:37 -0700 Subject: [PATCH 1/3] fix: Trap SIGTERM for graceful shutdown --- cmd/dranet/app.go | 26 ++++++++++---------------- pkg/driver/driver.go | 6 ++---- pkg/driver/nri_hooks.go | 4 ++++ 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/cmd/dranet/app.go b/cmd/dranet/app.go index 299a869b..58a64832 100644 --- a/cmd/dranet/app.go +++ b/cmd/dranet/app.go @@ -26,14 +26,13 @@ import ( "reflect" "runtime/debug" "sync/atomic" + "syscall" "github.com/google/cel-go/cel" "github.com/google/cel-go/ext" "github.com/google/dranet/pkg/driver" "github.com/prometheus/client_golang/prometheus/promhttp" - "golang.org/x/sys/unix" - resourcev1beta1 "k8s.io/api/resource/v1beta1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -119,17 +118,11 @@ func main() { klog.Fatalf("can not obtain the node name, use the hostname-override flag if you want to set it to a specific value: %v", err) } - // trap Ctrl+C and call cancel on the context - ctx := context.Background() - ctx, cancel := context.WithCancel(ctx) + ctx, cancel := context.WithCancel(context.Background()) - // Enable signal handler - signalCh := make(chan os.Signal, 2) - defer func() { - close(signalCh) - cancel() - }() - signal.Notify(signalCh, os.Interrupt, unix.SIGINT) + // Trap signals for graceful shutdown. + signalCh := make(chan os.Signal, 1) + signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM) opts := []driver.Option{} if celExpression != "" { @@ -156,16 +149,17 @@ func main() { if err != nil { klog.Fatalf("driver failed to start: %v", err) } - defer dranet.Stop() + defer dranet.Stop() // Gracefully shutdown at the end. + ready.Store(true) klog.Info("driver started") select { - case <-signalCh: - klog.Infof("Exiting: received signal") + case sig := <-signalCh: + klog.Infof("Received shutdown signal: %q. Initiating graceful shutdown...", sig) cancel() case <-ctx.Done(): - klog.Infof("Exiting: context cancelled") + klog.Info("Context cancelled. Initiating graceful shutdown...") } } diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index 19176cc9..aa8ca4e6 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -178,10 +178,8 @@ func Start(ctx context.Context, driverName string, kubeClient kubernetes.Interfa } func (np *NetworkDriver) Stop() { + // Stop NRI Plugin (it's expected that it returns when fully stopped). np.nriPlugin.Stop() + // Stop DRA Plugin (returns only after it has fully stopped). np.draPlugin.Stop() } - -func (np *NetworkDriver) Shutdown(_ context.Context) { - klog.Info("Runtime shutting down...") -} diff --git a/pkg/driver/nri_hooks.go b/pkg/driver/nri_hooks.go index 16631005..8f21cc3e 100644 --- a/pkg/driver/nri_hooks.go +++ b/pkg/driver/nri_hooks.go @@ -276,6 +276,10 @@ func (np *NetworkDriver) RemovePodSandbox(_ context.Context, pod *api.PodSandbox return nil } +func (np *NetworkDriver) Shutdown(_ context.Context) { + klog.Info("Runtime shutting down...") +} + func getNetworkNamespace(pod *api.PodSandbox) string { // get the pod network namespace for _, namespace := range pod.Linux.GetNamespaces() { From 5bb732ca4539766bb9b2b7143742d44d8548b0f1 Mon Sep 17 00:00:00 2001 From: Gaurav Ghildiyal Date: Mon, 14 Jul 2025 21:15:05 -0700 Subject: [PATCH 2/3] feat: Add e2e test for verifying graceful shutdown --- tests/e2e.bats | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/e2e.bats b/tests/e2e.bats index f472cb05..36363ebb 100644 --- a/tests/e2e.bats +++ b/tests/e2e.bats @@ -295,3 +295,51 @@ load 'test_helper/bats-assert/load' kubectl delete -f "$BATS_TEST_DIRNAME"/../examples/repeatresourceclaimtemplate.yaml } + +@test "driver should gracefully shutdown when terminated" { + # node1 will be labeled such that it stops running the dranet pod. + node1=$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[0].metadata.name}') + kubectl label node "${node1}" e2e-test-do-not-schedule=true + # node 2 will continue to run the dranet pod. + node2=$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[1].metadata.name}') + + # Add affinity to only schedule on nodes without the + # "e2e-test-do-not-schedule" label. This allows the pods on the specific node + # to be deleted (and prevents automatic recreation on it) + kubectl patch daemonset dranet -n kube-system --type='merge' --patch-file=<(cat < Date: Tue, 15 Jul 2025 08:42:12 -0700 Subject: [PATCH 3/3] Mount BPF filesystem and make it shareable --- tests/setup_suite.bash | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/setup_suite.bash b/tests/setup_suite.bash index e61914d2..148cef1f 100644 --- a/tests/setup_suite.bash +++ b/tests/setup_suite.bash @@ -20,6 +20,13 @@ function setup_suite { kind load docker-image "$IMAGE_NAME":test --name "$CLUSTER_NAME" + # Creating BPF and cgroup mounts on the Kind nodes. + NODES=$(kind get nodes --name ${CLUSTER_NAME}) + for node in $NODES; do + docker exec "$node" mount -t bpf bpffs /sys/fs/bpf + docker exec "$node" mount --make-shared /sys/fs/bpf + done + _install=$(sed s#"$IMAGE_NAME".*#"$IMAGE_NAME":test# < "$BATS_TEST_DIRNAME"/../install.yaml) printf '%s' "${_install}" | kubectl apply -f - kubectl wait --for=condition=ready pods --namespace=kube-system -l k8s-app=dranet