diff --git a/cmd/dranet/app.go b/cmd/dranet/app.go index 299a869b..58a64832 100644 --- a/cmd/dranet/app.go +++ b/cmd/dranet/app.go @@ -26,14 +26,13 @@ import ( "reflect" "runtime/debug" "sync/atomic" + "syscall" "github.com/google/cel-go/cel" "github.com/google/cel-go/ext" "github.com/google/dranet/pkg/driver" "github.com/prometheus/client_golang/prometheus/promhttp" - "golang.org/x/sys/unix" - resourcev1beta1 "k8s.io/api/resource/v1beta1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -119,17 +118,11 @@ func main() { klog.Fatalf("can not obtain the node name, use the hostname-override flag if you want to set it to a specific value: %v", err) } - // trap Ctrl+C and call cancel on the context - ctx := context.Background() - ctx, cancel := context.WithCancel(ctx) + ctx, cancel := context.WithCancel(context.Background()) - // Enable signal handler - signalCh := make(chan os.Signal, 2) - defer func() { - close(signalCh) - cancel() - }() - signal.Notify(signalCh, os.Interrupt, unix.SIGINT) + // Trap signals for graceful shutdown. + signalCh := make(chan os.Signal, 1) + signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM) opts := []driver.Option{} if celExpression != "" { @@ -156,16 +149,17 @@ func main() { if err != nil { klog.Fatalf("driver failed to start: %v", err) } - defer dranet.Stop() + defer dranet.Stop() // Gracefully shutdown at the end. + ready.Store(true) klog.Info("driver started") select { - case <-signalCh: - klog.Infof("Exiting: received signal") + case sig := <-signalCh: + klog.Infof("Received shutdown signal: %q. Initiating graceful shutdown...", sig) cancel() case <-ctx.Done(): - klog.Infof("Exiting: context cancelled") + klog.Info("Context cancelled. Initiating graceful shutdown...") } } diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index 19176cc9..aa8ca4e6 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -178,10 +178,8 @@ func Start(ctx context.Context, driverName string, kubeClient kubernetes.Interfa } func (np *NetworkDriver) Stop() { + // Stop NRI Plugin (it's expected that it returns when fully stopped). np.nriPlugin.Stop() + // Stop DRA Plugin (returns only after it has fully stopped). np.draPlugin.Stop() } - -func (np *NetworkDriver) Shutdown(_ context.Context) { - klog.Info("Runtime shutting down...") -} diff --git a/pkg/driver/nri_hooks.go b/pkg/driver/nri_hooks.go index 16631005..8f21cc3e 100644 --- a/pkg/driver/nri_hooks.go +++ b/pkg/driver/nri_hooks.go @@ -276,6 +276,10 @@ func (np *NetworkDriver) RemovePodSandbox(_ context.Context, pod *api.PodSandbox return nil } +func (np *NetworkDriver) Shutdown(_ context.Context) { + klog.Info("Runtime shutting down...") +} + func getNetworkNamespace(pod *api.PodSandbox) string { // get the pod network namespace for _, namespace := range pod.Linux.GetNamespaces() { diff --git a/tests/e2e.bats b/tests/e2e.bats index f472cb05..36363ebb 100644 --- a/tests/e2e.bats +++ b/tests/e2e.bats @@ -295,3 +295,51 @@ load 'test_helper/bats-assert/load' kubectl delete -f "$BATS_TEST_DIRNAME"/../examples/repeatresourceclaimtemplate.yaml } + +@test "driver should gracefully shutdown when terminated" { + # node1 will be labeled such that it stops running the dranet pod. + node1=$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[0].metadata.name}') + kubectl label node "${node1}" e2e-test-do-not-schedule=true + # node 2 will continue to run the dranet pod. + node2=$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[1].metadata.name}') + + # Add affinity to only schedule on nodes without the + # "e2e-test-do-not-schedule" label. This allows the pods on the specific node + # to be deleted (and prevents automatic recreation on it) + kubectl patch daemonset dranet -n kube-system --type='merge' --patch-file=<(cat <