Skip to content
This repository was archived by the owner on May 6, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 10 additions & 16 deletions cmd/dranet/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,13 @@ import (
"reflect"
"runtime/debug"
"sync/atomic"
"syscall"

"github.com/google/cel-go/cel"
"github.com/google/cel-go/ext"
"github.com/google/dranet/pkg/driver"
"github.com/prometheus/client_golang/prometheus/promhttp"

"golang.org/x/sys/unix"

resourcev1beta1 "k8s.io/api/resource/v1beta1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -119,17 +118,11 @@ func main() {
klog.Fatalf("can not obtain the node name, use the hostname-override flag if you want to set it to a specific value: %v", err)
}

// trap Ctrl+C and call cancel on the context
ctx := context.Background()
ctx, cancel := context.WithCancel(ctx)
ctx, cancel := context.WithCancel(context.Background())

// Enable signal handler
signalCh := make(chan os.Signal, 2)
defer func() {
close(signalCh)
cancel()
}()
signal.Notify(signalCh, os.Interrupt, unix.SIGINT)
// Trap signals for graceful shutdown.
signalCh := make(chan os.Signal, 1)
signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM)

opts := []driver.Option{}
if celExpression != "" {
Expand All @@ -156,16 +149,17 @@ func main() {
if err != nil {
klog.Fatalf("driver failed to start: %v", err)
}
defer dranet.Stop()
defer dranet.Stop() // Gracefully shutdown at the end.

ready.Store(true)
klog.Info("driver started")

select {
case <-signalCh:
klog.Infof("Exiting: received signal")
case sig := <-signalCh:
klog.Infof("Received shutdown signal: %q. Initiating graceful shutdown...", sig)
cancel()
Comment thread
gauravkghildiyal marked this conversation as resolved.
case <-ctx.Done():
klog.Infof("Exiting: context cancelled")
klog.Info("Context cancelled. Initiating graceful shutdown...")
}
}

Expand Down
6 changes: 2 additions & 4 deletions pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,8 @@ func Start(ctx context.Context, driverName string, kubeClient kubernetes.Interfa
}

func (np *NetworkDriver) Stop() {
// Stop NRI Plugin (it's expected that it returns when fully stopped).
np.nriPlugin.Stop()
// Stop DRA Plugin (returns only after it has fully stopped).
np.draPlugin.Stop()
}

func (np *NetworkDriver) Shutdown(_ context.Context) {
klog.Info("Runtime shutting down...")
}
4 changes: 4 additions & 0 deletions pkg/driver/nri_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,10 @@ func (np *NetworkDriver) RemovePodSandbox(_ context.Context, pod *api.PodSandbox
return nil
}

func (np *NetworkDriver) Shutdown(_ context.Context) {
klog.Info("Runtime shutting down...")
Comment thread
gauravkghildiyal marked this conversation as resolved.
}

func getNetworkNamespace(pod *api.PodSandbox) string {
// get the pod network namespace
for _, namespace := range pod.Linux.GetNamespaces() {
Expand Down
48 changes: 48 additions & 0 deletions tests/e2e.bats
Original file line number Diff line number Diff line change
Expand Up @@ -295,3 +295,51 @@ load 'test_helper/bats-assert/load'

kubectl delete -f "$BATS_TEST_DIRNAME"/../examples/repeatresourceclaimtemplate.yaml
}

@test "driver should gracefully shutdown when terminated" {
# node1 will be labeled such that it stops running the dranet pod.
node1=$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[0].metadata.name}')
kubectl label node "${node1}" e2e-test-do-not-schedule=true
# node 2 will continue to run the dranet pod.
node2=$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[1].metadata.name}')

# Add affinity to only schedule on nodes without the
# "e2e-test-do-not-schedule" label. This allows the pods on the specific node
# to be deleted (and prevents automatic recreation on it)
kubectl patch daemonset dranet -n kube-system --type='merge' --patch-file=<(cat <<EOF
spec:
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: e2e-test-do-not-schedule
operator: DoesNotExist
EOF
)
kubectl rollout status ds/dranet --namespace=kube-system

# After graceful shutdown of the driver from node1, the DRA plugin socket
# files should have been deleted.
run docker exec "${node1}" test -S /var/lib/kubelet/plugins/dra.net/dra.sock
assert_failure
run docker exec "${node1}" test -S /var/lib/kubelet/plugins_registry/dra.net-reg.sock
assert_failure

# For comparison, node2 should have the files present since the dranet pod is
# still runnning on it.
docker exec "${node2}" test -S /var/lib/kubelet/plugins/dra.net/dra.sock
docker exec "${node2}" test -S /var/lib/kubelet/plugins_registry/dra.net-reg.sock

# Remove affinity from DraNet DaemonSet to revert it back to original
kubectl patch daemonset dranet -n kube-system --type='merge' --patch-file=<(cat <<EOF
spec:
template:
spec:
affinity:
EOF
)
kubectl rollout status ds/dranet --namespace=kube-system
}
7 changes: 7 additions & 0 deletions tests/setup_suite.bash
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ function setup_suite {

kind load docker-image "$IMAGE_NAME":test --name "$CLUSTER_NAME"

# Creating BPF and cgroup mounts on the Kind nodes.
NODES=$(kind get nodes --name ${CLUSTER_NAME})
for node in $NODES; do
docker exec "$node" mount -t bpf bpffs /sys/fs/bpf
docker exec "$node" mount --make-shared /sys/fs/bpf
done

_install=$(sed s#"$IMAGE_NAME".*#"$IMAGE_NAME":test# < "$BATS_TEST_DIRNAME"/../install.yaml)
printf '%s' "${_install}" | kubectl apply -f -
kubectl wait --for=condition=ready pods --namespace=kube-system -l k8s-app=dranet
Expand Down
Loading