tetratelabs · codefromthecrypt · Jul 18, 2025 · Jul 18, 2025
@@ -1,4 +1,4 @@
-// Copyright 2025 Tetrate
+// Copyright func-e contributors
 // SPDX-License-Identifier: Apache-2.0
 
 package api
@@ -25,3 +25,8 @@ func TestRun_InvalidConfig(t *testing.T) {
 func TestRun_StaticFile(t *testing.T) {
 	e2e.TestRun_StaticFile(context.Background(), t, fakeFuncEFactory{})
 }
+
+func TestRun_CtrlCs(t *testing.T) {
+	// This doesn't call ctrl-c, rather cancels the context multiple times
+	e2e.TestRun_CtrlCs(context.Background(), t, fakeFuncEFactory{})
+}
@@ -1,4 +1,4 @@
-// Copyright 2025 Tetrate
+// Copyright func-e contributors
 // SPDX-License-Identifier: Apache-2.0
 
 package api
@@ -48,8 +48,10 @@ type fakeFuncE struct {
 
 // Interrupt cancels the context created in Run as we don't want to actually interrupt the calling test!
 func (f *fakeFuncE) Interrupt(context.Context) error {
-	f.cancelFunc()
-	f.cancelFunc = nil
+	if f.cancelFunc != nil {
+		f.cancelFunc()
+		// Don't set to nil in case interrupt is called multiple times (ctrl+c twice)
+	}
 	return nil
 }
 

@@ -1,4 +1,4 @@
-// Copyright 2025 Tetrate
+// Copyright func-e contributors
 // SPDX-License-Identifier: Apache-2.0
 
 package e2e
@@ -25,3 +25,11 @@ func TestRun_InvalidConfig(t *testing.T) {
 func TestRun_StaticFile(t *testing.T) {
 	e2e.TestRun_StaticFile(context.Background(), t, funcEFactory{})
 }
+
+func TestRun_CtrlCs(t *testing.T) {
+	e2e.TestRun_CtrlCs(context.Background(), t, funcEFactory{})
+}
+
+func TestRun_Kill9(t *testing.T) {
+	e2e.TestRun_Kill9(context.Background(), t, funcEFactory{})
+}
@@ -11,9 +11,12 @@ import (
 
 // interrupt attempts to interrupt the process. It doesn't necessarily kill it.
 func interrupt(p *os.Process) error {
-	// Send SIGINT to the child PID directly
-	if err := p.Signal(syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) {
-		return err
+	// Send SIGINT to the process group (negative PID) to ensure all child processes receive it
+	if err := syscall.Kill(-p.Pid, syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) {
+		// Fallback to sending to the process directly if group signal fails
+		if err := p.Signal(syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) {
+			return err
+		}
 	}
 	return nil
 }

@@ -5,6 +5,7 @@ package envoy
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -15,13 +16,13 @@ import (
 )
 
 // RegisterShutdownHook registers the passed functions to be run after Envoy has started
-// and just before func-e instructs Envoy to exit
+// and just before func-e instructs Envoy to exit.
 func (r *Runtime) RegisterShutdownHook(f func(context.Context) error) {
 	r.shutdownHooks = append(r.shutdownHooks, f)
 }
 
 func (r *Runtime) handleShutdown() {
-	// Ensure the SIGINT forwards to Envoy even if a shutdown hook panics
+	// Ensure the SIGINT forwards to Envoy even if a shutdown hook panics.
 	defer func() {
 		r.interruptEnvoy()
 		if r.cmd != nil && r.cmd.Process != nil {
@@ -30,23 +31,42 @@ func (r *Runtime) handleShutdown() {
 	}()
 
 	deadline := time.Now().Add(shutdownTimeout)
-	timeout, cancel := context.WithDeadline(context.Background(), deadline)
+	ctx, cancel := context.WithDeadline(context.Background(), deadline)
 	defer cancel()
 
 	fmt.Fprintf(r.Out, "invoking shutdown hooks with deadline %s\n", deadline.Format(dateFormat)) //nolint:errcheck
 
-	// Run each hook in parallel, logging each error
+	var (
+		mu   sync.Mutex
+		errs []error
+	)
+
+	// Run each hook in parallel, collecting errors.
 	var wg sync.WaitGroup
 	wg.Add(len(r.shutdownHooks))
 	for _, f := range r.shutdownHooks {
 		go func(f func(context.Context) error) {
 			defer wg.Done()
-			if err := f(timeout); err != nil {
-				fmt.Fprintf(r.Out, "failed shutdown hook: %s\n", err) //nolint:errcheck
+			defer func() {
+				if p := recover(); p != nil {
+					mu.Lock()
+					errs = append(errs, fmt.Errorf("panic in shutdown hook: %v", p))
+					mu.Unlock()
+				}
+			}()
+			if err := f(ctx); err != nil && !errors.Is(err, context.Canceled) {
+				// Don't collect if context was cancelled (ctrl+c twice scenario).
+				mu.Lock()
+				errs = append(errs, fmt.Errorf("failed shutdown hook: %w", err))
+				mu.Unlock()
 			}
 		}(f)
 	}
 	wg.Wait()
+
+	if len(errs) > 0 {
+		fmt.Fprintf(r.Out, "shutdown errors: %v\n", errors.Join(errs...)) //nolint:errcheck
+	}
 }
 
 func (r *Runtime) interruptEnvoy() {
@@ -58,10 +78,12 @@ func (r *Runtime) interruptEnvoy() {
 func (r *Runtime) archiveRunDir() error {
 	// Ensure logs are closed before we try to archive them.
 	if r.OutFile != nil {
-		r.OutFile.Close() //nolint
+		_ = r.OutFile.Sync()
+		_ = r.OutFile.Close()
 	}
 	if r.ErrFile != nil {
-		r.ErrFile.Close() //nolint
+		_ = r.ErrFile.Sync()
+		_ = r.ErrFile.Close()
 	}
 	if r.o.DontArchiveRunDir {
 		return nil

@@ -0,0 +1,199 @@
+// Copyright func-e contributors
+// SPDX-License-Identifier: Apache-2.0
+
+package envoy
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+var (
+	// testProcessTimeout is how long we wait for a process to exit in tests.
+	// Set to 40% of shutdownTimeout since the process should die quickly after
+	// handleShutdown completes (which already waited up to shutdownTimeout).
+	testProcessTimeout = shutdownTimeout * 2 / 5
+
+	// testSlowHookDuration simulates a hook that would exceed shutdownTimeout.
+	// Set to 6x shutdownTimeout to ensure it clearly exceeds the timeout.
+	testSlowHookDuration = shutdownTimeout * 6
+
+	// testShutdownBuffer is extra time beyond shutdownTimeout to account for
+	// goroutine scheduling and process termination overhead.
+	// Set to 40% of shutdownTimeout to be proportional but reasonable.
+	testShutdownBuffer = shutdownTimeout * 2 / 5
+)
+
+// TestHandleShutdown_PanicInHook tests that a panic in a shutdown hook
+// doesn't prevent Envoy from being terminated
+func TestHandleShutdown_PanicInHook(t *testing.T) {
+	// Create a test command that sleeps (simulating Envoy)
+	cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds())))
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+
+	require.NoError(t, cmd.Start())
+
+	r := &Runtime{
+		cmd: cmd,
+		Out: os.Stdout,
+		logf: func(format string, args ...interface{}) {
+			t.Logf(format, args...)
+		},
+	}
+
+	// Register a hook that panics
+	r.RegisterShutdownHook(func(ctx context.Context) error {
+		panic("shutdown hook panic!")
+	})
+
+	// Register another hook to verify execution continues
+	hookExecuted := false
+	r.RegisterShutdownHook(func(ctx context.Context) error {
+		hookExecuted = true
+		return nil
+	})
+
+	// Call handleShutdown - it should recover from panic and still kill the process
+	r.handleShutdown()
+
+	// Wait for process to exit
+	done := make(chan struct{})
+	go func() {
+		_ = cmd.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		// Process exited as expected
+	case <-time.After(testProcessTimeout):
+		t.Fatal("Process didn't exit within timeout")
+	}
+
+	require.True(t, hookExecuted, "Other hooks should still execute despite panic")
+}
+
+// TestHandleShutdown_MultipleHooksPanic tests that multiple panicking hooks
+// don't prevent termination
+func TestHandleShutdown_MultipleHooksPanic(t *testing.T) {
+	cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds())))
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+
+	require.NoError(t, cmd.Start())
+
+	r := &Runtime{
+		cmd: cmd,
+		Out: os.Stdout,
+		logf: func(format string, args ...interface{}) {
+			t.Logf(format, args...)
+		},
+	}
+
+	// Register multiple panicking hooks
+	for i := 0; i < 3; i++ {
+		idx := i
+		r.RegisterShutdownHook(func(ctx context.Context) error {
+			panic(fmt.Sprintf("hook %d panic!", idx))
+		})
+	}
+
+	// Call handleShutdown
+	r.handleShutdown()
+
+	// Wait for process to exit
+	done := make(chan struct{})
+	go func() {
+		_ = cmd.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		// Process exited as expected
+	case <-time.After(testProcessTimeout):
+		t.Fatal("Process didn't exit within timeout despite multiple panics")
+	}
+}
+
+// TestHandleShutdown_SlowHook tests that slow hooks don't prevent termination
+func TestHandleShutdown_SlowHook(t *testing.T) {
+	cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds())))
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+
+	require.NoError(t, cmd.Start())
+
+	r := &Runtime{
+		cmd: cmd,
+		Out: os.Stdout,
+		logf: func(format string, args ...interface{}) {
+			t.Logf(format, args...)
+		},
+	}
+
+	// Register a slow hook
+	r.RegisterShutdownHook(func(ctx context.Context) error {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(testSlowHookDuration):
+			return nil
+		}
+	})
+
+	start := time.Now()
+	r.handleShutdown()
+	elapsed := time.Since(start)
+
+	// Should complete within shutdown timeout + buffer for overhead
+	require.Less(t, elapsed, shutdownTimeout+testShutdownBuffer, "Shutdown should respect timeout")
+
+	// Wait for process to exit
+	done := make(chan struct{})
+	go func() {
+		_ = cmd.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		// Process exited as expected
+	case <-time.After(testProcessTimeout):
+		t.Fatal("Process didn't exit within timeout even with slow hooks")
+	}
+}
+
+// TestEnsureProcessDone tests that EnsureProcessDone kills the process
+func TestEnsureProcessDone(t *testing.T) {
+	cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds())))
+	require.NoError(t, cmd.Start())
+
+	// Ensure process is running
+	require.NoError(t, cmd.Process.Signal(syscall.Signal(0)))
+
+	// Call ensureProcessDone
+	err := ensureProcessDone(cmd.Process)
+	require.NoError(t, err)
+
+	// Wait for process to exit
+	done := make(chan struct{})
+	go func() {
+		_ = cmd.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		// Process exited as expected
+	case <-time.After(testProcessTimeout / 2):
+		// Use half of testProcessTimeout since EnsureProcessDone sends SIGKILL
+		// which should terminate the process immediately
+		t.Fatal("Process didn't exit after EnsureProcessDone")
+	}
+}