diff --git a/api/func-e_run_test.go b/api/func-e_run_test.go index b9add01c..3cec2f47 100644 --- a/api/func-e_run_test.go +++ b/api/func-e_run_test.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package api @@ -25,3 +25,8 @@ func TestRun_InvalidConfig(t *testing.T) { func TestRun_StaticFile(t *testing.T) { e2e.TestRun_StaticFile(context.Background(), t, fakeFuncEFactory{}) } + +func TestRun_CtrlCs(t *testing.T) { + // This doesn't call ctrl-c, rather cancels the context multiple times + e2e.TestRun_CtrlCs(context.Background(), t, fakeFuncEFactory{}) +} diff --git a/api/func-e_test.go b/api/func-e_test.go index 4864f2a7..9d4e1137 100644 --- a/api/func-e_test.go +++ b/api/func-e_test.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package api @@ -48,8 +48,10 @@ type fakeFuncE struct { // Interrupt cancels the context created in Run as we don't want to actually interrupt the calling test! func (f *fakeFuncE) Interrupt(context.Context) error { - f.cancelFunc() - f.cancelFunc = nil + if f.cancelFunc != nil { + f.cancelFunc() + // Don't set to nil in case interrupt is called multiple times (ctrl+c twice) + } return nil } diff --git a/e2e/func-e_run_test.go b/e2e/func-e_run_test.go index 07a55dbc..6fc2da08 100644 --- a/e2e/func-e_run_test.go +++ b/e2e/func-e_run_test.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package e2e @@ -25,3 +25,11 @@ func TestRun_InvalidConfig(t *testing.T) { func TestRun_StaticFile(t *testing.T) { e2e.TestRun_StaticFile(context.Background(), t, funcEFactory{}) } + +func TestRun_CtrlCs(t *testing.T) { + e2e.TestRun_CtrlCs(context.Background(), t, funcEFactory{}) +} + +func TestRun_Kill9(t *testing.T) { + e2e.TestRun_Kill9(context.Background(), t, funcEFactory{}) +} diff --git a/internal/envoy/proc_helpers.go b/internal/envoy/proc_helpers.go index 311af92e..97351db5 100644 --- a/internal/envoy/proc_helpers.go +++ b/internal/envoy/proc_helpers.go @@ -11,9 +11,12 @@ import ( // interrupt attempts to interrupt the process. It doesn't necessarily kill it. func interrupt(p *os.Process) error { - // Send SIGINT to the child PID directly - if err := p.Signal(syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) { - return err + // Send SIGINT to the process group (negative PID) to ensure all child processes receive it + if err := syscall.Kill(-p.Pid, syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) { + // Fallback to sending to the process directly if group signal fails + if err := p.Signal(syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) { + return err + } } return nil } diff --git a/internal/envoy/shutdown.go b/internal/envoy/shutdown.go index 90a30007..4262b1c2 100644 --- a/internal/envoy/shutdown.go +++ b/internal/envoy/shutdown.go @@ -5,6 +5,7 @@ package envoy import ( "context" + "errors" "fmt" "os" "path/filepath" @@ -15,13 +16,13 @@ import ( ) // RegisterShutdownHook registers the passed functions to be run after Envoy has started -// and just before func-e instructs Envoy to exit +// and just before func-e instructs Envoy to exit. func (r *Runtime) RegisterShutdownHook(f func(context.Context) error) { r.shutdownHooks = append(r.shutdownHooks, f) } func (r *Runtime) handleShutdown() { - // Ensure the SIGINT forwards to Envoy even if a shutdown hook panics + // Ensure the SIGINT forwards to Envoy even if a shutdown hook panics. defer func() { r.interruptEnvoy() if r.cmd != nil && r.cmd.Process != nil { @@ -30,23 +31,42 @@ func (r *Runtime) handleShutdown() { }() deadline := time.Now().Add(shutdownTimeout) - timeout, cancel := context.WithDeadline(context.Background(), deadline) + ctx, cancel := context.WithDeadline(context.Background(), deadline) defer cancel() fmt.Fprintf(r.Out, "invoking shutdown hooks with deadline %s\n", deadline.Format(dateFormat)) //nolint:errcheck - // Run each hook in parallel, logging each error + var ( + mu sync.Mutex + errs []error + ) + + // Run each hook in parallel, collecting errors. var wg sync.WaitGroup wg.Add(len(r.shutdownHooks)) for _, f := range r.shutdownHooks { go func(f func(context.Context) error) { defer wg.Done() - if err := f(timeout); err != nil { - fmt.Fprintf(r.Out, "failed shutdown hook: %s\n", err) //nolint:errcheck + defer func() { + if p := recover(); p != nil { + mu.Lock() + errs = append(errs, fmt.Errorf("panic in shutdown hook: %v", p)) + mu.Unlock() + } + }() + if err := f(ctx); err != nil && !errors.Is(err, context.Canceled) { + // Don't collect if context was cancelled (ctrl+c twice scenario). + mu.Lock() + errs = append(errs, fmt.Errorf("failed shutdown hook: %w", err)) + mu.Unlock() } }(f) } wg.Wait() + + if len(errs) > 0 { + fmt.Fprintf(r.Out, "shutdown errors: %v\n", errors.Join(errs...)) //nolint:errcheck + } } func (r *Runtime) interruptEnvoy() { @@ -58,10 +78,12 @@ func (r *Runtime) interruptEnvoy() { func (r *Runtime) archiveRunDir() error { // Ensure logs are closed before we try to archive them. if r.OutFile != nil { - r.OutFile.Close() //nolint + _ = r.OutFile.Sync() + _ = r.OutFile.Close() } if r.ErrFile != nil { - r.ErrFile.Close() //nolint + _ = r.ErrFile.Sync() + _ = r.ErrFile.Close() } if r.o.DontArchiveRunDir { return nil diff --git a/internal/envoy/shutdown_test.go b/internal/envoy/shutdown_test.go new file mode 100644 index 00000000..68abeebf --- /dev/null +++ b/internal/envoy/shutdown_test.go @@ -0,0 +1,199 @@ +// Copyright func-e contributors +// SPDX-License-Identifier: Apache-2.0 + +package envoy + +import ( + "context" + "fmt" + "os" + "os/exec" + "syscall" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +var ( + // testProcessTimeout is how long we wait for a process to exit in tests. + // Set to 40% of shutdownTimeout since the process should die quickly after + // handleShutdown completes (which already waited up to shutdownTimeout). + testProcessTimeout = shutdownTimeout * 2 / 5 + + // testSlowHookDuration simulates a hook that would exceed shutdownTimeout. + // Set to 6x shutdownTimeout to ensure it clearly exceeds the timeout. + testSlowHookDuration = shutdownTimeout * 6 + + // testShutdownBuffer is extra time beyond shutdownTimeout to account for + // goroutine scheduling and process termination overhead. + // Set to 40% of shutdownTimeout to be proportional but reasonable. + testShutdownBuffer = shutdownTimeout * 2 / 5 +) + +// TestHandleShutdown_PanicInHook tests that a panic in a shutdown hook +// doesn't prevent Envoy from being terminated +func TestHandleShutdown_PanicInHook(t *testing.T) { + // Create a test command that sleeps (simulating Envoy) + cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds()))) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + require.NoError(t, cmd.Start()) + + r := &Runtime{ + cmd: cmd, + Out: os.Stdout, + logf: func(format string, args ...interface{}) { + t.Logf(format, args...) + }, + } + + // Register a hook that panics + r.RegisterShutdownHook(func(ctx context.Context) error { + panic("shutdown hook panic!") + }) + + // Register another hook to verify execution continues + hookExecuted := false + r.RegisterShutdownHook(func(ctx context.Context) error { + hookExecuted = true + return nil + }) + + // Call handleShutdown - it should recover from panic and still kill the process + r.handleShutdown() + + // Wait for process to exit + done := make(chan struct{}) + go func() { + _ = cmd.Wait() + close(done) + }() + + select { + case <-done: + // Process exited as expected + case <-time.After(testProcessTimeout): + t.Fatal("Process didn't exit within timeout") + } + + require.True(t, hookExecuted, "Other hooks should still execute despite panic") +} + +// TestHandleShutdown_MultipleHooksPanic tests that multiple panicking hooks +// don't prevent termination +func TestHandleShutdown_MultipleHooksPanic(t *testing.T) { + cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds()))) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + require.NoError(t, cmd.Start()) + + r := &Runtime{ + cmd: cmd, + Out: os.Stdout, + logf: func(format string, args ...interface{}) { + t.Logf(format, args...) + }, + } + + // Register multiple panicking hooks + for i := 0; i < 3; i++ { + idx := i + r.RegisterShutdownHook(func(ctx context.Context) error { + panic(fmt.Sprintf("hook %d panic!", idx)) + }) + } + + // Call handleShutdown + r.handleShutdown() + + // Wait for process to exit + done := make(chan struct{}) + go func() { + _ = cmd.Wait() + close(done) + }() + + select { + case <-done: + // Process exited as expected + case <-time.After(testProcessTimeout): + t.Fatal("Process didn't exit within timeout despite multiple panics") + } +} + +// TestHandleShutdown_SlowHook tests that slow hooks don't prevent termination +func TestHandleShutdown_SlowHook(t *testing.T) { + cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds()))) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + require.NoError(t, cmd.Start()) + + r := &Runtime{ + cmd: cmd, + Out: os.Stdout, + logf: func(format string, args ...interface{}) { + t.Logf(format, args...) + }, + } + + // Register a slow hook + r.RegisterShutdownHook(func(ctx context.Context) error { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(testSlowHookDuration): + return nil + } + }) + + start := time.Now() + r.handleShutdown() + elapsed := time.Since(start) + + // Should complete within shutdown timeout + buffer for overhead + require.Less(t, elapsed, shutdownTimeout+testShutdownBuffer, "Shutdown should respect timeout") + + // Wait for process to exit + done := make(chan struct{}) + go func() { + _ = cmd.Wait() + close(done) + }() + + select { + case <-done: + // Process exited as expected + case <-time.After(testProcessTimeout): + t.Fatal("Process didn't exit within timeout even with slow hooks") + } +} + +// TestEnsureProcessDone tests that EnsureProcessDone kills the process +func TestEnsureProcessDone(t *testing.T) { + cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds()))) + require.NoError(t, cmd.Start()) + + // Ensure process is running + require.NoError(t, cmd.Process.Signal(syscall.Signal(0))) + + // Call ensureProcessDone + err := ensureProcessDone(cmd.Process) + require.NoError(t, err) + + // Wait for process to exit + done := make(chan struct{}) + go func() { + _ = cmd.Wait() + close(done) + }() + + select { + case <-done: + // Process exited as expected + case <-time.After(testProcessTimeout / 2): + // Use half of testProcessTimeout since EnsureProcessDone sends SIGKILL + // which should terminate the process immediately + t.Fatal("Process didn't exit after EnsureProcessDone") + } +} diff --git a/internal/test/e2e/testrun.go b/internal/test/e2e/testrun.go index b70e4664..632a94b4 100644 --- a/internal/test/e2e/testrun.go +++ b/internal/test/e2e/testrun.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package e2e @@ -7,11 +7,15 @@ import ( "bufio" "context" _ "embed" + "errors" "io" "os" "path/filepath" + "runtime" "strings" + "syscall" "testing" + "time" "github.com/shirou/gopsutil/v4/process" "github.com/stretchr/testify/require" @@ -30,6 +34,8 @@ type RunTestOptions struct { ExpectFail bool TestFunc RunTestFunc Args []string + // ExpectKilled indicates the test will kill func-e (e.g., with SIGKILL) + ExpectKilled bool } // Tests are implemented individually rather than as table-driven tests to facilitate @@ -117,6 +123,66 @@ func TestRun_InvalidConfig(ctx context.Context, t *testing.T, factory FuncEFacto }) } +// TestRun_CtrlCs tests the "Ctrl+C twice" behavior where the first interrupt +// starts shutdown hooks and the second interrupt forces immediate exit. +func TestRun_CtrlCs(ctx context.Context, t *testing.T, factory FuncEFactory) { + executeRunTest(ctx, t, factory, RunTestOptions{ + ExpectFail: false, + Args: []string{"--config-yaml", "admin: {address: {socket_address: {address: '127.0.0.1', port_value: 0}}}"}, + TestFunc: func(ctx context.Context, runDir string, envoyPid int32, interruptFuncE func(context.Context) error, adminClient *AdminClient) { + // First interrupt should start shutdown hooks + require.NoError(t, interruptFuncE(ctx)) + + // Send 5 more interrupts to ensure no special casing + for i := 0; i < 5; i++ { + require.NoError(t, interruptFuncE(ctx)) + } + }, + }) +} + +// TestRun_Kill9 tests that when func-e is killed with SIGKILL, envoy behavior differs by OS. +// On Darwin: envoy becomes orphaned (limitation without Pdeathsig) +// On Linux: envoy should die with func-e due to process group signaling +func TestRun_Kill9(ctx context.Context, t *testing.T, factory FuncEFactory) { + executeRunTest(ctx, t, factory, RunTestOptions{ + ExpectKilled: true, + Args: []string{"--config-yaml", "admin: {address: {socket_address: {address: '127.0.0.1', port_value: 0}}}"}, + TestFunc: func(ctx context.Context, runDir string, envoyPid int32, interruptFuncE func(context.Context) error, adminClient *AdminClient) { + envoyProcess, err := process.NewProcess(envoyPid) + require.NoError(t, err) + + funcE, err := envoyProcess.Parent() + require.NoError(t, err) + + funcEPid := funcE.Pid + funcEProcess, err := os.FindProcess(int(funcEPid)) + require.NoError(t, err) + + // kill -9 func-e process + err = funcEProcess.Signal(syscall.SIGKILL) + require.NoError(t, err) + + // Wait a moment for processes to react + time.Sleep(200 * time.Millisecond) + + // Check if envoy is still running + isRunning, _ := envoyProcess.IsRunning() + + // Until we have a technically challenging, race safe bidirectional + // pipe implementation, we can't guarantee kill -9 of func-e will + // propagate to envoy on Darwin, even if normal kill will. + if runtime.GOOS == "darwin" { + require.True(t, isRunning) + // Clean up the orphaned process + _ = envoyProcess.Kill() + } else { + require.False(t, isRunning) + } + }, + }) +} + // setupTestFiles writes the given files to the current directory for test setup. func setupTestFiles(t *testing.T, files map[string][]byte) { for name, content := range files { @@ -217,7 +283,9 @@ func executeRunTest(ctx context.Context, t *testing.T, factory FuncEFactory, opt if !opts.ExpectFail { require.True(t, envoyPid != 0, "expected Envoy to start") - require.NoError(t, runErr) + if !opts.ExpectKilled { + require.NoError(t, runErr) + } } else { require.True(t, envoyPid == 0, "expected Envoy not to start") require.Error(t, runErr) @@ -225,7 +293,7 @@ func executeRunTest(ctx context.Context, t *testing.T, factory FuncEFactory, opt } // After shutdown, the Envoy process should not exist. Otherwise, there's a leak issue. - if envoyPid != 0 { + if envoyPid != 0 && !opts.ExpectKilled { _, err = process.NewProcessWithContext(ctx, envoyPid) require.Error(t, err, "expected Envoy process to be gone after shutdown") } @@ -234,7 +302,12 @@ func executeRunTest(ctx context.Context, t *testing.T, factory FuncEFactory, opt // runTestAndInterruptFuncE runs the test function and ensures func-e is interrupted afterward. func runTestAndInterruptFuncE(ctx context.Context, t *testing.T, funcE FuncE, runDir string, envoyPid int32, callback func(context.Context, string, int32, func(context.Context) error, *AdminClient)) { defer func() { - require.NoError(t, funcE.Interrupt(ctx)) + if err := funcE.Interrupt(ctx); err != nil { + // Only fail if it's not an expected error from killing the process + if !errors.Is(err, os.ErrProcessDone) && !errors.Is(err, syscall.ESRCH) { + require.NoError(t, err) + } + } if ctx.Err() != nil { t.Logf("context canceled during interrupt: %v", ctx.Err()) }