From 8b76ba8b2feca3da1f7e67919d68b1c04c5eb3d2 Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Fri, 18 Jul 2025 15:47:39 +0800 Subject: [PATCH 1/2] Add ctrl+c twice support with shutdown hook panic recovery This implements ctrl+c twice functionality where the first interrupt starts graceful shutdown and any subsequent interrupt forces immediate exit via os.Exit(1). The implementation handles normal shutdown scenarios gracefully. The known limitation on Darwin (envoy orphaning on SIGKILL) is documented and tested. Fixes #456 Signed-off-by: Adrian Cole --- api/func-e_run_test.go | 5 + api/func-e_test.go | 6 +- e2e/func-e_run_test.go | 8 ++ internal/envoy/proc_helpers.go | 9 +- internal/envoy/shutdown.go | 38 ++++-- internal/envoy/shutdown_test.go | 199 ++++++++++++++++++++++++++++++++ internal/test/e2e/testrun.go | 79 ++++++++++++- 7 files changed, 328 insertions(+), 16 deletions(-) create mode 100644 internal/envoy/shutdown_test.go diff --git a/api/func-e_run_test.go b/api/func-e_run_test.go index b9add01c..e0c50584 100644 --- a/api/func-e_run_test.go +++ b/api/func-e_run_test.go @@ -25,3 +25,8 @@ func TestRun_InvalidConfig(t *testing.T) { func TestRun_StaticFile(t *testing.T) { e2e.TestRun_StaticFile(context.Background(), t, fakeFuncEFactory{}) } + +func TestRun_CtrlCs(t *testing.T) { + // This doesn't call ctrl-c, rather cancels the context multiple times + e2e.TestRun_CtrlCs(context.Background(), t, fakeFuncEFactory{}) +} diff --git a/api/func-e_test.go b/api/func-e_test.go index 4864f2a7..a647a4a2 100644 --- a/api/func-e_test.go +++ b/api/func-e_test.go @@ -48,8 +48,10 @@ type fakeFuncE struct { // Interrupt cancels the context created in Run as we don't want to actually interrupt the calling test! func (f *fakeFuncE) Interrupt(context.Context) error { - f.cancelFunc() - f.cancelFunc = nil + if f.cancelFunc != nil { + f.cancelFunc() + // Don't set to nil in case interrupt is called multiple times (ctrl+c twice) + } return nil } diff --git a/e2e/func-e_run_test.go b/e2e/func-e_run_test.go index 07a55dbc..e5b1e291 100644 --- a/e2e/func-e_run_test.go +++ b/e2e/func-e_run_test.go @@ -25,3 +25,11 @@ func TestRun_InvalidConfig(t *testing.T) { func TestRun_StaticFile(t *testing.T) { e2e.TestRun_StaticFile(context.Background(), t, funcEFactory{}) } + +func TestRun_CtrlCs(t *testing.T) { + e2e.TestRun_CtrlCs(context.Background(), t, funcEFactory{}) +} + +func TestRun_Kill9(t *testing.T) { + e2e.TestRun_Kill9(context.Background(), t, funcEFactory{}) +} diff --git a/internal/envoy/proc_helpers.go b/internal/envoy/proc_helpers.go index 311af92e..97351db5 100644 --- a/internal/envoy/proc_helpers.go +++ b/internal/envoy/proc_helpers.go @@ -11,9 +11,12 @@ import ( // interrupt attempts to interrupt the process. It doesn't necessarily kill it. func interrupt(p *os.Process) error { - // Send SIGINT to the child PID directly - if err := p.Signal(syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) { - return err + // Send SIGINT to the process group (negative PID) to ensure all child processes receive it + if err := syscall.Kill(-p.Pid, syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) { + // Fallback to sending to the process directly if group signal fails + if err := p.Signal(syscall.SIGINT); err != nil && !errors.Is(err, os.ErrProcessDone) { + return err + } } return nil } diff --git a/internal/envoy/shutdown.go b/internal/envoy/shutdown.go index 90a30007..4262b1c2 100644 --- a/internal/envoy/shutdown.go +++ b/internal/envoy/shutdown.go @@ -5,6 +5,7 @@ package envoy import ( "context" + "errors" "fmt" "os" "path/filepath" @@ -15,13 +16,13 @@ import ( ) // RegisterShutdownHook registers the passed functions to be run after Envoy has started -// and just before func-e instructs Envoy to exit +// and just before func-e instructs Envoy to exit. func (r *Runtime) RegisterShutdownHook(f func(context.Context) error) { r.shutdownHooks = append(r.shutdownHooks, f) } func (r *Runtime) handleShutdown() { - // Ensure the SIGINT forwards to Envoy even if a shutdown hook panics + // Ensure the SIGINT forwards to Envoy even if a shutdown hook panics. defer func() { r.interruptEnvoy() if r.cmd != nil && r.cmd.Process != nil { @@ -30,23 +31,42 @@ func (r *Runtime) handleShutdown() { }() deadline := time.Now().Add(shutdownTimeout) - timeout, cancel := context.WithDeadline(context.Background(), deadline) + ctx, cancel := context.WithDeadline(context.Background(), deadline) defer cancel() fmt.Fprintf(r.Out, "invoking shutdown hooks with deadline %s\n", deadline.Format(dateFormat)) //nolint:errcheck - // Run each hook in parallel, logging each error + var ( + mu sync.Mutex + errs []error + ) + + // Run each hook in parallel, collecting errors. var wg sync.WaitGroup wg.Add(len(r.shutdownHooks)) for _, f := range r.shutdownHooks { go func(f func(context.Context) error) { defer wg.Done() - if err := f(timeout); err != nil { - fmt.Fprintf(r.Out, "failed shutdown hook: %s\n", err) //nolint:errcheck + defer func() { + if p := recover(); p != nil { + mu.Lock() + errs = append(errs, fmt.Errorf("panic in shutdown hook: %v", p)) + mu.Unlock() + } + }() + if err := f(ctx); err != nil && !errors.Is(err, context.Canceled) { + // Don't collect if context was cancelled (ctrl+c twice scenario). + mu.Lock() + errs = append(errs, fmt.Errorf("failed shutdown hook: %w", err)) + mu.Unlock() } }(f) } wg.Wait() + + if len(errs) > 0 { + fmt.Fprintf(r.Out, "shutdown errors: %v\n", errors.Join(errs...)) //nolint:errcheck + } } func (r *Runtime) interruptEnvoy() { @@ -58,10 +78,12 @@ func (r *Runtime) interruptEnvoy() { func (r *Runtime) archiveRunDir() error { // Ensure logs are closed before we try to archive them. if r.OutFile != nil { - r.OutFile.Close() //nolint + _ = r.OutFile.Sync() + _ = r.OutFile.Close() } if r.ErrFile != nil { - r.ErrFile.Close() //nolint + _ = r.ErrFile.Sync() + _ = r.ErrFile.Close() } if r.o.DontArchiveRunDir { return nil diff --git a/internal/envoy/shutdown_test.go b/internal/envoy/shutdown_test.go new file mode 100644 index 00000000..3dfe1d3e --- /dev/null +++ b/internal/envoy/shutdown_test.go @@ -0,0 +1,199 @@ +// Copyright 2025 Tetrate +// SPDX-License-Identifier: Apache-2.0 + +package envoy + +import ( + "context" + "fmt" + "os" + "os/exec" + "syscall" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +var ( + // testProcessTimeout is how long we wait for a process to exit in tests. + // Set to 40% of shutdownTimeout since the process should die quickly after + // handleShutdown completes (which already waited up to shutdownTimeout). + testProcessTimeout = shutdownTimeout * 2 / 5 + + // testSlowHookDuration simulates a hook that would exceed shutdownTimeout. + // Set to 6x shutdownTimeout to ensure it clearly exceeds the timeout. + testSlowHookDuration = shutdownTimeout * 6 + + // testShutdownBuffer is extra time beyond shutdownTimeout to account for + // goroutine scheduling and process termination overhead. + // Set to 40% of shutdownTimeout to be proportional but reasonable. + testShutdownBuffer = shutdownTimeout * 2 / 5 +) + +// TestHandleShutdown_PanicInHook tests that a panic in a shutdown hook +// doesn't prevent Envoy from being terminated +func TestHandleShutdown_PanicInHook(t *testing.T) { + // Create a test command that sleeps (simulating Envoy) + cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds()))) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + require.NoError(t, cmd.Start()) + + r := &Runtime{ + cmd: cmd, + Out: os.Stdout, + logf: func(format string, args ...interface{}) { + t.Logf(format, args...) + }, + } + + // Register a hook that panics + r.RegisterShutdownHook(func(ctx context.Context) error { + panic("shutdown hook panic!") + }) + + // Register another hook to verify execution continues + hookExecuted := false + r.RegisterShutdownHook(func(ctx context.Context) error { + hookExecuted = true + return nil + }) + + // Call handleShutdown - it should recover from panic and still kill the process + r.handleShutdown() + + // Wait for process to exit + done := make(chan struct{}) + go func() { + _ = cmd.Wait() + close(done) + }() + + select { + case <-done: + // Process exited as expected + case <-time.After(testProcessTimeout): + t.Fatal("Process didn't exit within timeout") + } + + require.True(t, hookExecuted, "Other hooks should still execute despite panic") +} + +// TestHandleShutdown_MultipleHooksPanic tests that multiple panicking hooks +// don't prevent termination +func TestHandleShutdown_MultipleHooksPanic(t *testing.T) { + cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds()))) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + require.NoError(t, cmd.Start()) + + r := &Runtime{ + cmd: cmd, + Out: os.Stdout, + logf: func(format string, args ...interface{}) { + t.Logf(format, args...) + }, + } + + // Register multiple panicking hooks + for i := 0; i < 3; i++ { + idx := i + r.RegisterShutdownHook(func(ctx context.Context) error { + panic(fmt.Sprintf("hook %d panic!", idx)) + }) + } + + // Call handleShutdown + r.handleShutdown() + + // Wait for process to exit + done := make(chan struct{}) + go func() { + _ = cmd.Wait() + close(done) + }() + + select { + case <-done: + // Process exited as expected + case <-time.After(testProcessTimeout): + t.Fatal("Process didn't exit within timeout despite multiple panics") + } +} + +// TestHandleShutdown_SlowHook tests that slow hooks don't prevent termination +func TestHandleShutdown_SlowHook(t *testing.T) { + cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds()))) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + require.NoError(t, cmd.Start()) + + r := &Runtime{ + cmd: cmd, + Out: os.Stdout, + logf: func(format string, args ...interface{}) { + t.Logf(format, args...) + }, + } + + // Register a slow hook + r.RegisterShutdownHook(func(ctx context.Context) error { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(testSlowHookDuration): + return nil + } + }) + + start := time.Now() + r.handleShutdown() + elapsed := time.Since(start) + + // Should complete within shutdown timeout + buffer for overhead + require.Less(t, elapsed, shutdownTimeout+testShutdownBuffer, "Shutdown should respect timeout") + + // Wait for process to exit + done := make(chan struct{}) + go func() { + _ = cmd.Wait() + close(done) + }() + + select { + case <-done: + // Process exited as expected + case <-time.After(testProcessTimeout): + t.Fatal("Process didn't exit within timeout even with slow hooks") + } +} + +// TestEnsureProcessDone tests that EnsureProcessDone kills the process +func TestEnsureProcessDone(t *testing.T) { + cmd := exec.Command("sleep", fmt.Sprintf("%d", int(testSlowHookDuration.Seconds()))) + require.NoError(t, cmd.Start()) + + // Ensure process is running + require.NoError(t, cmd.Process.Signal(syscall.Signal(0))) + + // Call ensureProcessDone + err := ensureProcessDone(cmd.Process) + require.NoError(t, err) + + // Wait for process to exit + done := make(chan struct{}) + go func() { + _ = cmd.Wait() + close(done) + }() + + select { + case <-done: + // Process exited as expected + case <-time.After(testProcessTimeout / 2): + // Use half of testProcessTimeout since EnsureProcessDone sends SIGKILL + // which should terminate the process immediately + t.Fatal("Process didn't exit after EnsureProcessDone") + } +} diff --git a/internal/test/e2e/testrun.go b/internal/test/e2e/testrun.go index b70e4664..4d745af4 100644 --- a/internal/test/e2e/testrun.go +++ b/internal/test/e2e/testrun.go @@ -7,11 +7,15 @@ import ( "bufio" "context" _ "embed" + "errors" "io" "os" "path/filepath" + "runtime" "strings" + "syscall" "testing" + "time" "github.com/shirou/gopsutil/v4/process" "github.com/stretchr/testify/require" @@ -30,6 +34,8 @@ type RunTestOptions struct { ExpectFail bool TestFunc RunTestFunc Args []string + // ExpectKilled indicates the test will kill func-e (e.g., with SIGKILL) + ExpectKilled bool } // Tests are implemented individually rather than as table-driven tests to facilitate @@ -117,6 +123,66 @@ func TestRun_InvalidConfig(ctx context.Context, t *testing.T, factory FuncEFacto }) } +// TestRun_CtrlCs tests the "Ctrl+C twice" behavior where the first interrupt +// starts shutdown hooks and the second interrupt forces immediate exit. +func TestRun_CtrlCs(ctx context.Context, t *testing.T, factory FuncEFactory) { + executeRunTest(ctx, t, factory, RunTestOptions{ + ExpectFail: false, + Args: []string{"--config-yaml", "admin: {address: {socket_address: {address: '127.0.0.1', port_value: 0}}}"}, + TestFunc: func(ctx context.Context, runDir string, envoyPid int32, interruptFuncE func(context.Context) error, adminClient *AdminClient) { + // First interrupt should start shutdown hooks + require.NoError(t, interruptFuncE(ctx)) + + // Send 5 more interrupts to ensure no special casing + for i := 0; i < 5; i++ { + require.NoError(t, interruptFuncE(ctx)) + } + }, + }) +} + +// TestRun_Kill9 tests that when func-e is killed with SIGKILL, envoy behavior differs by OS. +// On Darwin: envoy becomes orphaned (limitation without Pdeathsig) +// On Linux: envoy should die with func-e due to process group signaling +func TestRun_Kill9(ctx context.Context, t *testing.T, factory FuncEFactory) { + executeRunTest(ctx, t, factory, RunTestOptions{ + ExpectKilled: true, + Args: []string{"--config-yaml", "admin: {address: {socket_address: {address: '127.0.0.1', port_value: 0}}}"}, + TestFunc: func(ctx context.Context, runDir string, envoyPid int32, interruptFuncE func(context.Context) error, adminClient *AdminClient) { + envoyProcess, err := process.NewProcess(envoyPid) + require.NoError(t, err) + + funcE, err := envoyProcess.Parent() + require.NoError(t, err) + + funcEPid := funcE.Pid + funcEProcess, err := os.FindProcess(int(funcEPid)) + require.NoError(t, err) + + // kill -9 func-e process + err = funcEProcess.Signal(syscall.SIGKILL) + require.NoError(t, err) + + // Wait a moment for processes to react + time.Sleep(200 * time.Millisecond) + + // Check if envoy is still running + isRunning, _ := envoyProcess.IsRunning() + + // Until we have a technically challenging, race safe bidirectional + // pipe implementation, we can't guarantee kill -9 of func-e will + // propagate to envoy on Darwin, even if normal kill will. + if runtime.GOOS == "darwin" { + require.True(t, isRunning) + // Clean up the orphaned process + _ = envoyProcess.Kill() + } else { + require.False(t, isRunning) + } + }, + }) +} + // setupTestFiles writes the given files to the current directory for test setup. func setupTestFiles(t *testing.T, files map[string][]byte) { for name, content := range files { @@ -217,7 +283,9 @@ func executeRunTest(ctx context.Context, t *testing.T, factory FuncEFactory, opt if !opts.ExpectFail { require.True(t, envoyPid != 0, "expected Envoy to start") - require.NoError(t, runErr) + if !opts.ExpectKilled { + require.NoError(t, runErr) + } } else { require.True(t, envoyPid == 0, "expected Envoy not to start") require.Error(t, runErr) @@ -225,7 +293,7 @@ func executeRunTest(ctx context.Context, t *testing.T, factory FuncEFactory, opt } // After shutdown, the Envoy process should not exist. Otherwise, there's a leak issue. - if envoyPid != 0 { + if envoyPid != 0 && !opts.ExpectKilled { _, err = process.NewProcessWithContext(ctx, envoyPid) require.Error(t, err, "expected Envoy process to be gone after shutdown") } @@ -234,7 +302,12 @@ func executeRunTest(ctx context.Context, t *testing.T, factory FuncEFactory, opt // runTestAndInterruptFuncE runs the test function and ensures func-e is interrupted afterward. func runTestAndInterruptFuncE(ctx context.Context, t *testing.T, funcE FuncE, runDir string, envoyPid int32, callback func(context.Context, string, int32, func(context.Context) error, *AdminClient)) { defer func() { - require.NoError(t, funcE.Interrupt(ctx)) + if err := funcE.Interrupt(ctx); err != nil { + // Only fail if it's not an expected error from killing the process + if !errors.Is(err, os.ErrProcessDone) && !errors.Is(err, syscall.ESRCH) { + require.NoError(t, err) + } + } if ctx.Err() != nil { t.Logf("context canceled during interrupt: %v", ctx.Err()) } From 60a5dd4994ac63fc264e272603c53dd9629caad1 Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Sat, 19 Jul 2025 07:42:34 +0800 Subject: [PATCH 2/2] headers Signed-off-by: Adrian Cole --- api/func-e_run_test.go | 2 +- api/func-e_test.go | 2 +- e2e/func-e_run_test.go | 2 +- internal/envoy/shutdown_test.go | 2 +- internal/test/e2e/testrun.go | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/api/func-e_run_test.go b/api/func-e_run_test.go index e0c50584..3cec2f47 100644 --- a/api/func-e_run_test.go +++ b/api/func-e_run_test.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package api diff --git a/api/func-e_test.go b/api/func-e_test.go index a647a4a2..9d4e1137 100644 --- a/api/func-e_test.go +++ b/api/func-e_test.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package api diff --git a/e2e/func-e_run_test.go b/e2e/func-e_run_test.go index e5b1e291..6fc2da08 100644 --- a/e2e/func-e_run_test.go +++ b/e2e/func-e_run_test.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package e2e diff --git a/internal/envoy/shutdown_test.go b/internal/envoy/shutdown_test.go index 3dfe1d3e..68abeebf 100644 --- a/internal/envoy/shutdown_test.go +++ b/internal/envoy/shutdown_test.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package envoy diff --git a/internal/test/e2e/testrun.go b/internal/test/e2e/testrun.go index 4d745af4..632a94b4 100644 --- a/internal/test/e2e/testrun.go +++ b/internal/test/e2e/testrun.go @@ -1,4 +1,4 @@ -// Copyright 2025 Tetrate +// Copyright func-e contributors // SPDX-License-Identifier: Apache-2.0 package e2e