diff --git a/internal/job/checkout.go b/internal/job/checkout.go index 280397dd51..8dbb5b29b2 100644 --- a/internal/job/checkout.go +++ b/internal/job/checkout.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "io/fs" "os" "path/filepath" "slices" @@ -50,28 +51,28 @@ func (e *Executor) configureHTTPSInsteadOfSSH(ctx context.Context) error { ).Run(ctx, shell.ShowPrompt(false)) } -func (e *Executor) removeCheckoutDir() error { +func (e *Executor) removeCheckoutDir(ctx context.Context) error { checkoutPath, _ := e.shell.Env.Get("BUILDKITE_BUILD_CHECKOUT_PATH") // on windows, sometimes removing large dirs can fail for various reasons // for instance having files open // see https://github.com/golang/go/issues/20841 - for range 10 { + return roko.NewRetrier( + roko.WithMaxAttempts(10), + roko.WithStrategy(roko.Constant(10*time.Second)), + ).DoWithContext(ctx, func(r *roko.Retrier) error { e.shell.Commentf("Removing %s", checkoutPath) - if err := os.RemoveAll(checkoutPath); err != nil { - e.shell.Errorf("Failed to remove \"%s\" (%s)", checkoutPath, err) - } else { - if _, err := os.Stat(checkoutPath); os.IsNotExist(err) { - return nil - } else { - e.shell.Errorf("Failed to remove %s", checkoutPath) - } - } - e.shell.Commentf("Waiting 10 seconds") - <-time.After(time.Second * 10) - } - return fmt.Errorf("failed to remove %s", checkoutPath) + if err := hardRemoveAll(checkoutPath); err != nil { + e.shell.Errorf("Failed to remove %q (%s)", checkoutPath, err) + return err + } + if _, err := os.Stat(checkoutPath); err == nil || !errors.Is(err, fs.ErrNotExist) { + e.shell.Errorf("Failed to remove %q", checkoutPath) + return errors.New("checkout path still exists") + } + return nil + }) } func (e *Executor) createCheckoutDir() error { @@ -79,18 +80,16 @@ func (e *Executor) createCheckoutDir() error { if !osutil.FileExists(checkoutPath) { e.shell.Commentf("Creating \"%s\"", checkoutPath) - // Actual file permissions will be reduced by umask, and won't be 0o777 unless the user has manually changed the umask to 000 - if err := os.MkdirAll(checkoutPath, 0o777); err != nil { + // Actual file permissions will be reduced by umask, and won't be 0777 + // unless the user has manually changed the umask to 000 + if err := os.MkdirAll(checkoutPath, 0777); err != nil { return err } } if e.shell.Getwd() != checkoutPath { - if err := e.shell.Chdir(checkoutPath); err != nil { - return err - } + return e.shell.Chdir(checkoutPath) } - return nil } @@ -112,7 +111,7 @@ func (e *Executor) CheckoutPhase(ctx context.Context) error { // Remove the checkout directory if BUILDKITE_CLEAN_CHECKOUT is present if e.CleanCheckout { e.shell.Headerf("Cleaning pipeline checkout") - if err = e.removeCheckoutDir(); err != nil { + if err = e.removeCheckoutDir(ctx); err != nil { return err } } @@ -202,7 +201,7 @@ func (e *Executor) CheckoutPhase(ctx context.Context) error { // Checkout can fail because of corrupted files in the checkout which can leave the agent in a state where it // keeps failing. This removes the checkout dir, which means the next checkout will be a lot slower (clone vs // fetch), but hopefully will allow the agent to self-heal - if err := e.removeCheckoutDir(); err != nil { + if err := e.removeCheckoutDir(ctx); err != nil { e.shell.Warningf("Failed to remove checkout dir while cleaning up after a checkout error: %v", err) } @@ -221,7 +220,7 @@ func (e *Executor) CheckoutPhase(ctx context.Context) error { e.shell.Warningf("Checkout failed! %s (%s)", err, r) // If it's some kind of error that we don't know about, clean the checkout dir just to be safe - if err := e.removeCheckoutDir(); err != nil { + if err := e.removeCheckoutDir(ctx); err != nil { e.shell.Warningf("Failed to remove checkout dir while cleaning up after a checkout error: %v", err) } diff --git a/internal/job/remove_all_nonunix.go b/internal/job/remove_all_nonunix.go new file mode 100644 index 0000000000..4b1de3fa6e --- /dev/null +++ b/internal/job/remove_all_nonunix.go @@ -0,0 +1,10 @@ +//go:build !unix + +package job + +import "os" + +// hardRemoveAll only does more than os.RemoveAll on Unix-likes. +func hardRemoveAll(path string) error { + return os.RemoveAll(path) +} diff --git a/internal/job/remove_all_test.go b/internal/job/remove_all_test.go new file mode 100644 index 0000000000..db9acc40fd --- /dev/null +++ b/internal/job/remove_all_test.go @@ -0,0 +1,44 @@ +//go:build unix + +package job + +import ( + "errors" + "io/fs" + "os" + "path/filepath" + "testing" +) + +func TestHardRemoveAll(t *testing.T) { + container, err := os.MkdirTemp("", "TestHardRemoveAll") + if err != nil { + t.Fatalf("os.MkdirTemp(TestHardRemoveAll) error = %v", err) + } + t.Cleanup(func() { os.RemoveAll(container) }) // lol but if hardRemoveAll doesn't work... + + dirA := filepath.Join(container, "a") + dirB := filepath.Join(dirA, "b") + fileC := filepath.Join(dirB, "c") + if err := os.MkdirAll(dirB, 0o777); err != nil { + t.Fatalf("os.MkdirAll(c%q, 0o777) = %v", dirB, err) + } + if err := os.WriteFile(fileC, []byte("hello!\n"), 0o664); err != nil { + t.Fatalf("os.WriteFile(%q, hello!, 0o664) = %v", fileC, err) + } + + // break directory perms + if err := os.Chmod(dirB, 0o666); err != nil { + t.Fatalf("os.Chmod(%q, 0o666) = %v", dirB, err) + } + if err := os.Chmod(dirA, 0o444); err != nil { + t.Fatalf("os.Chmod(%q, 0o444) = %v", dirA, err) + } + + if err := hardRemoveAll(dirA); err != nil { + t.Errorf("hardRemoveAll(%q) = %v", dirA, err) + } + if _, err := os.Stat(dirA); !errors.Is(err, fs.ErrNotExist) { + t.Errorf("os.Stat(%q) = %v, want %v", dirA, err, fs.ErrNotExist) + } +} diff --git a/internal/job/remove_all_unix.go b/internal/job/remove_all_unix.go new file mode 100644 index 0000000000..c2e46d85ee --- /dev/null +++ b/internal/job/remove_all_unix.go @@ -0,0 +1,57 @@ +//go:build unix + +package job + +import ( + "errors" + "os" + "path/filepath" + + "golang.org/x/sys/unix" +) + +// hardRemoveAll tries very hard to remove all items from the directory at path. +// In addition to calling os.RemoveAll, it fixes missing +x bits on directories. +func hardRemoveAll(path string) error { + // Only try to fix the permissions on 10000 directories. + // Any more and we could be here all day. + for range 10000 { + err := os.RemoveAll(path) + if err == nil { // If os.RemoveAll worked, then exit early. + return nil + } + // os.RemoveAll documents its only non-nil error as *os.PathError. + pathErr, ok := err.(*os.PathError) + if !ok { + return err + } + + // Did we not have permission to open something within a directory? + if pathErr.Err != unix.EACCES { + return err + } + dir := filepath.Dir(pathErr.Path) + + // Check that the EACCES was caused by mode on the directory. + // (Note that this is a TOCTOU race, but we're not changing + // owner uid/gid, and if something else is concurrently writing + // files they can probably chmod +wx their files themselves) + di, statErr := os.Lstat(dir) + if statErr != nil { + return statErr + } + if !di.IsDir() { + return err + } + if unix.Faccessat(0, dir, unix.W_OK|unix.X_OK, unix.AT_EACCESS) != unix.EACCES { + // Some other failure? + return err + } + // Try to fix it with chmod +x dir + if err := os.Chmod(dir, 0o777); err != nil { + return err + } + // Now retry os.RemoveAll. + } + return errors.New("too many inaccessible directories") +}