diff --git a/pkg/coretag/coretag.go b/pkg/coretag/coretag.go index 98e1485a97..9574b23617 100644 --- a/pkg/coretag/coretag.go +++ b/pkg/coretag/coretag.go @@ -41,6 +41,7 @@ func Enable() error { } // GetAllCoreTags returns the core tag of all the threads in the thread group. +// PID 0 means the current pid. func GetAllCoreTags(pid int) ([]uint64, error) { // prctl(PR_SCHED_CORE_GET, PR_SCHED_CORE_SCOPE_THREAD_GROUP, ...) is not supported // in linux. So instead we get all threads from /proc//task and get all the @@ -75,9 +76,14 @@ func GetAllCoreTags(pid int) ([]uint64, error) { } // getTids returns set of tids as reported by /proc//task. +// PID 0 means the current PID. func getTids(pid int) (map[int]struct{}, error) { tids := make(map[int]struct{}) - files, err := os.ReadDir("/proc/" + strconv.Itoa(pid) + "/task") + path := "/proc/self/task" + if pid != 0 { + path = fmt.Sprintf("/proc/%d/task", pid) + } + files, err := os.ReadDir(path) if err != nil { return nil, err } diff --git a/pkg/coretag/coretag_test.go b/pkg/coretag/coretag_test.go index 1930716c47..a9458499f2 100644 --- a/pkg/coretag/coretag_test.go +++ b/pkg/coretag/coretag_test.go @@ -16,6 +16,7 @@ package coretag import ( "os" + "reflect" "testing" "gvisor.dev/gvisor/pkg/hostos" @@ -36,11 +37,19 @@ func TestEnable(t *testing.T) { t.Fatalf("Enable() got error %v, wanted nil", err) } - coreTags, err := GetAllCoreTags(os.Getpid()) + pid := os.Getpid() + coreTags, err := GetAllCoreTags(pid) if err != nil { t.Fatalf("GetAllCoreTags() got error %v, wanted nil", err) } if len(coreTags) != 1 { t.Fatalf("Got coreTags %v, wanted len(coreTags)=1", coreTags) } + coreTagsSelf, err := GetAllCoreTags(0) + if err != nil { + t.Fatalf("GetAllCoreTags(0) got error %v, wanted nil", err) + } + if !reflect.DeepEqual(coreTags, coreTagsSelf) { + t.Fatalf("Got different coreTags for PID %d vs self: %v vs %v", pid, coreTags, coreTagsSelf) + } } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index ab99ee5c2e..1fb54321a4 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -245,7 +245,7 @@ func TestStartSignal(t *testing.T) { func TestHostnetWithRawSockets(t *testing.T) { // Drop CAP_NET_RAW from effective capabilities, if we have it. pid := os.Getpid() - caps, err := capability.NewPid2(os.Getpid()) + caps, err := capability.NewPid2(0) if err != nil { t.Fatalf("error getting capabilities for pid %d: %v", pid, err) } diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index d736b80c51..4b0f3b188a 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -471,7 +471,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomma // Verify that all sentry threads are properly core tagged, and log // current core tag. - coreTags, err := coretag.GetAllCoreTags(os.Getpid()) + coreTags, err := coretag.GetAllCoreTags(0) if err != nil { util.Fatalf("Failed read current core tags: %v", err) } diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go index 00f0cf0a58..3de0b20125 100644 --- a/runsc/cmd/chroot.go +++ b/runsc/cmd/chroot.go @@ -82,6 +82,73 @@ func copyFile(dst, src string) error { return err } +// setupMinimalProcfs creates a minimal procfs-like tree at `${chroot}/proc`. +func setupMinimalProcfs(chroot string) error { + // We can't always directly mount procfs because it may be obstructed + // by submounts within it. See https://gvisor.dev/issue/10944. + // All we really need from procfs is /proc/self and a few kernel + // parameter files, which are typically not obstructed. + // So we create a tmpfs at /proc and manually copy the kernel parameter + // files into it. Then, to get /proc/self, we mount either a new + // instance of procfs (if possible), or a recursive bind mount of the + // procfs we do have access to (which still contains the obstructed + // submounts but /proc/self is not obstructed), and we symlink + // our /proc/self to the one in that mount. + // + // Why not try to mount the new procfs instance at /proc directly? + // Because that would cause the set of files at /proc to differ + // between the "new procfs instance" case and the "recursive bind + // mount" case. Thus, this could introduce a bug whereby gVisor starts + // to depend on a /proc file that is present in one case but not the + // other, without decent test coverage to catch it. + procRoot := filepath.Join(chroot, "/proc") + if err := os.Mkdir(procRoot, 0755); err != nil { + return fmt.Errorf("error creating /proc in chroot: %v", err) + } + if err := specutils.SafeMount("runsc-proc", procRoot, "tmpfs", + unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC, "", "/proc"); err != nil { + return fmt.Errorf("error mounting tmpfs in /proc: %v", err) + } + flags := uint32(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY) + procSubmountDir := "sandbox-proc" + if newProcfsErr := mountInChroot(chroot, "proc", "/proc/"+procSubmountDir, "proc", flags); newProcfsErr != nil { + log.Debugf("Unable to mount a new instance of the procfs file system at %q (%v); trying a recursive bind mount instead.", filepath.Join(procRoot, procSubmountDir), newProcfsErr) + procSubmountDir = "host-proc" + if bindErr := mountInChroot(chroot, "/proc", "/proc/"+procSubmountDir, "bind", + unix.MS_BIND|unix.MS_REC|flags); bindErr != nil { + return fmt.Errorf("error recursively bind-mounting proc at %q (%w) after also failing to mount a new procfs instance there (%v)", filepath.Join(procRoot, procSubmountDir), bindErr, newProcfsErr) + } + log.Debugf("Successfully mounted a recursive bind mount of procfs at %q; continuing.", filepath.Join(procRoot, procSubmountDir)) + } + // Create needed directories. + for _, d := range []string{ + "/proc/sys", + "/proc/sys/kernel", + "/proc/sys/vm", + } { + if err := os.Mkdir(filepath.Join(chroot, d), 0755); err != nil { + return fmt.Errorf("error creating directory %q: %v", filepath.Join(chroot, d), err) + } + } + // Copy needed files. + for _, f := range []string{ + "/proc/sys/vm/mmap_min_addr", + "/proc/sys/kernel/cap_last_cap", + } { + if err := copyFile(filepath.Join(chroot, f), f); err != nil { + return fmt.Errorf("failed to copy %q -> %q: %w", f, filepath.Join(chroot, f), err) + } + } + // Create symlink for /proc/self. + if err := os.Symlink(procSubmountDir+"/self", filepath.Join(procRoot, "self")); err != nil { + return fmt.Errorf("error creating symlink %q -> %q: %w", filepath.Join(procRoot, "self"), procSubmountDir+"/self", err) + } + if err := os.Chmod(procRoot, 0o111); err != nil { + return fmt.Errorf("error chmodding %q: %v", procRoot, err) + } + return nil +} + // setUpChroot creates an empty directory with runsc mounted at /runsc and proc // mounted at /proc. func setUpChroot(spec *specs.Spec, conf *config.Config) error { @@ -109,9 +176,8 @@ func setUpChroot(spec *specs.Spec, conf *config.Config) error { log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err) } - flags := uint32(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY) - if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil { - return fmt.Errorf("error mounting proc in chroot: %v", err) + if err := setupMinimalProcfs(chroot); err != nil { + return fmt.Errorf("error setting up minimal procfs in chroot %q: %v", chroot, err) } if err := tpuProxyUpdateChroot("/", chroot, spec, conf); err != nil { diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index abbfa6350b..bd3754b08a 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -214,7 +214,7 @@ func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { // HasCapabilities returns true if the user has all capabilities in 'cs'. func HasCapabilities(cs ...capability.Cap) bool { - caps, err := capability.NewPid2(os.Getpid()) + caps, err := capability.NewPid2(0) if err != nil { return false } diff --git a/test/e2e/runtime_in_docker_test.go b/test/e2e/runtime_in_docker_test.go index 97c7e5aa96..9924c2c1d2 100644 --- a/test/e2e/runtime_in_docker_test.go +++ b/test/e2e/runtime_in_docker_test.go @@ -74,18 +74,6 @@ func (test testVariant) run(ctx context.Context, logger testutil.Logger, runscPa ReadOnly: false, }) } - // Mount an unobstructed view of procfs at /proc2 so that the runtime - // can mount a fresh procfs. - // TODO(gvisor.dev/issue/10944): Remove this once issue is fixed. - opts.Mounts = append(opts.Mounts, mount.Mount{ - Type: mount.TypeBind, - Source: "/proc", - Target: "/proc2", - ReadOnly: false, - BindOptions: &mount.BindOptions{ - NonRecursive: true, - }, - }) const wantMessage = "It became a jumble of words, a litany, almost a kind of glossolalia." args := []string{ "/runtime",