diff --git a/go.mod b/go.mod index ffea0a1a..a8e825bd 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/containerd/cgroups go 1.12 require ( + github.com/cilium/ebpf v0.0.0-20191113100448-d9fb101ca1fb github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e github.com/docker/go-units v0.4.0 github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e @@ -11,5 +12,5 @@ require ( github.com/pkg/errors v0.8.1 github.com/sirupsen/logrus v1.4.2 github.com/urfave/cli v1.22.1 - golang.org/x/sys v0.0.0-20190514135907-3a4b5fb9f71f + golang.org/x/sys v0.0.0-20191112214154-59a1497f0cea ) diff --git a/go.sum b/go.sum index dabe1896..50a93ef3 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,6 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/cilium/ebpf v0.0.0-20191113100448-d9fb101ca1fb h1:bQ0NJ9dAB8vsw7ffajBDX/7Wr64BdLWeJkYY36UkeRY= +github.com/cilium/ebpf v0.0.0-20191113100448-d9fb101ca1fb/go.mod h1:MA5e5Lr8slmEg9bt0VpxxWqJlO4iwu3FBdHUzV7wQVg= github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e h1:Wf6HqHfScWJN9/ZjdUKyjop4mf3Qdd+1TvvltAvM3m8= github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= @@ -33,8 +35,10 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/urfave/cli v1.22.1 h1:+mkCCcOFKPnCmVYVcURKps1Xe+3zP90gSYGNfRkjoIY= github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190514135907-3a4b5fb9f71f h1:Xab8gg26GrI/x3RNdVhVkHHM1XLyGeRBEvz4Q5x4YW8= -golang.org/x/sys v0.0.0-20190514135907-3a4b5fb9f71f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7 h1:HmbHVPwrPEKPGLAcHSrMe6+hqSUlvZU0rab6x5EXfGU= +golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191112214154-59a1497f0cea h1:Mz1TMnfJDRJLk8S8OPCoJYgrsp/Se/2TBre2+vwX128= +golang.org/x/sys v0.0.0-20191112214154-59a1497f0cea/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/v2/devicefilter.go b/v2/devicefilter.go new file mode 100644 index 00000000..20abcf1d --- /dev/null +++ b/v2/devicefilter.go @@ -0,0 +1,201 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "fmt" + "github.com/opencontainers/runtime-spec/specs-go" + "math" + + "github.com/cilium/ebpf/asm" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +/* +Devicefilter containes eBPF device filter program + +The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c + +Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 + +This particular Go implementation based on runc version +https://github.com/opencontainers/runc/blob/master/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go +*/ +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// DeviceFilter returns eBPF device filter program and its license string +func DeviceFilter(devices []specs.LinuxDeviceCgroup) (asm.Instructions, string, error) { + p := &program{} + p.init() + for i := len(devices) - 1; i >= 0; i-- { + if err := p.appendDevice(devices[i]); err != nil { + return nil, "", err + } + } + insts, err := p.finalize() + return insts, license, err +} + +type program struct { + insts asm.Instructions + hasWildCard bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Half)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. +func (p *program) appendDevice(dev specs.LinuxDeviceCgroup) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + if p.hasWildCard { + // All entries after wildcard entry are ignored + return nil + } + + bpfType := int32(-1) + hasType := true + switch dev.Type { + case string('c'): + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case string('b'): + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + case string('a'): + hasType = false + default: + // if not specified in OCI json, typ is set to DeviceTypeAll + return errors.Errorf("invalid DeviceType %q", dev.Type) + } + if *dev.Major > math.MaxUint32 { + return errors.Errorf("invalid major %d", *dev.Major) + } + if *dev.Minor > math.MaxUint32 { + return errors.Errorf("invalid minor %d", *dev.Major) + } + hasMajor := *dev.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := *dev.Minor >= 0 + bpfAccess := int32(0) + for _, r := range dev.Access { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return errors.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + blockSym := fmt.Sprintf("block-%d", p.blockID) + nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1) + prevBlockLastIdx := len(p.insts) - 1 + if hasType { + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + } + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JEq.Imm(asm.R1, 0, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(*dev.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(*dev.Minor), nextBlockSym), + ) + } + if !hasType && !hasAccess && !hasMajor && !hasMinor { + p.hasWildCard = true + } + p.insts = append(p.insts, acceptBlock(dev.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() (asm.Instructions, error) { + if p.hasWildCard { + // acceptBlock with asm.Return() is already inserted + return p.insts, nil + } + blockSym := fmt.Sprintf("block-%d", p.blockID) + p.insts = append(p.insts, + // R0 <- 0 + asm.Mov.Imm32(asm.R0, 0).Sym(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts, nil +} + +func acceptBlock(accept bool) asm.Instructions { + v := int32(0) + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/v2/devicefilter_test.go b/v2/devicefilter_test.go new file mode 100644 index 00000000..dfc732e0 --- /dev/null +++ b/v2/devicefilter_test.go @@ -0,0 +1,184 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "github.com/opencontainers/runtime-spec/specs-go" + "strings" + "testing" +) + +func hash(s, comm string) string { + var res []string + for _, l := range strings.Split(s, "\n") { + trimmed := strings.TrimSpace(l) + if trimmed == "" || strings.HasPrefix(trimmed, comm) { + continue + } + res = append(res, trimmed) + } + return strings.Join(res, "\n") +} + +func testDeviceFilter(t testing.TB, devices []specs.LinuxDeviceCgroup, expectedStr string) { + insts, _, err := DeviceFilter(devices) + if err != nil { + t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices) + } + s := insts.String() + t.Logf("%s: devices: %+v\n%s", t.Name(), devices, s) + if expectedStr != "" { + hashed := hash(s, "//") + expectedHashed := hash(expectedStr, "//") + if expectedHashed != hashed { + t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed) + } + } +} + +func TestDeviceFilter_Nil(t *testing.T) { + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) + 5: Mov32Imm dst: r0 imm: 0 + 6: Exit + ` + testDeviceFilter(t, nil, expected) +} + +func TestDeviceFilter_Privileged(t *testing.T) { + devices := []specs.LinuxDeviceCgroup{ + { + Type: "a", + Major: pointerInt64(-1), + Minor: pointerInt64(-1), + Access: "rwm", + Allow: true, + }, + } + expected := + ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 1 (accept) + 5: Mov32Imm dst: r0 imm: 1 + 6: Exit + ` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { + devices := []specs.LinuxDeviceCgroup{ + { + Type: "a", + Major: pointerInt64(-1), + Minor: pointerInt64(-1), + Access: "rwm", + Allow: true, + }, + { + Type: "b", + Major: pointerInt64(8), + Minor: pointerInt64(0), + Access: "rwm", + Allow: false, + }, + } + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 0 + 5: JNEImm dst: r2 off: -1 imm: 1 + 6: JNEImm dst: r4 off: -1 imm: 8 + 7: JNEImm dst: r5 off: -1 imm: 0 + 8: Mov32Imm dst: r0 imm: 0 + 9: Exit +block-1: +// return 1 (accept) + 10: Mov32Imm dst: r0 imm: 1 + 11: Exit +` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_Weird(t *testing.T) { + devices := []specs.LinuxDeviceCgroup{ + { + Type: "b", + Major: pointerInt64(8), + Minor: pointerInt64(1), + Access: "rwm", + Allow: false, + }, + { + Type: "a", + Major: pointerInt64(-1), + Minor: pointerInt64(-1), + Access: "rwm", + Allow: true, + }, + { + Type: "b", + Major: pointerInt64(8), + Minor: pointerInt64(2), + Access: "rwm", + Allow: false, + }, + } + // 8/1 is allowed, 8/2 is not allowed. + // This conforms to runc v1.0.0-rc.9 (cgroup1) behavior. + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 2 + 5: JNEImm dst: r2 off: -1 imm: 1 + 6: JNEImm dst: r4 off: -1 imm: 8 + 7: JNEImm dst: r5 off: -1 imm: 2 + 8: Mov32Imm dst: r0 imm: 0 + 9: Exit +block-1: +// return 1 (accept) + 10: Mov32Imm dst: r0 imm: 1 + 11: Exit +` + testDeviceFilter(t, devices, expected) +} + +func pointerInt64(int int64) *int64 { + return &int +} diff --git a/v2/ebpf.go b/v2/ebpf.go new file mode 100644 index 00000000..b02dc2d4 --- /dev/null +++ b/v2/ebpf.go @@ -0,0 +1,83 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) { + nilCloser := func() error { + return nil + } + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + closer := func() error { + if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + return nil + } + return closer, nil +} + +func isRWM(cgroupPermissions string) bool { + r := false + w := false + m := false + for _, rn := range cgroupPermissions { + switch rn { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// the logic is from runc +// https://github.com/opencontainers/runc/blob/master/libcontainer/cgroups/fs/devices_v2.go#L44 +func canSkipEBPFError(res *specs.LinuxResources) bool { + for _, dev := range res.Devices { + if dev.Allow || !isRWM(dev.Access) { + return false + } + } + return true +} diff --git a/v2/manager.go b/v2/manager.go index ff2f0a1a..889ad452 100644 --- a/v2/manager.go +++ b/v2/manager.go @@ -19,6 +19,7 @@ package v2 import ( "bufio" "fmt" + "github.com/opencontainers/runtime-spec/specs-go" "io/ioutil" "math" "os" @@ -478,3 +479,22 @@ func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) { } } } + + +func (r *Resources) SetDevice(path string, res *specs.LinuxResources) error { + insts, license, err := DeviceFilter(res.Devices) + if err != nil { + return err + } + dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY, 0600) + if err != nil { + return errors.Errorf("cannot get dir FD for %s", path) + } + defer unix.Close(dirFD) + if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(res) { + return err + } + } + return nil +}