diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4618b2e818..ecf661074e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -85,3 +85,29 @@ jobs: uvmboot.exe zapdir.exe ncproxy.exe + + build_gcs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Install go + uses: actions/setup-go@v2 + with: + go-version: '^1.15.0' + + - name: Pull busybox image + run: docker pull busybox + + - name: Run Busybox Container + run: docker run --name base_image_container busybox + + - name: Export container to tar file + run: | + docker export base_image_container | gzip > base.tar.gz + + - name: Build And Test + run: | + BASE=./base.tar.gz + make all test diff --git a/.gitignore b/.gitignore index aec9bd4bb0..54ed6f06c9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,38 @@ +# Binaries for programs and plugins *.exe -.idea -.vscode +*.dll +*.so +*.dylib + +# Ignore vscode setting files +.vscode/ + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 +.glide/ + +# Ignore gcs bin directory +service/bin/ +service/pkg/ + +*.img +*.vhd +*.tar.gz + +# Make stuff +.rootfs-done +bin/* +rootfs/* +*.o +/build/ + +deps/* +out/* + +.idea/ +.vscode/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..cbc448ddda --- /dev/null +++ b/Makefile @@ -0,0 +1,87 @@ +BASE:=base.tar.gz + +GO:=go +GO_FLAGS:=-ldflags "-s -w" # strip Go binaries +CGO_ENABLED:=0 +GOMODVENDOR:= + +CFLAGS:=-O2 -Wall +LDFLAGS:=-static -s # strip C binaries + +GO_FLAGS_EXTRA:= +ifeq "$(GOMODVENDOR)" "1" +GO_FLAGS_EXTRA += -mod=vendor +endif +GO_BUILD:=CGO_ENABLED=$(CGO_ENABLED) $(GO) build $(GO_FLAGS) $(GO_FLAGS_EXTRA) + +SRCROOT=$(dir $(abspath $(firstword $(MAKEFILE_LIST)))) + +# The link aliases for gcstools +GCS_TOOLS=\ + generichook + +.PHONY: all always rootfs test + +all: out/initrd.img out/rootfs.tar.gz + +clean: + find -name '*.o' -print0 | xargs -0 -r rm + rm -rf bin deps rootfs out + +test: + cd $(SRCROOT) && go test ./internal/guest/... + +out/delta.tar.gz: bin/init bin/vsockexec bin/cmd/gcs bin/cmd/gcstools Makefile + @mkdir -p out + rm -rf rootfs + mkdir -p rootfs/bin/ + cp bin/init rootfs/ + cp bin/vsockexec rootfs/bin/ + cp bin/cmd/gcs rootfs/bin/ + cp bin/cmd/gcstools rootfs/bin/ + for tool in $(GCS_TOOLS); do ln -s gcstools rootfs/bin/$$tool; done + git -C $(SRCROOT) rev-parse HEAD > rootfs/gcs.commit && \ + git -C $(SRCROOT) rev-parse --abbrev-ref HEAD > rootfs/gcs.branch + tar -zcf $@ -C rootfs . + rm -rf rootfs + +out/rootfs.tar.gz: out/initrd.img + rm -rf rootfs-conv + mkdir rootfs-conv + gunzip -c out/initrd.img | (cd rootfs-conv && cpio -imd) + tar -zcf $@ -C rootfs-conv . + rm -rf rootfs-conv + +out/initrd.img: $(BASE) out/delta.tar.gz $(SRCROOT)/hack/catcpio.sh + $(SRCROOT)/hack/catcpio.sh "$(BASE)" out/delta.tar.gz > out/initrd.img.uncompressed + gzip -c out/initrd.img.uncompressed > $@ + rm out/initrd.img.uncompressed + +-include deps/cmd/gcs.gomake +-include deps/cmd/gcstools.gomake + +# Implicit rule for includes that define Go targets. +%.gomake: $(SRCROOT)/Makefile + @mkdir -p $(dir $@) + @/bin/echo $(@:deps/%.gomake=bin/%): $(SRCROOT)/hack/gomakedeps.sh > $@.new + @/bin/echo -e '\t@mkdir -p $$(dir $$@) $(dir $@)' >> $@.new + @/bin/echo -e '\t$$(GO_BUILD) -o $$@.new $$(SRCROOT)/$$(@:bin/%=%)' >> $@.new + @/bin/echo -e '\tGO="$(GO)" $$(SRCROOT)/hack/gomakedeps.sh $$@ $$(SRCROOT)/$$(@:bin/%=%) $$(GO_FLAGS) $$(GO_FLAGS_EXTRA) > $(@:%.gomake=%.godeps).new' >> $@.new + @/bin/echo -e '\tmv $(@:%.gomake=%.godeps).new $(@:%.gomake=%.godeps)' >> $@.new + @/bin/echo -e '\tmv $$@.new $$@' >> $@.new + @/bin/echo -e '-include $(@:%.gomake=%.godeps)' >> $@.new + mv $@.new $@ + +VPATH=$(SRCROOT) + +bin/vsockexec: vsockexec/vsockexec.o vsockexec/vsock.o + @mkdir -p bin + $(CC) $(LDFLAGS) -o $@ $^ + +bin/init: init/init.o vsockexec/vsock.o + @mkdir -p bin + $(CC) $(LDFLAGS) -o $@ $^ + +%.o: %.c + @mkdir -p $(dir $@) + $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< \ No newline at end of file diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go new file mode 100644 index 0000000000..74676671a6 --- /dev/null +++ b/cmd/gcs/main.go @@ -0,0 +1,245 @@ +// +build linux + +package main + +import ( + "flag" + "fmt" + "io" + "io/ioutil" + "os" + "syscall" + "time" + + "github.com/Microsoft/hcsshim/internal/guest/bridge" + "github.com/Microsoft/hcsshim/internal/guest/kmsg" + "github.com/Microsoft/hcsshim/internal/guest/runtime/hcsv2" + "github.com/Microsoft/hcsshim/internal/guest/runtime/runc" + "github.com/Microsoft/hcsshim/internal/guest/transport" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/containerd/cgroups" + cgroupstats "github.com/containerd/cgroups/stats/v1" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "go.opencensus.io/trace" +) + +func memoryLogFormat(metrics *cgroupstats.Metrics) logrus.Fields { + return logrus.Fields{ + "memoryUsage": metrics.Memory.Usage.Usage, + "memoryUsageMax": metrics.Memory.Usage.Max, + "memoryUsageLimit": metrics.Memory.Usage.Limit, + "swapUsage": metrics.Memory.Swap.Usage, + "swapUsageMax": metrics.Memory.Swap.Max, + "swapUsageLimit": metrics.Memory.Swap.Limit, + "kernelUsage": metrics.Memory.Kernel.Usage, + "kernelUsageMax": metrics.Memory.Kernel.Max, + "kernelUsageLimit": metrics.Memory.Kernel.Limit, + } +} + +func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, threshold int64, cg cgroups.Cgroup) { + // Buffer must be >= 8 bytes for eventfd reads + // http://man7.org/linux/man-pages/man2/eventfd.2.html + count := 0 + buf := make([]byte, 8) + for { + if _, err := efdFile.Read(buf); err != nil { + logrus.WithError(err).WithField("cgroup", cgName).Error("failed to read from eventfd") + return + } + + // Sometimes an event is sent during cgroup teardown, but does not indicate that the + // threshold was actually crossed. In the teardown case the cgroup.event_control file + // won't exist anymore, so check that to determine if we should ignore this event. + _, err := os.Lstat(fmt.Sprintf("/sys/fs/cgroup/memory%s/cgroup.event_control", cgName)) + if os.IsNotExist(err) { + return + } + + count++ + msg := "memory usage for cgroup exceeded threshold" + entry := logrus.WithFields(logrus.Fields{ + "gcsStartTime": startTime, + "time": time.Now(), + "cgroup": cgName, + "thresholdBytes": threshold, + "count": count, + }) + // Sleep for one second in case there is a series of allocations slightly after + // reaching threshold. + time.Sleep(time.Second) + metrics, err := cg.Stat(cgroups.IgnoreNotExist) + if err != nil { + // Don't return on Stat err as it will return an error if + // any of the cgroup subsystems Stat calls failed for any reason. + // We still want to log if we hit the cgroup threshold/limit + entry.WithError(err).Error(msg) + } else { + entry.WithFields(memoryLogFormat(metrics)).Warn(msg) + } + } +} + +func main() { + startTime := time.Now() + logLevel := flag.String("loglevel", "debug", "Logging Level: debug, info, warning, error, fatal, panic.") + kmsgLogLevel := flag.Uint("kmsgLogLevel", uint(kmsg.Warning), "Log all kmsg entries with a priority less than or equal to the supplied level.") + logFile := flag.String("logfile", "", "Logging Target: An optional file name/path. Omit for console output.") + logFormat := flag.String("log-format", "text", "Logging Format: text or json") + useInOutErr := flag.Bool("use-inouterr", false, "If true use stdin/stdout for bridge communication and stderr for logging") + v4 := flag.Bool("v4", false, "enable the v4 protocol support and v2 schema") + rootMemReserveBytes := flag.Uint64("root-mem-reserve-bytes", 75*1024*1024, "the amount of memory reserved for the orchestration, the rest will be assigned to containers") + gcsMemLimitBytes := flag.Uint64("gcs-mem-limit-bytes", 50*1024*1024, "the maximum amount of memory the gcs can use") + + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "\nUsage of %s:\n", os.Args[0]) + flag.PrintDefaults() + fmt.Fprintf(os.Stderr, "Examples:\n") + fmt.Fprintf(os.Stderr, " %s -loglevel=debug -logfile=/run/gcs/gcs.log\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " %s -loglevel=info -logfile=stdout\n", os.Args[0]) + } + + flag.Parse() + + // If v4 enable opencensus + if *v4 { + trace.ApplyConfig(trace.Config{DefaultSampler: trace.AlwaysSample()}) + trace.RegisterExporter(&oc.LogrusExporter{}) + } + + // Use a file instead of stdout + if *logFile != "" { + logFileHandle, err := os.OpenFile(*logFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) + if err != nil { + logrus.WithFields(logrus.Fields{ + "path": *logFile, + logrus.ErrorKey: err, + }).Fatal("failed to create log file") + } + logrus.SetOutput(logFileHandle) + } + + switch *logFormat { + case "text": + // retain logrus's default. + case "json": + logrus.SetFormatter(&logrus.JSONFormatter{ + TimestampFormat: time.RFC3339Nano, // include ns for accurate comparisons on the host + }) + default: + logrus.WithFields(logrus.Fields{ + "log-format": *logFormat, + }).Fatal("unknown log-format") + } + + level, err := logrus.ParseLevel(*logLevel) + if err != nil { + logrus.Fatal(err) + } + + logrus.SetLevel(level) + + baseLogPath := "/run/gcs/c" + + logrus.Info("GCS started") + + // Continuously log /dev/kmsg + go kmsg.ReadForever(kmsg.LogLevel(*kmsgLogLevel)) + + tport := &transport.VsockTransport{} + rtime, err := runc.NewRuntime(baseLogPath) + if err != nil { + logrus.WithError(err).Fatal("failed to initialize new runc runtime") + } + mux := bridge.NewBridgeMux() + b := bridge.Bridge{ + Handler: mux, + EnableV4: *v4, + } + h := hcsv2.NewHost(rtime, tport) + b.AssignHandlers(mux, h) + + var bridgeIn io.ReadCloser + var bridgeOut io.WriteCloser + if *useInOutErr { + bridgeIn = os.Stdin + bridgeOut = os.Stdout + } else { + const commandPort uint32 = 0x40000000 + bridgeCon, err := tport.Dial(commandPort) + if err != nil { + logrus.WithFields(logrus.Fields{ + "port": commandPort, + logrus.ErrorKey: err, + }).Fatal("failed to dial host vsock connection") + } + bridgeIn = bridgeCon + bridgeOut = bridgeCon + } + + // Setup the UVM cgroups to protect against a workload taking all available + // memory and causing the GCS to malfunction we create two cgroups: gcs, + // containers. + // + + // Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy + // support. This needs to be set before we create any cgroups as the write + // will fail otherwise. + if err := ioutil.WriteFile("/sys/fs/cgroup/memory/memory.use_hierarchy", []byte("1"), 0644); err != nil { + logrus.WithError(err).Fatal("failed to enable hierarchy support for root cgroup") + } + + // The containers cgroup is limited only by {Totalram - 75 MB + // (reservation)}. + // + // The gcs cgroup is not limited but an event will get logged if memory + // usage exceeds 50 MB. + sinfo := syscall.Sysinfo_t{} + if err := syscall.Sysinfo(&sinfo); err != nil { + logrus.WithError(err).Fatal("failed to get sys info") + } + containersLimit := int64(sinfo.Totalram - *rootMemReserveBytes) + containersControl, err := cgroups.New(cgroups.V1, cgroups.StaticPath("/containers"), &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &containersLimit, + }, + }) + if err != nil { + logrus.WithError(err).Fatal("failed to create containers cgroup") + } + defer containersControl.Delete() + + gcsControl, err := cgroups.New(cgroups.V1, cgroups.StaticPath("/gcs"), &oci.LinuxResources{}) + if err != nil { + logrus.WithError(err).Fatal("failed to create gcs cgroup") + } + defer gcsControl.Delete() + if err := gcsControl.Add(cgroups.Process{Pid: os.Getpid()}); err != nil { + logrus.WithError(err).Fatal("failed add gcs pid to gcs cgroup") + } + + event := cgroups.MemoryThresholdEvent(*gcsMemLimitBytes, false) + gefd, err := gcsControl.RegisterMemoryEvent(event) + if err != nil { + logrus.WithError(err).Fatal("failed to register memory threshold for gcs cgroup") + } + gefdFile := os.NewFile(gefd, "gefd") + defer gefdFile.Close() + + oom, err := containersControl.OOMEventFD() + if err != nil { + logrus.WithError(err).Fatal("failed to retrieve the container cgroups oom eventfd") + } + oomFile := os.NewFile(oom, "cefd") + defer oomFile.Close() + + go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) + go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) + err = b.ListenAndServe(bridgeIn, bridgeOut) + if err != nil { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + }).Fatal("failed to serve gcs service") + } +} diff --git a/cmd/gcstools/commoncli/common.go b/cmd/gcstools/commoncli/common.go new file mode 100644 index 0000000000..9359c1eeba --- /dev/null +++ b/cmd/gcstools/commoncli/common.go @@ -0,0 +1,40 @@ +package commoncli + +import ( + "flag" + "fmt" + "os" + "path/filepath" + + "github.com/sirupsen/logrus" +) + +// SetFlagsForLogging sets the command line flags for logging. +func SetFlagsForLogging() []*string { + basename := filepath.Base(os.Args[0]) + ".log" + logFile := flag.String("logfile", filepath.Join("/tmp", basename), "logging file location") + logLevel := flag.String("loglevel", "debug", "Logging Level: debug, info, warning, error, fatal, panic.") + return []*string{logFile, logLevel} +} + +// SetupLogging creates the logger from the command line parameters. +func SetupLogging(args ...*string) error { + if len(args) < 1 { + return fmt.Errorf("Invalid log params") + } + level, err := logrus.ParseLevel(*args[1]) + if err != nil { + logrus.Fatal(err) + return err + } + logrus.SetLevel(level) + + filename := *args[0] + outputTarget, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600) + if err != nil { + return err + } + + logrus.SetOutput(outputTarget) + return nil +} diff --git a/cmd/gcstools/generichook.go b/cmd/gcstools/generichook.go new file mode 100644 index 0000000000..0ffa57d6b2 --- /dev/null +++ b/cmd/gcstools/generichook.go @@ -0,0 +1,144 @@ +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "text/template" + + "github.com/Microsoft/hcsshim/cmd/gcstools/generichook" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +func runGenericHook() error { + state, err := loadHookState(os.Stdin) + if err != nil { + return err + } + + var ( + tctx = newTemplateContext(state) + args = []string(os.Args[1:]) + env = os.Environ() + ) + + parsedArgs, err := render(args, tctx) + if err != nil { + return err + } + parsedEnv, err := render(env, tctx) + if err != nil { + return err + } + + hookCmd := exec.Command(parsedArgs[0], parsedArgs[1:]...) + hookCmd.Env = parsedEnv + + out, err := hookCmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to run nvidia cli tool with: %v, %v", string(out), err) + } + + return nil +} + +func logDebugFile(debugFilePath string) { + contents, err := ioutil.ReadFile(debugFilePath) + if err != nil { + logrus.Errorf("failed to read debug file at %s: %v", debugFilePath, err) + return + } + numBytesInContents := len(contents) + + // since we forward logs on windows to etw, limit log size to 8KB to avoid issues + maxLogSize := 8000 + startBytes := 0 + i := 0 + for startBytes < numBytesInContents { + bytesLeft := len(contents[startBytes:]) + chunkSize := maxLogSize + if bytesLeft < maxLogSize { + chunkSize = bytesLeft + } + stopBytes := startBytes + chunkSize + output := string(contents[startBytes:stopBytes]) + logrus.WithField("output", output).Infof("%s debug part %d", debugFilePath, i) + i += 1 + startBytes += chunkSize + } + +} + +func genericHookMain() { + if err := runGenericHook(); err != nil { + logrus.Errorf("error in generic hook: %s", err) + debugFileToRead := os.Getenv(generichook.LogDebugFileEnvKey) + if debugFileToRead != "" { + logDebugFile(debugFileToRead) + } + os.Exit(1) + } + os.Exit(0) +} + +// the below functions are based on containerd oci-hook.go and are used for +// injecting runtime values into an oci hook's command +func loadHookState(r io.Reader) (*specs.State, error) { + var s *specs.State + if err := json.NewDecoder(r).Decode(&s); err != nil { + return nil, err + } + return s, nil +} + +func newTemplateContext(state *specs.State) *templateContext { + t := &templateContext{ + state: state, + } + t.funcs = template.FuncMap{ + "id": t.id, + "pid": t.pid, + "annotation": t.annotation, + } + return t +} + +type templateContext struct { + state *specs.State + funcs template.FuncMap +} + +func (t *templateContext) id() string { + return t.state.ID +} + +func (t *templateContext) pid() int { + return t.state.Pid +} + +func (t *templateContext) annotation(k string) string { + return t.state.Annotations[k] +} + +func render(templateList []string, tctx *templateContext) ([]string, error) { + buf := bytes.NewBuffer(nil) + for i, s := range templateList { + buf.Reset() + + t, err := template.New("generic-hook").Funcs(tctx.funcs).Parse(s) + if err != nil { + return nil, err + } + if err := t.Execute(buf, tctx); err != nil { + return nil, err + } + templateList[i] = buf.String() + } + buf.Reset() + return templateList, nil +} diff --git a/cmd/gcstools/generichook/defs.go b/cmd/gcstools/generichook/defs.go new file mode 100644 index 0000000000..3e01a3afeb --- /dev/null +++ b/cmd/gcstools/generichook/defs.go @@ -0,0 +1,6 @@ +package generichook + +// LogDebugFileEnvKey represents an optional environment variable key to be used to +// determine what file containing the ran spec hook's debug contents the `generichook` +// binary should read and log to logrus on spec hook error. +const LogDebugFileEnvKey = "GENERIC_HOOK_LOG_DEBUG_FILE" diff --git a/cmd/gcstools/main.go b/cmd/gcstools/main.go new file mode 100644 index 0000000000..bf88c764f0 --- /dev/null +++ b/cmd/gcstools/main.go @@ -0,0 +1,26 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" +) + +var commands = map[string]func(){ + "generichook": genericHookMain, +} + +func main() { + cmd := filepath.Base(os.Args[0]) + mainFunc := commands[cmd] + if mainFunc == nil { + fmt.Fprintf(os.Stderr, "unknown command: %s\n", cmd) + fmt.Fprintf(os.Stderr, "known commands:\n") + for k := range commands { + fmt.Fprintf(os.Stderr, "\t%s\n", k) + } + os.Exit(127) + } + + mainFunc() +} diff --git a/go.mod b/go.mod index 21640a6b99..fe1e7958ad 100644 --- a/go.mod +++ b/go.mod @@ -11,10 +11,15 @@ require ( github.com/containerd/ttrpc v1.0.2 github.com/containerd/typeurl v1.0.2 github.com/gogo/protobuf v1.3.2 + github.com/linuxkit/virtsock v0.0.0-20201010232012-f8cee7dfc7a3 + github.com/mattn/go-shellwords v1.0.6 + github.com/opencontainers/runc v1.0.0-rc93 github.com/opencontainers/runtime-spec v1.0.3-0.20200929063507-e6143ca7d51d github.com/pkg/errors v0.9.1 github.com/sirupsen/logrus v1.7.0 github.com/urfave/cli v1.22.2 + github.com/vishvananda/netlink v1.1.1-0.20201029203352-d40f9887b852 + github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae go.opencensus.io v0.22.3 golang.org/x/sync v0.0.0-20201207232520-09787c993a3a golang.org/x/sys v0.0.0-20210324051608-47abb6519492 diff --git a/go.sum b/go.sum index b7bede13fc..56925418aa 100644 --- a/go.sum +++ b/go.sum @@ -190,8 +190,10 @@ github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd v0.0.0-20161114122254-48702e0da86b/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e h1:Wf6HqHfScWJN9/ZjdUKyjop4mf3Qdd+1TvvltAvM3m8= github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/go-systemd/v22 v22.0.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= +github.com/coreos/go-systemd/v22 v22.1.0 h1:kq/SbG2BCKLkDKkjQf5OWwKWUKj1lgs3lFI4PxnR5lg= github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= @@ -219,6 +221,7 @@ github.com/docker/go-events v0.0.0-20170721190031-9461782956ad/go.mod h1:Uw6Uezg github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA= github.com/docker/go-metrics v0.0.0-20180209012529-399ea8c73916/go.mod h1:/u0gXw0Gay3ceNrsHubL3BtdOL2fHf93USgMTe0W5dI= github.com/docker/go-metrics v0.0.1/go.mod h1:cG1hvH2utMXtqgqqYE9plW6lDxS3/5ayHzueweSI3Vw= +github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1/go.mod h1:cyGadeNEkKy96OOhEzfZl+yxihPEzKnqJwvfuSUqbZE= github.com/docker/spdystream v0.0.0-20160310174837-449fdfce4d96/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM= @@ -260,7 +263,9 @@ github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/godbus/dbus v0.0.0-20151105175453-c7fdd8b5cd55/go.mod h1:/YcGZj5zSblfDWMMoOzV4fas9FZnQYTkDnsGvmh2Grw= github.com/godbus/dbus v0.0.0-20180201030542-885f9cc04c9c/go.mod h1:/YcGZj5zSblfDWMMoOzV4fas9FZnQYTkDnsGvmh2Grw= +github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e h1:BWhy2j3IXJhjCbC68FptL43tDKIq8FladmaTs3Xs7Z8= github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4= +github.com/godbus/dbus/v5 v5.0.3 h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/googleapis v1.2.0/go.mod h1:Njal3psf3qN6dwBtQfUmBZh2ybovJ0tlu3o/AC7HYjU= github.com/gogo/googleapis v1.4.0/go.mod h1:5YRNX2z1oM5gXdAkurHa942MDgEJyk02w4OecKY87+c= @@ -381,6 +386,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/linuxkit/virtsock v0.0.0-20201010232012-f8cee7dfc7a3 h1:jUp75lepDg0phMUJBCmvaeFDldD2N3S1lBuPwUTszio= +github.com/linuxkit/virtsock v0.0.0-20201010232012-f8cee7dfc7a3/go.mod h1:3r6x7q95whyfWQpmGZTu3gk3v2YkMi05HEzl7Tf7YEo= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= @@ -390,6 +397,8 @@ github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaO github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/mattn/go-shellwords v1.0.3/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= +github.com/mattn/go-shellwords v1.0.6 h1:9Jok5pILi5S1MnDirGVTufYGtksUs/V2BWUP3ZkeUUI= +github.com/mattn/go-shellwords v1.0.6/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/miekg/pkcs11 v1.0.3/go.mod h1:XsNlhZGX73bx86s2hdc/FuaLm2CPZJemRLMA+WTFxgs= @@ -440,6 +449,7 @@ github.com/opencontainers/runc v0.0.0-20190115041553-12f6a991201f/go.mod h1:qT5X github.com/opencontainers/runc v0.1.1/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U= github.com/opencontainers/runc v1.0.0-rc8.0.20190926000215-3e425f80a8c9/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U= github.com/opencontainers/runc v1.0.0-rc9/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U= +github.com/opencontainers/runc v1.0.0-rc93 h1:x2UMpOOVf3kQ8arv/EsDGwim8PTNqzL1/EYDr/+scOM= github.com/opencontainers/runc v1.0.0-rc93/go.mod h1:3NOsor4w32B2tC0Zbl8Knk4Wg84SM2ImC1fxBuqJ/H0= github.com/opencontainers/runtime-spec v0.1.2-0.20190507144316-5b71a03e2700/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.0.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= @@ -550,9 +560,11 @@ github.com/urfave/cli v1.22.2 h1:gsqYFH8bb9ekPA12kRo0hfjngWQjkJPlN9R0N78BoUo= github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/vishvananda/netlink v0.0.0-20181108222139-023a6dafdcdf/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk= github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= +github.com/vishvananda/netlink v1.1.1-0.20201029203352-d40f9887b852 h1:cPXZWzzG0NllBLdjWoD1nDfaqu98YMv+OneaKc8sPOA= github.com/vishvananda/netlink v1.1.1-0.20201029203352-d40f9887b852/go.mod h1:twkDnbuQxJYemMlGd4JFIcuhgX83tXhKS2B/PRMpOho= github.com/vishvananda/netns v0.0.0-20180720170159-13995c7128cc/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI= github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= +github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae h1:4hwBBUfQCFe3Cym0ZtKyq7L16eZUtYKs+BaHDN6mAns= github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0= github.com/willf/bitset v1.1.11-0.20200630133818-d5bec3311243/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4= github.com/willf/bitset v1.1.11/go.mod h1:83CECat5yLh5zVOf4P1ErAgKA5UDvKtgyUABdr3+MjI= diff --git a/hack/catcpio.sh b/hack/catcpio.sh new file mode 100755 index 0000000000..b5ed3298af --- /dev/null +++ b/hack/catcpio.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e + +dir="`mktemp -d`" +trap 'rm -rf "$dir"' EXIT + +for file; do + if ! [ -f "$file" ]; then + echo file not found: "$file" + exit 1 + fi + case `file -bz "$file"` in + "ASCII cpio archive"*"(gzip compressed data"*) + gunzip -c "$file" | (cd "$dir" && cpio -iumd) ;; + "ASCII cpio archive"*) + cat "$file" | (cd "$dir" && cpio -iumd) ;; + *) + tar -xf "$file" -C "$dir" ;; + esac +done +cd "$dir" && find . | cpio --create --format=newc -R 0:0 diff --git a/hack/gomakedeps.sh b/hack/gomakedeps.sh new file mode 100755 index 0000000000..730a6eaf6c --- /dev/null +++ b/hack/gomakedeps.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# Creates Makefile rules that enumerate all the input files +# for the Go package provided on the command line. + +set -e + +GO="${GO:-go}" +target_bin="$1" +target_pkg="$2" +shift 2 +fmt=$(cat < +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../vsockexec/vsock.h" + +// musl-gcc doesn't use headers in /usr/include, so it can't find +// linux/random.h which is where RNDADDENTROPY is defined. We only need this +// single definition from linux/random.h, so we just duplicate it here as a +// workaround. +#define RNDADDENTROPY _IOW( 'R', 0x03, int [2] ) + +#define DEFAULT_PATH_ENV "PATH=/sbin:/usr/sbin:/bin:/usr/bin" + +const char *const default_envp[] = { + DEFAULT_PATH_ENV, + NULL, +}; + +// When nothing is passed, default to the LCOWv1 behavior. +const char *const default_argv[] = { "/bin/gcs", "-loglevel", "debug", "-logfile=/run/gcs/gcs.log" }; +const char *const default_shell = "/bin/sh"; + +struct Mount { + const char *source, *target, *type; + unsigned long flags; + const void *data; +}; + +struct Mkdir { + const char *path; + mode_t mode; +}; + +struct Mknod { + const char *path; + mode_t mode; + int major, minor; +}; + +struct Symlink { + const char *linkpath, *target; +}; + +enum OpType { + OpMount, + OpMkdir, + OpMknod, + OpSymlink, +}; + +struct InitOp { + enum OpType op; + union { + struct Mount mount; + struct Mkdir mkdir; + struct Mknod mknod; + struct Symlink symlink; + }; +}; + +const struct InitOp ops[] = { + // mount /proc (which should already exist) + { OpMount, .mount = { "proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC } }, + + // add symlinks in /dev (which is already mounted) + { OpSymlink, .symlink = { "/dev/fd", "/proc/self/fd" } }, + { OpSymlink, .symlink = { "/dev/stdin", "/proc/self/fd/0" } }, + { OpSymlink, .symlink = { "/dev/stdout", "/proc/self/fd/1" } }, + { OpSymlink, .symlink = { "/dev/stderr", "/proc/self/fd/2" } }, + + // mount tmpfs on /run and /tmp (which should already exist) + { OpMount, .mount = { "tmpfs", "/run", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } }, + { OpMount, .mount = { "tmpfs", "/tmp", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } }, + + // mount shm and devpts + { OpMkdir, .mkdir = { "/dev/shm", 0755 } }, + { OpMount, .mount = { "shm", "/dev/shm", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } }, + { OpMkdir, .mkdir = { "/dev/pts", 0755 } }, + { OpMount, .mount = { "devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC } }, + + // mount /sys (which should already exist) + { OpMount, .mount = { "sysfs", "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } }, + { OpMount, .mount = { "cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } }, +}; + +void warn(const char *msg) { + int error = errno; + perror(msg); + errno = error; +} + +void warn2(const char *msg1, const char *msg2) { + int error = errno; + fputs(msg1, stderr); + fputs(": ", stderr); + errno = error; + warn(msg2); +} + +_Noreturn void dien() { + exit(errno); +} + +_Noreturn void die(const char *msg) { + warn(msg); + dien(); +} + +_Noreturn void die2(const char *msg1, const char *msg2) { + warn2(msg1, msg2); + dien(); +} + +void init_rlimit() { + // Set the hard limit for number of open fds much larger. The kernel sets + // a limit of 4096 for historical reasons, and this limit is too low for + // some software. According to the systemd developers, there is no downside + // to a large hard limit in modern Linux kernels. + // + // Retain the small soft limit of 1024 for appcompat. + struct rlimit rlim = { + .rlim_cur = 1024, + .rlim_max = 1024 * 1024, + }; + if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { + die("setrlimit(RLIMIT_NOFILE)"); + } +} + +void init_dev() { + if (mount("dev", "/dev", "devtmpfs", MS_NOSUID | MS_NOEXEC, NULL) < 0) { + warn2("mount", "/dev"); + // /dev will be already mounted if devtmpfs.mount = 1 on the kernel + // command line or CONFIG_DEVTMPFS_MOUNT is set. Do not consider this + // an error. + if (errno != EBUSY) { + dien(); + } + } +} + +void init_fs(const struct InitOp *ops, size_t count) { + for (size_t i = 0; i < count; i++) { + switch (ops[i].op) { + case OpMount: { + const struct Mount *m = &ops[i].mount; + if (mount(m->source, m->target, m->type, m->flags, m->data) < 0) { + die2("mount", m->target); + } + break; + } + case OpMkdir: { + const struct Mkdir *m = &ops[i].mkdir; + if (mkdir(m->path, m->mode) < 0) { + warn2("mkdir", m->path); + if (errno != EEXIST) { + dien(); + } + } + break; + } + case OpMknod: { + const struct Mknod *n = &ops[i].mknod; + if (mknod(n->path, n->mode, makedev(n->major, n->minor)) < 0) { + warn2("mknod", n->path); + if (errno != EEXIST) { + dien(); + } + } + break; + } + case OpSymlink: { + const struct Symlink *sl = &ops[i].symlink; + if (symlink(sl->target, sl->linkpath) < 0) { + warn2("symlink", sl->linkpath); + if (errno != EEXIST) { + dien(); + } + } + break; + } + } + } +} + +void init_cgroups() { + const char *fpath = "/proc/cgroups"; + FILE *f = fopen(fpath, "r"); + if (f == NULL) { + die2("fopen", fpath); + } + // Skip the first line. + for (;;) { + char c = fgetc(f); + if (c == EOF || c == '\n') { + break; + } + } + for (;;) { + static const char base_path[] = "/sys/fs/cgroup/"; + char path[sizeof(base_path) - 1 + 64]; + char* name = path + sizeof(base_path) - 1; + int hier, groups, enabled; + int r = fscanf(f, "%64s %d %d %d\n", name, &hier, &groups, &enabled); + if (r == EOF) { + break; + } + if (r != 4) { + errno = errno ? : EINVAL; + die2("fscanf", fpath); + } + if (enabled) { + memcpy(path, base_path, sizeof(base_path) - 1); + if (mkdir(path, 0755) < 0) { + die2("mkdir", path); + } + if (mount(name, path, "cgroup", MS_NODEV | MS_NOSUID | MS_NOEXEC, name) < 0) { + die2("mount", path); + } + } + } + fclose(f); +} + +void init_network(const char *iface, int domain) { + int s = socket(domain, SOCK_DGRAM, IPPROTO_IP); + if (s < 0) { + if (errno == EAFNOSUPPORT) { + return; + } + die("socket"); + } + + struct ifreq request = {0}; + strncpy(request.ifr_name, iface, sizeof(request.ifr_name)); + if (ioctl(s, SIOCGIFFLAGS, &request) < 0) { + die2("ioctl(SIOCGIFFLAGS)", iface); + } + + request.ifr_flags |= IFF_UP | IFF_RUNNING; + if (ioctl(s, SIOCSIFFLAGS, &request) < 0) { + die2("ioctl(SIOCSIFFLAGS)", iface); + } + + close(s); +} + +// inject boot-time entropy after reading it from a vsock port +void init_entropy(int port) { + int s = openvsock(VMADDR_CID_HOST, port); + if (s < 0) { + die("openvsock entropy"); + } + + int e = open("/dev/random", O_RDWR); + if (e < 0) { + die("open /dev/random"); + } + + struct { + int entropy_count; + int buf_size; + char buf[4096]; + } buf; + + for (;;) { + ssize_t n = read(s, buf.buf, sizeof(buf.buf)); + if (n < 0) { + die("read entropy"); + } + + if (n == 0) { + break; + } + + buf.entropy_count = n * 8; // in bits + buf.buf_size = n; // in bytes + if (ioctl(e, RNDADDENTROPY, &buf) < 0) { + die("ioctl(RNDADDENTROPY)"); + } + } + + close(s); + close(e); +} + +pid_t launch(int argc, char **argv) { + int pid = fork(); + if (pid != 0) { + if (pid < 0) { + die("fork"); + } + + return pid; + } + + // Unblock signals before execing. + sigset_t set; + sigfillset(&set); + sigprocmask(SIG_UNBLOCK, &set, 0); + + // Create a session and process group. + setsid(); + setpgid(0, 0); + + // Terminate the arguments and exec. + char **argvn = alloca(sizeof(argv[0]) * (argc + 1)); + memcpy(argvn, argv, sizeof(argv[0]) * argc); + argvn[argc] = NULL; + if (putenv(DEFAULT_PATH_ENV)) { // Specify the PATH used for execvpe + die("putenv"); + } + execvpe(argvn[0], argvn, (char**)default_envp); + die2("execvpe", argvn[0]); +} + +int reap_until(pid_t until_pid) { + for (;;) { + int status; + pid_t pid = wait(&status); + if (pid < 0) { + die("wait"); + } + + if (pid == until_pid) { + // The initial child process died. Pass through the exit status. + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + fputs("child exited with error\n", stderr); + } + return WEXITSTATUS(status); + } + fputs("child exited by signal: ", stderr); + fputs(strsignal(WTERMSIG(status)), stderr); + fputs("\n", stderr); + return 128 + WTERMSIG(status); + } + } +} + +int main(int argc, char **argv) { + char *debug_shell = NULL; + int entropy_port = 0; + if (argc <= 1) { + argv = (char **)default_argv; + argc = sizeof(default_argv) / sizeof(default_argv[0]); + optind = 0; + debug_shell = (char*)default_shell; + } else { + for (int opt; (opt = getopt(argc, argv, "+d:e:")) >= 0; ) { + switch (opt) { + case 'd': + debug_shell = optarg; + break; + + case 'e': + entropy_port = atoi(optarg); + if (entropy_port == 0) { + fputs("invalid entropy port\n", stderr); + exit(1); + } + + break; + + default: + exit(1); + } + } + } + + char **child_argv = argv + optind; + int child_argc = argc - optind; + + // Block all signals in init. SIGCHLD will still cause wait() to return. + sigset_t set; + sigfillset(&set); + sigprocmask(SIG_BLOCK, &set, 0); + + init_rlimit(); + init_dev(); + init_fs(ops, sizeof(ops) / sizeof(ops[0])); + init_cgroups(); + init_network("lo", AF_INET); + init_network("lo", AF_INET6); + if (entropy_port != 0) { + init_entropy(entropy_port); + } + + pid_t pid = launch(child_argc, child_argv); + if (debug_shell != NULL) { + // The debug shell takes over as the primary child. + pid = launch(1, &debug_shell); + } + + // Reap until the initial child process dies. + return reap_until(pid); +} diff --git a/internal/debug/dumpstacks.go b/internal/debug/dumpstacks.go new file mode 100644 index 0000000000..60a04dc31e --- /dev/null +++ b/internal/debug/dumpstacks.go @@ -0,0 +1,15 @@ +package debug + +import "runtime" + +func DumpStacks() string { + buf := make([]byte, 4096) + for { + buf = buf[:runtime.Stack(buf, true)] + if len(buf) < cap(buf) { + break + } + buf = make([]byte, 2*len(buf)) + } + return string(buf) +} diff --git a/internal/guest/bridge/bridge.go b/internal/guest/bridge/bridge.go new file mode 100644 index 0000000000..5d414adf13 --- /dev/null +++ b/internal/guest/bridge/bridge.go @@ -0,0 +1,460 @@ +// +build linux + +// Package bridge defines the bridge struct, which implements the control loop +// and functions of the GCS's bridge client. +package bridge + +import ( + "context" + "encoding/base64" + "encoding/binary" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "os" + "strconv" + "sync" + "sync/atomic" + "time" + + "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/guest/runtime/hcsv2" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "go.opencensus.io/trace" + "go.opencensus.io/trace/tracestate" +) + +// UnknownMessage represents the default handler logic for an unmatched request +// type sent from the bridge. +func UnknownMessage(r *Request) (RequestResponse, error) { + return nil, gcserr.WrapHresult(errors.Errorf("bridge: function not supported, header type: %v", r.Header.Type), gcserr.HrNotImpl) +} + +// UnknownMessageHandler creates a default HandlerFunc out of the +// UnknownMessage handler logic. +func UnknownMessageHandler() Handler { + return HandlerFunc(UnknownMessage) +} + +// Handler responds to a bridge request. +type Handler interface { + ServeMsg(*Request) (RequestResponse, error) +} + +// HandlerFunc is an adapter to use functions as handlers. +type HandlerFunc func(*Request) (RequestResponse, error) + +// ServeMsg calls f(w, r). +func (f HandlerFunc) ServeMsg(r *Request) (RequestResponse, error) { + return f(r) +} + +// Mux is a protocol multiplexer for request response pairs +// following the bridge protocol. +type Mux struct { + mu sync.Mutex + m map[prot.MessageIdentifier]map[prot.ProtocolVersion]Handler +} + +// NewBridgeMux creates a default bridge multiplexer. +func NewBridgeMux() *Mux { + return &Mux{m: make(map[prot.MessageIdentifier]map[prot.ProtocolVersion]Handler)} +} + +// Handle registers the handler for the given message id and protocol version. +func (mux *Mux) Handle(id prot.MessageIdentifier, ver prot.ProtocolVersion, handler Handler) { + mux.mu.Lock() + defer mux.mu.Unlock() + + if handler == nil { + panic("bridge: nil handler") + } + + if _, ok := mux.m[id]; !ok { + mux.m[id] = make(map[prot.ProtocolVersion]Handler) + } + + if _, ok := mux.m[id][ver]; ok { + logrus.WithFields(logrus.Fields{ + "message-type": id.String(), + "protocol-version": ver, + }).Warn("opengcs::bridge - overwriting bridge handler") + } + + mux.m[id][ver] = handler +} + +// HandleFunc registers the handler function for the given message id and protocol version. +func (mux *Mux) HandleFunc(id prot.MessageIdentifier, ver prot.ProtocolVersion, handler func(*Request) (RequestResponse, error)) { + if handler == nil { + panic("bridge: nil handler func") + } + + mux.Handle(id, ver, HandlerFunc(handler)) +} + +// Handler returns the handler to use for the given request type. +func (mux *Mux) Handler(r *Request) Handler { + mux.mu.Lock() + defer mux.mu.Unlock() + + if r == nil { + panic("bridge: nil request to handler") + } + + var m map[prot.ProtocolVersion]Handler + var ok bool + if m, ok = mux.m[r.Header.Type]; !ok { + return UnknownMessageHandler() + } + + var h Handler + if h, ok = m[r.Version]; !ok { + return UnknownMessageHandler() + } + + return h +} + +// ServeMsg dispatches the request to the handler whose +// type matches the request type. +func (mux *Mux) ServeMsg(r *Request) (RequestResponse, error) { + h := mux.Handler(r) + return h.ServeMsg(r) +} + +// Request is the bridge request that has been sent. +type Request struct { + // Context is the request context received from the bridge. + Context context.Context + // Header is the wire format message header that preceeded the message for + // this request. + Header *prot.MessageHeader + // ContainerID is the id of the container that this message cooresponds to. + ContainerID string + // ActivityID is the id of the specific activity for this request. + ActivityID string + // Message is the portion of the request that follows the `Header`. This is + // a json encoded string that MUST contain `prot.MessageBase`. + Message []byte + // Version is the version of the protocol that `Header` and `Message` were + // sent in. + Version prot.ProtocolVersion +} + +// RequestResponse is the base response for any bridge message request. +type RequestResponse interface { + Base() *prot.MessageResponseBase +} + +type bridgeResponse struct { + // ctx is the context created on request read + ctx context.Context + header *prot.MessageHeader + response interface{} +} + +// Bridge defines the bridge client in the GCS. It acts in many ways analogous +// to go's `http` package and multiplexer. +// +// It has two fundamentally different dispatch options: +// +// 1. Request/Response where using the `Handler` a request +// of a given type will be dispatched to the apprpriate handler +// and an appropriate response will respond to exactly that request that +// caused the dispatch. +// +// 2. `PublishNotification` where a notification that was not initiated +// by a request from any client can be written to the bridge at any time +// in any order. +type Bridge struct { + // Handler to invoke when messages are received. + Handler Handler + // EnableV4 enables the v4+ bridge and the schema v2+ interfaces. + EnableV4 bool + + // responseChan is the response channel used for both request/response + // and publish notification workflows. + responseChan chan bridgeResponse + + hostState *hcsv2.Host + + quitChan chan bool + // hasQuitPending when != 0 will cause no more requests to be Read. + hasQuitPending uint32 + + protVer prot.ProtocolVersion +} + +// AssignHandlers creates and assigns the appropriate bridge +// events to be listen for and intercepted on `mux` before forwarding +// to `gcs` for handling. +func (b *Bridge) AssignHandlers(mux *Mux, host *hcsv2.Host) { + b.hostState = host + + // These are PvInvalid because they will be called previous to any protocol + // negotiation so they respond only when the protocols are not known. + if b.EnableV4 { + mux.HandleFunc(prot.ComputeSystemNegotiateProtocolV1, prot.PvInvalid, b.negotiateProtocolV2) + } + + if b.EnableV4 { + // v4 specific handlers + mux.HandleFunc(prot.ComputeSystemStartV1, prot.PvV4, b.startContainerV2) + mux.HandleFunc(prot.ComputeSystemCreateV1, prot.PvV4, b.createContainerV2) + mux.HandleFunc(prot.ComputeSystemExecuteProcessV1, prot.PvV4, b.execProcessV2) + mux.HandleFunc(prot.ComputeSystemShutdownForcedV1, prot.PvV4, b.killContainerV2) + mux.HandleFunc(prot.ComputeSystemShutdownGracefulV1, prot.PvV4, b.shutdownContainerV2) + mux.HandleFunc(prot.ComputeSystemSignalProcessV1, prot.PvV4, b.signalProcessV2) + mux.HandleFunc(prot.ComputeSystemGetPropertiesV1, prot.PvV4, b.getPropertiesV2) + mux.HandleFunc(prot.ComputeSystemWaitForProcessV1, prot.PvV4, b.waitOnProcessV2) + mux.HandleFunc(prot.ComputeSystemResizeConsoleV1, prot.PvV4, b.resizeConsoleV2) + mux.HandleFunc(prot.ComputeSystemModifySettingsV1, prot.PvV4, b.modifySettingsV2) + mux.HandleFunc(prot.ComputeSystemDumpStacksV1, prot.PvV4, b.dumpStacksV2) + mux.HandleFunc(prot.ComputeSystemDeleteContainerStateV1, prot.PvV4, b.deleteContainerStateV2) + } +} + +// ListenAndServe connects to the bridge transport, listens for +// messages and dispatches the appropriate handlers to handle each +// event in an asynchronous manner. +func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser) error { + requestChan := make(chan *Request) + requestErrChan := make(chan error) + b.responseChan = make(chan bridgeResponse) + responseErrChan := make(chan error) + b.quitChan = make(chan bool) + + defer close(b.quitChan) + defer bridgeOut.Close() + defer close(responseErrChan) + defer close(b.responseChan) + defer close(requestChan) + defer close(requestErrChan) + defer bridgeIn.Close() + + // Receive bridge requests and schedule them to be processed. + go func() { + var recverr error + for { + if atomic.LoadUint32(&b.hasQuitPending) == 0 { + header := &prot.MessageHeader{} + if err := binary.Read(bridgeIn, binary.LittleEndian, header); err != nil { + if err == io.ErrUnexpectedEOF || err == os.ErrClosed { + break + } + recverr = errors.Wrap(err, "bridge: failed reading message header") + break + } + message := make([]byte, header.Size-prot.MessageHeaderSize) + if _, err := io.ReadFull(bridgeIn, message); err != nil { + if err == io.ErrUnexpectedEOF || err == os.ErrClosed { + break + } + recverr = errors.Wrap(err, "bridge: failed reading message payload") + break + } + + base := prot.MessageBase{} + if err := json.Unmarshal(message, &base); err != nil { + // TODO: JTERRY75 - This should fail the request but right + // now we still forward to the method and let them return + // this error. Unify the JSON part previous to invoking a + // request. + } + + var ctx context.Context + var span *trace.Span + if base.OpenCensusSpanContext != nil { + sc := trace.SpanContext{} + if bytes, err := hex.DecodeString(base.OpenCensusSpanContext.TraceID); err == nil { + copy(sc.TraceID[:], bytes) + } + if bytes, err := hex.DecodeString(base.OpenCensusSpanContext.SpanID); err == nil { + copy(sc.SpanID[:], bytes) + } + sc.TraceOptions = trace.TraceOptions(base.OpenCensusSpanContext.TraceOptions) + if base.OpenCensusSpanContext.Tracestate != "" { + if bytes, err := base64.StdEncoding.DecodeString(base.OpenCensusSpanContext.Tracestate); err == nil { + var entries []tracestate.Entry + if err := json.Unmarshal(bytes, &entries); err == nil { + if ts, err := tracestate.New(nil, entries...); err == nil { + sc.Tracestate = ts + } + } + } + } + ctx, span = trace.StartSpanWithRemoteParent(context.Background(), "opengcs::bridge::request", sc) + } else { + ctx, span = trace.StartSpan(context.Background(), "opengcs::bridge::request") + } + + span.AddAttributes( + trace.Int64Attribute("message-id", int64(header.ID)), + trace.StringAttribute("message-type", header.Type.String()), + trace.StringAttribute("activityID", base.ActivityID), + trace.StringAttribute("cid", base.ContainerID)) + + log.G(ctx).WithField("message", string(message)).Debug("request read message") + + requestChan <- &Request{ + Context: ctx, + Header: header, + ContainerID: base.ContainerID, + ActivityID: base.ActivityID, + Message: message, + Version: b.protVer, + } + } + } + requestErrChan <- recverr + }() + // Process each bridge request async and create the response writer. + go func() { + for req := range requestChan { + go func(r *Request) { + br := bridgeResponse{ + ctx: r.Context, + header: &prot.MessageHeader{ + Type: prot.GetResponseIdentifier(r.Header.Type), + ID: r.Header.ID, + }, + } + resp, err := b.Handler.ServeMsg(r) + if resp == nil { + resp = &prot.MessageResponseBase{} + } + resp.Base().ActivityID = r.ActivityID + if err != nil { + span := trace.FromContext(r.Context) + if span != nil { + oc.SetSpanStatus(span, err) + } + setErrorForResponseBase(resp.Base(), err) + } + br.response = resp + b.responseChan <- br + }(req) + } + }() + // Process each bridge response sync. This channel is for request/response and publish workflows. + go func() { + var resperr error + for resp := range b.responseChan { + responseBytes, err := json.Marshal(resp.response) + if err != nil { + resperr = errors.Wrapf(err, "bridge: failed to marshal JSON for response \"%v\"", resp.response) + break + } + resp.header.Size = uint32(len(responseBytes) + prot.MessageHeaderSize) + if err := binary.Write(bridgeOut, binary.LittleEndian, resp.header); err != nil { + resperr = errors.Wrap(err, "bridge: failed writing message header") + break + } + + if _, err := bridgeOut.Write(responseBytes); err != nil { + resperr = errors.Wrap(err, "bridge: failed writing message payload") + break + } + + s := trace.FromContext(resp.ctx) + if s != nil { + log.G(resp.ctx).WithField("message", string(responseBytes)).Debug("request write response") + s.End() + } + } + responseErrChan <- resperr + }() + + select { + case err := <-requestErrChan: + return err + case err := <-responseErrChan: + return err + case <-b.quitChan: + // The request loop needs to exit so that the teardown process begins. + // Set the request loop to stop processing new messages + atomic.StoreUint32(&b.hasQuitPending, 1) + // Wait for the request loop to process its last message. Its possible + // that if it lost the race with the hasQuitPending it could be stuck in + // a pending read from bridgeIn. Wait 2 seconds and kill the connection. + var err error + select { + case err = <-requestErrChan: + case <-time.After(time.Second * 5): + // Timeout expired first. Close the connection to unblock the read + if cerr := bridgeIn.Close(); cerr != nil { + err = errors.Wrap(cerr, "bridge: failed to close bridgeIn") + } + <-requestErrChan + } + <-responseErrChan + return err + } +} + +// PublishNotification writes a specific notification to the bridge. +func (b *Bridge) PublishNotification(n *prot.ContainerNotification) { + ctx, span := trace.StartSpan(context.Background(), "opengcs::bridge::PublishNotification") + span.AddAttributes(trace.StringAttribute("notification", fmt.Sprintf("%+v", n))) + // DONT defer span.End() here. Publish is odd because bridgeResponse calls + // `End` on the `ctx` after the response is sent. + + resp := bridgeResponse{ + ctx: ctx, + header: &prot.MessageHeader{ + Type: prot.ComputeSystemNotificationV1, + ID: 0, + }, + response: n, + } + b.responseChan <- resp +} + +// setErrorForResponseBase modifies the passed-in MessageResponseBase to +// contain information pertaining to the given error. +func setErrorForResponseBase(response *prot.MessageResponseBase, errForResponse error) { + errorMessage := errForResponse.Error() + stackString := "" + fileName := "" + lineNumber := -1 + functionName := "" + if stack := gcserr.BaseStackTrace(errForResponse); stack != nil { + bottomFrame := stack[0] + stackString = fmt.Sprintf("%+v", stack) + fileName = fmt.Sprintf("%s", bottomFrame) + lineNumberStr := fmt.Sprintf("%d", bottomFrame) + var err error + lineNumber, err = strconv.Atoi(lineNumberStr) + if err != nil { + logrus.WithFields(logrus.Fields{ + "line-number": lineNumberStr, + logrus.ErrorKey: err, + }).Error("opengcs::bridge::setErrorForResponseBase - failed to parse line number, using -1 instead") + lineNumber = -1 + } + functionName = fmt.Sprintf("%n", bottomFrame) + } + hresult, err := gcserr.GetHresult(errForResponse) + if err != nil { + // Default to using the generic failure HRESULT. + hresult = gcserr.HrFail + } + response.Result = int32(hresult) + response.ErrorMessage = errorMessage + newRecord := prot.ErrorRecord{ + Result: int32(hresult), + Message: errorMessage, + StackTrace: stackString, + ModuleName: "gcs", + FileName: fileName, + Line: uint32(lineNumber), + FunctionName: functionName, + } + response.ErrorRecords = append(response.ErrorRecords, newRecord) +} diff --git a/internal/guest/bridge/bridge_unit_test.go b/internal/guest/bridge/bridge_unit_test.go new file mode 100644 index 0000000000..f388f8a02a --- /dev/null +++ b/internal/guest/bridge/bridge_unit_test.go @@ -0,0 +1,665 @@ +// +build linux + +package bridge + +import ( + "encoding/binary" + "encoding/json" + "io" + "io/ioutil" + "os" + "strings" + "sync" + "testing" + + "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/guest/transport" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +func Test_Bridge_Mux_New(t *testing.T) { + m := NewBridgeMux() + if m == nil { + t.Error("Failed to create bridge mux") + } +} + +func Test_Bridge_Mux_New_Success(t *testing.T) { + m := NewBridgeMux() + if m.m == nil { + t.Error("Bridge mux map is not initialized") + } +} + +type thandler struct { + set bool + resp RequestResponse + err error +} + +func (h *thandler) ServeMsg(r *Request) (RequestResponse, error) { + h.set = true + return h.resp, h.err +} + +func TestBridgeMux_Handle_NilHandler_Panic(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Error("The code did not panic on nil handler") + } + }() + + m := NewBridgeMux() + m.Handle(prot.ComputeSystemCreateV1, prot.PvInvalid, nil) +} + +func TestBridgeMux_Handle_NilMap_Panic(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Error("The code did not panic on nil map") + } + }() + + m := &Mux{} // Caller didn't use NewBridgeMux (not supported). + th := &thandler{} + m.Handle(prot.ComputeSystemCreateV1, prot.PvInvalid, th) +} + +func Test_Bridge_Mux_Handle_Succeeds(t *testing.T) { + th := &thandler{} + m := NewBridgeMux() + m.Handle(prot.ComputeSystemCreateV1, prot.PvInvalid, th) + + var verMap map[prot.ProtocolVersion]Handler + var ok bool + if verMap, ok = m.m[prot.ComputeSystemCreateV1]; !ok { + t.Error("The handler type map not successfully added.") + } + + var hOut Handler + if hOut, ok = verMap[prot.PvInvalid]; !ok { + t.Error("The handler was not successfully added.") + } + + // Is it the correct handler? + hOut.ServeMsg(nil) + + if !th.set { + t.Error("The handler added was not the same handler.") + } +} + +func TestBridgeMux_HandleFunc_NilHandleFunc_Panic(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Error("The code did not panic on nil handler") + } + }() + + m := NewBridgeMux() + m.HandleFunc(prot.ComputeSystemCreateV1, prot.PvInvalid, nil) +} + +func TestBridgeMux_HandleFunc_NilMap_Panic(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Error("The code did not panic on nil handler") + } + }() + + hIn := func(*Request) (RequestResponse, error) { + return nil, nil + } + + m := &Mux{} // Caller didn't use NewBridgeMux (not supported). + m.HandleFunc(prot.ComputeSystemCreateV1, prot.PvInvalid, hIn) +} + +func Test_Bridge_Mux_HandleFunc_Succeeds(t *testing.T) { + var set bool + hIn := func(*Request) (RequestResponse, error) { + set = true + return nil, nil + } + + m := NewBridgeMux() + m.HandleFunc(prot.ComputeSystemCreateV1, prot.PvInvalid, hIn) + + var verMap map[prot.ProtocolVersion]Handler + var ok bool + if verMap, ok = m.m[prot.ComputeSystemCreateV1]; !ok { + t.Error("The handler type map not successfully added.") + } + + var hOut Handler + if hOut, ok = verMap[prot.PvInvalid]; !ok { + t.Error("The handler was not successfully added.") + } + + // Is it the correct handler? + hOut.ServeMsg(nil) + + if !set { + t.Error("The handler added was not the same handler.") + } +} + +func Test_Bridge_Mux_Handler_NilRequest_Panic(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Error("The code did not panic on nil request to handler") + } + }() + + var set bool + hIn := func(*Request) (RequestResponse, error) { + set = true + return nil, nil + } + + m := NewBridgeMux() + m.HandleFunc(prot.ComputeSystemCreateV1, prot.PvInvalid, hIn) + m.Handler(nil) + if set { + t.Fatal("should not be set on nil request") + } +} + +func verifyResponseIsDefaultHandler(t *testing.T, resp RequestResponse) { + if resp == nil { + t.Fatal("The response is nil") + } + + base := resp.Base() + if base.Result != int32(gcserr.HrNotImpl) { + t.Fatal("The default handler did not set a -1 error result.") + } + if len(base.ErrorRecords) != 1 { + t.Fatal("The default handler did not set an error record.") + } + if !strings.Contains(base.ErrorRecords[0].Message, "bridge: function not supported") { + t.Fatal("The default handler did not return the not supported message") + } +} + +func Test_Bridge_Mux_Handler_NotAdded_Default(t *testing.T) { + // Testing specifically that if we have a bridge with no handlers that + // for the incomming request we get the default handler. + + m := NewBridgeMux() + + req := &Request{ + Header: &prot.MessageHeader{ + Type: prot.ComputeSystemCreateV1, + Size: 0, + ID: prot.SequenceID(1), + }, + } + + hOut := m.Handler(req) + resp, err := hOut.ServeMsg(req) + if resp != nil { + t.Fatalf("expected nil response got: %+v", resp) + } + if err == nil { + t.Fatal("expected valid error got: nil") + } +} + +func Test_Bridge_Mux_Handler_Added_NotMatched(t *testing.T) { + // Testing specifically that if we have at least one handler of a different + // type than the incomming request that we still get the default handler + // and that the other handler does not get called. + + m := NewBridgeMux() + th := &thandler{} + + // Add at least one handler for a different request type. + m.Handle(prot.ComputeSystemCreateV1, prot.PvInvalid, th) + + req := &Request{ + Header: &prot.MessageHeader{ + Type: prot.ComputeSystemExecuteProcessV1, + Size: 0, + ID: prot.SequenceID(1), + }, + } + + // Handle the request of a different type. + hOut := m.Handler(req) + respChan := make(chan bridgeResponse, 1) // We need to allocate the space because we are running ServeMsg sync. + defer close(respChan) + + resp, err := hOut.ServeMsg(req) + if resp != nil { + t.Fatalf("expected nil response got: %+v", resp) + } + if err == nil { + t.Fatal("expected valid error got: nil") + } + if th.set { + t.Error("Handler did not call the appropriate handler for a match request") + } +} + +func Test_Bridge_Mux_Handler_Success(t *testing.T) { + m := NewBridgeMux() + th := &thandler{ + resp: &prot.ContainerCreateResponse{}, + } + + m.Handle(prot.ComputeSystemCreateV1, prot.PvInvalid, th) + + req := &Request{ + Header: &prot.MessageHeader{ + Type: prot.ComputeSystemCreateV1, + Size: 0, + ID: prot.SequenceID(1), + }, + } + + hOut := m.Handler(req) + respChan := make(chan bridgeResponse, 1) // We need to allocate the space because we are running ServeMsg sync. + defer close(respChan) + + resp, err := hOut.ServeMsg(req) + if resp == nil { + t.Fatal("expected valid response got: nil") + } + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } + if !th.set { + t.Error("Handler did not call the appropriate handler for a match request") + } +} + +func Test_Bridge_Mux_ServeMsg_NotAdded_Default(t *testing.T) { + // Testing specifically that if we have a bridge with no handlers that + // calling ServeMsg we get the default handler. + + m := NewBridgeMux() + + req := &Request{ + Header: &prot.MessageHeader{ + Type: prot.ComputeSystemCreateV1, + Size: 0, + ID: prot.SequenceID(1), + }, + } + + respChan := make(chan bridgeResponse, 1) // We need to allocate the space because we are running ServeMsg sync. + defer close(respChan) + + resp, err := m.ServeMsg(req) + if resp != nil { + t.Fatalf("expected nil response, got: %+v", resp) + } + if err == nil { + t.Fatal("expected error got: nil") + } +} + +func Test_Bridge_Mux_ServeMsg_Added_NotMatched(t *testing.T) { + // Testing specifically that if we have at least one handler of a different + // type than the incomming request that calling ServeMsg we get the default + // handler. + + m := NewBridgeMux() + th := &thandler{} + + // Add at least one handler for a different request type. + m.Handle(prot.ComputeSystemCreateV1, prot.PvInvalid, th) + + req := &Request{ + Header: &prot.MessageHeader{ + Type: prot.ComputeSystemExecuteProcessV1, + Size: 0, + ID: prot.SequenceID(1), + }, + } + + // Handle the request of a different type. + respChan := make(chan bridgeResponse, 1) // We need to allocate the space because we are running ServeMsg sync. + defer close(respChan) + + resp, err := m.ServeMsg(req) + if resp != nil { + t.Fatalf("expected nil response, got: %+v", resp) + } + if err == nil { + t.Fatal("expected error got: nil") + } + if th.set { + t.Error("Handler did not call the appropriate handler for a match request") + } +} + +func Test_Bridge_Mux_ServeMsg_Success(t *testing.T) { + m := NewBridgeMux() + th := &thandler{ + resp: &prot.ContainerCreateResponse{}, + } + + m.Handle(prot.ComputeSystemCreateV1, prot.PvInvalid, th) + + req := &Request{ + Header: &prot.MessageHeader{ + Type: prot.ComputeSystemCreateV1, + Size: 0, + ID: prot.SequenceID(1), + }, + } + + respChan := make(chan bridgeResponse, 1) // We need to allocate the space because we are running ServeMsg sync. + defer close(respChan) + + resp, err := m.ServeMsg(req) + if resp == nil { + t.Fatal("expected valid response got: nil") + } + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } + if !th.set { + t.Error("Handler did not call the appropriate handler for a match request") + } +} + +type errorTransport struct { + e error +} + +func (e *errorTransport) Dial(_ uint32) (transport.Connection, error) { + return nil, e.e +} + +func serverSend(conn io.Writer, messageType prot.MessageIdentifier, messageID prot.SequenceID, i interface{}) error { + body := make([]byte, 0) + if i != nil { + var err error + body, err = json.Marshal(i) + if err != nil { + return errors.Wrap(err, "Failed to json marshal to server.") + } + } + + header := prot.MessageHeader{ + Type: messageType, + ID: messageID, + Size: uint32(len(body) + prot.MessageHeaderSize), + } + + // Send the header. + if err := binary.Write(conn, binary.LittleEndian, header); err != nil { + return errors.Wrap(err, "bridge_test: failed to write message header") + } + // Send the body. + if _, err := conn.Write(body); err != nil { + return errors.Wrap(err, "bridge_test: failed to write the message body") + } + return nil +} + +func serverRead(conn io.Reader) (*prot.MessageHeader, []byte, error) { + header := &prot.MessageHeader{} + // Read the header. + if err := binary.Read(conn, binary.LittleEndian, header); err != nil { + return nil, nil, errors.Wrap(err, "bridge_test: failed to read message header") + } + message := make([]byte, header.Size-prot.MessageHeaderSize) + // Read the body. + if _, err := io.ReadFull(conn, message); err != nil { + return nil, nil, errors.Wrap(err, "bridge_test: failed to read the message body") + } + + return header, message, nil +} + +type loopbackConnection struct { + // Format is client-read, server-write, server-read, client-write + pipes [4]*os.File +} + +func (lc *loopbackConnection) close() { + for i := 3; i >= 0; i-- { + lc.pipes[i].Close() + } +} + +func (lc *loopbackConnection) CRead() io.ReadCloser { + return lc.pipes[0] +} + +func (lc *loopbackConnection) CWrite() io.WriteCloser { + return lc.pipes[3] +} + +func (lc *loopbackConnection) SRead() io.ReadCloser { + return lc.pipes[2] +} + +func (lc *loopbackConnection) SWrite() io.WriteCloser { + return lc.pipes[1] +} + +func newLoopbackConnection() *loopbackConnection { + l := new(loopbackConnection) + l.pipes[0], l.pipes[1], _ = os.Pipe() + l.pipes[2], l.pipes[3], _ = os.Pipe() + return l +} + +func Test_Bridge_ListenAndServe_UnknownMessageHandler_Success(t *testing.T) { + // Turn off logging so as not to spam output. + logrus.SetOutput(ioutil.Discard) + + lc := newLoopbackConnection() + defer lc.close() + + b := &Bridge{ + Handler: UnknownMessageHandler(), + } + + go func() { + if err := b.ListenAndServe(lc.SRead(), lc.SWrite()); err != nil { + t.Error(err) + } + }() + defer func() { + b.quitChan <- true + }() + + message := &prot.ContainerResizeConsole{ + MessageBase: prot.MessageBase{ + ContainerID: "01234567-89ab-cdef-0123-456789abcdef", + ActivityID: "00000000-0000-0000-0000-000000000001", + }, + } + if err := serverSend(lc.CWrite(), prot.ComputeSystemResizeConsoleV1, prot.SequenceID(1), message); err != nil { + t.Error("Failed to send message to server") + return + } + header, body, err := serverRead(lc.CRead()) + if err != nil { + t.Error("Failed to read message response from server") + return + } + response := &prot.MessageResponseBase{} + if err := json.Unmarshal(body, response); err != nil { + t.Error("Failed to unmarshal response body from server") + return + } + + // Verify + if header.Type != prot.ComputeSystemResponseResizeConsoleV1 { + t.Error("Response header was not resize console response.") + } + if header.ID != prot.SequenceID(1) { + t.Error("Response header had wrong sequence id") + } + verifyResponseIsDefaultHandler(t, response) + if response.ActivityID != message.ActivityID { + t.Fatal("Response had invalid activity id") + } +} + +func Test_Bridge_ListenAndServe_CorrectHandler_Success(t *testing.T) { + // Turn off logging so as not to spam output. + logrus.SetOutput(ioutil.Discard) + + lc := newLoopbackConnection() + defer lc.close() + + mux := NewBridgeMux() + message := &prot.ContainerResizeConsole{ + MessageBase: prot.MessageBase{ + ContainerID: "01234567-89ab-cdef-0123-456789abcdef", + ActivityID: "00000000-0000-0000-0000-000000000010", + }, + } + resizeFn := func(r *Request) (RequestResponse, error) { + // Verify the request is as expected. + if r.Header.Type != prot.ComputeSystemResizeConsoleV1 { + return nil, errors.New("bridge_test: wrong request type") + } + if r.Header.ID != prot.SequenceID(1) { + return nil, errors.New("bridge_test: wrong sequence id") + } + + rBody := prot.ContainerResizeConsole{} + + if err := json.Unmarshal(r.Message, &rBody); err != nil { + return nil, errors.New("failed to unmarshal body") + } + if message.ContainerID != rBody.ContainerID { + return nil, errors.New("containerID of source and handler func not equal") + } + + return &prot.MessageResponseBase{ + Result: 1, + ActivityID: rBody.ActivityID, + }, nil + } + mux.HandleFunc(prot.ComputeSystemResizeConsoleV1, prot.PvV4, resizeFn) + b := &Bridge{ + Handler: mux, + protVer: prot.PvV4, + } + + go func() { + if err := b.ListenAndServe(lc.SRead(), lc.SWrite()); err != nil { + t.Error(err) + } + }() + defer func() { + b.quitChan <- true + }() + + if err := serverSend(lc.CWrite(), prot.ComputeSystemResizeConsoleV1, prot.SequenceID(1), message); err != nil { + t.Error("Failed to send message to server") + return + } + header, body, err := serverRead(lc.CRead()) + if err != nil { + t.Error("Failed to read message response from server") + return + } + response := &prot.MessageResponseBase{} + if err := json.Unmarshal(body, response); err != nil { + t.Error("Failed to unmarshal response body from server") + return + } + // Verify. + if header.Type != prot.ComputeSystemResponseResizeConsoleV1 { + t.Error("response header was not resize console response.") + } + if header.ID != prot.SequenceID(1) { + t.Error("response header had wrong sequence id") + } + if response.ActivityID != message.ActivityID { + t.Error("response body did not have same activity id") + } + if response.Result != 1 { + t.Error("response result was not 1 as expected") + } +} + +func Test_Bridge_ListenAndServe_HandlersAreAsync_Success(t *testing.T) { + // Turn off logging so as not to spam output. + logrus.SetOutput(ioutil.Discard) + + lc := newLoopbackConnection() + defer lc.close() + + mux := NewBridgeMux() + + orderWg := sync.WaitGroup{} + orderWg.Add(1) + + firstFn := func(r *Request) (RequestResponse, error) { + // Wait for the second request to come in. + orderWg.Wait() + return &prot.MessageResponseBase{ + Result: 1, + }, nil + } + secondFn := func(r *Request) (RequestResponse, error) { + defer orderWg.Done() + return &prot.MessageResponseBase{ + Result: 10, + }, nil + } + mux.HandleFunc(prot.ComputeSystemResizeConsoleV1, prot.PvV4, firstFn) + mux.HandleFunc(prot.ComputeSystemModifySettingsV1, prot.PvV4, secondFn) + + b := &Bridge{ + Handler: mux, + protVer: prot.PvV4, + } + + go func() { + if err := b.ListenAndServe(lc.SRead(), lc.SWrite()); err != nil { + t.Error(err) + } + }() + defer func() { + b.quitChan <- true + }() + + if err := serverSend(lc.CWrite(), prot.ComputeSystemResizeConsoleV1, prot.SequenceID(0), nil); err != nil { + t.Error("Failed to send first message to server") + return + } + if err := serverSend(lc.CWrite(), prot.ComputeSystemModifySettingsV1, prot.SequenceID(1), nil); err != nil { + t.Error("Failed to send second message to server") + return + } + + headerFirst, _, errFirst := serverRead(lc.CRead()) + if errFirst != nil { + t.Error("Failed to read first response from server") + return + } + headerSecond, _, errSecond := serverRead(lc.CRead()) + if errSecond != nil { + t.Error("Failed to read first response from server") + return + } + // headerFirst should match the 2nd request. + if headerFirst.Type != prot.ComputeSystemResponseModifySettingsV1 { + t.Error("Incorrect response type for 2nd request") + } + if headerFirst.ID != prot.SequenceID(1) { + t.Error("Incorrect response order for 2nd request") + } + // headerSecond should match the 1st request. + if headerSecond.Type != prot.ComputeSystemResponseResizeConsoleV1 { + t.Error("Incorrect response for 1st request") + } + if headerSecond.ID != prot.SequenceID(0) { + t.Error("Incorrect response order for 1st request") + } +} diff --git a/internal/guest/bridge/bridge_v2.go b/internal/guest/bridge/bridge_v2.go new file mode 100644 index 0000000000..460579b524 --- /dev/null +++ b/internal/guest/bridge/bridge_v2.go @@ -0,0 +1,525 @@ +// +build linux + +package bridge + +import ( + "context" + "encoding/json" + "syscall" + "time" + + "github.com/Microsoft/hcsshim/internal/debug" + "github.com/Microsoft/hcsshim/internal/guest/commonutils" + "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/guest/runtime/hcsv2" + "github.com/Microsoft/hcsshim/internal/guest/stdio" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "go.opencensus.io/trace" + "golang.org/x/sys/unix" +) + +// The capabilities of this GCS. +var capabilities = prot.GcsCapabilities{ + SendHostCreateMessage: false, + SendHostStartMessage: false, + HVSocketConfigOnStartup: false, + SupportedSchemaVersions: []prot.SchemaVersion{ + { + Major: 2, + Minor: 1, + }, + }, + RuntimeOsType: prot.OsTypeLinux, + GuestDefinedCapabilities: prot.GcsGuestCapabilities{ + NamespaceAddRequestSupported: true, + SignalProcessSupported: true, + DumpStacksSupported: true, + DeleteContainerStateSupported: true, + }, +} + +// negotiateProtocolV2 was introduced in v4 so will not be called with a minimum +// lower than that. +func (b *Bridge) negotiateProtocolV2(r *Request) (_ RequestResponse, err error) { + _, span := trace.StartSpan(r.Context, "opengcs::bridge::negotiateProtocolV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + var request prot.NegotiateProtocol + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + if request.MaximumVersion < uint32(prot.PvV4) || uint32(prot.PvMax) < request.MinimumVersion { + return nil, gcserr.NewHresultError(gcserr.HrVmcomputeUnsupportedProtocolVersion) + } + + min := func(x, y uint32) uint32 { + if x < y { + return x + } + return y + } + + major := min(uint32(prot.PvMax), request.MaximumVersion) + + // Set our protocol selected version before return. + b.protVer = prot.ProtocolVersion(major) + + return &prot.NegotiateProtocolResponse{ + Version: major, + Capabilities: capabilities, + }, nil +} + +// createContainerV2 creates a container based on the settings passed in `r`. +// +// This is allowed only for protocol version 4+, schema version 2.1+ +func (b *Bridge) createContainerV2(r *Request) (_ RequestResponse, err error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::createContainerV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + var request prot.ContainerCreate + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + var settingsV2 prot.VMHostedContainerSettingsV2 + if err := commonutils.UnmarshalJSONWithHresult([]byte(request.ContainerConfig), &settingsV2); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON for ContainerConfig \"%s\"", request.ContainerConfig) + } + + if settingsV2.SchemaVersion.Cmp(prot.SchemaVersion{Major: 2, Minor: 1}) < 0 { + return nil, gcserr.WrapHresult( + errors.Errorf("invalid schema version: %v", settingsV2.SchemaVersion), + gcserr.HrVmcomputeInvalidJSON) + } + + c, err := b.hostState.CreateContainer(ctx, request.ContainerID, &settingsV2) + if err != nil { + return nil, err + } + waitFn := func() prot.NotificationType { + return c.Wait() + } + + go func() { + nt := waitFn() + notification := &prot.ContainerNotification{ + MessageBase: prot.MessageBase{ + ContainerID: request.ContainerID, + ActivityID: request.ActivityID, + }, + Type: nt, + Operation: prot.AoNone, + Result: 0, + ResultInfo: "", + } + b.PublishNotification(notification) + }() + + return &prot.ContainerCreateResponse{}, nil +} + +// startContainerV2 doesn't have a great correlation to LCOW. On Windows this is +// used to start the container silo. In Linux the container is the process so we +// wait until the exec process of the init process to actually issue the start. +// +// This is allowed only for protocol version 4+, schema version 2.1+ +func (b *Bridge) startContainerV2(r *Request) (_ RequestResponse, err error) { + _, span := trace.StartSpan(r.Context, "opengcs::bridge::startContainerV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + // This is just a noop, but needs to be handled so that an error isn't + // returned to the HCS. + var request prot.MessageBase + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + return &prot.MessageResponseBase{}, nil +} + +// execProcessV2 is used to execute three types of processes in the guest. +// +// 1. HostProcess. This is a process in the Host pid namespace that runs as +// root. It is signified by either `request.IsExternal` or `request.ContainerID +// == hcsv2.UVMContainerID`. +// +// 2. Container Init process. This is the init process of the created container. +// We use exec for this instead of `StartContainer` because the protocol does +// not pass in the appropriate std pipes for relaying the results until exec. +// Until this is called the container remains in the `created` state. +// +// 3. Container Exec process. This is a process that is run in the container's +// pid namespace. +// +// This is allowed only for protocol version 4+, schema version 2.1+ +func (b *Bridge) execProcessV2(r *Request) (_ RequestResponse, err error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::execProcessV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + var request prot.ContainerExecuteProcess + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + // The request contains a JSON string field which is equivalent to an + // ExecuteProcessInfo struct. + var params prot.ProcessParameters + if err := commonutils.UnmarshalJSONWithHresult([]byte(request.Settings.ProcessParameters), ¶ms); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON for ProcessParameters \"%s\"", request.Settings.ProcessParameters) + } + + var conSettings stdio.ConnectionSettings + if params.CreateStdInPipe { + conSettings.StdIn = &request.Settings.VsockStdioRelaySettings.StdIn + } + if params.CreateStdOutPipe { + conSettings.StdOut = &request.Settings.VsockStdioRelaySettings.StdOut + } + if params.CreateStdErrPipe { + conSettings.StdErr = &request.Settings.VsockStdioRelaySettings.StdErr + } + + var pid int + var c *hcsv2.Container + if params.IsExternal || request.ContainerID == hcsv2.UVMContainerID { + pid, err = b.hostState.RunExternalProcess(ctx, params, conSettings) + } else if c, err = b.hostState.GetContainer(request.ContainerID); err == nil { + // We found a V2 container. Treat this as a V2 process. + if params.OCIProcess == nil { + pid, err = c.Start(ctx, conSettings) + } else { + pid, err = c.ExecProcess(ctx, params.OCIProcess, conSettings) + } + } + + if err != nil { + return nil, err + } + log.G(ctx).WithField("pid", pid).Debug("created process pid") + return &prot.ContainerExecuteProcessResponse{ + ProcessID: uint32(pid), + }, nil +} + +// killContainerV2 is a user forced terminate of the container and all processes +// in the container. It is equivalent to sending SIGKILL to the init process and +// all exec'd processes. +// +// This is allowed only for protocol version 4+, schema version 2.1+ +func (b *Bridge) killContainerV2(r *Request) (RequestResponse, error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::killContainerV2") + defer span.End() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + return b.signalContainerV2(ctx, span, r, unix.SIGKILL) +} + +// shutdownContainerV2 is a user requested shutdown of the container and all +// processes in the container. It is equivalent to sending SIGTERM to the init +// process and all exec'd processes. +// +// This is allowed only for protocol version 4+, schema version 2.1+ +func (b *Bridge) shutdownContainerV2(r *Request) (RequestResponse, error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::shutdownContainerV2") + defer span.End() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + return b.signalContainerV2(ctx, span, r, unix.SIGTERM) +} + +// signalContainerV2 is not a handler func. This is because the actual signal is +// implied based on the message type of either `killContainerV2` or +// `shutdownContainerV2`. +func (b *Bridge) signalContainerV2(ctx context.Context, span *trace.Span, r *Request, signal syscall.Signal) (_ RequestResponse, err error) { + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("cid", r.ContainerID), + trace.Int64Attribute("signal", int64(signal))) + + var request prot.MessageBase + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + // If this is targeting the UVM send the request to the host itself. + if request.ContainerID == hcsv2.UVMContainerID { + // We are asking to shutdown the UVM itself. + if signal != unix.SIGTERM { + log.G(ctx).Error("invalid signal for uvm") + } + // This is a destructive call. We do not respond to the HCS + b.quitChan <- true + b.hostState.Shutdown() + } else { + c, err := b.hostState.GetContainer(request.ContainerID) + if err != nil { + return nil, err + } + + err = c.Kill(ctx, signal) + if err != nil { + return nil, err + } + } + + return &prot.MessageResponseBase{}, nil +} + +func (b *Bridge) signalProcessV2(r *Request) (_ RequestResponse, err error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::signalProcessV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + var request prot.ContainerSignalProcess + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + span.AddAttributes( + trace.Int64Attribute("pid", int64(request.ProcessID)), + trace.Int64Attribute("signal", int64(request.Options.Signal))) + + c, err := b.hostState.GetContainer(request.ContainerID) + if err != nil { + return nil, err + } + + p, err := c.GetProcess(request.ProcessID) + if err != nil { + return nil, err + } + + var signal syscall.Signal + if request.Options.Signal == 0 { + signal = unix.SIGKILL + } else { + signal = syscall.Signal(request.Options.Signal) + } + if err := p.Kill(ctx, signal); err != nil { + return nil, err + } + + return &prot.MessageResponseBase{}, nil +} + +func (b *Bridge) getPropertiesV2(r *Request) (_ RequestResponse, err error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::getPropertiesV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + var request prot.ContainerGetProperties + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + properties := &prot.PropertiesV2{} + + var query prot.PropertyQuery + if len(request.Query) != 0 { + if err := json.Unmarshal([]byte(request.Query), &query); err != nil { + e := gcserr.WrapHresult(err, gcserr.HrVmcomputeInvalidJSON) + return nil, errors.Wrapf(e, "The query could not be unmarshaled: '%s'", query) + } + } + + if request.ContainerID == hcsv2.UVMContainerID { + return nil, errors.New("getPropertiesV2 is not supported against the UVM") + } + + c, err := b.hostState.GetContainer(request.ContainerID) + if err != nil { + return nil, err + } + + for _, requestedProperty := range query.PropertyTypes { + if requestedProperty == prot.PtProcessList { + pids, err := c.GetAllProcessPids(ctx) + if err != nil { + return nil, err + } + properties.ProcessList = make([]prot.ProcessDetails, len(pids)) + for i, pid := range pids { + properties.ProcessList[i].ProcessID = uint32(pid) + } + } else if requestedProperty == prot.PtStatistics { + cgroupMetrics, err := c.GetStats(ctx) + if err != nil { + return nil, err + } + properties.Metrics = cgroupMetrics + } + } + + propertyJSON := []byte("{}") + if properties != nil { + var err error + propertyJSON, err = json.Marshal(properties) + if err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%+v\"", properties) + } + } + + return &prot.ContainerGetPropertiesResponse{ + Properties: string(propertyJSON), + }, nil +} + +func (b *Bridge) waitOnProcessV2(r *Request) (_ RequestResponse, err error) { + _, span := trace.StartSpan(r.Context, "opengcs::bridge::waitOnProcessV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + var request prot.ContainerWaitForProcess + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + span.AddAttributes( + trace.Int64Attribute("pid", int64(request.ProcessID)), + trace.Int64Attribute("timeout-ms", int64(request.TimeoutInMs))) + + var exitCodeChan <-chan int + var doneChan chan<- bool + + if request.ContainerID == hcsv2.UVMContainerID { + p, err := b.hostState.GetExternalProcess(int(request.ProcessID)) + if err != nil { + return nil, err + } + exitCodeChan, doneChan = p.Wait() + } else { + c, err := b.hostState.GetContainer(request.ContainerID) + if err != nil { + return nil, err + } + p, err := c.GetProcess(request.ProcessID) + if err != nil { + return nil, err + } + exitCodeChan, doneChan = p.Wait() + } + + // If we timed out or if we got the exit code. Acknowledge we no longer want to wait. + defer close(doneChan) + + var tc <-chan time.Time + if request.TimeoutInMs != prot.InfiniteWaitTimeout { + t := time.NewTimer(time.Duration(request.TimeoutInMs) * time.Millisecond) + defer t.Stop() + tc = t.C + } + select { + case exitCode := <-exitCodeChan: + return &prot.ContainerWaitForProcessResponse{ + ExitCode: uint32(exitCode), + }, nil + case <-tc: + return nil, gcserr.NewHresultError(gcserr.HvVmcomputeTimeout) + } +} + +func (b *Bridge) resizeConsoleV2(r *Request) (_ RequestResponse, err error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::resizeConsoleV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + var request prot.ContainerResizeConsole + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + span.AddAttributes( + trace.Int64Attribute("pid", int64(request.ProcessID)), + trace.Int64Attribute("height", int64(request.Height)), + trace.Int64Attribute("width", int64(request.Width))) + + c, err := b.hostState.GetContainer(request.ContainerID) + if err != nil { + return nil, err + } + + p, err := c.GetProcess(request.ProcessID) + if err != nil { + return nil, err + } + + err = p.ResizeConsole(ctx, request.Height, request.Width) + if err != nil { + return nil, err + } + + return &prot.MessageResponseBase{}, nil +} + +func (b *Bridge) modifySettingsV2(r *Request) (_ RequestResponse, err error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::modifySettingsV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + request, err := prot.UnmarshalContainerModifySettings(r.Message) + if err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + err = b.hostState.ModifySettings(ctx, request.ContainerID, request.Request.(*prot.ModifySettingRequest)) + if err != nil { + return nil, err + } + + return &prot.MessageResponseBase{}, nil +} + +func (b *Bridge) dumpStacksV2(r *Request) (_ RequestResponse, err error) { + _, span := trace.StartSpan(r.Context, "opengcs::bridge::dumpStacksV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + stacks := debug.DumpStacks() + + return &prot.DumpStacksResponse{ + GuestStacks: stacks, + }, nil +} + +func (b *Bridge) deleteContainerStateV2(r *Request) (_ RequestResponse, err error) { + ctx, span := trace.StartSpan(r.Context, "opengcs::bridge::deleteContainerStateV2") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes(trace.StringAttribute("cid", r.ContainerID)) + + var request prot.MessageBase + if err := commonutils.UnmarshalJSONWithHresult(r.Message, &request); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) + } + + c, err := b.hostState.GetContainer(request.ContainerID) + if err != nil { + return nil, err + } + + if err := c.Delete(ctx); err != nil { + return nil, err + } + + b.hostState.RemoveContainer(request.ContainerID) + return &prot.MessageResponseBase{}, nil +} diff --git a/internal/guest/commonutils/utilities.go b/internal/guest/commonutils/utilities.go new file mode 100644 index 0000000000..adcf70e6c2 --- /dev/null +++ b/internal/guest/commonutils/utilities.go @@ -0,0 +1,26 @@ +package commonutils + +import ( + "encoding/json" + "io" + + "github.com/Microsoft/hcsshim/internal/guest/gcserr" +) + +// UnmarshalJSONWithHresult unmarshals the given data into the given interface, and +// wraps any error returned in an HRESULT error. +func UnmarshalJSONWithHresult(data []byte, v interface{}) error { + if err := json.Unmarshal(data, v); err != nil { + return gcserr.WrapHresult(err, gcserr.HrVmcomputeInvalidJSON) + } + return nil +} + +// DecodeJSONWithHresult decodes the JSON from the given reader into the given +// interface, and wraps any error returned in an HRESULT error. +func DecodeJSONWithHresult(r io.Reader, v interface{}) error { + if err := json.NewDecoder(r).Decode(v); err != nil { + return gcserr.WrapHresult(err, gcserr.HrVmcomputeInvalidJSON) + } + return nil +} diff --git a/internal/guest/gcserr/errors.go b/internal/guest/gcserr/errors.go new file mode 100644 index 0000000000..d0dcd2bac5 --- /dev/null +++ b/internal/guest/gcserr/errors.go @@ -0,0 +1,164 @@ +package gcserr + +import ( + "fmt" + "io" + + "github.com/pkg/errors" +) + +// Hresult is a type corresponding to the HRESULT error type used on Windows. +type Hresult int32 + +const ( + // HrNotImpl is the HRESULT for a not implemented function. + HrNotImpl = Hresult(-2147467263) // 0x80004001 + // HrFail is the HRESULT for an invocation failure. + HrFail = Hresult(-2147467259) // 0x80004005 + // HrErrNotFound is the HRESULT for an invalid process id. + HrErrNotFound = Hresult(-2147023728) // 0x80070490 + // HvVmcomputeTimeout is the HRESULT for operations that timed out. + HvVmcomputeTimeout = Hresult(-1070137079) // 0xC0370109 + // HrVmcomputeInvalidJSON is the HRESULT for failing to unmarshal a json + // string. + HrVmcomputeInvalidJSON = Hresult(-1070137075) // 0xC037010D + // HrVmcomputeSystemNotFound is the HRESULT for: + // + // A virtual machine or container with the specified identifier does not + // exist. + HrVmcomputeSystemNotFound = Hresult(-1070137074) // 0xC037010E + // HrVmcomputeSystemAlreadyExists is the HRESULT for: + // + // A virtual machine or container with the specified identifier already exists. + HrVmcomputeSystemAlreadyExists = Hresult(-1070137073) // 0xC037010F + // HrVmcomputeUnsupportedProtocolVersion is the HRESULT for an invalid + // protocol version range specified at negotiation. + HrVmcomputeUnsupportedProtocolVersion = Hresult(-1070137076) // 0xC037010C + // HrVmcomputeUnknownMessage is the HRESULT for unknown message types sent + // from the HCS. + HrVmcomputeUnknownMessage = Hresult(-1070137077) // 0xC037010B +) + +// StackTracer is an interface originating (but not exported) from the +// github.com/pkg/errors package. It defines something which can return a stack +// trace. +type StackTracer interface { + StackTrace() errors.StackTrace +} + +// BaseStackTrace gets the earliest errors.StackTrace in the given error's cause +// stack. This will be the stack trace which reaches closest to the error's +// actual origin. It returns nil if no stack trace is found in the cause stack. +func BaseStackTrace(e error) errors.StackTrace { + type causer interface { + Cause() error + } + cause := e + var tracer StackTracer + for cause != nil { + serr, ok := cause.(StackTracer) + if ok { + tracer = serr + } + cerr, ok := cause.(causer) + if !ok { + break + } + cause = cerr.Cause() + } + if tracer == nil { + return nil + } + return tracer.StackTrace() +} + +type baseHresultError struct { + hresult Hresult +} + +func (e *baseHresultError) Error() string { + return fmt.Sprintf("HRESULT: 0x%x", uint32(e.Hresult())) +} +func (e *baseHresultError) Hresult() Hresult { + return e.hresult +} + +type wrappingHresultError struct { + cause error + hresult Hresult +} + +func (e *wrappingHresultError) Error() string { + return fmt.Sprintf("HRESULT 0x%x", uint32(e.Hresult())) + ": " + e.Cause().Error() +} +func (e *wrappingHresultError) Hresult() Hresult { + return e.hresult +} +func (e *wrappingHresultError) Cause() error { + return e.cause +} +func (e *wrappingHresultError) Format(s fmt.State, verb rune) { + switch verb { + case 'v': + if s.Flag('+') { + fmt.Fprintf(s, "%+v\n", e.Cause()) + return + } + fallthrough + case 's': + _, _ = io.WriteString(s, e.Error()) + case 'q': + fmt.Fprintf(s, "%q", e.Error()) + } +} +func (e *wrappingHresultError) StackTrace() errors.StackTrace { + type stackTracer interface { + StackTrace() errors.StackTrace + } + serr, ok := e.Cause().(stackTracer) + if !ok { + return nil + } + return serr.StackTrace() +} + +// NewHresultError produces a new error with the given HRESULT. +func NewHresultError(hresult Hresult) error { + return &baseHresultError{hresult: hresult} +} + +// WrapHresult produces a new error with the given HRESULT and wrapping the +// given error. +func WrapHresult(e error, hresult Hresult) error { + return &wrappingHresultError{ + cause: e, + hresult: hresult, + } +} + +// GetHresult interates through the error's cause stack (similiarly to how the +// Cause function in github.com/pkg/errors operates). At the first error it +// encounters which implements the Hresult() method, it return's that error's +// HRESULT. This allows errors higher up in the cause stack to shadow the +// HRESULTs of errors lower down. +func GetHresult(e error) (Hresult, error) { + type hresulter interface { + Hresult() Hresult + } + type causer interface { + Cause() error + } + cause := e + for cause != nil { + herr, ok := cause.(hresulter) + if ok { + return herr.Hresult(), nil + } + cerr, ok := cause.(causer) + if !ok { + break + } + cause = cerr.Cause() + } + return -1, errors.Errorf("no HRESULT found in cause stack for error %s", e) +} diff --git a/internal/guest/kmsg/kmsg.go b/internal/guest/kmsg/kmsg.go new file mode 100644 index 0000000000..429235bb16 --- /dev/null +++ b/internal/guest/kmsg/kmsg.go @@ -0,0 +1,149 @@ +// Package kmsg contains support for parsing Linux kernel log entries read from +// /dev/kmsg. These are the same log entries that can be read via the `dmesg` +// command. Each read from /dev/kmsg is guaranteed to return a single log entry, +// so no line-splitting is required. +// +// More information can be found here: +// https://www.kernel.org/doc/Documentation/ABI/testing/dev-kmsg +package kmsg + +import ( + "errors" + "os" + "strconv" + "strings" + "syscall" + + "github.com/sirupsen/logrus" +) + +var ( + // ErrInvalidFormat indicates the kmsg entry failed to parse. + ErrInvalidFormat = errors.New("invalid kmsg format") +) + +// LogLevel represents the severity/priority of a log entry in the kernels +// ring buffer. +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/kern_levels.h?id=HEAD +type LogLevel uint8 + +func (logLevel LogLevel) String() string { + return levels[logLevel] +} + +const ( + Emerg LogLevel = iota + Alert + Crit + Err + Warning + Notice + Info + Debug +) + +var levels = [...]string{ + "Emerg", + "Alert", + "Crit", + "Err", + "Warning", + "Notice", + "Info", + "Debug", +} + +// Entry is a single log entry in kmsg. +type Entry struct { + Priority LogLevel + Facility uint8 + Seq uint64 + TimeSinceBootMicro uint64 + Flags string + Message string +} + +func (ke *Entry) logFormat() logrus.Fields { + return logrus.Fields{ + "priority": ke.Priority.String(), + "facility": ke.Facility, + "seq": ke.Seq, + "timesincebootmicro": ke.TimeSinceBootMicro, + "flags": ke.Flags, + "message": ke.Message, + } +} + +// Parse takes a single kmsg log entry string and returns a struct representing +// the components of the log entry. +func parse(s string) (*Entry, error) { + fields := strings.SplitN(s, ";", 2) + if len(fields) < 2 { + return nil, ErrInvalidFormat + } + prefixFields := strings.SplitN(fields[0], ",", 5) + if len(prefixFields) < 4 { + return nil, ErrInvalidFormat + } + syslog, err := strconv.ParseUint(prefixFields[0], 10, 16) + if err != nil { + return nil, ErrInvalidFormat + } + seq, err := strconv.ParseUint(prefixFields[1], 10, 64) + if err != nil { + return nil, ErrInvalidFormat + } + timestamp, err := strconv.ParseUint(prefixFields[2], 10, 64) + if err != nil { + return nil, ErrInvalidFormat + } + return &Entry{ + Priority: LogLevel(syslog & 0x7), + Facility: uint8(syslog >> 3), + Seq: seq, + TimeSinceBootMicro: timestamp, + Flags: prefixFields[3], + Message: fields[1], + }, nil +} + +// ReadForever reads from /dev/kmsg forever unless /dev/kmsg cannot be opened. +// Every entry with priority <= 'logLevel' will be logged. +func ReadForever(logLevel LogLevel) { + file, err := os.Open("/dev/kmsg") + if err != nil { + logrus.WithError(err).Error("failed to open /dev/kmsg") + return + } + defer file.Close() + // Reuse buffer for entries + // Buffer size from: https://elixir.bootlin.com/linux/latest/source/include/linux/printk.h#L44 + buf := make([]byte, 8192) + for { + n, err := file.Read(buf) + if err != nil { + // "In case messages get overwritten in the circular buffer while + // the device is kept open, the next read() will return -EPIPE, + // and the seek position be updated to the next available record. + // Subsequent reads() will return available records again." + if err == syscall.EPIPE { + logrus.Warn("kmsg entry overwritten; skipping entry") + continue + } + logrus.WithError(err).Error("kmsg read failure") + return + } + line := string(buf[:n]) + entry, err := parse(line) + if err != nil { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "line": line, + }).Error("failed to parse kmsg entry") + } else { + if entry.Priority <= logLevel { + logrus.WithFields(entry.logFormat()).Info("kmsg read") + } + } + } +} diff --git a/internal/guest/network/netns.go b/internal/guest/network/netns.go new file mode 100644 index 0000000000..cfba62ef40 --- /dev/null +++ b/internal/guest/network/netns.go @@ -0,0 +1,233 @@ +// +build linux + +package network + +import ( + "context" + "fmt" + "net" + "os/exec" + "runtime" + "time" + + "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/pkg/errors" + "github.com/vishvananda/netlink" + "github.com/vishvananda/netns" +) + +// MoveInterfaceToNS moves the adapter with interface name `ifStr` to the network namespace +// of `pid`. +func MoveInterfaceToNS(ifStr string, pid int) error { + // Get a reference to the interface and make sure it's down + link, err := netlink.LinkByName(ifStr) + if err != nil { + return errors.Wrapf(err, "netlink.LinkByName(%s) failed", ifStr) + } + if err := netlink.LinkSetDown(link); err != nil { + return errors.Wrapf(err, "netlink.LinkSetDown(%#v) failed", link) + } + + // Move the interface to the new network namespace + if err := netlink.LinkSetNsPid(link, pid); err != nil { + return errors.Wrapf(err, "netlink.SetNsPid(%#v, %d) failed", link, pid) + } + return nil +} + +// DoInNetNS is a utility to run a function `run` inside of a specific network namespace +// `ns`. This is accomplished by locking the current goroutines thread to prevent the goroutine +// from being scheduled to a new thread during execution of `run`. The threads original network namespace +// will be rejoined on exit. +func DoInNetNS(ns netns.NsHandle, run func() error) error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + origNs, err := netns.Get() + if err != nil { + return errors.Wrap(err, "failed to get current network namespace") + } + defer origNs.Close() + + if err := netns.Set(ns); err != nil { + return errors.Wrapf(err, "failed to set network namespace to %v", ns) + } + // Defer so we can re-enter the threads original netns on exit. + defer netns.Set(origNs) + + return run() +} + +// NetNSConfig moves a network interface into a network namespace and +// configures it. +// +// This function MUST be used in tandem with `DoInNetNS` or some other means that ensures that the goroutine +// executing this code stays on the same thread. +func NetNSConfig(ctx context.Context, ifStr string, nsPid int, adapter *prot.NetworkAdapter) error { + if ifStr == "" || nsPid == -1 || adapter == nil { + return errors.New("All three arguments must be specified") + } + + if adapter.NatEnabled { + log.G(ctx).Debugf("Configure %s in %d with: %s/%d gw=%s", ifStr, nsPid, adapter.AllocatedIPAddress, adapter.HostIPPrefixLength, adapter.HostIPAddress) + } else { + log.G(ctx).Debugf("Configure %s in %d with DHCP", ifStr, nsPid) + } + + log.G(ctx).Debug("Obtaining current namespace") + ns, err := netns.Get() + if err != nil { + return errors.Wrap(err, "netns.Get() failed") + } + defer ns.Close() + + log.G(ctx).Debugf("New network namespace from PID %d is %v", nsPid, ns) + + // Re-Get a reference to the interface (it may be a different ID in the new namespace) + log.G(ctx).Debug("Getting reference to interface") + link, err := netlink.LinkByName(ifStr) + if err != nil { + return errors.Wrapf(err, "netlink.LinkByName(%s) failed", ifStr) + } + + // User requested non-default MTU size + if adapter.EncapOverhead != 0 { + log.G(ctx).Debug("EncapOverhead non-zero, will set MTU") + mtu := link.Attrs().MTU - int(adapter.EncapOverhead) + log.G(ctx).Debugf("mtu %d", mtu) + if err = netlink.LinkSetMTU(link, mtu); err != nil { + return errors.Wrapf(err, "netlink.LinkSetMTU(%#v, %d) failed", link, mtu) + } + } + + // Configure the interface + if adapter.NatEnabled { + log.G(ctx).Debug("Nat enabled - configuring interface") + metric := 1 + if adapter.EnableLowMetric { + metric = 500 + } + + // Bring the interface up + if err := netlink.LinkSetUp(link); err != nil { + return errors.Wrapf(err, "netlink.LinkSetUp(%#v) failed", link) + } + // Set IP address + addr := &net.IPNet{ + IP: net.ParseIP(adapter.AllocatedIPAddress), + // TODO(rn): This assumes/hardcodes IPv4 + Mask: net.CIDRMask(int(adapter.HostIPPrefixLength), 32)} + ipAddr := &netlink.Addr{IPNet: addr, Label: ""} + if err := netlink.AddrAdd(link, ipAddr); err != nil { + return errors.Wrapf(err, "netlink.AddrAdd(%#v, %#v) failed", link, ipAddr) + } + // Set gateway + if adapter.HostIPAddress != "" { + gw := net.ParseIP(adapter.HostIPAddress) + + if !addr.Contains(gw) { + // In the case that a gw is not part of the subnet we are setting gw for, + // a new addr containing this gw address need to be added into the link to avoid getting + // unreachable error when adding this out-of-subnet gw route + log.G(ctx).Debugf("gw is outside of the subnet: Configure %s in %d with: %s/%d gw=%s\n", + ifStr, nsPid, adapter.AllocatedIPAddress, adapter.HostIPPrefixLength, adapter.HostIPAddress) + addr2 := &net.IPNet{ + IP: net.ParseIP(adapter.HostIPAddress), + Mask: net.CIDRMask(32, 32)} // This assumes/hardcodes IPv4 + ipAddr2 := &netlink.Addr{IPNet: addr2, Label: ""} + if err := netlink.AddrAdd(link, ipAddr2); err != nil { + return errors.Wrapf(err, "netlink.AddrAdd(%#v, %#v) failed", link, ipAddr2) + } + } + + if !adapter.EnableLowMetric { + route := netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + LinkIndex: link.Attrs().Index, + Gw: gw, + Priority: metric, // This is what ip route add does + } + if err := netlink.RouteAdd(&route); err != nil { + return errors.Wrapf(err, "netlink.RouteAdd(%#v) failed", route) + } + } else { + // add a route rule for the new interface so packets coming on this interface + // always go out the same interface + srcNet := &net.IPNet{IP: net.ParseIP(adapter.AllocatedIPAddress), Mask: net.CIDRMask(32, 32)} + rule := netlink.NewRule() + rule.Table = 101 + rule.Src = srcNet + rule.Priority = 5 + + if err := netlink.RuleAdd(rule); err != nil { + return errors.Wrapf(err, "netlink.RuleAdd(%#v) failed", rule) + } + + // add the default route in that interface specific table + route := netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + LinkIndex: link.Attrs().Index, + Gw: gw, + Table: rule.Table, + Priority: metric, + } + if err := netlink.RouteAdd(&route); err != nil { + return errors.Wrapf(err, "netlink.RouteAdd(%#v) failed", route) + } + + } + } + } else { + log.G(ctx).Debug("Execing udhcpc with timeout...") + cmd := exec.Command("udhcpc", "-q", "-i", ifStr, "-s", "/sbin/udhcpc_config.script") + + done := make(chan error) + go func() { + done <- cmd.Wait() + }() + defer close(done) + + select { + case <-time.After(30 * time.Second): + var cos string + co, err := cmd.CombinedOutput() // In case it has written something + if err != nil { + cos = string(co) + } + cmd.Process.Kill() + log.G(ctx).Debugf("udhcpc timed out [%s]", cos) + return fmt.Errorf("udhcpc timed out. Failed to get DHCP address: %s", cos) + case err := <-done: + var cos string + co, err := cmd.CombinedOutput() // Something should be on stderr + if err != nil { + cos = string(co) + } + if err != nil { + log.G(ctx).WithError(err).Debugf("udhcpc failed [%s]", cos) + return errors.Wrapf(err, "process failed (%s)", cos) + } + } + var cos string + co, err := cmd.CombinedOutput() + if err != nil { + cos = string(co) + } + log.G(ctx).Debugf("udhcpc succeeded: %s", cos) + } + + // Add some debug logging + curNS, _ := netns.Get() + // Refresh link attributes/state + link, _ = netlink.LinkByIndex(link.Attrs().Index) + attr := link.Attrs() + addrs, _ := netlink.AddrList(link, 0) + + log.G(ctx).Debugf("%v: %s[idx=%d,type=%s] is %v", curNS, attr.Name, attr.Index, link.Type(), attr.OperState) + for _, addr := range addrs { + log.G(ctx).Debugf(" %v", addr) + } + + return nil +} diff --git a/internal/guest/network/network.go b/internal/guest/network/network.go new file mode 100644 index 0000000000..4cb66f302b --- /dev/null +++ b/internal/guest/network/network.go @@ -0,0 +1,145 @@ +// +build linux + +package network + +import ( + "bytes" + "context" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "time" + + "github.com/Microsoft/hcsshim/internal/guest/storage/vmbus" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "go.opencensus.io/trace" +) + +// maxDNSSearches is limited to 6 in `man 5 resolv.conf` +const maxDNSSearches = 6 + +// GenerateEtcHostsContent generates a /etc/hosts file based on `hostname`. +func GenerateEtcHostsContent(ctx context.Context, hostname string) string { + _, span := trace.StartSpan(ctx, "network::GenerateEtcHostsContent") + defer span.End() + span.AddAttributes(trace.StringAttribute("hostname", hostname)) + + nameParts := strings.Split(hostname, ".") + buf := bytes.Buffer{} + buf.WriteString("127.0.0.1 localhost\n") + if len(nameParts) > 1 { + buf.WriteString(fmt.Sprintf("127.0.0.1 %s %s\n", hostname, nameParts[0])) + } else { + buf.WriteString(fmt.Sprintf("127.0.0.1 %s\n", hostname)) + } + buf.WriteString("\n") + buf.WriteString("# The following lines are desirable for IPv6 capable hosts\n") + buf.WriteString("::1 ip6-localhost ip6-loopback\n") + buf.WriteString("fe00::0 ip6-localnet\n") + buf.WriteString("ff00::0 ip6-mcastprefix\n") + buf.WriteString("ff02::1 ip6-allnodes\n") + buf.WriteString("ff02::2 ip6-allrouters\n") + return buf.String() +} + +// GenerateResolvConfContent generates the resolv.conf file content based on +// `searches`, `servers`, and `options`. +func GenerateResolvConfContent(ctx context.Context, searches, servers, options []string) (_ string, err error) { + _, span := trace.StartSpan(ctx, "network::GenerateResolvConfContent") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes( + trace.StringAttribute("searches", strings.Join(searches, ", ")), + trace.StringAttribute("servers", strings.Join(servers, ", ")), + trace.StringAttribute("options", strings.Join(options, ", "))) + + if len(searches) > maxDNSSearches { + return "", errors.Errorf("searches has more than %d domains", maxDNSSearches) + } + + content := "" + if len(searches) > 0 { + content += fmt.Sprintf("search %s\n", strings.Join(searches, " ")) + } + if len(servers) > 0 { + content += fmt.Sprintf("nameserver %s\n", strings.Join(servers, "\nnameserver ")) + } + if len(options) > 0 { + content += fmt.Sprintf("options %s\n", strings.Join(options, " ")) + } + return content, nil +} + +// MergeValues merges `first` and `second` maintaining order `first, second`. +func MergeValues(first, second []string) []string { + if len(first) == 0 { + return second + } + if len(second) == 0 { + return first + } + values := make([]string, len(first), len(first)+len(second)) + copy(values, first) + for _, v := range second { + found := false + for i := 0; i < len(values); i++ { + if v == values[i] { + found = true + break + } + } + if !found { + values = append(values, v) + } + } + return values +} + +// InstanceIDToName converts from the given instance ID (a GUID generated on the +// Windows host) to its corresponding interface name (e.g. "eth0"). +// +// Will retry the operation until `ctx` is exceeded or canceled. +func InstanceIDToName(ctx context.Context, id string) (_ string, err error) { + ctx, span := trace.StartSpan(ctx, "network::InstanceIDToName") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + id = strings.ToLower(id) + span.AddAttributes(trace.StringAttribute("adapterInstanceID", id)) + + vmBusSubPath := filepath.Join(id, "net") + devicePath, err := vmbus.WaitForDevicePath(ctx, vmBusSubPath) + + var deviceDirs []os.FileInfo + for { + deviceDirs, err = ioutil.ReadDir(devicePath) + if err != nil { + if os.IsNotExist(err) { + select { + case <-ctx.Done(): + return "", errors.Wrap(ctx.Err(), "timed out waiting for net adapter") + default: + time.Sleep(10 * time.Millisecond) + continue + } + } else { + return "", errors.Wrapf(err, "failed to read vmbus network device from /sys filesystem for adapter %s", id) + } + } + break + } + if len(deviceDirs) == 0 { + return "", errors.Errorf("no interface name found for adapter %s", id) + } + if len(deviceDirs) > 1 { + return "", errors.Errorf("multiple interface names found for adapter %s", id) + } + ifname := deviceDirs[0].Name() + log.G(ctx).WithField("ifname", ifname).Debug("resolved ifname") + return ifname, nil +} diff --git a/internal/guest/network/network_test.go b/internal/guest/network/network_test.go new file mode 100644 index 0000000000..08c3781f67 --- /dev/null +++ b/internal/guest/network/network_test.go @@ -0,0 +1,167 @@ +// +build linux + +package network + +import ( + "context" + "testing" +) + +func Test_GenerateResolvConfContent(t *testing.T) { + type testcase struct { + name string + + searches []string + servers []string + options []string + + expectedContent string + expectErr bool + } + testcases := []*testcase{ + { + name: "Empty", + }, + { + name: "MaxSearches", + searches: []string{"1", "2", "3", "4", "5", "6", "7"}, + expectErr: true, + }, + { + name: "ValidSearches", + searches: []string{"a.com", "b.com"}, + expectedContent: "search a.com b.com\n", + }, + { + name: "ValidServers", + servers: []string{"8.8.8.8", "8.8.4.4"}, + expectedContent: "nameserver 8.8.8.8\nnameserver 8.8.4.4\n", + }, + { + name: "ValidOptions", + options: []string{"timeout:30", "inet6"}, + expectedContent: "options timeout:30 inet6\n", + }, + { + name: "All", + searches: []string{"a.com", "b.com"}, + servers: []string{"8.8.8.8", "8.8.4.4"}, + options: []string{"timeout:30", "inet6"}, + expectedContent: "search a.com b.com\nnameserver 8.8.8.8\nnameserver 8.8.4.4\noptions timeout:30 inet6\n", + }, + } + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + c, err := GenerateResolvConfContent(context.Background(), tc.searches, tc.servers, tc.options) + if tc.expectErr && err == nil { + t.Fatal("expected err got nil") + } else if !tc.expectErr && err != nil { + t.Fatalf("expected no error got %v:", err) + } + + if c != tc.expectedContent { + t.Fatalf("expected content: %q got: %q", tc.expectedContent, c) + } + }) + } +} + +func Test_MergeValues(t *testing.T) { + type testcase struct { + name string + + first []string + second []string + + expected []string + } + testcases := []*testcase{ + { + name: "BothEmpty", + }, + { + name: "FirstEmpty", + second: []string{"a", "b"}, + expected: []string{"a", "b"}, + }, + { + name: "SecondEmpty", + first: []string{"a", "b"}, + expected: []string{"a", "b"}, + }, + { + name: "AllUnique", + first: []string{"a", "c", "d"}, + second: []string{"b", "e"}, + expected: []string{"a", "c", "d", "b", "e"}, + }, + { + name: "NonUnique", + first: []string{"a", "c", "d"}, + second: []string{"a", "b", "c", "d"}, + expected: []string{"a", "c", "d", "b"}, + }, + } + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + m := MergeValues(tc.first, tc.second) + if len(m) != len(tc.expected) { + t.Fatalf("expected %d entries got: %d", len(tc.expected), len(m)) + } + for i := 0; i < len(tc.expected); i++ { + if tc.expected[i] != m[i] { + t.Logf("%v :: %v", tc.expected, m) + t.Fatalf("expected value: %q at index: %d got: %q", tc.expected[i], i, m[i]) + } + } + }) + } +} + +func Test_GenerateEtcHostsContent(t *testing.T) { + type testcase struct { + name string + + hostname string + + expectedContent string + } + testcases := []*testcase{ + { + name: "Net BIOS Name", + hostname: "Test", + expectedContent: `127.0.0.1 localhost +127.0.0.1 Test + +# The following lines are desirable for IPv6 capable hosts +::1 ip6-localhost ip6-loopback +fe00::0 ip6-localnet +ff00::0 ip6-mcastprefix +ff02::1 ip6-allnodes +ff02::2 ip6-allrouters +`, + }, + { + name: "FQDN", + hostname: "test.rules.domain.com", + expectedContent: `127.0.0.1 localhost +127.0.0.1 test.rules.domain.com test + +# The following lines are desirable for IPv6 capable hosts +::1 ip6-localhost ip6-loopback +fe00::0 ip6-localnet +ff00::0 ip6-mcastprefix +ff02::1 ip6-allnodes +ff02::2 ip6-allrouters +`, + }, + } + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + c := GenerateEtcHostsContent(context.Background(), tc.hostname) + if c != tc.expectedContent { + t.Fatalf("expected content: %q got: %q", tc.expectedContent, c) + } + }) + } +} diff --git a/internal/guest/prot/protocol.go b/internal/guest/prot/protocol.go new file mode 100644 index 0000000000..b12554eb06 --- /dev/null +++ b/internal/guest/prot/protocol.go @@ -0,0 +1,933 @@ +// Package prot defines any structures used in the communication between the HCS +// and the GCS. Some of these structures are also used outside the bridge as +// good ways of packaging parameters to core calls. +package prot + +import ( + "encoding/json" + "strconv" + + "github.com/Microsoft/hcsshim/internal/guest/commonutils" + v1 "github.com/containerd/cgroups/stats/v1" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" +) + +//////////// Code for the Message Header //////////// +// Message Identifiers as present in the message header are subdivided into +// various pieces of information. +// +// +---+----+-----+----+ +// | T | CC | III | VV | +// +---+----+-----+----+ +// +// T - 4 Bits Type +// CC - 8 Bits Category +// III - 12 Bits Message Id +// VV - 8 Bits Version + +//nolint:deadcode,varcheck +const ( + messageTypeMask = 0xF0000000 + messageCategoryMask = 0x0FF00000 + messageIDMask = 0x000FFF00 + messageVersionMask = 0x000000FF + messageIDShift = 8 + messageVersionShift = 0 +) + +// MessageType is the type of the message. +type MessageType uint32 + +const ( + // MtNone is the default MessageType. + MtNone = 0 + // MtRequest is the MessageType when a request is recieved. + MtRequest = 0x10000000 + // MtResponse is the MessageType used to send a response. + MtResponse = 0x20000000 + // MtNotification is the MessageType used to send a notification not + // initiated by a request. + MtNotification = 0x30000000 +) + +// MessageCategory allows splitting the identifier namespace to easily route +// similar messages for common processing. +type MessageCategory uint32 + +const ( + // McNone is the default category. + McNone = 0 + // McComputeSystem is the category to define message types for compute + // systems. + McComputeSystem = 0x00100000 +) + +// GetResponseIdentifier returns the response version of the given request +// identifier. So, for example, an input of ComputeSystemCreateV1 would result +// in an output of ComputeSystemResponseCreateV1. +func GetResponseIdentifier(identifier MessageIdentifier) MessageIdentifier { + return MessageIdentifier(MtResponse | (uint32(identifier) & ^uint32(messageTypeMask))) +} + +// MessageIdentifier describes the Type field of a MessageHeader struct. +type MessageIdentifier uint32 + +const ( + // MiNone is the unknown identifier. + MiNone = 0 + + // ComputeSystemCreateV1 is the create container request. + ComputeSystemCreateV1 = 0x10100101 + // ComputeSystemStartV1 is the start container request. + ComputeSystemStartV1 = 0x10100201 + // ComputeSystemShutdownGracefulV1 is the graceful shutdown container + // request. + ComputeSystemShutdownGracefulV1 = 0x10100301 + // ComputeSystemShutdownForcedV1 is the forceful shutdown container request. + ComputeSystemShutdownForcedV1 = 0x10100401 + // ComputeSystemExecuteProcessV1 is the execute process request. + ComputeSystemExecuteProcessV1 = 0x10100501 + // ComputeSystemWaitForProcessV1 is the wait for process exit request. + ComputeSystemWaitForProcessV1 = 0x10100601 + // ComputeSystemSignalProcessV1 is the signal process request. + ComputeSystemSignalProcessV1 = 0x10100701 + // ComputeSystemResizeConsoleV1 is the resize console tty request. + ComputeSystemResizeConsoleV1 = 0x10100801 + // ComputeSystemGetPropertiesV1 is the list process properties request. + ComputeSystemGetPropertiesV1 = 0x10100901 + // ComputeSystemModifySettingsV1 is the modify container request. + ComputeSystemModifySettingsV1 = 0x10100a01 + // ComputeSystemNegotiateProtocolV1 is the protocol negotiation request. + ComputeSystemNegotiateProtocolV1 = 0x10100b01 + // ComputeSystemDumpStacksV1 is the dump stack request + ComputeSystemDumpStacksV1 = 0x10100c01 + // ComputeSystemDeleteContainerStateV1 is the delete container request. + ComputeSystemDeleteContainerStateV1 = 0x10100d01 + + // ComputeSystemResponseCreateV1 is the create container response. + ComputeSystemResponseCreateV1 = 0x20100101 + // ComputeSystemResponseStartV1 is the start container response. + ComputeSystemResponseStartV1 = 0x20100201 + // ComputeSystemResponseShutdownGracefulV1 is the graceful shutdown + // container response. + ComputeSystemResponseShutdownGracefulV1 = 0x20100301 + // ComputeSystemResponseShutdownForcedV1 is the forceful shutdown container + // response. + ComputeSystemResponseShutdownForcedV1 = 0x20100401 + // ComputeSystemResponseExecuteProcessV1 is the execute process response. + ComputeSystemResponseExecuteProcessV1 = 0x20100501 + // ComputeSystemResponseWaitForProcessV1 is the wait for process exit + // response. + ComputeSystemResponseWaitForProcessV1 = 0x20100601 + // ComputeSystemResponseSignalProcessV1 is the signal process response. + ComputeSystemResponseSignalProcessV1 = 0x20100701 + // ComputeSystemResponseResizeConsoleV1 is the resize console tty response. + ComputeSystemResponseResizeConsoleV1 = 0x20100801 + // ComputeSystemResponseGetPropertiesV1 is the list process properties + // response. + ComputeSystemResponseGetPropertiesV1 = 0x20100901 + // ComputeSystemResponseModifySettingsV1 is the modify container response. + ComputeSystemResponseModifySettingsV1 = 0x20100a01 + // ComputeSystemResponseNegotiateProtocolV1 is the protocol negotiation + // response. + ComputeSystemResponseNegotiateProtocolV1 = 0x20100b01 + // ComputeSystemResponseDumpStacksV1 is the dump stack response + ComputeSystemResponseDumpStacksV1 = 0x20100c01 + + // ComputeSystemNotificationV1 is the notification identifier. + ComputeSystemNotificationV1 = 0x30100101 +) + +// String returns the string representation of the message identifer. +func (mi MessageIdentifier) String() string { + switch mi { + case MiNone: + return "None" + case ComputeSystemCreateV1: + return "ComputeSystemCreateV1" + case ComputeSystemStartV1: + return "ComputeSystemStartV1" + case ComputeSystemShutdownGracefulV1: + return "ComputeSystemShutdownGracefulV1" + case ComputeSystemShutdownForcedV1: + return "ComputeSystemShutdownForcedV1" + case ComputeSystemExecuteProcessV1: + return "ComputeSystemExecuteProcessV1" + case ComputeSystemWaitForProcessV1: + return "ComputeSystemWaitForProcessV1" + case ComputeSystemSignalProcessV1: + return "ComputeSystemSignalProcessV1" + case ComputeSystemResizeConsoleV1: + return "ComputeSystemResizeConsoleV1" + case ComputeSystemGetPropertiesV1: + return "ComputeSystemGetPropertiesV1" + case ComputeSystemModifySettingsV1: + return "ComputeSystemModifySettingsV1" + case ComputeSystemNegotiateProtocolV1: + return "ComputeSystemNegotiateProtocolV1" + case ComputeSystemDumpStacksV1: + return "ComputeSystemDumpStacksV1" + case ComputeSystemDeleteContainerStateV1: + return "ComputeSystemDeleteContainerStateV1" + case ComputeSystemResponseCreateV1: + return "ComputeSystemResponseCreateV1" + case ComputeSystemResponseStartV1: + return "ComputeSystemResponseStartV1" + case ComputeSystemResponseShutdownGracefulV1: + return "ComputeSystemResponseShutdownGracefulV1" + case ComputeSystemResponseShutdownForcedV1: + return "ComputeSystemResponseShutdownForcedV1" + case ComputeSystemResponseExecuteProcessV1: + return "ComputeSystemResponseExecuteProcessV1" + case ComputeSystemResponseWaitForProcessV1: + return "ComputeSystemResponseWaitForProcessV1" + case ComputeSystemResponseSignalProcessV1: + return "ComputeSystemResponseSignalProcessV1" + case ComputeSystemResponseResizeConsoleV1: + return "ComputeSystemResponseResizeConsoleV1" + case ComputeSystemResponseGetPropertiesV1: + return "ComputeSystemResponseGetPropertiesV1" + case ComputeSystemResponseModifySettingsV1: + return "ComputeSystemResponseModifySettingsV1" + case ComputeSystemResponseNegotiateProtocolV1: + return "ComputeSystemResponseNegotiateProtocolV1" + case ComputeSystemResponseDumpStacksV1: + return "ComputeSystemResponseDumpStacksV1" + case ComputeSystemNotificationV1: + return "ComputeSystemNotificationV1" + default: + return strconv.FormatUint(uint64(mi), 10) + } +} + +// SequenceID is used to correlate requests and responses. +type SequenceID uint64 + +// MessageHeader is the common header present in all communications messages. +type MessageHeader struct { + Type MessageIdentifier + Size uint32 + ID SequenceID +} + +// MessageHeaderSize is the size in bytes of the MessageHeader struct. +const MessageHeaderSize = 16 + +///////////////////////////////////////////////////// + +// ProtocolVersion is a type for the seclected HCS<->GCS protocol version of +// messages +type ProtocolVersion uint32 + +// Protocol versions. +const ( + PvInvalid ProtocolVersion = 0 + PvV4 ProtocolVersion = 4 + PvMax ProtocolVersion = PvV4 +) + +// ProtocolSupport specifies the protocol versions to be used for HCS-GCS +// communication. +type ProtocolSupport struct { + MinimumVersion string `json:",omitempty"` + MaximumVersion string `json:",omitempty"` + MinimumProtocolVersion uint32 + MaximumProtocolVersion uint32 +} + +// OsType defines the operating system type identifer of the guest hosting the +// GCS. +type OsType string + +// OsTypeLinux is the OS type the HCS expects for a Linux GCS +const OsTypeLinux OsType = "Linux" + +// GcsCapabilities specifies the abilities and scenarios supported by this GCS. +type GcsCapabilities struct { + // True if a create message should be sent for the hosting system itself. + SendHostCreateMessage bool `json:",omitempty"` + // True if a start message should be sent for the hosting system itself. If + // SendHostCreateMessage is false, a start message will not be sent either. + SendHostStartMessage bool `json:",omitempty"` + // True if an HVSocket ModifySettings request should be sent immediately + // after the create/start messages are sent (if they're sent at all). This + // ModifySettings request would be to configure the local and parent + // Hyper-V socket addresses of the VM, and would have a RequestType of + // Update. + HVSocketConfigOnStartup bool `json:"HvSocketConfigOnStartup,omitempty"` + SupportedSchemaVersions []SchemaVersion `json:",omitempty"` + RuntimeOsType OsType `json:",omitempty"` + // GuestDefinedCapabilities define any JSON object that will be directly + // passed to a client of the HCS. This can be useful to pass runtime + // specific capabilities not tied to the platform itself. + GuestDefinedCapabilities GcsGuestCapabilities `json:",omitempty"` +} + +// GcsGuestCapabilities represents the customized guest capabilities supported +// by this GCS. +type GcsGuestCapabilities struct { + NamespaceAddRequestSupported bool `json:",omitempty"` + SignalProcessSupported bool `json:",omitempty"` + DumpStacksSupported bool `json:",omitempty"` + DeleteContainerStateSupported bool `json:",omitempty"` +} + +// ocspancontext is the internal JSON representation of the OpenCensus +// `trace.SpanContext` for fowarding to a GCS that supports it. +type ocspancontext struct { + // TraceID is the `hex` encoded string of the OpenCensus + // `SpanContext.TraceID` to propagate to the guest. + TraceID string `json:",omitempty"` + // SpanID is the `hex` encoded string of the OpenCensus `SpanContext.SpanID` + // to propagate to the guest. + SpanID string `json:",omitempty"` + + // TraceOptions is the OpenCensus `SpanContext.TraceOptions` passed through + // to propagate to the guest. + TraceOptions uint32 `json:",omitempty"` + + // Tracestate is the `base64` encoded string of marshaling the OpenCensus + // `SpanContext.TraceState.Entries()` to JSON. + // + // If `SpanContext.Tracestate == nil || + // len(SpanContext.Tracestate.Entries()) == 0` this will be `""`. + Tracestate string `json:",omitempty"` +} + +// MessageBase is the base type embedded in all messages sent from the HCS to +// the GCS, as well as ContainerNotification which is sent from GCS to HCS. +type MessageBase struct { + ContainerID string `json:"ContainerId"` + ActivityID string `json:"ActivityId"` + + // OpenCensusSpanContext is the encoded OpenCensus `trace.SpanContext` if + // set when making the request. + // + // NOTE: This is not a part of the protocol but because its a JSON protocol + // adding fields is a non-breaking change. If the guest supports it this is + // just additive context. + OpenCensusSpanContext *ocspancontext `json:"ocsc,omitempty"` +} + +// NegotiateProtocol is the message from the HCS used to determine the protocol +// version that will be used for future communication. +type NegotiateProtocol struct { + MessageBase + MinimumVersion uint32 + MaximumVersion uint32 +} + +// ContainerCreate is the message from the HCS specifying to create a container +// in the utility VM. This message won't actually create a Linux container +// inside the utility VM, but will set up the infrustructure needed to start one +// once the container's initial process is executed. +type ContainerCreate struct { + MessageBase + ContainerConfig string + SupportedVersions ProtocolSupport `json:",omitempty"` +} + +// NotificationType defines a type of notification to be sent back to the HCS. +type NotificationType string + +const ( + // NtNone indicates nothing to be sent back to the HCS + NtNone = NotificationType("None") + // NtGracefulExit indicates a graceful exit notification to be sent back to + // the HCS + NtGracefulExit = NotificationType("GracefulExit") + // NtForcedExit indicates a forced exit notification to be sent back to the + // HCS + NtForcedExit = NotificationType("ForcedExit") + // NtUnexpectedExit indicates an unexpected exit notification to be sent + // back to the HCS + NtUnexpectedExit = NotificationType("UnexpectedExit") + // NtReboot indicates a reboot notification to be sent back to the HCS + NtReboot = NotificationType("Reboot") + // NtConstructed indicates a constructed notification to be sent back to the + // HCS + NtConstructed = NotificationType("Constructed") + // NtStarted indicates a started notification to be sent back to the HCS + NtStarted = NotificationType("Started") + // NtPaused indicates a paused notification to be sent back to the HCS + NtPaused = NotificationType("Paused") + // NtUnknown indicates an unknown notification to be sent back to the HCS + NtUnknown = NotificationType("Unknown") +) + +// ActiveOperation defines an operation to be associated with a notification +// sent back to the HCS. +type ActiveOperation string + +const ( + // AoNone indicates no active operation + AoNone = ActiveOperation("None") + // AoConstruct indicates a construct active operation + AoConstruct = ActiveOperation("Construct") + // AoStart indicates a start active operation + AoStart = ActiveOperation("Start") + // AoPause indicates a pause active operation + AoPause = ActiveOperation("Pause") + // AoResume indicates a resume active operation + AoResume = ActiveOperation("Resume") + // AoShutdown indicates a shutdown active operation + AoShutdown = ActiveOperation("Shutdown") + // AoTerminate indicates a terminate active operation + AoTerminate = ActiveOperation("Terminate") +) + +// ContainerNotification is a message sent from the GCS to the HCS to indicate +// some kind of event. At the moment, it is only used for container exit +// notifications. +type ContainerNotification struct { + MessageBase + Type NotificationType + Operation ActiveOperation + Result int32 + ResultInfo string `json:",omitempty"` +} + +// ExecuteProcessVsockStdioRelaySettings defines the port numbers for each +// stdio socket for a process. +type ExecuteProcessVsockStdioRelaySettings struct { + StdIn uint32 `json:",omitempty"` + StdOut uint32 `json:",omitempty"` + StdErr uint32 `json:",omitempty"` +} + +// ExecuteProcessSettings defines the settings for a single process to be +// executed either inside or outside the container namespace. +type ExecuteProcessSettings struct { + ProcessParameters string + VsockStdioRelaySettings ExecuteProcessVsockStdioRelaySettings +} + +// ContainerExecuteProcess is the message from the HCS specifying to execute a +// process either inside or outside the container namespace. +type ContainerExecuteProcess struct { + MessageBase + Settings ExecuteProcessSettings +} + +// ContainerResizeConsole is the message from the HCS specifying to change the +// console size for the given process. +type ContainerResizeConsole struct { + MessageBase + ProcessID uint32 `json:"ProcessId"` + Height uint16 + Width uint16 +} + +// ContainerWaitForProcess is the message from the HCS specifying to wait until +// the given process exits. After receiving this message, the corresponding +// response should not be sent until the process has exited. +type ContainerWaitForProcess struct { + MessageBase + ProcessID uint32 `json:"ProcessId"` + TimeoutInMs uint32 +} + +// InfiniteWaitTimeout is the value for ContainerWaitForProcess.TimeoutInMs that +// indicates that no timeout should be in effect. +const InfiniteWaitTimeout = 0xffffffff + +// ContainerSignalProcess is the message from the HCS specifying to send a +// signal to the given process. +type ContainerSignalProcess struct { + MessageBase + ProcessID uint32 `json:"ProcessId"` + Options SignalProcessOptions `json:",omitempty"` +} + +// ContainerGetProperties is the message from the HCS requesting certain +// properties of the container, such as a list of its processes. +type ContainerGetProperties struct { + MessageBase + Query string +} + +// PropertyType is the type of property, such as memory or virtual disk, which +// is to be modified for the container. +type PropertyType string + +const ( + // PtMemory is the property type for memory + PtMemory = PropertyType("Memory") + // PtCPUGroup is the property type for CPU group + PtCPUGroup = PropertyType("CpuGroup") + // PtStatistics is the property type for statistics + PtStatistics = PropertyType("Statistics") + // PtProcessList is the property type for a process list + PtProcessList = PropertyType("ProcessList") + // PtPendingUpdates is the property type for determining if there are + // pending updates + PtPendingUpdates = PropertyType("PendingUpdates") + // PtTerminateOnLastHandleClosed is the property type for exiting when the + // last handle is closed + PtTerminateOnLastHandleClosed = PropertyType("TerminateOnLastHandleClosed") + // PtMappedDirectory is the property type for mapped directories + PtMappedDirectory = PropertyType("MappedDirectory") + // PtSystemGUID is the property type for the system GUID + PtSystemGUID = PropertyType("SystemGUID") + // PtNetwork is the property type for networking + PtNetwork = PropertyType("Network") + // PtMappedPipe is the property type for mapped pipes + PtMappedPipe = PropertyType("MappedPipe") + // PtMappedVirtualDisk is the property type for mapped virtual disks + PtMappedVirtualDisk = PropertyType("MappedVirtualDisk") +) + +// RequestType is the type of operation to perform on a given property type. +type RequestType string + +const ( + // RtAdd is the "Add" request type of operation + RtAdd = RequestType("Add") + // RtRemove is the "Remove" request type of operation + RtRemove = RequestType("Remove") + // RtUpdate is the "Update" request type of operation + RtUpdate = RequestType("Update") +) + +// ResourceModificationRequestResponse details a container resource which should +// be modified, how, and with what parameters. +type ResourceModificationRequestResponse struct { + ResourceType PropertyType + RequestType RequestType `json:",omitempty"` + Settings interface{} `json:",omitempty"` +} + +// ModifyResourceType is the type of resource, such as memory or virtual disk, +// which is to be modified for the container. This is the V2 schema equivalent +// of PropertyType. +type ModifyResourceType string + +const ( + // MrtMappedDirectory is the modify resource type for mapped directories + MrtMappedDirectory = ModifyResourceType("MappedDirectory") + // MrtMappedVirtualDisk is the modify resource type for mapped virtual + // disks + MrtMappedVirtualDisk = ModifyResourceType("MappedVirtualDisk") + // MrtCombinedLayers is the modify resource type for combined layers + MrtCombinedLayers = ModifyResourceType("CombinedLayers") + // MrtVPMemDevice is the modify resource type for VPMem devices + MrtVPMemDevice = ModifyResourceType("VPMemDevice") + // MrtNetwork is the modify resource type for the `NetworkAdapterV2` device. + MrtNetwork = ModifyResourceType("Network") + // MrtVPCIDevice is the modify resource type for vpci devices + MrtVPCIDevice = ModifyResourceType("VPCIDevice") + // MrtContainerConstraints is the modify resource type for updating container constraints + MrtContainerConstraints = ModifyResourceType("ContainerConstraints") +) + +// ModifyRequestType is the type of operation to perform on a given modify +// resource type. This is the V2 schema equivalent of RequestType. +type ModifyRequestType string + +const ( + // MreqtAdd is the "Add" modify request type + MreqtAdd = ModifyRequestType("Add") + // MreqtRemove is the "Remove" modify request type + MreqtRemove = ModifyRequestType("Remove") + // MreqtUpdate is the "Update" modify request type + MreqtUpdate = ModifyRequestType("Update") +) + +// ModifySettingRequest details a container resource which should be modified, +// how, and with what parameters. This is the V2 schema equivalent of +// ResourceModificationRequestResponse. +type ModifySettingRequest struct { + ResourceType ModifyResourceType `json:",omitempty"` + RequestType ModifyRequestType `json:",omitempty"` + Settings interface{} `json:",omitempty"` +} + +// ContainerModifySettings is the message from the HCS specifying how a certain +// container resource should be modified. +type ContainerModifySettings struct { + MessageBase + Request interface{} +} + +// UnmarshalContainerModifySettings unmarshals the given bytes into a +// ContainerModifySettings message. This function is required because properties +// such as `Settings` can be of many types identified by the `ResourceType` and +// require dynamic unmarshalling. +func UnmarshalContainerModifySettings(b []byte) (*ContainerModifySettings, error) { + // Unmarshal the message. + var request ContainerModifySettings + var requestRawSettings json.RawMessage + request.Request = &requestRawSettings + if err := commonutils.UnmarshalJSONWithHresult(b, &request); err != nil { + return nil, errors.Wrap(err, "failed to unmarshal ContainerModifySettings") + } + + var msr ModifySettingRequest + var msrRawSettings json.RawMessage + msr.Settings = &msrRawSettings + if err := commonutils.UnmarshalJSONWithHresult(requestRawSettings, &msr); err != nil { + return &request, errors.Wrap(err, "failed to unmarshal request.Settings as ModifySettingRequest") + } + + if msr.RequestType == "" { + msr.RequestType = MreqtAdd + } + + // Fill in the ResourceType-specific fields. + switch msr.ResourceType { + case MrtMappedVirtualDisk: + mvd := &MappedVirtualDiskV2{} + if err := commonutils.UnmarshalJSONWithHresult(msrRawSettings, mvd); err != nil { + return &request, errors.Wrap(err, "failed to unmarshal settings as MappedVirtualDiskV2") + } + msr.Settings = mvd + case MrtMappedDirectory: + md := &MappedDirectoryV2{} + if err := commonutils.UnmarshalJSONWithHresult(msrRawSettings, md); err != nil { + return &request, errors.Wrap(err, "failed to unmarshal settings as MappedDirectoryV2") + } + msr.Settings = md + case MrtVPMemDevice: + vpd := &MappedVPMemDeviceV2{} + if err := commonutils.UnmarshalJSONWithHresult(msrRawSettings, vpd); err != nil { + return &request, errors.Wrap(err, "failed to unmarshal hosted settings as MappedVPMemDeviceV2") + } + msr.Settings = vpd + case MrtCombinedLayers: + cl := &CombinedLayersV2{} + if err := commonutils.UnmarshalJSONWithHresult(msrRawSettings, cl); err != nil { + return &request, errors.Wrap(err, "failed to unmarshal settings as CombinedLayersV2") + } + msr.Settings = cl + case MrtNetwork: + na := &NetworkAdapterV2{} + if err := commonutils.UnmarshalJSONWithHresult(msrRawSettings, na); err != nil { + return &request, errors.Wrap(err, "failed to unmarshal settings as NetworkAdapterV2") + } + msr.Settings = na + case MrtVPCIDevice: + vd := &MappedVPCIDeviceV2{} + if err := commonutils.UnmarshalJSONWithHresult(msrRawSettings, vd); err != nil { + return &request, errors.Wrap(err, "failed to unmarshal settings as MappedVPCIDeviceV2") + } + msr.Settings = vd + case MrtContainerConstraints: + cc := &ContainerConstraintsV2{} + if err := commonutils.UnmarshalJSONWithHresult(msrRawSettings, cc); err != nil { + return &request, errors.Wrap(err, "failed to unmarshal settings as ContainerConstraintsV2") + } + msr.Settings = cc + default: + return &request, errors.Errorf("invalid ResourceType '%s'", msr.ResourceType) + } + request.Request = &msr + return &request, nil +} + +// ErrorRecord represents a single error to be reported back to the HCS. It +// allows for specifying information about the source of the error, as well as +// an error message and stack trace. +type ErrorRecord struct { + Result int32 + Message string + StackTrace string `json:",omitempty"` + ModuleName string + FileName string + Line uint32 + FunctionName string `json:",omitempty"` +} + +// MessageResponseBase is the base type embedded in all messages sent from the +// GCS to the HCS except for ContainerNotification. +type MessageResponseBase struct { + Result int32 + ActivityID string `json:"ActivityId,omitempty"` + ErrorMessage string `json:",omitempty"` // Only used by hcsshim external bridge + ErrorRecords []ErrorRecord `json:",omitempty"` +} + +// Base returns the response base by reference. +func (mrp *MessageResponseBase) Base() *MessageResponseBase { + return mrp +} + +// NegotiateProtocolResponse is the message to the HCS responding to a +// NegotiateProtocol message. It specifies the prefered protocol version and +// available capabilities of the GCS. +type NegotiateProtocolResponse struct { + MessageResponseBase + Version uint32 + Capabilities GcsCapabilities +} + +type DumpStacksResponse struct { + MessageResponseBase + GuestStacks string +} + +// ContainerCreateResponse is the message to the HCS responding to a +// ContainerCreate message. It serves a protocol negotiation function as well +// for protocol versions 3 and lower, returning protocol version information to +// the HCS. +type ContainerCreateResponse struct { + MessageResponseBase + SelectedVersion string `json:",omitempty"` + SelectedProtocolVersion uint32 +} + +// ContainerExecuteProcessResponse is the message to the HCS responding to a +// ContainerExecuteProcess message. It provides back the process's pid. +type ContainerExecuteProcessResponse struct { + MessageResponseBase + ProcessID uint32 `json:"ProcessId"` +} + +// ContainerWaitForProcessResponse is the message to the HCS responding to a +// ContainerWaitForProcess message. It is only sent when the process has exited. +type ContainerWaitForProcessResponse struct { + MessageResponseBase + ExitCode uint32 +} + +// ContainerGetPropertiesResponse is the message to the HCS responding to a +// ContainerGetProperties message. It contains a string representing the +// properties requested. +type ContainerGetPropertiesResponse struct { + MessageResponseBase + Properties string +} + +/* types added on to the current official protocol types */ + +// Layer represents a filesystem layer for a container. +type Layer struct { + // Path is in this case the identifier (such as the SCSI number) of the + // layer device. + Path string +} + +// CombinedLayersV2 is a modify type that corresponds to MrtCombinedLayers +// request. +type CombinedLayersV2 struct { + Layers []Layer `json:",omitempty"` + ScratchPath string `json:",omitempty"` + ContainerRootPath string +} + +// NetworkAdapter represents a network interface and its associated +// configuration. +type NetworkAdapter struct { + AdapterInstanceID string `json:"AdapterInstanceId"` + FirewallEnabled bool + NatEnabled bool + MacAddress string `json:",omitempty"` + AllocatedIPAddress string `json:"AllocatedIpAddress,omitempty"` + HostIPAddress string `json:"HostIpAddress,omitempty"` + HostIPPrefixLength uint8 `json:"HostIpPrefixLength,omitempty"` + HostDNSServerList string `json:"HostDnsServerList,omitempty"` + HostDNSSuffix string `json:"HostDnsSuffix,omitempty"` + EnableLowMetric bool `json:",omitempty"` + EncapOverhead uint16 `json:",omitempty"` +} + +// NetworkAdapterV2 represents a network interface and its associated +// configuration in a namespace. +type NetworkAdapterV2 struct { + NamespaceID string `json:",omitempty"` + ID string `json:",omitempty"` + MacAddress string `json:",omitempty"` + IPAddress string `json:",omitempty"` + PrefixLength uint8 `json:",omitempty"` + GatewayAddress string `json:",omitempty"` + DNSSuffix string `json:",omitempty"` + DNSServerList string `json:",omitempty"` + EnableLowMetric bool `json:",omitempty"` + EncapOverhead uint16 `json:",omitempty"` +} + +// MappedVirtualDisk represents a disk on the host which is mapped into a +// directory in the guest. +type MappedVirtualDisk struct { + ContainerPath string + Lun uint8 `json:",omitempty"` + CreateInUtilityVM bool `json:",omitempty"` + ReadOnly bool `json:",omitempty"` + AttachOnly bool `json:",omitempty"` +} + +// MappedVirtualDiskV2 represents a disk on the host which is mapped into a +// directory in the guest in the V2 schema. +type MappedVirtualDiskV2 struct { + MountPath string `json:",omitempty"` + Lun uint8 `json:",omitempty"` + Controller uint8 `json:",omitempty"` + ReadOnly bool `json:",omitempty"` + Options []string `json:",omitempty"` +} + +// MappedDirectory represents a directory on the host which is mapped to a +// directory on the guest through a technology such as Plan9. +type MappedDirectory struct { + ContainerPath string + CreateInUtilityVM bool `json:",omitempty"` + ReadOnly bool `json:",omitempty"` + Port uint32 `json:",omitempty"` +} + +// MappedDirectoryV2 represents a directory on the host which is mapped to a +// directory on the guest through Plan9 in the V2 schema. +type MappedDirectoryV2 struct { + MountPath string `json:",omitempty"` + Port uint32 `json:",omitempty"` + ShareName string `json:",omitempty"` + ReadOnly bool `json:",omitempty"` +} + +// DeviceMappingInfo represents a mapped device on a given VPMem +type DeviceMappingInfo struct { + DeviceOffsetInBytes int64 `json:",omitempty"` + DeviceSizeInBytes int64 `json:",omitempty"` +} + +// DeviceVerityInfo represents dm-verity information of a given data device. +// The assumption is that the hash device is the same as data device with +// verity data appended in the end. +type DeviceVerityInfo struct { + Ext4SizeInBytes int64 + Version int + Algorithm string + SuperBlock bool + RootDigest string + Salt string + BlockSize int +} + +// MappedVPMemDeviceV2 represents a VPMem device that is mapped into a guest +// path in the V2 schema. +type MappedVPMemDeviceV2 struct { + DeviceNumber uint32 `json:",omitempty"` + MountPath string `json:",omitempty"` + // MappingInfo is used when multiple devices are mapped onto a single VPMem device + MappingInfo *DeviceMappingInfo `json:",omitempty"` + VerityInfo *DeviceVerityInfo `json:",omitempty"` +} + +type MappedVPCIDeviceV2 struct { + VMBusGUID string `json:",omitempty"` +} + +type ContainerConstraintsV2 struct { + Windows oci.WindowsResources `json:",omitempty"` + Linux oci.LinuxResources `json:",omitempty"` +} + +// VMHostedContainerSettings is the set of settings used to specify the initial +// configuration of a container. +type VMHostedContainerSettings struct { + Layers []Layer + // SandboxDataPath is in this case the identifier (such as the SCSI number) + // of the sandbox device. + SandboxDataPath string + MappedVirtualDisks []MappedVirtualDisk + MappedDirectories []MappedDirectory + NetworkAdapters []NetworkAdapter `json:",omitempty"` +} + +// SchemaVersion defines the version of the schema that should be deserialized. +type SchemaVersion struct { + Major uint32 `json:",omitempty"` + Minor uint32 `json:",omitempty"` +} + +// Cmp compares s and v and returns: +// +// -1 if s < v +// 0 if s == v +// 1 if s > v +func (s *SchemaVersion) Cmp(v SchemaVersion) int { + if s.Major == v.Major { + if s.Minor == v.Minor { + return 0 + } else if s.Minor < v.Minor { + return -1 + } + return 1 + } else if s.Major < v.Major { + return -1 + } + return 1 +} + +// VMHostedContainerSettingsV2 defines the portion of the +// ContainerCreate.ContainerConfig that is sent via a V2 call. This correlates +// to the 'HostedSystem' on the HCS side but rather than sending the 'Container' +// field the Linux GCS accepts an oci.Spec directly. +type VMHostedContainerSettingsV2 struct { + SchemaVersion SchemaVersion + OCIBundlePath string `json:"OciBundlePath,omitempty"` + OCISpecification *oci.Spec `json:"OciSpecification,omitempty"` +} + +// ProcessParameters represents any process which may be started in the utility +// VM. This covers three cases: +// 1.) It is an external process, i.e. a process running inside the utility VM +// but not inside any container. In this case, don't specify the +// OCISpecification field, but specify all other fields. +// 2.) It is the first process in a container. In this case, specify only the +// OCISpecification field, and not the other fields. +// 3.) It is a container process, but not the first process in that container. +// In this case, don't specify the OCISpecification field, but specify all +// other fields. This is the same as if it were an external process. +type ProcessParameters struct { + // CommandLine is a space separated list of command line parameters. For + // example, the command which sleeps for 100 seconds would be represented by + // the CommandLine string "sleep 100". + CommandLine string `json:",omitempty"` + // CommandArgs is a list of strings representing the command to execute. If + // it is not empty, it will be used by the GCS. If it is empty, CommandLine + // will be used instead. + CommandArgs []string `json:",omitempty"` + WorkingDirectory string `json:",omitempty"` + Environment map[string]string `json:",omitempty"` + EmulateConsole bool `json:",omitempty"` + CreateStdInPipe bool `json:",omitempty"` + CreateStdOutPipe bool `json:",omitempty"` + CreateStdErrPipe bool `json:",omitempty"` + // If IsExternal is false, the process will be created inside a container. + // If true, it will be created external to any container. The latter is + // useful if, for example, you want to start up a shell in the utility VM + // for debugging/diagnostic purposes. + IsExternal bool `json:"CreateInUtilityVM,omitempty"` + // If this is the first process created for this container, this field must + // be specified. Otherwise, it must be left blank and the other fields must + // be specified. + OCISpecification *oci.Spec `json:"OciSpecification,omitempty"` + + OCIProcess *oci.Process `json:"OciProcess,omitempty"` +} + +// SignalProcessOptions represents the options for signaling a process. +type SignalProcessOptions struct { + Signal int32 +} + +// ProcessDetails represents information about a given process. +type ProcessDetails struct { + ProcessID uint32 `json:"ProcessId"` +} + +// PropertyQuery is a query to specify which properties are requested. +type PropertyQuery struct { + PropertyTypes []PropertyType `json:",omitempty"` +} + +// Properties represents the properties of a compute system. +type Properties struct { + ProcessList []ProcessDetails `json:",omitempty"` +} + +type PropertiesV2 struct { + ProcessList []ProcessDetails `json:"ProcessList,omitempty"` + Metrics *v1.Metrics `json:"LCOWMetrics,omitempty"` +} diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go new file mode 100644 index 0000000000..117ada881d --- /dev/null +++ b/internal/guest/runtime/hcsv2/container.go @@ -0,0 +1,178 @@ +// +build linux + +package hcsv2 + +import ( + "context" + "sync" + "syscall" + + "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/guest/runtime" + "github.com/Microsoft/hcsshim/internal/guest/stdio" + "github.com/Microsoft/hcsshim/internal/guest/storage" + "github.com/Microsoft/hcsshim/internal/guest/transport" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/containerd/cgroups" + v1 "github.com/containerd/cgroups/stats/v1" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "go.opencensus.io/trace" +) + +type Container struct { + id string + vsock transport.Transport + + spec *oci.Spec + isSandbox bool + + container runtime.Container + initProcess *containerProcess + + etL sync.Mutex + exitType prot.NotificationType + + processesMutex sync.Mutex + processes map[uint32]*containerProcess +} + +func (c *Container) Start(ctx context.Context, conSettings stdio.ConnectionSettings) (int, error) { + stdioSet, err := stdio.Connect(c.vsock, conSettings) + if err != nil { + return -1, err + } + if c.initProcess.spec.Terminal { + ttyr := c.container.Tty() + ttyr.ReplaceConnectionSet(stdioSet) + ttyr.Start() + } else { + pr := c.container.PipeRelay() + pr.ReplaceConnectionSet(stdioSet) + pr.CloseUnusedPipes() + pr.Start() + } + err = c.container.Start() + if err != nil { + stdioSet.Close() + } + return int(c.initProcess.pid), err +} + +func (c *Container) ExecProcess(ctx context.Context, process *oci.Process, conSettings stdio.ConnectionSettings) (int, error) { + stdioSet, err := stdio.Connect(c.vsock, conSettings) + if err != nil { + return -1, err + } + + p, err := c.container.ExecProcess(process, stdioSet) + if err != nil { + stdioSet.Close() + return -1, err + } + + pid := p.Pid() + c.processesMutex.Lock() + c.processes[uint32(pid)] = newProcess(c, process, p, uint32(pid), false) + c.processesMutex.Unlock() + return pid, nil +} + +// GetProcess returns the Process with the matching 'pid'. If the 'pid' does +// not exit returns error. +func (c *Container) GetProcess(pid uint32) (Process, error) { + if c.initProcess.pid == pid { + return c.initProcess, nil + } + + c.processesMutex.Lock() + defer c.processesMutex.Unlock() + + p, ok := c.processes[pid] + if !ok { + return nil, gcserr.NewHresultError(gcserr.HrErrNotFound) + } + return p, nil +} + +// GetAllProcessPids returns all process pids in the container namespace. +func (c *Container) GetAllProcessPids(ctx context.Context) ([]int, error) { + state, err := c.container.GetAllProcesses() + if err != nil { + return nil, err + } + pids := make([]int, len(state)) + for i, s := range state { + pids[i] = s.Pid + } + return pids, nil +} + +// Kill sends 'signal' to the container process. +func (c *Container) Kill(ctx context.Context, signal syscall.Signal) error { + err := c.container.Kill(signal) + if err != nil { + return err + } + c.setExitType(signal) + return nil +} + +func (c *Container) Delete(ctx context.Context) error { + if c.isSandbox { + // remove user mounts in sandbox container + if err := storage.UnmountAllInPath(ctx, getSandboxMountsDir(c.id), true); err != nil { + log.G(ctx).WithError(err).Error("failed to unmount sandbox mounts") + } + } + return c.container.Delete() +} + +func (c *Container) Update(ctx context.Context, resources interface{}) error { + return c.container.Update(resources) +} + +// Wait waits for the container's init process to exit. +func (c *Container) Wait() prot.NotificationType { + _, span := trace.StartSpan(context.Background(), "opengcs::Container::Wait") + defer span.End() + span.AddAttributes(trace.StringAttribute("cid", c.id)) + + c.initProcess.writersWg.Wait() + c.etL.Lock() + defer c.etL.Unlock() + return c.exitType +} + +// setExitType sets `c.exitType` to the appropriate value based on `signal` if +// `signal` will take down the container. +func (c *Container) setExitType(signal syscall.Signal) { + c.etL.Lock() + defer c.etL.Unlock() + + if signal == syscall.SIGTERM { + c.exitType = prot.NtGracefulExit + } else if signal == syscall.SIGKILL { + c.exitType = prot.NtForcedExit + } +} + +// GetStats returns the cgroup metrics for the container. +func (c *Container) GetStats(ctx context.Context) (*v1.Metrics, error) { + _, span := trace.StartSpan(ctx, "opengcs::Container::GetStats") + defer span.End() + span.AddAttributes(trace.StringAttribute("cid", c.id)) + + cgroupPath := c.spec.Linux.CgroupsPath + cg, err := cgroups.Load(cgroups.V1, cgroups.StaticPath(cgroupPath)) + if err != nil { + return nil, errors.Errorf("failed to get container stats for %v: %v", c.id, err) + } + + return cg.Stat(cgroups.IgnoreNotExist) +} + +func (c *Container) modifyContainerConstraints(ctx context.Context, rt prot.ModifyRequestType, cc *prot.ContainerConstraintsV2) (err error) { + return c.Update(ctx, cc.Linux) +} diff --git a/internal/guest/runtime/hcsv2/network.go b/internal/guest/runtime/hcsv2/network.go new file mode 100644 index 0000000000..5d1c1d3395 --- /dev/null +++ b/internal/guest/runtime/hcsv2/network.go @@ -0,0 +1,281 @@ +// +build linux + +package hcsv2 + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/Microsoft/hcsshim/internal/guest/network" + "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "github.com/vishvananda/netns" + "go.opencensus.io/trace" +) + +var ( + // namespaceSync protects access to `namespaces`. + namespaceSync sync.Mutex + // namespaces is the set of `in-memory` namespace adapters know to the GCS. + // These may or may not be assigned to a container as there is support for + // pre-Add and post-Add. + namespaces map[string]*namespace + + networkInstanceIDToName = network.InstanceIDToName +) + +func init() { + namespaces = make(map[string]*namespace) +} + +// getNetworkNamespace returns the namespace found by `id`. If the namespace +// does not exist returns `gcserr.HrErrNotFound`. +func getNetworkNamespace(id string) (*namespace, error) { + id = strings.ToLower(id) + + namespaceSync.Lock() + defer namespaceSync.Unlock() + + ns, ok := namespaces[id] + if !ok { + return nil, gcserr.WrapHresult(errors.Errorf("namespace '%s' not found", id), gcserr.HrErrNotFound) + } + return ns, nil +} + +// getOrAddNetworkNamespace returns the namespace found by `id` or creates a new +// one and assigns `id. +func getOrAddNetworkNamespace(id string) *namespace { + id = strings.ToLower(id) + + namespaceSync.Lock() + defer namespaceSync.Unlock() + + ns, ok := namespaces[id] + if !ok { + ns = &namespace{ + id: id, + } + namespaces[id] = ns + } + return ns +} + +// removeNetworkNamespace removes the in-memory `namespace` found by `id`. +func removeNetworkNamespace(ctx context.Context, id string) (err error) { + _, span := trace.StartSpan(ctx, "hcsv2::removeNetworkNamespace") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + id = strings.ToLower(id) + span.AddAttributes(trace.StringAttribute("id", id)) + + namespaceSync.Lock() + defer namespaceSync.Unlock() + + ns, ok := namespaces[id] + if ok { + ns.m.Lock() + defer ns.m.Unlock() + if len(ns.nics) > 0 { + return errors.Errorf("network namespace '%s' contains adapters", id) + } + delete(namespaces, id) + } + + return nil +} + +// namespace struct maps all vNIC's to the namespace ID used by the HNS. +type namespace struct { + id string + + m sync.Mutex + pid int + nics []*nicInNamespace +} + +// ID is the id of the network namespace +func (n *namespace) ID() string { + return n.id +} + +// AssignContainerPid assigns `pid` to `n` but does NOT move any previously +// assigned adapters into this namespace. The caller MUST call `Sync()` to +// complete this operation. +func (n *namespace) AssignContainerPid(ctx context.Context, pid int) (err error) { + _, span := trace.StartSpan(ctx, "namespace::AssignContainerPid") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("namespace", n.id), + trace.Int64Attribute("pid", int64(pid))) + + n.m.Lock() + defer n.m.Unlock() + + if n.pid != 0 { + return errors.Errorf("previously assigned container pid: %d", n.pid) + } + + n.pid = pid + return nil +} + +// Adapters returns a copy of the adapters assigned to `n` at the time of the +// call. +func (n *namespace) Adapters() []*prot.NetworkAdapterV2 { + n.m.Lock() + defer n.m.Unlock() + + adps := make([]*prot.NetworkAdapterV2, len(n.nics)) + for i, nin := range n.nics { + adps[i] = nin.adapter + } + return adps +} + +// AddAdapter adds `adp` to `n` but does NOT move the adapter into the network +// namespace assigned to `n`. A user must call `Sync()` to complete this +// operation. +func (n *namespace) AddAdapter(ctx context.Context, adp *prot.NetworkAdapterV2) (err error) { + ctx, span := trace.StartSpan(ctx, "namespace::AddAdapter") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("namespace", n.id), + trace.StringAttribute("adapter", fmt.Sprintf("%+v", adp))) + + n.m.Lock() + defer n.m.Unlock() + + for _, nic := range n.nics { + if strings.EqualFold(nic.adapter.ID, adp.ID) { + return errors.Errorf("adapter with id: '%s' already present in namespace", adp.ID) + } + } + + resolveCtx, cancel := context.WithTimeout(ctx, time.Second*5) + defer cancel() + ifname, err := networkInstanceIDToName(resolveCtx, adp.ID) + if err != nil { + return err + } + n.nics = append(n.nics, &nicInNamespace{ + adapter: adp, + ifname: ifname, + }) + return nil +} + +// RemoveAdapter removes the adapter matching `id` from `n`. If `id` is not +// found returns no error. +func (n *namespace) RemoveAdapter(ctx context.Context, id string) (err error) { + _, span := trace.StartSpan(ctx, "namespace::RemoveAdapter") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("namespace", n.id), + trace.StringAttribute("adapterID", id)) + + n.m.Lock() + defer n.m.Unlock() + + // TODO: do we need to remove anything guestside from a sandbox namespace? + + i := -1 + for j, nic := range n.nics { + if strings.EqualFold(nic.adapter.ID, id) { + i = j + break + } + } + if i > -1 { + n.nics = append(n.nics[:i], n.nics[i+1:]...) + } + return nil +} + +// Sync moves all adapters to the network namespace of `n` if assigned. +func (n *namespace) Sync(ctx context.Context) (err error) { + ctx, span := trace.StartSpan(ctx, "namespace::Sync") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("namespace", n.id)) + + n.m.Lock() + defer n.m.Unlock() + + if n.pid != 0 { + for i, a := range n.nics { + // Lower the metric for anything but the first adapter + // TODO: remove when we correctly support assigning metrics to the default GWs + if i > 0 { + a.adapter.EnableLowMetric = true + } + err = a.assignToPid(ctx, n.pid) + if err != nil { + return err + } + } + } + return nil +} + +// nicInNamespace represents a single network adapter that has been added to the +// guest and its mapping to the linux `ifname`. +type nicInNamespace struct { + // adapter captures the network settings when the nic was added + adapter *prot.NetworkAdapterV2 + // ifname is the interface name resolved for this adapter + ifname string + // assignedPid will be `0` for any nic in this namespace that has not been + // moved into a specific pid network namespace. + assignedPid int +} + +// assignToPid assigns `nin.adapter`, represented by `nin.ifname` to `pid`. +func (nin *nicInNamespace) assignToPid(ctx context.Context, pid int) (err error) { + ctx, span := trace.StartSpan(ctx, "nicInNamespace::assignToPid") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("adapterID", nin.adapter.ID), + trace.StringAttribute("ifname", nin.ifname), + trace.Int64Attribute("pid", int64(pid))) + + v1Adapter := &prot.NetworkAdapter{ + NatEnabled: nin.adapter.IPAddress != "", + AllocatedIPAddress: nin.adapter.IPAddress, + HostIPAddress: nin.adapter.GatewayAddress, + HostIPPrefixLength: nin.adapter.PrefixLength, + EnableLowMetric: nin.adapter.EnableLowMetric, + EncapOverhead: nin.adapter.EncapOverhead, + } + + if err := network.MoveInterfaceToNS(nin.ifname, pid); err != nil { + return errors.Wrapf(err, "failed to move interface %s to network namespace", nin.ifname) + } + + // Get a reference to the new network namespace + ns, err := netns.GetFromPid(pid) + if err != nil { + return errors.Wrapf(err, "netns.GetFromPid(%d) failed", pid) + } + defer ns.Close() + + netNSCfg := func() error { + return network.NetNSConfig(ctx, nin.ifname, pid, v1Adapter) + } + + if err := network.DoInNetNS(ns, netNSCfg); err != nil { + return errors.Wrapf(err, "failed to configure adapter aid: %s, if id: %s", nin.adapter.ID, nin.ifname) + } + nin.assignedPid = pid + return nil +} diff --git a/internal/guest/runtime/hcsv2/network_test.go b/internal/guest/runtime/hcsv2/network_test.go new file mode 100644 index 0000000000..90f24aadd2 --- /dev/null +++ b/internal/guest/runtime/hcsv2/network_test.go @@ -0,0 +1,119 @@ +// +build linux + +package hcsv2 + +import ( + "context" + "testing" + + "github.com/Microsoft/hcsshim/internal/guest/prot" +) + +func Test_getNetworkNamespace_NotExist(t *testing.T) { + defer func() { + err := removeNetworkNamespace(context.Background(), t.Name()) + if err != nil { + t.Errorf("failed to remove ns with error: %v", err) + } + }() + + ns, err := getNetworkNamespace(t.Name()) + if err == nil { + t.Fatal("expected error got nil") + } + if ns != nil { + t.Fatalf("namespace should be nil, got: %+v", ns) + } +} + +func Test_getNetworkNamespace_PreviousExist(t *testing.T) { + defer func() { + err := removeNetworkNamespace(context.Background(), t.Name()) + if err != nil { + t.Errorf("failed to remove ns with error: %v", err) + } + }() + + ns1 := getOrAddNetworkNamespace(t.Name()) + if ns1 == nil { + t.Fatal("namespace ns1 should not be nil") + } + ns2, err := getNetworkNamespace(t.Name()) + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } + if ns1 != ns2 { + t.Fatalf("ns1 %+v != ns2 %+v", ns1, ns2) + } +} + +func Test_getOrAddNetworkNamespace_NotExist(t *testing.T) { + defer func() { + err := removeNetworkNamespace(context.Background(), t.Name()) + if err != nil { + t.Errorf("failed to remove ns with error: %v", err) + } + }() + + ns := getOrAddNetworkNamespace(t.Name()) + if ns == nil { + t.Fatalf("namespace should not be nil") + } +} + +func Test_getOrAddNetworkNamespace_PreviousExist(t *testing.T) { + defer func() { + err := removeNetworkNamespace(context.Background(), t.Name()) + if err != nil { + t.Errorf("failed to remove ns with error: %v", err) + } + }() + + ns1 := getOrAddNetworkNamespace(t.Name()) + ns2 := getOrAddNetworkNamespace(t.Name()) + if ns1 != ns2 { + t.Fatalf("ns1 %+v != ns2 %+v", ns1, ns2) + } +} + +func Test_removeNetworkNamespace_NotExist(t *testing.T) { + err := removeNetworkNamespace(context.Background(), t.Name()) + if err != nil { + t.Fatalf("failed to remove non-existing ns with error: %v", err) + } +} + +func Test_removeNetworkNamespace_HasAdapters(t *testing.T) { + defer func() { + err := removeNetworkNamespace(context.Background(), t.Name()) + if err != nil { + t.Errorf("failed to remove ns with error: %v", err) + } + }() + nsOld := networkInstanceIDToName + defer func() { + networkInstanceIDToName = nsOld + }() + + ns := getOrAddNetworkNamespace(t.Name()) + + networkInstanceIDToName = func(ctx context.Context, id string) (string, error) { + return "/dev/sdz", nil + } + err := ns.AddAdapter(context.Background(), &prot.NetworkAdapterV2{ID: "test"}) + if err != nil { + t.Fatalf("failed to add adapter: %v", err) + } + err = removeNetworkNamespace(context.Background(), t.Name()) + if err == nil { + t.Fatal("should have failed to delete namespace with adapters") + } + err = ns.RemoveAdapter(context.Background(), "test") + if err != nil { + t.Fatalf("failed to remove adapter: %v", err) + } + err = removeNetworkNamespace(context.Background(), t.Name()) + if err != nil { + t.Fatalf("should not have failed to delete empty namepace got: %v", err) + } +} diff --git a/internal/guest/runtime/hcsv2/nvidia_utils.go b/internal/guest/runtime/hcsv2/nvidia_utils.go new file mode 100644 index 0000000000..09931368d2 --- /dev/null +++ b/internal/guest/runtime/hcsv2/nvidia_utils.go @@ -0,0 +1,105 @@ +// +build linux + +package hcsv2 + +import ( + "context" + "fmt" + "os" + "os/exec" + "strings" + + "github.com/Microsoft/hcsshim/cmd/gcstools/generichook" + "github.com/Microsoft/hcsshim/internal/guest/storage/pci" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" +) + +// path that the shim mounts the nvidia gpu vhd to in the uvm +// this MUST match the path mapped to in the shim +const lcowNvidiaMountPath = "/run/nvidia" + +// annotation to find the gpu capabilities on the container spec +// must match the hcsshim annotation string for gpu capabilities +const annotationContainerGPUCapabilities = "io.microsoft.container.gpu.capabilities" +const nvidiaDebugFilePath = "/nvidia-container.log" + +const nvidiaToolBinary = "nvidia-container-cli" + +// TODO katiewasnothere: prestart hooks will be depracated, this needs to be moved to a createRuntime hook +// described here: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks +// addNvidiaDevicePreHook builds the arguments for nvidia-container-cli and creates the prestart hook +func addNvidiaDevicePreHook(ctx context.Context, spec *oci.Spec) error { + genericHookBinary := "generichook" + genericHookPath, err := exec.LookPath(genericHookBinary) + if err != nil { + return errors.Wrapf(err, "failed to find %s for container device support", genericHookBinary) + } + + debugOption := fmt.Sprintf("--debug=%s", nvidiaDebugFilePath) + + // TODO katiewasnothere: right now both host and container ldconfig do not work as expected for nvidia-container-cli + // ldconfig needs to be run in the container to setup the correct symlinks to the library files nvidia-container-cli + // maps into the container + args := []string{ + genericHookPath, + nvidiaToolBinary, + debugOption, + "--load-kmods", + "--no-pivot", + "configure", + "--ldconfig=@/sbin/ldconfig", + } + if capabilities, ok := spec.Annotations[annotationContainerGPUCapabilities]; ok { + caps := strings.Split(capabilities, ",") + for _, c := range caps { + args = append(args, fmt.Sprintf("--%s", c)) + } + } + + for _, d := range spec.Windows.Devices { + switch d.IDType { + case "gpu": + busLocation, err := pci.FindDeviceBusLocationFromVMBusGUID(ctx, d.ID) + if err != nil { + return errors.Wrapf(err, "failed to find nvidia gpu bus location") + } + args = append(args, fmt.Sprintf("--device=%s", busLocation)) + } + } + + // add template for pid argument to be injected later by the generic hook binary + args = append(args, "--no-cgroups", "--pid={{pid}}", spec.Root.Path) + + if spec.Hooks == nil { + spec.Hooks = &oci.Hooks{} + } + + hookLogDebugFileEnvOpt := fmt.Sprintf("%s=%s", generichook.LogDebugFileEnvKey, nvidiaDebugFilePath) + hookEnv := append(updateEnvWithNvidiaVariables(), hookLogDebugFileEnvOpt) + nvidiaHook := oci.Hook{ + Path: genericHookPath, + Args: args, + Env: hookEnv, + } + + spec.Hooks.Prestart = append(spec.Hooks.Prestart, nvidiaHook) + return nil +} + +// updateEnvWithNvidiaVariables creates an env with the nvidia gpu vhd in PATH and insecure mode set +func updateEnvWithNvidiaVariables() []string { + pathPrefix := "PATH=" + nvidiaBin := fmt.Sprintf("%s/bin", lcowNvidiaMountPath) + env := os.Environ() + for i, v := range env { + if strings.HasPrefix(v, pathPrefix) { + newPath := fmt.Sprintf("%s:%s", v, nvidiaBin) + env[i] = newPath + } + } + // NVC_INSECURE_MODE allows us to run nvidia-container-cli without seccomp + // we don't currently use seccomp in the uvm, so avoid using it here for now as well + env = append(env, "NVC_INSECURE_MODE=1") + return env +} diff --git a/internal/guest/runtime/hcsv2/process.go b/internal/guest/runtime/hcsv2/process.go new file mode 100644 index 0000000000..c4343525c9 --- /dev/null +++ b/internal/guest/runtime/hcsv2/process.go @@ -0,0 +1,310 @@ +// +build linux + +package hcsv2 + +import ( + "context" + "fmt" + "os/exec" + "sync" + "syscall" + + "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/Microsoft/hcsshim/internal/guest/runtime" + "github.com/Microsoft/hcsshim/internal/guest/stdio" + "github.com/Microsoft/hcsshim/internal/log" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "go.opencensus.io/trace" +) + +type Process interface { + // Kill sends `signal` to the process. + // + // If the process has already exited returns `gcserr.HrErrNotFound` by contract. + Kill(ctx context.Context, signal syscall.Signal) error + // Pid returns the process id of the process. + Pid() int + // ResizeConsole resizes the tty to `height`x`width` for the process. + ResizeConsole(ctx context.Context, height, width uint16) error + // Wait returns a channel that can be used to wait for the process to exit + // and gather the exit code. The second channel must be signaled from the + // caller when the caller has completed its use of this call to Wait. + Wait() (<-chan int, chan<- bool) +} + +// Process is a struct that defines the lifetime and operations associated with +// an oci.Process. +type containerProcess struct { + // c is the owning container + c *Container + spec *oci.Process + // cid is the container id that owns this process. + cid string + + process runtime.Process + pid uint32 + // init is `true` if this is the container process itself + init bool + + // This is only valid post the exitWg + exitCode int + // exitWg is marked as done as soon as the underlying + // (runtime.Process).Wait() call returns, and exitCode has been updated. + exitWg sync.WaitGroup + + // Used to allow addtion/removal to the writersWg after an initial wait has + // already been issued. It is not safe to call Add/Done without holding this + // lock. + writersSyncRoot sync.Mutex + // Used to track the number of writers that need to finish + // before the process can be marked for cleanup. + writersWg sync.WaitGroup + // Used to track the 1st caller to the writersWg that successfully + // acknowledges it wrote the exit response. + writersCalled bool +} + +// newProcess returns a containerProcess struct that has been initialized with +// an outstanding wait for process exit, and post exit an outstanding wait for +// process cleanup to release all resources once at least 1 waiter has +// successfully written the exit response. +func newProcess(c *Container, spec *oci.Process, process runtime.Process, pid uint32, init bool) *containerProcess { + p := &containerProcess{ + c: c, + spec: spec, + process: process, + init: init, + cid: c.id, + pid: pid, + } + p.exitWg.Add(1) + p.writersWg.Add(1) + go func() { + ctx, span := trace.StartSpan(context.Background(), "newProcess::waitBackground") + defer span.End() + span.AddAttributes( + trace.StringAttribute("cid", p.cid), + trace.Int64Attribute("pid", int64(p.pid))) + + // Wait for the process to exit + exitCode, err := p.process.Wait() + if err != nil { + log.G(ctx).WithError(err).Error("failed to wait for runc process") + } + p.exitCode = exitCode + log.G(ctx).WithField("exitCode", p.exitCode).Debug("process exited") + + // Free any process waiters + p.exitWg.Done() + + // Schedule the removal of this process object from the map once at + // least one waiter has read the result + go func() { + p.writersWg.Wait() + // cleanup the process state + if derr := p.process.Delete(); derr != nil { + log.G(ctx).WithFields(logrus.Fields{ + "cid": p.cid, + "pid": p.pid, + }).Debugf("process cleanup error: %s", derr) + } + c.processesMutex.Lock() + + _, span := trace.StartSpan(context.Background(), "newProcess::waitBackground::waitAllWaiters") + defer span.End() + span.AddAttributes( + trace.StringAttribute("cid", p.cid), + trace.Int64Attribute("pid", int64(p.pid))) + + delete(c.processes, p.pid) + c.processesMutex.Unlock() + }() + }() + return p +} + +// Kill sends 'signal' to the process. +// +// If the process has already exited returns `gcserr.HrErrNotFound` by contract. +func (p *containerProcess) Kill(ctx context.Context, signal syscall.Signal) error { + if err := syscall.Kill(int(p.pid), signal); err != nil { + if err == syscall.ESRCH { + return gcserr.NewHresultError(gcserr.HrErrNotFound) + } + return err + } + + if p.init { + p.c.setExitType(signal) + } + + return nil +} + +func (p *containerProcess) Pid() int { + return int(p.pid) +} + +// ResizeConsole resizes the tty to `height`x`width` for the process. +func (p *containerProcess) ResizeConsole(ctx context.Context, height, width uint16) error { + tty := p.process.Tty() + if tty == nil { + return fmt.Errorf("pid: %d, is not a tty and cannot be resized", p.pid) + } + return tty.ResizeConsole(height, width) +} + +// Wait returns a channel that can be used to wait for the process to exit and +// gather the exit code. The second channel must be signaled from the caller +// when the caller has completed its use of this call to Wait. +func (p *containerProcess) Wait() (<-chan int, chan<- bool) { + ctx, span := trace.StartSpan(context.Background(), "opengcs::containerProcess::Wait") + span.AddAttributes( + trace.StringAttribute("cid", p.cid), + trace.Int64Attribute("pid", int64(p.pid))) + + exitCodeChan := make(chan int, 1) + doneChan := make(chan bool) + + // Increment our waiters for this waiter + p.writersSyncRoot.Lock() + p.writersWg.Add(1) + p.writersSyncRoot.Unlock() + + go func() { + bgExitCodeChan := make(chan int, 1) + go func() { + p.exitWg.Wait() + bgExitCodeChan <- p.exitCode + }() + + // Wait for the exit code or the caller to stop waiting. + select { + case exitCode := <-bgExitCodeChan: + exitCodeChan <- exitCode + + // The caller got the exit code. Wait for them to tell us they have + // issued the write + select { + case <-doneChan: + p.writersSyncRoot.Lock() + // Decrement this waiter + log.G(ctx).Debug("wait completed, releasing wait count") + + p.writersWg.Done() + if !p.writersCalled { + // We have at least 1 response for the exit code for this + // process. Decrement the release waiter that will free the + // process resources when the writersWg hits 0 + log.G(ctx).Debug("first wait completed, releasing first wait count") + + p.writersCalled = true + p.writersWg.Done() + } + p.writersSyncRoot.Unlock() + span.End() + } + + case <-doneChan: + // In this case the caller timed out before the process exited. Just + // decrement the waiter but since no exit code we just deal with our + // waiter. + p.writersSyncRoot.Lock() + log.G(ctx).Debug("wait canceled before exit, releasing wait count") + + p.writersWg.Done() + p.writersSyncRoot.Unlock() + span.End() + } + }() + return exitCodeChan, doneChan +} + +func newExternalProcess(ctx context.Context, cmd *exec.Cmd, tty *stdio.TtyRelay, onRemove func(pid int)) (*externalProcess, error) { + ep := &externalProcess{ + cmd: cmd, + tty: tty, + waitBlock: make(chan struct{}), + remove: onRemove, + } + if err := cmd.Start(); err != nil { + return nil, errors.Wrap(err, "failed to call Start for external process") + } + if tty != nil { + tty.Start() + } + go func() { + cmd.Wait() + ep.exitCode = cmd.ProcessState.ExitCode() + log.G(ctx).WithFields(logrus.Fields{ + "pid": cmd.Process.Pid, + "exitCode": ep.exitCode, + }).Debug("external process exited") + if ep.tty != nil { + ep.tty.Wait() + } + close(ep.waitBlock) + }() + return ep, nil +} + +type externalProcess struct { + cmd *exec.Cmd + tty *stdio.TtyRelay + + waitBlock chan struct{} + exitCode int + + removeOnce sync.Once + remove func(pid int) +} + +func (ep *externalProcess) Kill(ctx context.Context, signal syscall.Signal) error { + if err := syscall.Kill(int(ep.cmd.Process.Pid), signal); err != nil { + if err == syscall.ESRCH { + return gcserr.NewHresultError(gcserr.HrErrNotFound) + } + return err + } + return nil +} + +func (ep *externalProcess) Pid() int { + return ep.cmd.Process.Pid +} + +func (ep *externalProcess) ResizeConsole(ctx context.Context, height, width uint16) error { + if ep.tty == nil { + return fmt.Errorf("pid: %d, is not a tty and cannot be resized", ep.cmd.Process.Pid) + } + return ep.tty.ResizeConsole(height, width) +} + +func (ep *externalProcess) Wait() (<-chan int, chan<- bool) { + _, span := trace.StartSpan(context.Background(), "opengcs::externalProcess::Wait") + span.AddAttributes(trace.Int64Attribute("pid", int64(ep.cmd.Process.Pid))) + + exitCodeChan := make(chan int, 1) + doneChan := make(chan bool) + + go func() { + defer close(exitCodeChan) + + // Wait for the exit code or the caller to stop waiting. + select { + case <-ep.waitBlock: + // Process exited send the exit code and wait for caller to close. + exitCodeChan <- ep.exitCode + <-doneChan + // At least one waiter was successful, remove this external process. + ep.removeOnce.Do(func() { + ep.remove(ep.cmd.Process.Pid) + }) + case <-doneChan: + // Caller closed early, do nothing. + } + }() + return exitCodeChan, doneChan +} diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go new file mode 100644 index 0000000000..887f823bd7 --- /dev/null +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -0,0 +1,113 @@ +// +build linux + +package hcsv2 + +import ( + "context" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "github.com/Microsoft/hcsshim/internal/guest/network" + "github.com/Microsoft/hcsshim/internal/oc" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "go.opencensus.io/trace" +) + +func getSandboxRootDir(id string) string { + return filepath.Join("/run/gcs/c", id) +} + +func getSandboxMountsDir(id string) string { + return filepath.Join(getSandboxRootDir(id), "sandboxMounts") +} + +func getSandboxHostnamePath(id string) string { + return filepath.Join(getSandboxRootDir(id), "hostname") +} + +func getSandboxHostsPath(id string) string { + return filepath.Join(getSandboxRootDir(id), "hosts") +} + +func getSandboxResolvPath(id string) string { + return filepath.Join(getSandboxRootDir(id), "resolv.conf") +} + +func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { + ctx, span := trace.StartSpan(ctx, "hcsv2::setupSandboxContainerSpec") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", id)) + + // Generate the sandbox root dir + rootDir := getSandboxRootDir(id) + if err := os.MkdirAll(rootDir, 0755); err != nil { + return errors.Wrapf(err, "failed to create sandbox root directory %q", rootDir) + } + + // Write the hostname + hostname := spec.Hostname + if hostname == "" { + var err error + hostname, err = os.Hostname() + if err != nil { + return errors.Wrap(err, "failed to get hostname") + } + } + + sandboxHostnamePath := getSandboxHostnamePath(id) + if err := ioutil.WriteFile(sandboxHostnamePath, []byte(hostname+"\n"), 0644); err != nil { + return errors.Wrapf(err, "failed to write hostname to %q", sandboxHostnamePath) + } + + // Write the hosts + sandboxHostsContent := network.GenerateEtcHostsContent(ctx, hostname) + sandboxHostsPath := getSandboxHostsPath(id) + if err := ioutil.WriteFile(sandboxHostsPath, []byte(sandboxHostsContent), 0644); err != nil { + return errors.Wrapf(err, "failed to write sandbox hosts to %q", sandboxHostsPath) + } + + // Write resolv.conf + ns, err := getNetworkNamespace(getNetworkNamespaceID(spec)) + if err != nil { + return err + } + var searches, servers []string + for _, n := range ns.Adapters() { + if len(n.DNSSuffix) > 0 { + searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ",")) + } + if len(n.DNSServerList) > 0 { + servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ",")) + } + } + resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil) + if err != nil { + return errors.Wrap(err, "failed to generate sandbox resolv.conf content") + } + sandboxResolvPath := getSandboxResolvPath(id) + if err := ioutil.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil { + return errors.Wrap(err, "failed to write sandbox resolv.conf") + } + + if userstr, ok := spec.Annotations["io.microsoft.lcow.userstr"]; ok { + if err := setUserStr(spec, userstr); err != nil { + return err + } + } + + // TODO: JTERRY75 /dev/shm is not properly setup for LCOW I believe. CRI + // also has a concept of a sandbox/shm file when the IPC NamespaceMode != + // NODE. + + // Force the parent cgroup into our /containers root + spec.Linux.CgroupsPath = "/containers/" + id + + // Clear the windows section as we dont want to forward to runc + spec.Windows = nil + + return nil +} diff --git a/internal/guest/runtime/hcsv2/spec.go b/internal/guest/runtime/hcsv2/spec.go new file mode 100644 index 0000000000..d69793fda0 --- /dev/null +++ b/internal/guest/runtime/hcsv2/spec.go @@ -0,0 +1,154 @@ +// +build linux + +package hcsv2 + +import ( + "fmt" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/user" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" +) + +// getNetworkNamespaceID returns the `ToLower` of +// `spec.Windows.Network.NetworkNamespace` or `""`. +func getNetworkNamespaceID(spec *oci.Spec) string { + if spec.Windows != nil && + spec.Windows.Network != nil { + return strings.ToLower(spec.Windows.Network.NetworkNamespace) + } + return "" +} + +// isRootReadonly returns `true` if the spec specifies the rootfs is readonly. +func isRootReadonly(spec *oci.Spec) bool { + if spec.Root != nil { + return spec.Root.Readonly + } + return false +} + +// isInMounts returns `true` if `target` matches a `Destination` in any of +// `mounts`. +func isInMounts(target string, mounts []oci.Mount) bool { + for _, m := range mounts { + if m.Destination == target { + return true + } + } + return false +} + +func setProcess(spec *oci.Spec) { + if spec.Process == nil { + spec.Process = &oci.Process{} + } +} + +// setUserStr sets `spec.Process` to the valid `userstr` based on the OCI Image Spec +// v1.0.0 `userstr`. +// +// Valid values are: user, uid, user:group, uid:gid, uid:group, user:gid. +// If uid is provided instead of the username then that value is not checked against the +// /etc/passwd file to verify if the user with given uid actually exists. +func setUserStr(spec *oci.Spec, userstr string) error { + setProcess(spec) + + parts := strings.Split(userstr, ":") + switch len(parts) { + case 1: + v, err := strconv.Atoi(parts[0]) + if err != nil { + // evaluate username to uid/gid + return setUsername(spec, userstr) + } + return setUserID(spec, int(v)) + case 2: + var ( + username, groupname string + uid, gid int + ) + v, err := strconv.Atoi(parts[0]) + if err != nil { + username = parts[0] + } else { + uid = int(v) + } + v, err = strconv.Atoi(parts[1]) + if err != nil { + groupname = parts[1] + } else { + gid = int(v) + } + if username != "" { + u, err := getUser(spec, func(u user.User) bool { + return u.Name == username + }) + if err != nil { + return errors.Wrapf(err, "failed to find user by username: %s", username) + } + uid = u.Uid + } + if groupname != "" { + g, err := getGroup(spec, func(g user.Group) bool { + return g.Name == groupname + }) + if err != nil { + return errors.Wrapf(err, "failed to find group by groupname: %s", groupname) + } + gid = g.Gid + } + spec.Process.User.UID, spec.Process.User.GID = uint32(uid), uint32(gid) + return nil + default: + return fmt.Errorf("invalid userstr: '%s'", userstr) + } +} + +func setUsername(spec *oci.Spec, username string) error { + u, err := getUser(spec, func(u user.User) bool { + return u.Name == username + }) + if err != nil { + return errors.Wrapf(err, "failed to find user by username: %s", username) + } + spec.Process.User.UID, spec.Process.User.GID = uint32(u.Uid), uint32(u.Gid) + return nil +} + +func setUserID(spec *oci.Spec, uid int) error { + u, err := getUser(spec, func(u user.User) bool { + return u.Uid == uid + }) + if err != nil { + spec.Process.User.UID, spec.Process.User.GID = uint32(uid), 0 + return nil + } + spec.Process.User.UID, spec.Process.User.GID = uint32(u.Uid), uint32(u.Gid) + return nil +} + +func getUser(spec *oci.Spec, filter func(user.User) bool) (user.User, error) { + users, err := user.ParsePasswdFileFilter(filepath.Join(spec.Root.Path, "/etc/passwd"), filter) + if err != nil { + return user.User{}, err + } + if len(users) != 1 { + return user.User{}, errors.Errorf("expected exactly 1 user matched '%d'", len(users)) + } + return users[0], nil +} + +func getGroup(spec *oci.Spec, filter func(user.Group) bool) (user.Group, error) { + groups, err := user.ParseGroupFileFilter(filepath.Join(spec.Root.Path, "/etc/group"), filter) + if err != nil { + return user.Group{}, err + } + if len(groups) != 1 { + return user.Group{}, errors.Errorf("expected exactly 1 group matched '%d'", len(groups)) + } + return groups[0], nil +} diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go new file mode 100644 index 0000000000..542fabe756 --- /dev/null +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -0,0 +1,135 @@ +// +build linux + +package hcsv2 + +import ( + "context" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "github.com/Microsoft/hcsshim/internal/guest/network" + "github.com/Microsoft/hcsshim/internal/oc" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "go.opencensus.io/trace" +) + +func getStandaloneRootDir(id string) string { + return filepath.Join("/run/gcs/c", id) +} + +func getStandaloneHostnamePath(id string) string { + return filepath.Join(getStandaloneRootDir(id), "hostname") +} + +func getStandaloneHostsPath(id string) string { + return filepath.Join(getStandaloneRootDir(id), "hosts") +} + +func getStandaloneResolvPath(id string) string { + return filepath.Join(getStandaloneRootDir(id), "resolv.conf") +} + +func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { + ctx, span := trace.StartSpan(ctx, "hcsv2::setupStandaloneContainerSpec") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", id)) + + // Generate the standalone root dir + rootDir := getStandaloneRootDir(id) + if err := os.MkdirAll(rootDir, 0755); err != nil { + return errors.Wrapf(err, "failed to create container root directory %q", rootDir) + } + + hostname := spec.Hostname + if hostname == "" { + var err error + hostname, err = os.Hostname() + if err != nil { + return errors.Wrap(err, "failed to get hostname") + } + } + + // Write the hostname + if !isInMounts("/etc/hostname", spec.Mounts) { + standaloneHostnamePath := getStandaloneHostnamePath(id) + if err := ioutil.WriteFile(standaloneHostnamePath, []byte(hostname+"\n"), 0644); err != nil { + return errors.Wrapf(err, "failed to write hostname to %q", standaloneHostnamePath) + } + + mt := oci.Mount{ + Destination: "/etc/hostname", + Type: "bind", + Source: getStandaloneHostnamePath(id), + Options: []string{"bind"}, + } + if isRootReadonly(spec) { + mt.Options = append(mt.Options, "ro") + } + spec.Mounts = append(spec.Mounts, mt) + } + + // Write the hosts + if !isInMounts("/etc/hosts", spec.Mounts) { + standaloneHostsContent := network.GenerateEtcHostsContent(ctx, hostname) + standaloneHostsPath := getStandaloneHostsPath(id) + if err := ioutil.WriteFile(standaloneHostsPath, []byte(standaloneHostsContent), 0644); err != nil { + return errors.Wrapf(err, "failed to write standalone hosts to %q", standaloneHostsPath) + } + + mt := oci.Mount{ + Destination: "/etc/hosts", + Type: "bind", + Source: getStandaloneHostsPath(id), + Options: []string{"bind"}, + } + if isRootReadonly(spec) { + mt.Options = append(mt.Options, "ro") + } + spec.Mounts = append(spec.Mounts, mt) + } + + // Write resolv.conf + if !isInMounts("/etc/resolv.conf", spec.Mounts) { + ns := getOrAddNetworkNamespace(getNetworkNamespaceID(spec)) + var searches, servers []string + for _, n := range ns.Adapters() { + if len(n.DNSSuffix) > 0 { + searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ",")) + } + if len(n.DNSServerList) > 0 { + servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ",")) + } + } + resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil) + if err != nil { + return errors.Wrap(err, "failed to generate standalone resolv.conf content") + } + standaloneResolvPath := getStandaloneResolvPath(id) + if err := ioutil.WriteFile(standaloneResolvPath, []byte(resolvContent), 0644); err != nil { + return errors.Wrap(err, "failed to write standalone resolv.conf") + } + + mt := oci.Mount{ + Destination: "/etc/resolv.conf", + Type: "bind", + Source: getStandaloneResolvPath(id), + Options: []string{"bind"}, + } + if isRootReadonly(spec) { + mt.Options = append(mt.Options, "ro") + } + spec.Mounts = append(spec.Mounts, mt) + } + + // Force the parent cgroup into our /containers root + spec.Linux.CgroupsPath = "/containers/" + id + + // Clear the windows section as we dont want to forward to runc + spec.Windows = nil + + return nil +} diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go new file mode 100644 index 0000000000..1bd360ef82 --- /dev/null +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -0,0 +1,475 @@ +// +build linux + +package hcsv2 + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/guest/runtime" + "github.com/Microsoft/hcsshim/internal/guest/stdio" + "github.com/Microsoft/hcsshim/internal/guest/storage" + "github.com/Microsoft/hcsshim/internal/guest/storage/overlay" + "github.com/Microsoft/hcsshim/internal/guest/storage/pci" + "github.com/Microsoft/hcsshim/internal/guest/storage/plan9" + "github.com/Microsoft/hcsshim/internal/guest/storage/pmem" + "github.com/Microsoft/hcsshim/internal/guest/storage/scsi" + "github.com/Microsoft/hcsshim/internal/guest/transport" + shellwords "github.com/mattn/go-shellwords" + "github.com/pkg/errors" +) + +// UVMContainerID is the ContainerID that will be sent on any prot.MessageBase +// for V2 where the specific message is targeted at the UVM itself. +const UVMContainerID = "00000000-0000-0000-0000-000000000000" + +// Host is the structure tracking all UVM host state including all containers +// and processes. +type Host struct { + containersMutex sync.Mutex + containers map[string]*Container + + externalProcessesMutex sync.Mutex + externalProcesses map[int]*externalProcess + + // Rtime is the Runtime interface used by the GCS core. + rtime runtime.Runtime + vsock transport.Transport +} + +func NewHost(rtime runtime.Runtime, vsock transport.Transport) *Host { + return &Host{ + containers: make(map[string]*Container), + externalProcesses: make(map[int]*externalProcess), + rtime: rtime, + vsock: vsock, + } +} + +func (h *Host) RemoveContainer(id string) { + h.containersMutex.Lock() + defer h.containersMutex.Unlock() + + delete(h.containers, id) +} + +func (h *Host) getContainerLocked(id string) (*Container, error) { + if c, ok := h.containers[id]; !ok { + return nil, gcserr.NewHresultError(gcserr.HrVmcomputeSystemNotFound) + } else { + return c, nil + } +} + +func (h *Host) GetContainer(id string) (*Container, error) { + h.containersMutex.Lock() + defer h.containersMutex.Unlock() + + return h.getContainerLocked(id) +} + +func setupSandboxMountsPath(id string) error { + mountPath := getSandboxMountsDir(id) + if err := os.MkdirAll(mountPath, 0755); err != nil { + return errors.Wrapf(err, "failed to create sandboxMounts dir in sandbox %v", id) + } + + return storage.MountRShared(mountPath) +} + +func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { + h.containersMutex.Lock() + defer h.containersMutex.Unlock() + + if _, ok := h.containers[id]; ok { + return nil, gcserr.NewHresultError(gcserr.HrVmcomputeSystemAlreadyExists) + } + + var namespaceID string + criType, isCRI := settings.OCISpecification.Annotations["io.kubernetes.cri.container-type"] + if isCRI { + switch criType { + case "sandbox": + // Capture namespaceID if any because setupSandboxContainerSpec clears the Windows section. + namespaceID = getNetworkNamespaceID(settings.OCISpecification) + err = setupSandboxContainerSpec(ctx, id, settings.OCISpecification) + defer func() { + if err != nil { + defer os.RemoveAll(getSandboxRootDir(id)) + } + }() + err = setupSandboxMountsPath(id) + case "container": + sid, ok := settings.OCISpecification.Annotations["io.kubernetes.cri.sandbox-id"] + if !ok || sid == "" { + return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid) + } + err = setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification) + defer func() { + if err != nil { + defer os.RemoveAll(getWorkloadRootDir(id)) + } + }() + default: + err = errors.Errorf("unsupported 'io.kubernetes.cri.container-type': '%s'", criType) + } + } else { + // Capture namespaceID if any because setupStandaloneContainerSpec clears the Windows section. + namespaceID = getNetworkNamespaceID(settings.OCISpecification) + err = setupStandaloneContainerSpec(ctx, id, settings.OCISpecification) + defer func() { + if err != nil { + os.RemoveAll(getStandaloneRootDir(id)) + } + }() + } + if err != nil { + return nil, err + } + + // Create the BundlePath + if err := os.MkdirAll(settings.OCIBundlePath, 0700); err != nil { + return nil, errors.Wrapf(err, "failed to create OCIBundlePath: '%s'", settings.OCIBundlePath) + } + configFile := path.Join(settings.OCIBundlePath, "config.json") + f, err := os.Create(configFile) + if err != nil { + return nil, errors.Wrapf(err, "failed to create config.json at: '%s'", configFile) + } + defer f.Close() + writer := bufio.NewWriter(f) + if err := json.NewEncoder(writer).Encode(settings.OCISpecification); err != nil { + return nil, errors.Wrapf(err, "failed to write OCISpecification to config.json at: '%s'", configFile) + } + if err := writer.Flush(); err != nil { + return nil, errors.Wrapf(err, "failed to flush writer for config.json at: '%s'", configFile) + } + + con, err := h.rtime.CreateContainer(id, settings.OCIBundlePath, nil) + if err != nil { + return nil, errors.Wrapf(err, "failed to create container") + } + + c := &Container{ + id: id, + vsock: h.vsock, + spec: settings.OCISpecification, + isSandbox: criType == "sandbox", + container: con, + exitType: prot.NtUnexpectedExit, + processes: make(map[uint32]*containerProcess), + } + c.initProcess = newProcess(c, settings.OCISpecification.Process, con.(runtime.Process), uint32(c.container.Pid()), true) + + // Sandbox or standalone, move the networks to the container namespace + if criType == "sandbox" || !isCRI { + ns, err := getNetworkNamespace(namespaceID) + if isCRI && err != nil { + return nil, err + } + // standalone is not required to have a networking namespace setup + if ns != nil { + if err := ns.AssignContainerPid(ctx, c.container.Pid()); err != nil { + return nil, err + } + if err := ns.Sync(ctx); err != nil { + return nil, err + } + } + } + + h.containers[id] = c + return c, nil +} + +func (h *Host) modifyHostSettings(ctx context.Context, containerID string, settings *prot.ModifySettingRequest) error { + switch settings.ResourceType { + case prot.MrtMappedVirtualDisk: + return modifyMappedVirtualDisk(ctx, settings.RequestType, settings.Settings.(*prot.MappedVirtualDiskV2)) + case prot.MrtMappedDirectory: + return modifyMappedDirectory(ctx, h.vsock, settings.RequestType, settings.Settings.(*prot.MappedDirectoryV2)) + case prot.MrtVPMemDevice: + return modifyMappedVPMemDevice(ctx, settings.RequestType, settings.Settings.(*prot.MappedVPMemDeviceV2)) + case prot.MrtCombinedLayers: + return modifyCombinedLayers(ctx, settings.RequestType, settings.Settings.(*prot.CombinedLayersV2)) + case prot.MrtNetwork: + return modifyNetwork(ctx, settings.RequestType, settings.Settings.(*prot.NetworkAdapterV2)) + case prot.MrtVPCIDevice: + return modifyMappedVPCIDevice(ctx, settings.RequestType, settings.Settings.(*prot.MappedVPCIDeviceV2)) + case prot.MrtContainerConstraints: + c, err := h.GetContainer(containerID) + if err != nil { + return err + } + return c.modifyContainerConstraints(ctx, settings.RequestType, settings.Settings.(*prot.ContainerConstraintsV2)) + default: + return errors.Errorf("the ResourceType \"%s\" is not supported for UVM", settings.ResourceType) + } +} + +func (h *Host) modifyContainerSettings(ctx context.Context, containerID string, settings *prot.ModifySettingRequest) error { + c, err := h.GetContainer(containerID) + if err != nil { + return err + } + + switch settings.ResourceType { + case prot.MrtContainerConstraints: + return c.modifyContainerConstraints(ctx, settings.RequestType, settings.Settings.(*prot.ContainerConstraintsV2)) + default: + return errors.Errorf("the ResourceType \"%s\" is not supported for containers", settings.ResourceType) + } +} + +func (h *Host) ModifySettings(ctx context.Context, containerID string, settings *prot.ModifySettingRequest) error { + if containerID == UVMContainerID { + return h.modifyHostSettings(ctx, containerID, settings) + } + return h.modifyContainerSettings(ctx, containerID, settings) +} + +// Shutdown terminates this UVM. This is a destructive call and will destroy all +// state that has not been cleaned before calling this function. +func (h *Host) Shutdown() { + syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) +} + +// RunExternalProcess runs a process in the utility VM. +func (h *Host) RunExternalProcess(ctx context.Context, params prot.ProcessParameters, conSettings stdio.ConnectionSettings) (_ int, err error) { + var stdioSet *stdio.ConnectionSet + stdioSet, err = stdio.Connect(h.vsock, conSettings) + if err != nil { + return -1, err + } + defer func() { + if err != nil { + stdioSet.Close() + } + }() + + args := params.CommandArgs + if len(args) == 0 { + args, err = processParamCommandLineToOCIArgs(params.CommandLine) + if err != nil { + return -1, err + } + } + + cmd := exec.Command(args[0], args[1:]...) + cmd.Dir = params.WorkingDirectory + cmd.Env = processParamEnvToOCIEnv(params.Environment) + + var relay *stdio.TtyRelay + if params.EmulateConsole { + // Allocate a console for the process. + var ( + master *os.File + consolePath string + ) + master, consolePath, err = stdio.NewConsole() + if err != nil { + return -1, errors.Wrap(err, "failed to create console for external process") + } + defer func() { + if err != nil { + master.Close() + } + }() + + var console *os.File + console, err = os.OpenFile(consolePath, os.O_RDWR|syscall.O_NOCTTY, 0777) + if err != nil { + return -1, errors.Wrap(err, "failed to open console file for external process") + } + defer console.Close() + + relay = stdio.NewTtyRelay(stdioSet, master) + cmd.Stdin = console + cmd.Stdout = console + cmd.Stderr = console + // Make the child process a session leader and adopt the pty as + // the controlling terminal. + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setsid: true, + Setctty: true, + Ctty: syscall.Stdin, + } + } else { + var fileSet *stdio.FileSet + fileSet, err = stdioSet.Files() + if err != nil { + return -1, errors.Wrap(err, "failed to set cmd stdio") + } + defer fileSet.Close() + defer stdioSet.Close() + cmd.Stdin = fileSet.In + cmd.Stdout = fileSet.Out + cmd.Stderr = fileSet.Err + } + + onRemove := func(pid int) { + h.externalProcessesMutex.Lock() + delete(h.externalProcesses, pid) + h.externalProcessesMutex.Unlock() + } + p, err := newExternalProcess(ctx, cmd, relay, onRemove) + if err != nil { + return -1, err + } + + h.externalProcessesMutex.Lock() + h.externalProcesses[p.Pid()] = p + h.externalProcessesMutex.Unlock() + return p.Pid(), nil +} + +func (h *Host) GetExternalProcess(pid int) (Process, error) { + h.externalProcessesMutex.Lock() + defer h.externalProcessesMutex.Unlock() + + p, ok := h.externalProcesses[pid] + if !ok { + return nil, gcserr.NewHresultError(gcserr.HrErrNotFound) + } + return p, nil +} + +func newInvalidRequestTypeError(rt prot.ModifyRequestType) error { + return errors.Errorf("the RequestType \"%s\" is not supported", rt) +} + +func modifyMappedVirtualDisk(ctx context.Context, rt prot.ModifyRequestType, mvd *prot.MappedVirtualDiskV2) (err error) { + switch rt { + case prot.MreqtAdd: + mountCtx, cancel := context.WithTimeout(ctx, time.Second*4) + defer cancel() + if mvd.MountPath != "" { + return scsi.Mount(mountCtx, mvd.Controller, mvd.Lun, mvd.MountPath, mvd.ReadOnly, mvd.Options) + } + return nil + case prot.MreqtRemove: + if mvd.MountPath != "" { + if err := storage.UnmountPath(ctx, mvd.MountPath, true); err != nil { + return err + } + } + return scsi.UnplugDevice(ctx, mvd.Controller, mvd.Lun) + default: + return newInvalidRequestTypeError(rt) + } +} + +func modifyMappedDirectory(ctx context.Context, vsock transport.Transport, rt prot.ModifyRequestType, md *prot.MappedDirectoryV2) (err error) { + switch rt { + case prot.MreqtAdd: + return plan9.Mount(ctx, vsock, md.MountPath, md.ShareName, md.Port, md.ReadOnly) + case prot.MreqtRemove: + return storage.UnmountPath(ctx, md.MountPath, true) + default: + return newInvalidRequestTypeError(rt) + } +} + +func modifyMappedVPMemDevice(ctx context.Context, rt prot.ModifyRequestType, vpd *prot.MappedVPMemDeviceV2) (err error) { + switch rt { + case prot.MreqtAdd: + return pmem.Mount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, vpd.VerityInfo) + case prot.MreqtRemove: + return pmem.Unmount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, vpd.VerityInfo) + default: + return newInvalidRequestTypeError(rt) + } +} + +func modifyMappedVPCIDevice(ctx context.Context, rt prot.ModifyRequestType, vpciDev *prot.MappedVPCIDeviceV2) error { + switch rt { + case prot.MreqtAdd: + return pci.WaitForPCIDeviceFromVMBusGUID(ctx, vpciDev.VMBusGUID) + default: + return newInvalidRequestTypeError(rt) + } +} + +func modifyCombinedLayers(ctx context.Context, rt prot.ModifyRequestType, cl *prot.CombinedLayersV2) (err error) { + switch rt { + case prot.MreqtAdd: + layerPaths := make([]string, len(cl.Layers)) + for i, layer := range cl.Layers { + layerPaths[i] = layer.Path + } + + var upperdirPath string + var workdirPath string + readonly := false + if cl.ScratchPath == "" { + // The user did not pass a scratch path. Mount overlay as readonly. + readonly = true + } else { + upperdirPath = filepath.Join(cl.ScratchPath, "upper") + workdirPath = filepath.Join(cl.ScratchPath, "work") + } + + return overlay.Mount(ctx, layerPaths, upperdirPath, workdirPath, cl.ContainerRootPath, readonly) + case prot.MreqtRemove: + return storage.UnmountPath(ctx, cl.ContainerRootPath, true) + default: + return newInvalidRequestTypeError(rt) + } +} + +func modifyNetwork(ctx context.Context, rt prot.ModifyRequestType, na *prot.NetworkAdapterV2) (err error) { + switch rt { + case prot.MreqtAdd: + ns := getOrAddNetworkNamespace(na.NamespaceID) + if err := ns.AddAdapter(ctx, na); err != nil { + return err + } + // This code doesnt know if the namespace was already added to the + // container or not so it must always call `Sync`. + return ns.Sync(ctx) + case prot.MreqtRemove: + ns := getOrAddNetworkNamespace(na.ID) + if err := ns.RemoveAdapter(ctx, na.ID); err != nil { + return err + } + return nil + default: + return newInvalidRequestTypeError(rt) + } +} + +// processParamCommandLineToOCIArgs converts a CommandLine field from +// ProcessParameters (a space separate argument string) into an array of string +// arguments which can be used by an oci.Process. +func processParamCommandLineToOCIArgs(commandLine string) ([]string, error) { + args, err := shellwords.Parse(commandLine) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse command line string \"%s\"", commandLine) + } + return args, nil +} + +// processParamEnvToOCIEnv converts an Environment field from ProcessParameters +// (a map from environment variable to value) into an array of environment +// variable assignments (where each is in the form "=") which +// can be used by an oci.Process. +func processParamEnvToOCIEnv(environment map[string]string) []string { + environmentList := make([]string, 0, len(environment)) + for k, v := range environment { + // TODO: Do we need to escape things like quotation marks in + // environment variable values? + environmentList = append(environmentList, fmt.Sprintf("%s=%s", k, v)) + } + return environmentList +} diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go new file mode 100644 index 0000000000..1aed4983c2 --- /dev/null +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -0,0 +1,234 @@ +// +build linux + +package hcsv2 + +import ( + "context" + "os" + "path/filepath" + "strings" + + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "go.opencensus.io/trace" +) + +func getWorkloadRootDir(id string) string { + return filepath.Join("/run/gcs/c", id) +} + +func updateSandboxMounts(sbid string, spec *oci.Spec) error { + sandboxMountPrefix := "sandbox://" + for i, m := range spec.Mounts { + if strings.HasPrefix(m.Source, sandboxMountPrefix) { + mountsDir := getSandboxMountsDir(sbid) + subPath := strings.TrimPrefix(m.Source, sandboxMountPrefix) + sandboxSource := filepath.Join(mountsDir, subPath) + + if !strings.HasPrefix(sandboxSource, mountsDir) { + return errors.Errorf("mount path %v for mount %v is not within sandbox's mounts dir", sandboxSource, m.Source) + } + + spec.Mounts[i].Source = sandboxSource + + _, err := os.Stat(sandboxSource) + if os.IsNotExist(err) { + if err := os.MkdirAll(sandboxSource, 0755); err != nil { + return err + } + } + } + } + return nil +} + +func specHasGPUDevice(spec *oci.Spec) bool { + for _, d := range spec.Windows.Devices { + if d.IDType == "gpu" { + return true + } + } + return false +} + +func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.Spec) (err error) { + ctx, span := trace.StartSpan(ctx, "hcsv2::setupWorkloadContainerSpec") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("sandboxID", sbid), + trace.StringAttribute("cid", id)) + + // Verify no hostname + if spec.Hostname != "" { + return errors.Errorf("workload container must not change hostname: %s", spec.Hostname) + } + + // update any sandbox mounts with the sandboxMounts directory path and create files + if err = updateSandboxMounts(sbid, spec); err != nil { + return errors.Wrapf(err, "failed to update sandbox mounts for container %v in sandbox %v", id, sbid) + } + + // Add /etc/hostname if the spec did not override it. + if !isInMounts("/etc/hostname", spec.Mounts) { + mt := oci.Mount{ + Destination: "/etc/hostname", + Type: "bind", + Source: getSandboxHostnamePath(sbid), + Options: []string{"bind"}, + } + if isRootReadonly(spec) { + mt.Options = append(mt.Options, "ro") + } + spec.Mounts = append(spec.Mounts, mt) + } + + // Add /etc/hosts if the spec did not override it. + if !isInMounts("/etc/hosts", spec.Mounts) { + mt := oci.Mount{ + Destination: "/etc/hosts", + Type: "bind", + Source: getSandboxHostsPath(sbid), + Options: []string{"bind"}, + } + if isRootReadonly(spec) { + mt.Options = append(mt.Options, "ro") + } + spec.Mounts = append(spec.Mounts, mt) + } + + // Add /etc/resolv.conf if the spec did not override it. + if !isInMounts("/etc/resolv.conf", spec.Mounts) { + mt := oci.Mount{ + Destination: "/etc/resolv.conf", + Type: "bind", + Source: getSandboxResolvPath(sbid), + Options: []string{"bind"}, + } + if isRootReadonly(spec) { + mt.Options = append(mt.Options, "ro") + } + spec.Mounts = append(spec.Mounts, mt) + } + + // TODO: JTERRY75 /dev/shm is not properly setup for LCOW I believe. CRI + // also has a concept of a sandbox/shm file when the IPC NamespaceMode != + // NODE. + + // Check if we need to do any capability/device mappings + if spec.Annotations["io.microsoft.virtualmachine.lcow.privileged"] == "true" { + log.G(ctx).Debug("'io.microsoft.virtualmachine.lcow.privileged' set for privileged container") + + // Add all host devices + hostDevices, err := devices.HostDevices() + if err != nil { + return err + } + for _, hostDevice := range hostDevices { + addLinuxDeviceToSpec(ctx, hostDevice, spec, false) + } + + // Set the cgroup access + spec.Linux.Resources.Devices = []oci.LinuxDeviceCgroup{ + { + Allow: true, + Access: "rwm", + }, + } + } else { + tempLinuxDevices := spec.Linux.Devices + spec.Linux.Devices = []oci.LinuxDevice{} + for _, ld := range tempLinuxDevices { + hostDevice, err := devices.DeviceFromPath(ld.Path, "rwm") + if err != nil { + return err + } + addLinuxDeviceToSpec(ctx, hostDevice, spec, true) + } + } + + if userstr, ok := spec.Annotations["io.microsoft.lcow.userstr"]; ok { + if err := setUserStr(spec, userstr); err != nil { + return err + } + } + + // Force the parent cgroup into our /containers root + spec.Linux.CgroupsPath = "/containers/" + id + + if spec.Windows != nil && specHasGPUDevice(spec) { + // we only support Nvidia gpus right now + ldConfigargs := []string{"-l", "/run/nvidia/lib"} + env := updateEnvWithNvidiaVariables() + if err := addLDConfigHook(ctx, spec, ldConfigargs, env); err != nil { + return err + } + if err := addNvidiaDevicePreHook(ctx, spec); err != nil { + return err + } + } + + // Clear the windows section as we dont want to forward to runc + spec.Windows = nil + + return nil +} + +// Helper function to create an oci prestart hook to run ldconfig +func addLDConfigHook(ctx context.Context, spec *oci.Spec, args, env []string) error { + if spec.Hooks == nil { + spec.Hooks = &oci.Hooks{} + } + + ldConfigHook := oci.Hook{ + Path: "/sbin/ldconfig", + Args: args, + Env: env, + } + + spec.Hooks.Prestart = append(spec.Hooks.Prestart, ldConfigHook) + return nil +} + +func addLinuxDeviceToSpec(ctx context.Context, hostDevice *configs.Device, spec *oci.Spec, addCgroupDevice bool) { + rd := oci.LinuxDevice{ + Path: hostDevice.Path, + Type: string(hostDevice.Type), + Major: hostDevice.Major, + Minor: hostDevice.Minor, + UID: &hostDevice.Uid, + GID: &hostDevice.Gid, + } + if hostDevice.Major == 0 && hostDevice.Minor == 0 { + // Invalid device, most likely a symbolic link, skip it. + return + } + found := false + for i, dev := range spec.Linux.Devices { + if dev.Path == rd.Path { + found = true + spec.Linux.Devices[i] = rd + break + } + if dev.Type == rd.Type && dev.Major == rd.Major && dev.Minor == rd.Minor { + log.G(ctx).Warnf("The same type '%s', major '%d' and minor '%d', should not be used for multiple devices.", dev.Type, dev.Major, dev.Minor) + } + } + if !found { + spec.Linux.Devices = append(spec.Linux.Devices, rd) + if addCgroupDevice { + deviceCgroup := oci.LinuxDeviceCgroup{ + Allow: true, + Type: string(hostDevice.Type), + Major: &hostDevice.Major, + Minor: &hostDevice.Minor, + Access: string(hostDevice.Permissions), + } + spec.Linux.Resources.Devices = append(spec.Linux.Resources.Devices, deviceCgroup) + } + } +} diff --git a/internal/guest/runtime/runc/ioutils.go b/internal/guest/runtime/runc/ioutils.go new file mode 100644 index 0000000000..9a00cb1af8 --- /dev/null +++ b/internal/guest/runtime/runc/ioutils.go @@ -0,0 +1,104 @@ +// +build linux + +package runc + +import ( + "net" + "os" + "path/filepath" + + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// createConsoleSocket creates a unix socket in the given process directory and +// returns its path and a listener to it. This socket can then be used to +// receive the container's terminal master file descriptor. +func (r *runcRuntime) createConsoleSocket(processDir string) (listener *net.UnixListener, socketPath string, err error) { + socketPath = filepath.Join(processDir, "master.sock") + addr, err := net.ResolveUnixAddr("unix", socketPath) + if err != nil { + return nil, "", errors.Wrapf(err, "failed to resolve unix socket at address %s", socketPath) + } + listener, err = net.ListenUnix("unix", addr) + if err != nil { + return nil, "", errors.Wrapf(err, "failed to listen on unix socket at address %s", socketPath) + } + return listener, socketPath, nil +} + +// getMasterFromSocket blocks on the given listener's socket until a message is +// sent, then parses the file descriptor representing the terminal master out +// of the message and returns it as a file. +func (r *runcRuntime) getMasterFromSocket(listener *net.UnixListener) (master *os.File, err error) { + // Accept the listener's connection. + conn, err := listener.Accept() + if err != nil { + return nil, errors.Wrap(err, "failed to get terminal master file descriptor from socket") + } + defer conn.Close() + unixConn, ok := conn.(*net.UnixConn) + if !ok { + return nil, errors.New("connection returned from Accept was not a unix socket") + } + + const maxNameLen = 4096 + var oobSpace = unix.CmsgSpace(4) + + name := make([]byte, maxNameLen) + oob := make([]byte, oobSpace) + + // Read a message from the unix socket. This blocks until the message is + // sent. + n, oobn, _, _, err := unixConn.ReadMsgUnix(name, oob) + if err != nil { + return nil, errors.Wrap(err, "failed to read message from unix socket") + } + if n >= maxNameLen || oobn != oobSpace { + return nil, errors.Errorf("read an invalid number of bytes (n=%d oobn=%d)", n, oobn) + } + + // Truncate the data returned from the message. + name = name[:n] + oob = oob[:oobn] + + // Parse the out-of-band data in the message. + messages, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse socket control message for oob %v", oob) + } + if len(messages) == 0 { + return nil, errors.New("did not receive any socket control messages") + } + if len(messages) > 1 { + return nil, errors.Errorf("received more than one socket control message: received %d", len(messages)) + } + message := messages[0] + + // Parse the file descriptor out of the out-of-band data in the message. + fds, err := unix.ParseUnixRights(&message) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse file descriptors out of message %v", message) + } + if len(fds) == 0 { + return nil, errors.New("did not receive any file descriptors") + } + if len(fds) > 1 { + return nil, errors.Errorf("received more than one file descriptor: received %d", len(fds)) + } + fd := uintptr(fds[0]) + + return os.NewFile(fd, string(name)), nil +} + +// pathExists returns true if the given path exists, false if not. +func (r *runcRuntime) pathExists(pathToCheck string) (bool, error) { + _, err := os.Stat(pathToCheck) + if err != nil { + if os.IsNotExist(err) { + return false, nil + } + return false, errors.Wrapf(err, "failed call to Stat for path %s", pathToCheck) + } + return true, nil +} diff --git a/internal/guest/runtime/runc/runc.go b/internal/guest/runtime/runc/runc.go new file mode 100644 index 0000000000..d3c9086913 --- /dev/null +++ b/internal/guest/runtime/runc/runc.go @@ -0,0 +1,735 @@ +// +build linux + +// Package runc defines an implementation of the Runtime interface which uses +// runC as the container runtime. +package runc + +import ( + "encoding/json" + "io/ioutil" + "net" + "os" + "path" + "path/filepath" + "strconv" + "strings" + "syscall" + + "github.com/Microsoft/hcsshim/internal/guest/commonutils" + "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/Microsoft/hcsshim/internal/guest/runtime" + "github.com/Microsoft/hcsshim/internal/guest/stdio" + oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + containerFilesDir = "/var/run/gcsrunc" + initPidFilename = "initpid" +) + +func setSubreaper(i int) error { + return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) +} + +// runcRuntime is an implementation of the Runtime interface which uses runC as +// the container runtime. +type runcRuntime struct { + runcLogBasePath string +} + +var _ runtime.Runtime = &runcRuntime{} + +type container struct { + r *runcRuntime + id string + init *process + // ownsPidNamespace indicates whether the container's init process is also + // the init process for its pid namespace. + ownsPidNamespace bool +} + +func (c *container) ID() string { + return c.id +} + +func (c *container) Pid() int { + return c.init.Pid() +} + +func (c *container) Tty() *stdio.TtyRelay { + return c.init.ttyRelay +} + +func (c *container) PipeRelay() *stdio.PipeRelay { + return c.init.pipeRelay +} + +// process represents a process running in a container. It can either be a +// container's init process, or an exec process in a container. +type process struct { + c *container + pid int + ttyRelay *stdio.TtyRelay + pipeRelay *stdio.PipeRelay +} + +func (p *process) Pid() int { + return p.pid +} + +func (p *process) Tty() *stdio.TtyRelay { + return p.ttyRelay +} + +func (p *process) PipeRelay() *stdio.PipeRelay { + return p.pipeRelay +} + +// NewRuntime instantiates a new runcRuntime struct. +func NewRuntime(logBasePath string) (runtime.Runtime, error) { + + rtime := &runcRuntime{runcLogBasePath: logBasePath} + if err := rtime.initialize(); err != nil { + return nil, err + } + return rtime, nil +} + +// initialize sets up any state necessary for the runcRuntime to function. +func (r *runcRuntime) initialize() error { + paths := [2]string{containerFilesDir, r.runcLogBasePath} + for _, p := range paths { + _, err := os.Stat(p) + if err != nil { + if os.IsNotExist(err) { + if err := os.MkdirAll(p, 0700); err != nil { + return errors.Wrapf(err, "failed making runC container files directory %s", p) + } + } else { + return err + } + } + } + + return nil +} + +// CreateContainer creates a container with the given ID and the given +// bundlePath. +// bundlePath should be a path to an OCI bundle containing a config.json file +// and a rootfs for the container. +func (r *runcRuntime) CreateContainer(id string, bundlePath string, stdioSet *stdio.ConnectionSet) (c runtime.Container, err error) { + c, err = r.runCreateCommand(id, bundlePath, stdioSet) + if err != nil { + return nil, err + } + return c, nil +} + +// Start unblocks the container's init process created by the call to +// CreateContainer. +func (c *container) Start() error { + logPath := c.r.getLogPath(c.id) + args := []string{"start", c.id} + cmd := createRuncCommand(logPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + runcErr := getRuncLogError(logPath) + c.r.cleanupContainer(c.id) + return errors.Wrapf(err, "runc start failed with %v: %s", runcErr, string(out)) + } + return nil +} + +// ExecProcess executes a new process, represented as an OCI process struct, +// inside an already-running container. +func (c *container) ExecProcess(process *oci.Process, stdioSet *stdio.ConnectionSet) (p runtime.Process, err error) { + p, err = c.runExecCommand(process, stdioSet) + if err != nil { + return nil, err + } + return p, nil +} + +// Kill sends the specified signal to the container's init process. +func (c *container) Kill(signal syscall.Signal) error { + logPath := c.r.getLogPath(c.id) + args := []string{"kill"} + if signal == syscall.SIGTERM || signal == syscall.SIGKILL { + args = append(args, "--all") + } + args = append(args, c.id, strconv.Itoa(int(signal))) + cmd := createRuncCommand(logPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + if strings.Contains(err.Error(), "os: process already finished") || + strings.Contains(err.Error(), "container not running") || + err == syscall.ESRCH { + return gcserr.NewHresultError(gcserr.HrVmcomputeSystemNotFound) + } + + runcErr := getRuncLogError(logPath) + return errors.Wrapf(err, "unknown runc error after kill %v: %s", runcErr, string(out)) + } + return nil +} + +// Delete deletes any state created for the container by either this wrapper or +// runC itself. +func (c *container) Delete() error { + logPath := c.r.getLogPath(c.id) + args := []string{"delete", c.id} + cmd := createRuncCommand(logPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + runcErr := getRuncLogError(logPath) + return errors.Wrapf(err, "runc delete failed with %v: %s", runcErr, string(out)) + } + if err := c.r.cleanupContainer(c.id); err != nil { + return err + } + return nil +} + +// Delete deletes any state created for the process by either this wrapper or +// runC itself. +func (p *process) Delete() error { + if err := p.c.r.cleanupProcess(p.c.id, p.pid); err != nil { + return err + } + return nil +} + +// Pause suspends all processes running in the container. +func (c *container) Pause() error { + logPath := c.r.getLogPath(c.id) + args := []string{"pause", c.id} + cmd := createRuncCommand(logPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + runcErr := getRuncLogError(logPath) + return errors.Wrapf(err, "runc pause failed with %v: %s", runcErr, string(out)) + } + return nil +} + +// Resume unsuspends processes running in the container. +func (c *container) Resume() error { + logPath := c.r.getLogPath(c.id) + args := []string{"resume", c.id} + cmd := createRuncCommand(logPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + runcErr := getRuncLogError(logPath) + return errors.Wrapf(err, "runc resume failed with %v: %s", runcErr, string(out)) + } + return nil +} + +// GetState returns information about the given container. +func (c *container) GetState() (*runtime.ContainerState, error) { + logPath := c.r.getLogPath(c.id) + args := []string{"state", c.id} + cmd := createRuncCommand(logPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + runcErr := getRuncLogError(logPath) + return nil, errors.Wrapf(err, "runc state failed with %v: %s", runcErr, string(out)) + } + var state runtime.ContainerState + if err := json.Unmarshal(out, &state); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal the state for container %s", c.id) + } + return &state, nil +} + +// Exists returns true if the container exists, false if it doesn't +// exist. +// It should be noted that containers that have stopped but have not been +// deleted are still considered to exist. +func (c *container) Exists() (bool, error) { + states, err := c.r.ListContainerStates() + if err != nil { + return false, err + } + // TODO: This is definitely not the most efficient way of doing this. See + // about improving it in the future. + for _, state := range states { + if state.ID == c.id { + return true, nil + } + } + return false, nil +} + +// ListContainerStates returns ContainerState structs for all existing +// containers, whether they're running or not. +func (r *runcRuntime) ListContainerStates() ([]runtime.ContainerState, error) { + logPath := filepath.Join(r.runcLogBasePath, "global-runc.log") + args := []string{"list", "-f", "json"} + cmd := createRuncCommand(logPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + runcErr := getRuncLogError(logPath) + return nil, errors.Wrapf(err, "runc list failed with %v: %s", runcErr, string(out)) + } + var states []runtime.ContainerState + if err := json.Unmarshal(out, &states); err != nil { + return nil, errors.Wrap(err, "failed to unmarshal the states for the container list") + } + return states, nil +} + +// GetRunningProcesses gets only the running processes associated with the given +// container. This excludes zombie processes. +func (c *container) GetRunningProcesses() ([]runtime.ContainerProcessState, error) { + pids, err := c.r.getRunningPids(c.id) + if err != nil { + return nil, err + } + + pidMap := map[int]*runtime.ContainerProcessState{} + // Initialize all processes with a pid and command, and mark correctly that + // none of them are zombies. Default CreatedByRuntime to false. + for _, pid := range pids { + command, err := c.r.getProcessCommand(pid) + if err != nil { + if errors.Is(err, unix.ENOENT) { + // process has exited between getting the running pids above + // and now, ignore error + continue + } + return nil, err + } + pidMap[pid] = &runtime.ContainerProcessState{Pid: pid, Command: command, CreatedByRuntime: false, IsZombie: false} + } + + // For each process state directory which corresponds to a running pid, set + // that the process was created by the Runtime. + processDirs, err := ioutil.ReadDir(filepath.Join(containerFilesDir, c.id)) + if err != nil { + return nil, errors.Wrapf(err, "failed to read the contents of container directory %s", filepath.Join(containerFilesDir, c.id)) + } + for _, processDir := range processDirs { + if processDir.Name() != initPidFilename { + pid, err := strconv.Atoi(processDir.Name()) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse string \"%s\" as pid", processDir.Name()) + } + if _, ok := pidMap[pid]; ok { + pidMap[pid].CreatedByRuntime = true + } + } + } + + return c.r.pidMapToProcessStates(pidMap), nil +} + +// GetAllProcesses gets all processes associated with the given container, +// including both running and zombie processes. +func (c *container) GetAllProcesses() ([]runtime.ContainerProcessState, error) { + runningPids, err := c.r.getRunningPids(c.id) + if err != nil { + return nil, err + } + + logrus.WithFields(logrus.Fields{ + "cid": c.id, + "pids": runningPids, + }).Debug("running container pids") + + pidMap := map[int]*runtime.ContainerProcessState{} + // Initialize all processes with a pid and command, leaving CreatedByRuntime + // and IsZombie at the default value of false. + for _, pid := range runningPids { + command, err := c.r.getProcessCommand(pid) + if err != nil { + if errors.Is(err, unix.ENOENT) { + // process has exited between getting the running pids above + // and now, ignore error + continue + } + return nil, err + } + pidMap[pid] = &runtime.ContainerProcessState{Pid: pid, Command: command, CreatedByRuntime: false, IsZombie: false} + } + + processDirs, err := ioutil.ReadDir(filepath.Join(containerFilesDir, c.id)) + if err != nil { + return nil, errors.Wrapf(err, "failed to read the contents of container directory %s", filepath.Join(containerFilesDir, c.id)) + } + // Loop over every process state directory. Since these processes have + // process state directories, CreatedByRuntime will be true for all of them. + for _, processDir := range processDirs { + if processDir.Name() != initPidFilename { + pid, err := strconv.Atoi(processDir.Name()) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse string \"%s\" into pid", processDir.Name()) + } + if c.r.processExists(pid) { + // If the process exists in /proc and is in the pidMap, it must + // be a running non-zombie. + if _, ok := pidMap[pid]; ok { + pidMap[pid].CreatedByRuntime = true + } else { + // Otherwise, since it's in /proc but not running, it must + // be a zombie. + command, err := c.r.getProcessCommand(pid) + if err != nil { + if errors.Is(err, unix.ENOENT) { + // process has exited between checking that it exists and now, ignore error + continue + } + return nil, err + } + pidMap[pid] = &runtime.ContainerProcessState{Pid: pid, Command: command, CreatedByRuntime: true, IsZombie: true} + } + } + } + } + return c.r.pidMapToProcessStates(pidMap), nil +} + +// getRunningPids gets the pids of all processes which runC recognizes as +// running. +func (r *runcRuntime) getRunningPids(id string) ([]int, error) { + logPath := r.getLogPath(id) + args := []string{"ps", "-f", "json", id} + cmd := createRuncCommand(logPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + runcErr := getRuncLogError(logPath) + return nil, errors.Wrapf(err, "runc ps failed with %v: %s", runcErr, string(out)) + } + var pids []int + if err := json.Unmarshal(out, &pids); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal pids for container %s", id) + } + return pids, nil +} + +// getProcessCommand gets the command line command and arguments for the process +// with the given pid. +func (r *runcRuntime) getProcessCommand(pid int) ([]string, error) { + // Get the contents of the process's cmdline file. This file is formatted + // with a null character after every argument. e.g. "ping google.com " + data, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "cmdline")) + if err != nil { + return nil, errors.Wrapf(err, "failed to read cmdline file for process %d", pid) + } + // Get rid of the \0 character at end. + cmdString := strings.TrimSuffix(string(data), "\x00") + return strings.Split(cmdString, "\x00"), nil +} + +// pidMapToProcessStates is a helper function which converts a map from pid to +// ContainerProcessState to a slice of ContainerProcessStates. +func (r *runcRuntime) pidMapToProcessStates(pidMap map[int]*runtime.ContainerProcessState) []runtime.ContainerProcessState { + processStates := make([]runtime.ContainerProcessState, len(pidMap)) + i := 0 + for _, processState := range pidMap { + processStates[i] = *processState + i++ + } + return processStates +} + +// waitOnProcess waits for the process to exit, and returns its exit code. +func (r *runcRuntime) waitOnProcess(pid int) (int, error) { + process, err := os.FindProcess(pid) + if err != nil { + return -1, errors.Wrapf(err, "failed to find process %d", pid) + } + state, err := process.Wait() + if err != nil { + return -1, errors.Wrapf(err, "failed waiting on process %d", pid) + } + + status := state.Sys().(syscall.WaitStatus) + if status.Signaled() { + return 128 + int(status.Signal()), nil + } + return status.ExitStatus(), nil +} + +func (p *process) Wait() (int, error) { + exitCode, err := p.c.r.waitOnProcess(p.pid) + + l := logrus.WithField("cid", p.c.id) + l.WithField("pid", p.pid).Debug("process wait completed") + + // If the init process for the container has exited, kill everything else in + // the container. Runc uses the devices cgroup of the container ot determine + // what other processes to kill. + // + // We don't issue the kill if the container owns its own pid namespace, + // because in that case the container kernel will kill everything in the pid + // namespace automatically (as the container init will be the pid namespace + // init). This prevents a potential issue where two containers share cgroups + // but have their own pid namespaces. If we didn't handle this case, runc + // would kill the processes in both containers when trying to kill + // either one of them. + if p == p.c.init && !p.c.ownsPidNamespace { + // If the init process of a pid namespace terminates, the kernel + // terminates all other processes in the namespace with SIGKILL. We + // simulate the same behavior. + if err := p.c.Kill(syscall.SIGKILL); err != nil { + l.WithError(err).Error("failed to terminate container after process wait") + } + } + + // Wait on the relay to drain any output that was already buffered. + // + // At this point, if this is the init process for the container, everything + // else in the container has been killed, so the write ends of the stdio + // relay will have been closed. + // + // If this is a container exec process instead, then it is possible the + // relay waits will hang waiting for the write ends to close. This can occur + // if the exec spawned any child processes that inherited its stdio. + // Currently we do not do anything to avoid hanging in this case, but in the + // future we could add special handling. + if p.ttyRelay != nil { + p.ttyRelay.Wait() + } + if p.pipeRelay != nil { + p.pipeRelay.Wait() + } + return exitCode, err +} + +// Wait waits on every non-init process in the container, and then performs a +// final wait on the init process. The exit code returned is the exit code +// acquired from waiting on the init process. +func (c *container) Wait() (int, error) { + processes, err := c.GetAllProcesses() + if err != nil { + return -1, err + } + for _, process := range processes { + // Only wait on non-init processes that were created with exec. + if process.Pid != c.init.pid && process.CreatedByRuntime { + // FUTURE-jstarks: Consider waiting on the child process's relays as + // well (as in p.Wait()). This may not matter as long as the relays + // finish "soon" after Wait() returns since HCS expects the stdio + // connections to close before container shutdown can complete. + logrus.WithFields(logrus.Fields{ + "cid": c.id, + "pid": process.Pid, + }).Debug("waiting on container exec process") + c.r.waitOnProcess(process.Pid) + } + } + exitCode, err := c.init.Wait() + if err != nil { + return -1, err + } + return exitCode, nil +} + +// runCreateCommand sets up the arguments for calling runc create. +func (r *runcRuntime) runCreateCommand(id string, bundlePath string, stdioSet *stdio.ConnectionSet) (runtime.Container, error) { + c := &container{r: r, id: id} + if err := r.makeContainerDir(id); err != nil { + return nil, err + } + // Create a temporary random directory to store the process's files. + tempProcessDir, err := ioutil.TempDir(containerFilesDir, id) + if err != nil { + return nil, err + } + + spec, err := ociSpecFromBundle(bundlePath) + if err != nil { + return nil, err + } + + // Determine if the container owns its own pid namespace or not. Per the OCI + // spec: + // - If the spec has no entry for the pid namespace, the container inherits + // the runtime namespace (container does not own). + // - If the spec has a pid namespace entry, but the path is empty, a new + // namespace will be created and used for the container (container owns). + // - If there is a pid namespace entry with a path, the container uses the + // namespace at that path (container does not own). + if spec.Linux != nil { + for _, ns := range spec.Linux.Namespaces { + if ns.Type == oci.PIDNamespace { + c.ownsPidNamespace = ns.Path == "" + } + } + } + + if spec.Process.Cwd != "/" { + cwd := path.Join(bundlePath, "rootfs", spec.Process.Cwd) + // Intentionally ignore the error. + _ = os.MkdirAll(cwd, 0755) + } + + args := []string{"create", "-b", bundlePath, "--no-pivot"} + p, err := c.startProcess(tempProcessDir, spec.Process.Terminal, stdioSet, args...) + if err != nil { + return nil, err + } + + // Write pid to initpid file for container. + containerDir := r.getContainerDir(id) + if err := ioutil.WriteFile(filepath.Join(containerDir, initPidFilename), []byte(strconv.Itoa(p.pid)), 0777); err != nil { + return nil, err + } + + c.init = p + return c, nil +} + +func ociSpecFromBundle(bundlePath string) (*oci.Spec, error) { + configPath := filepath.Join(bundlePath, "config.json") + configFile, err := os.Open(configPath) + if err != nil { + return nil, errors.Wrapf(err, "failed to open bundle config at %s", configPath) + } + defer configFile.Close() + var spec *oci.Spec + if err := commonutils.DecodeJSONWithHresult(configFile, &spec); err != nil { + return nil, errors.Wrap(err, "failed to parse OCI spec") + } + return spec, nil +} + +// runExecCommand sets up the arguments for calling runc exec. +func (c *container) runExecCommand(processDef *oci.Process, stdioSet *stdio.ConnectionSet) (p runtime.Process, err error) { + // Create a temporary random directory to store the process's files. + tempProcessDir, err := ioutil.TempDir(containerFilesDir, c.id) + if err != nil { + return nil, err + } + + f, err := os.Create(filepath.Join(tempProcessDir, "process.json")) + if err != nil { + return nil, errors.Wrapf(err, "failed to create process.json file at %s", filepath.Join(tempProcessDir, "process.json")) + } + defer f.Close() + if err := json.NewEncoder(f).Encode(processDef); err != nil { + return nil, errors.Wrap(err, "failed to encode JSON into process.json file") + } + + args := []string{"exec"} + args = append(args, "-d", "--process", filepath.Join(tempProcessDir, "process.json")) + return c.startProcess(tempProcessDir, processDef.Terminal, stdioSet, args...) +} + +// startProcess performs the operations necessary to start a container process +// and properly handle its stdio. This function is used by both CreateContainer +// and ExecProcess. For V2 container creation stdioSet will be nil, in this case +// it is expected that the caller starts the relay previous to calling Start on +// the container. +func (c *container) startProcess(tempProcessDir string, hasTerminal bool, stdioSet *stdio.ConnectionSet, initialArgs ...string) (p *process, err error) { + args := initialArgs + + if err := setSubreaper(1); err != nil { + return nil, errors.Wrapf(err, "failed to set process as subreaper for process in container %s", c.id) + } + if err := c.r.makeLogDir(c.id); err != nil { + return nil, err + } + + logPath := c.r.getLogPath(c.id) + args = append(args, "--pid-file", filepath.Join(tempProcessDir, "pid")) + + var sockListener *net.UnixListener + if hasTerminal { + var consoleSockPath string + sockListener, consoleSockPath, err = c.r.createConsoleSocket(tempProcessDir) + if err != nil { + return nil, errors.Wrapf(err, "failed to create console socket for container %s", c.id) + } + defer sockListener.Close() + args = append(args, "--console-socket", consoleSockPath) + } + args = append(args, c.id) + + cmd := createRuncCommand(logPath, args...) + + var pipeRelay *stdio.PipeRelay + if !hasTerminal { + pipeRelay, err = stdio.NewPipeRelay(stdioSet) + if err != nil { + return nil, errors.Wrapf(err, "failed to create a pipe relay connection set for container %s", c.id) + } + fileSet, err := pipeRelay.Files() + if err != nil { + return nil, errors.Wrapf(err, "failed to get files for connection set for container %s", c.id) + } + // Closing the FileSet here is fine as that end of the pipes will have + // already been copied into the child process. + defer fileSet.Close() + if fileSet.In != nil { + cmd.Stdin = fileSet.In + } + if fileSet.Out != nil { + cmd.Stdout = fileSet.Out + } + if fileSet.Err != nil { + cmd.Stderr = fileSet.Err + } + } + + if err := cmd.Run(); err != nil { + runcErr := getRuncLogError(logPath) + return nil, errors.Wrapf(err, "failed to run runc create/exec call for container %s with %v", c.id, runcErr) + } + + var ttyRelay *stdio.TtyRelay + if hasTerminal { + var master *os.File + master, err = c.r.getMasterFromSocket(sockListener) + if err != nil { + cmd.Process.Kill() + return nil, errors.Wrapf(err, "failed to get pty master for process in container %s", c.id) + } + // Keep master open for the relay unless there is an error. + defer func() { + if err != nil { + master.Close() + } + }() + ttyRelay = stdio.NewTtyRelay(stdioSet, master) + } + + // Rename the process's directory to its pid. + pid, err := c.r.readPidFile(filepath.Join(tempProcessDir, "pid")) + if err != nil { + return nil, err + } + if err := os.Rename(tempProcessDir, c.r.getProcessDir(c.id, pid)); err != nil { + return nil, err + } + + if ttyRelay != nil && stdioSet != nil { + ttyRelay.Start() + } + if pipeRelay != nil && stdioSet != nil { + pipeRelay.Start() + } + return &process{c: c, pid: pid, ttyRelay: ttyRelay, pipeRelay: pipeRelay}, nil +} + +func (c *container) Update(resources interface{}) error { + jsonResources, err := json.Marshal(resources) + if err != nil { + return err + } + logPath := c.r.getLogPath(c.id) + args := []string{"update", "--resources", "-", c.id} + cmd := createRuncCommand(logPath, args...) + cmd.Stdin = strings.NewReader(string(jsonResources)) + out, err := cmd.CombinedOutput() + if err != nil { + runcErr := getRuncLogError(logPath) + return errors.Wrapf(err, "runc update request %s failed with %v: %s", string(jsonResources), runcErr, string(out)) + } + return nil +} diff --git a/internal/guest/runtime/runc/utils.go b/internal/guest/runtime/runc/utils.go new file mode 100644 index 0000000000..4d09da84b3 --- /dev/null +++ b/internal/guest/runtime/runc/utils.go @@ -0,0 +1,131 @@ +// +build linux + +package runc + +import ( + "encoding/json" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" + + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +// readPidFile reads the integer pid stored in the given file. +func (r *runcRuntime) readPidFile(pidFile string) (pid int, err error) { + data, err := ioutil.ReadFile(pidFile) + if err != nil { + return -1, errors.Wrap(err, "failed reading from pid file") + } + pid, err = strconv.Atoi(string(data)) + if err != nil { + return -1, errors.Wrapf(err, "failed converting pid text \"%s\" to integer form", data) + } + return pid, nil +} + +// cleanupContainer cleans up any state left behind by the container. +func (r *runcRuntime) cleanupContainer(id string) error { + containerDir := r.getContainerDir(id) + if err := os.RemoveAll(containerDir); err != nil { + return errors.Wrapf(err, "failed removing the container directory for container %s", id) + } + return nil +} + +// cleanupProcess cleans up any state left behind by the process. +func (r *runcRuntime) cleanupProcess(id string, pid int) error { + processDir := r.getProcessDir(id, pid) + if err := os.RemoveAll(processDir); err != nil { + return errors.Wrapf(err, "failed removing the process directory for process %d in container %s", pid, id) + } + return nil +} + +// getProcessDir returns the path to the state directory of the given process. +func (r *runcRuntime) getProcessDir(id string, pid int) string { + containerDir := r.getContainerDir(id) + return filepath.Join(containerDir, strconv.Itoa(pid)) +} + +// getContainerDir returns the path to the state directory of the given +// container. +func (r *runcRuntime) getContainerDir(id string) string { + return filepath.Join(containerFilesDir, id) +} + +// makeContainerDir creates the state directory for the given container. +func (r *runcRuntime) makeContainerDir(id string) error { + dir := r.getContainerDir(id) + if err := os.MkdirAll(dir, os.ModeDir); err != nil { + return errors.Wrapf(err, "failed making container directory for container %s", id) + } + return nil +} + +// getLogDir gets the path to the runc logs directory. +func (r *runcRuntime) getLogDir(id string) string { + return filepath.Join(r.runcLogBasePath, id) +} + +// makeLogDir creates the runc logs directory if it doesnt exist. +func (r *runcRuntime) makeLogDir(id string) error { + dir := r.getLogDir(id) + if err := os.MkdirAll(dir, os.ModeDir); err != nil { + return errors.Wrapf(err, "failed making runc log directory for container %s", id) + } + return nil +} + +// getLogPath returns the path to the log file used by the runC wrapper. +func (r *runcRuntime) getLogPath(id string) string { + return filepath.Join(r.getLogDir(id), "runc.log") +} + +// processExists returns true if the given process exists in /proc, false if +// not. +// It should be noted that processes which have exited, but have not yet been +// waited on (i.e. zombies) are still considered to exist by this function. +func (r *runcRuntime) processExists(pid int) bool { + _, err := os.Stat(filepath.Join("/proc", strconv.Itoa(pid))) + return !os.IsNotExist(err) +} + +type standardLogEntry struct { + Level logrus.Level `json:"level"` + Message string `json:"msg"` + Err error `json:"error,omitempty"` +} + +func getRuncLogError(logPath string) error { + reader, err := os.OpenFile(logPath, syscall.O_RDONLY, 0644) + if err != nil { + return nil + } + defer reader.Close() + + var lastErr error + dec := json.NewDecoder(reader) + for { + entry := &standardLogEntry{} + if err := dec.Decode(entry); err != nil { + break + } + if entry.Level <= logrus.ErrorLevel { + lastErr = errors.New(entry.Message) + if entry.Err != nil { + lastErr = errors.Wrapf(lastErr, entry.Err.Error()) + } + } + } + return lastErr +} + +func createRuncCommand(logPath string, args ...string) *exec.Cmd { + args = append([]string{"--log", logPath, "--log-format", "json"}, args...) + return exec.Command("runc", args...) +} diff --git a/internal/guest/runtime/runtime.go b/internal/guest/runtime/runtime.go new file mode 100644 index 0000000000..7f4854f906 --- /dev/null +++ b/internal/guest/runtime/runtime.go @@ -0,0 +1,73 @@ +// +build linux + +// Package runtime defines the interface between the GCS and an OCI container +// runtime. +package runtime + +import ( + "io" + "syscall" + + "github.com/Microsoft/hcsshim/internal/guest/stdio" + oci "github.com/opencontainers/runtime-spec/specs-go" +) + +// ContainerState gives information about a container created by a Runtime. +type ContainerState struct { + OCIVersion string + ID string + Pid int + BundlePath string + RootfsPath string + Status string + Created string +} + +// ContainerProcessState gives information about a process created by a +// Runtime. +type ContainerProcessState struct { + Pid int + Command []string + CreatedByRuntime bool + IsZombie bool +} + +// StdioPipes contain the interfaces for reading from and writing to a +// process's stdio. +type StdioPipes struct { + In io.WriteCloser + Out io.ReadCloser + Err io.ReadCloser +} + +// Process is an interface to manipulate process state. +type Process interface { + Wait() (int, error) + Pid() int + Delete() error + Tty() *stdio.TtyRelay + PipeRelay() *stdio.PipeRelay +} + +// Container is an interface to manipulate container state. +type Container interface { + Process + ID() string + Exists() (bool, error) + Start() error + ExecProcess(process *oci.Process, stdioSet *stdio.ConnectionSet) (p Process, err error) + Kill(signal syscall.Signal) error + Pause() error + Resume() error + GetState() (*ContainerState, error) + GetRunningProcesses() ([]ContainerProcessState, error) + GetAllProcesses() ([]ContainerProcessState, error) + Update(resources interface{}) error +} + +// Runtime is the interface defining commands over an OCI container runtime, +// such as runC. +type Runtime interface { + CreateContainer(id string, bundlePath string, stdioSet *stdio.ConnectionSet) (c Container, err error) + ListContainerStates() ([]ContainerState, error) +} diff --git a/internal/guest/stdio/connection.go b/internal/guest/stdio/connection.go new file mode 100644 index 0000000000..9458596489 --- /dev/null +++ b/internal/guest/stdio/connection.go @@ -0,0 +1,105 @@ +// +build linux + +package stdio + +import ( + "os" + + "github.com/Microsoft/hcsshim/internal/guest/transport" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +// ConnectionSettings describe the stdin, stdout, stderr ports to connect the +// transport to. A nil port specifies no connection. +type ConnectionSettings struct { + StdIn *uint32 + StdOut *uint32 + StdErr *uint32 +} + +type logConnection struct { + con transport.Connection + port uint32 +} + +func (lc *logConnection) Read(b []byte) (int, error) { + return lc.con.Read(b) +} + +func (lc *logConnection) Write(b []byte) (int, error) { + return lc.con.Write(b) +} + +func (lc *logConnection) Close() error { + logrus.WithFields(logrus.Fields{ + "port": lc.port, + }).Debug("opengcs::logConnection::Close - closing connection") + + return lc.con.Close() +} + +func (lc *logConnection) CloseRead() error { + logrus.WithFields(logrus.Fields{ + "port": lc.port, + }).Debug("opengcs::logConnection::Close - closing read connection") + + return lc.con.CloseRead() +} + +func (lc *logConnection) CloseWrite() error { + logrus.WithFields(logrus.Fields{ + "port": lc.port, + }).Debug("opengcs::logConnection::Close - closing write connection") + + return lc.con.CloseWrite() +} + +func (lc *logConnection) File() (*os.File, error) { + return lc.con.File() +} + +var _ = (transport.Connection)(&logConnection{}) + +// Connect returns new transport.Connection instances, one for each stdio pipe +// to be used. If CreateStd*Pipe for a given pipe is false, the given Connection +// is set to nil. +func Connect(tport transport.Transport, settings ConnectionSettings) (_ *ConnectionSet, err error) { + connSet := &ConnectionSet{} + defer func() { + if err != nil { + connSet.Close() + } + }() + if settings.StdIn != nil { + c, err := tport.Dial(*settings.StdIn) + if err != nil { + return nil, errors.Wrap(err, "failed creating stdin Connection") + } + connSet.In = &logConnection{ + con: c, + port: *settings.StdIn, + } + } + if settings.StdOut != nil { + c, err := tport.Dial(*settings.StdOut) + if err != nil { + return nil, errors.Wrap(err, "failed creating stdout Connection") + } + connSet.Out = &logConnection{ + con: c, + port: *settings.StdOut, + } + } + if settings.StdErr != nil { + c, err := tport.Dial(*settings.StdErr) + if err != nil { + return nil, errors.Wrap(err, "failed creating stderr Connection") + } + connSet.Err = &logConnection{ + con: c, + port: *settings.StdErr, + } + } + return connSet, nil +} diff --git a/internal/guest/stdio/stdio.go b/internal/guest/stdio/stdio.go new file mode 100644 index 0000000000..0a71823f3e --- /dev/null +++ b/internal/guest/stdio/stdio.go @@ -0,0 +1,378 @@ +// +build linux + +package stdio + +import ( + "io" + "os" + "strings" + "sync" + + "github.com/Microsoft/hcsshim/internal/guest/transport" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +// ConnectionSet is a structure defining the readers and writers the Core +// implementation should forward a process's stdio through. +type ConnectionSet struct { + In, Out, Err transport.Connection +} + +// Close closes each stdio connection. +func (s *ConnectionSet) Close() error { + var err error + if s.In != nil { + if cerr := s.In.Close(); cerr != nil && err == nil { + err = errors.Wrap(cerr, "failed Close on stdin") + } + s.In = nil + } + if s.Out != nil { + if cerr := s.Out.Close(); cerr != nil && err == nil { + err = errors.Wrap(cerr, "failed Close on stdout") + } + s.Out = nil + } + if s.Err != nil { + if cerr := s.Err.Close(); cerr != nil && err == nil { + err = errors.Wrap(cerr, "failed Close on stderr") + } + s.Err = nil + } + return err +} + +// FileSet represents the stdio of a process. It contains os.File types for +// in, out, err. +type FileSet struct { + In, Out, Err *os.File +} + +// Close closes all the FileSet handles. +func (fs *FileSet) Close() error { + var err error + if fs.In != nil { + if cerr := fs.In.Close(); cerr != nil && err == nil { + err = errors.Wrap(cerr, "failed Close on stdin") + } + fs.In = nil + } + if fs.Out != nil { + if cerr := fs.Out.Close(); cerr != nil && err == nil { + err = errors.Wrap(cerr, "failed Close on stdout") + } + fs.Out = nil + } + if fs.Err != nil { + if cerr := fs.Err.Close(); cerr != nil && err == nil { + err = errors.Wrap(cerr, "failed Close on stderr") + } + fs.Err = nil + } + return err +} + +// Files returns a FileSet with an os.File for each connection +// in the connection set. +func (s *ConnectionSet) Files() (_ *FileSet, err error) { + fs := &FileSet{} + defer func() { + if err != nil { + fs.Close() + } + }() + if s.In != nil { + fs.In, err = s.In.File() + if err != nil { + return nil, errors.Wrap(err, "failed to dup stdin socket for command") + } + } + if s.Out != nil { + fs.Out, err = s.Out.File() + if err != nil { + return nil, errors.Wrap(err, "failed to dup stdout socket for command") + } + } + if s.Err != nil { + fs.Err, err = s.Err.File() + if err != nil { + return nil, errors.Wrap(err, "failed to dup stderr socket for command") + } + } + return fs, nil +} + +// NewPipeRelay returns a new pipe relay wrapping the given connection stdin, +// stdout, stderr set. If s is nil will assume al stdin, stdout, stderr pipes. +func NewPipeRelay(s *ConnectionSet) (_ *PipeRelay, err error) { + pr := &PipeRelay{s: s} + defer func() { + if err != nil { + pr.closePipes() + } + }() + + if s == nil || s.In != nil { + pr.pipes[0], pr.pipes[1], err = os.Pipe() + if err != nil { + return nil, errors.Wrap(err, "failed to create stdin pipe relay") + } + } + if s == nil || s.Out != nil { + pr.pipes[2], pr.pipes[3], err = os.Pipe() + if err != nil { + return nil, errors.Wrap(err, "failed to create stdout pipe relay") + } + } + if s == nil || s.Err != nil { + pr.pipes[4], pr.pipes[5], err = os.Pipe() + if err != nil { + return nil, errors.Wrap(err, "failed to create stderr pipe relay") + } + } + return pr, nil +} + +// PipeRelay is a relay built to expose a pipe interface +// for stdin, stdout, stderr on top of a ConnectionSet. +type PipeRelay struct { + wg sync.WaitGroup + s *ConnectionSet + // pipes format is stdin [0 read, 1 write], stdout [2 read, 3 write], stderr [4 read, 5 write]. + pipes [6]*os.File +} + +// ReplaceConnectionSet allows the caller to add a new destination set after +// creating the relay. This can only be called previous to the call to Start. +func (pr *PipeRelay) ReplaceConnectionSet(s *ConnectionSet) { + pr.s = s +} + +// Files returns a FileSet with an os.File for each connection +// in the connection set. +func (pr *PipeRelay) Files() (*FileSet, error) { + fs := new(FileSet) + if pr.s == nil || pr.s.In != nil { + fs.In = pr.pipes[0] + } + if pr.s == nil || pr.s.Out != nil { + fs.Out = pr.pipes[3] + } + if pr.s == nil || pr.s.Err != nil { + fs.Err = pr.pipes[5] + } + return fs, nil +} + +func copyAndCleanClose(c transport.Connection, r io.Reader, name string) { + if n, err := io.Copy(c, r); err != nil { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "bytes": n, + "file": name, + }).Error("opengcs::PipeRelay::copyAndCleanClose - error copying from pipe") + } + // Shut down the write end of the socket, then read a byte (which should + // yield EOF) to wait for the other endpoint to finish reading and close + // the connection. + if err := c.CloseWrite(); err == nil { + var b [1]byte + _, err = c.Read(b[:]) + if err == nil { + err = errors.New("unexpected data in socket") + } + if err != io.EOF { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "file": name, + }).Error("opengcs::PipeRelay::copyAndCleanClose - error reading for clean close") + } + } else { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "file": name, + }).Error("opengcs::PipeRelay::copyAndCleanClose - error shutting down socket") + } + if err := c.Close(); err != nil { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "file": name, + }).Error("opengcs::PipeRelay::copyAndCleanClose - error closing socket") + } +} + +// Start starts the relay operation. The caller must call Wait to wait +// for the relay to finish and release the associated resources. +func (pr *PipeRelay) Start() { + if pr.s.In != nil { + pr.wg.Add(1) + go func() { + if n, err := io.Copy(pr.pipes[1], pr.s.In); err != nil { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "bytes": n, + }).Error("opengcs::PipeRelay::Start - error copying stdin to pipe") + } + if err := pr.pipes[1].Close(); err != nil { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + }).Error("opengcs::PipeRelay::Start - error closing stdin write pipe") + } + pr.pipes[1] = nil + pr.wg.Done() + }() + } + if pr.s.Out != nil { + pr.wg.Add(1) + go func() { + copyAndCleanClose(pr.s.Out, pr.pipes[2], "stdout") + pr.wg.Done() + }() + } + if pr.s.Err != nil { + pr.wg.Add(1) + go func() { + copyAndCleanClose(pr.s.Err, pr.pipes[4], "stderr") + pr.wg.Done() + }() + } +} + +// Wait waits for the relaying to finish and closes the associated +// pipes and connections. +func (pr *PipeRelay) Wait() { + // Close stdin so that the copying goroutine is safely unblocked; this is necessary + // because the host expects stdin to be closed before it will report process + // exit back to the client, and the client expects the process notification before + // it will close its side of stdin (which io.Copy is waiting on in the copying goroutine). + if pr.s != nil && pr.s.In != nil { + pr.s.In.CloseRead() + } + + pr.wg.Wait() + pr.closePipes() + if pr.s != nil { + pr.s.Close() + } +} + +// CloseUnusedPipes gives the caller the ability to close any pipes that do not +// have a cooresponding entry on the ConnectionSet. This is to be used in +// conjunction with NewPipeRelay where s is nil which wil open all pipes and +// later calling ReplaceConnectionSet with the actual connections. +func (pr *PipeRelay) CloseUnusedPipes() { + if pr.s == nil { + pr.closePipes() + } else { + if pr.s.In == nil { + // Write end of stdin + pr.pipes[1].Close() + } + if pr.s.Out == nil { + // Read end of stdout + pr.pipes[2].Close() + } + if pr.s.Err == nil { + // Read end of stderr + pr.pipes[4].Close() + } + } +} + +func (pr *PipeRelay) closePipes() { + for i := 0; i < len(pr.pipes); i++ { + if pr.pipes[i] != nil { + if err := pr.pipes[i].Close(); err != nil { + if !strings.Contains(err.Error(), "file already closed") { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + }).Error("opengcs::PipeRelay::closePipes - error closing relay pipe") + } + } + pr.pipes[i] = nil + } + } +} + +// NewTtyRelay returns a new TTY relay for a given master PTY file. +func NewTtyRelay(s *ConnectionSet, pty *os.File) *TtyRelay { + return &TtyRelay{s: s, pty: pty} +} + +// TtyRelay relays IO between a set of stdio connections and a master PTY file. +type TtyRelay struct { + m sync.Mutex + closed bool + wg sync.WaitGroup + s *ConnectionSet + pty *os.File +} + +// ReplaceConnectionSet allows the caller to add a new destination set after +// creating the relay. This can only be called previous to the call to Start. +func (r *TtyRelay) ReplaceConnectionSet(s *ConnectionSet) { + r.s = s +} + +// ResizeConsole sends the appropriate resize to a pTTY FD +func (r *TtyRelay) ResizeConsole(height, width uint16) error { + r.m.Lock() + defer r.m.Unlock() + + if r.closed { + return nil + } + return ResizeConsole(r.pty, height, width) +} + +// Start starts the relay operation. The caller must call Wait to wait +// for the relay to finish and release the associated resources. +func (r *TtyRelay) Start() { + if r.s.In != nil { + r.wg.Add(1) + go func() { + if _, err := io.Copy(r.pty, r.s.In); err != nil { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + }).Error("opengcs::TtyRelay::Start - error copying stdin to pty") + } + r.wg.Done() + }() + } + if r.s.Out != nil { + r.wg.Add(1) + go func() { + if _, err := io.Copy(r.s.Out, r.pty); err != nil { + logrus.WithFields(logrus.Fields{ + logrus.ErrorKey: err, + }).Error("opengcs::TtyRelay::Start - error copying pty to stdout") + } + r.wg.Done() + }() + } +} + +// Wait waits for the relaying to finish and closes the associated +// files and connections. +func (r *TtyRelay) Wait() { + // Close stdin so that the copying goroutine is safely unblocked; this is necessary + // because the host expects stdin to be closed before it will report process + // exit back to the client, and the client expects the process notification before + // it will close its side of stdin (which io.Copy is waiting on in the copying goroutine). + if r.s != nil && r.s.In != nil { + r.s.In.CloseRead() + } + + // Wait for all users of stdioSet and master to finish before closing them. + r.wg.Wait() + + r.m.Lock() + defer r.m.Unlock() + + r.pty.Close() + r.closed = true + if r.s != nil { + r.s.Close() + } +} diff --git a/internal/guest/stdio/tty.go b/internal/guest/stdio/tty.go new file mode 100644 index 0000000000..4f4bc7dc11 --- /dev/null +++ b/internal/guest/stdio/tty.go @@ -0,0 +1,77 @@ +// +build linux + +package stdio + +import ( + "fmt" + "os" + "syscall" + "unsafe" + + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// NewConsole allocates a new console and returns the File for its master and +// path for its slave. +func NewConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", errors.Wrap(err, "failed to open master pseudoterminal file") + } + console, err := ptsname(master) + if err != nil { + return nil, "", err + } + if err := unlockpt(master); err != nil { + return nil, "", err + } + // TODO: Do we need to keep this chmod call? + if err := os.Chmod(console, 0600); err != nil { + return nil, "", errors.Wrap(err, "failed to change permissions on the slave pseudoterminal file") + } + if err := os.Chown(console, 0, 0); err != nil { + return nil, "", errors.Wrap(err, "failed to change ownership on the slave pseudoterminal file") + } + return master, console, nil +} + +// ResizeConsole sends the appropriate resize to a pTTY FD +// Synchronization of pty should be handled in the callers context. +func ResizeConsole(pty *os.File, height, width uint16) error { + type consoleSize struct { + Height uint16 + Width uint16 + x uint16 + y uint16 + } + + return ioctl(pty.Fd(), uintptr(unix.TIOCSWINSZ), uintptr(unsafe.Pointer(&consoleSize{Height: height, Width: width}))) +} + +func ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +// ptsname is a Go wrapper around the ptsname system call. It returns the name +// of the slave pseudoterminal device corresponding to the given master. +func ptsname(f *os.File) (string, error) { + var n int32 + if err := ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", errors.Wrap(err, "ioctl TIOCGPTN failed for ptsname") + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +// unlockpt is a Go wrapper around the unlockpt system call. It unlocks the +// slave pseudoterminal device corresponding to the given master. +func unlockpt(f *os.File) error { + var u int32 + if err := ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))); err != nil { + return errors.Wrap(err, "ioctl TIOCSPTLCK failed for unlockpt") + } + return nil +} diff --git a/internal/guest/storage/devicemapper/devicemapper.go b/internal/guest/storage/devicemapper/devicemapper.go new file mode 100644 index 0000000000..aecdb43c70 --- /dev/null +++ b/internal/guest/storage/devicemapper/devicemapper.go @@ -0,0 +1,290 @@ +// +build linux + +package devicemapper + +import ( + "fmt" + "os" + "path" + "unsafe" + + "golang.org/x/sys/unix" +) + +// CreateFlags modify the operation of CreateDevice +type CreateFlags int + +const ( + // CreateReadOnly specifies that the device is not writable + CreateReadOnly CreateFlags = 1 << iota +) + +const ( + _IOC_WRITE = 1 + _IOC_READ = 2 + _IOC_NRBITS = 8 + _IOC_TYPEBITS = 8 + _IOC_SIZEBITS = 14 + _IOC_DIRBITS = 2 + + _IOC_NRMASK = ((1 << _IOC_NRBITS) - 1) + _IOC_TYPEMASK = ((1 << _IOC_TYPEBITS) - 1) + _IOC_SIZEMASK = ((1 << _IOC_SIZEBITS) - 1) + _IOC_DIRMASK = ((1 << _IOC_DIRBITS) - 1) + _IOC_TYPESHIFT = (_IOC_NRBITS) + _IOC_SIZESHIFT = (_IOC_TYPESHIFT + _IOC_TYPEBITS) + _IOC_DIRSHIFT = (_IOC_SIZESHIFT + _IOC_SIZEBITS) + + _DM_IOCTL = 0xfd + _DM_IOCTL_SIZE = 312 + _DM_IOCTL_BASE = (_IOC_READ|_IOC_WRITE)<<_IOC_DIRSHIFT | _DM_IOCTL<<_IOC_TYPESHIFT | _DM_IOCTL_SIZE<<_IOC_SIZESHIFT + + _DM_READONLY_FLAG = 1 << 0 + _DM_SUSPEND_FLAG = 1 << 1 + _DM_PERSISTENT_DEV_FLAG = 1 << 3 + + BlockSize = 512 +) + +const ( + _DM_VERSION = iota + _DM_REMOVE_ALL + _DM_LIST_DEVICES + _DM_DEV_CREATE + _DM_DEV_REMOVE + _DM_DEV_RENAME + _DM_DEV_SUSPEND + _DM_DEV_STATUS + _DM_DEV_WAIT + _DM_TABLE_LOAD + _DM_TABLE_CLEAR + _DM_TABLE_DEPS + _DM_TABLE_STATUS +) + +var dmOpName = []string{ + "version", + "remove all", + "list devices", + "device create", + "device remove", + "device rename", + "device suspend", + "device status", + "device wait", + "table load", + "table clear", + "table deps", + "table status", +} + +type dmIoctl struct { + Version [3]uint32 + DataSize uint32 + DataStart uint32 + TargetCount uint32 + OpenCount int32 + Flags uint32 + EventNumber uint32 + _ uint32 + Dev uint64 + Name [128]byte + UUID [129]byte + _ [7]byte +} + +type targetSpec struct { + SectorStart int64 + LengthInBlocks int64 + Status int32 + Next uint32 + Type [16]byte +} + +// initIoctl initializes a device-mapper ioctl input struct with the given size +// and device name +func initIoctl(d *dmIoctl, size int, name string) { + *d = dmIoctl{ + Version: [3]uint32{4, 0, 0}, + DataSize: uint32(size), + } + copy(d.Name[:], name) +} + +type dmError struct { + Op int + Err error +} + +func (err *dmError) Error() string { + op := "" + if err.Op < len(dmOpName) { + op = dmOpName[err.Op] + } + return "device-mapper " + op + ": " + err.Err.Error() +} + +// ioctl issues the specified device-mapper ioctl +func ioctl(f *os.File, code int, data *dmIoctl) error { + _, _, errno := unix.Syscall(unix.SYS_IOCTL, f.Fd(), uintptr(code|_DM_IOCTL_BASE), uintptr(unsafe.Pointer(data))) + if errno != 0 { + return &dmError{Op: code, Err: errno} + } + return nil +} + +// openMapper opens the device-mapper control device and validates that it +// supports the required version +func openMapper() (f *os.File, err error) { + f, err = os.OpenFile("/dev/mapper/control", os.O_RDWR, 0) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + f.Close() + } + }() + var d dmIoctl + initIoctl(&d, int(unsafe.Sizeof(d)), "") + err = ioctl(f, _DM_VERSION, &d) + if err != nil { + return nil, err + } + return f, nil +} + +// Target specifies a single entry in a device's target specification. +type Target struct { + Type string + SectorStart int64 + LengthInBlocks int64 + Params string +} + +// sizeof returns the size of a targetSpec needed to fit this specification. +func (t *Target) sizeof() int { + // include a null terminator (not sure if necessary) and round up to 8-byte + // alignment + return (int(unsafe.Sizeof(targetSpec{})) + len(t.Params) + 1 + 7) &^ 7 +} + +// LinearTarget constructs a device-mapper target that maps a portion of a block +// device at the specified offset. +// +// Example linear target table: +// 0 20971520 linear /dev/hda 384 +// | | | | | +// start | target data_dev | +// size offset +func LinearTarget(sectorStart, lengthBlocks int64, path string, deviceStart int64) Target { + return Target{ + Type: "linear", + SectorStart: sectorStart, + LengthInBlocks: lengthBlocks, + Params: fmt.Sprintf("%s %d", path, deviceStart), + } +} + +// PMemLinearTarget constructs a LinearTarget for PMem device with 0 sector start and length/device start +// expected to be in bytes rather than blocks +func PMemLinearTarget(lengthBytes int64, path string, deviceStartBytes int64) Target { + lengthInBlocks := lengthBytes / BlockSize + startInBlocks := deviceStartBytes / BlockSize + return LinearTarget(0, lengthInBlocks, path, startInBlocks) +} + +// makeTableIoctl builds an ioctl input structure with a table of the specified +// targets. +func makeTableIoctl(name string, targets []Target) *dmIoctl { + off := int(unsafe.Sizeof(dmIoctl{})) + n := off + for _, t := range targets { + n += t.sizeof() + } + b := make([]byte, n) + d := (*dmIoctl)(unsafe.Pointer(&b[0])) + initIoctl(d, n, name) + d.DataStart = uint32(off) + d.TargetCount = uint32(len(targets)) + for _, t := range targets { + spec := (*targetSpec)(unsafe.Pointer(&b[off])) + sn := t.sizeof() + spec.SectorStart = t.SectorStart + spec.LengthInBlocks = t.LengthInBlocks + spec.Next = uint32(sn) + copy(spec.Type[:], t.Type) + copy(b[off+int(unsafe.Sizeof(*spec)):], t.Params) + off += int(sn) + } + return d +} + +// CreateDevice creates a device-mapper device with the given target spec. It returns +// the path of the new device node. +func CreateDevice(name string, flags CreateFlags, targets []Target) (_ string, err error) { + f, err := openMapper() + if err != nil { + return "", err + } + defer f.Close() + + var d dmIoctl + size := int(unsafe.Sizeof(d)) + initIoctl(&d, size, name) + err = ioctl(f, _DM_DEV_CREATE, &d) + if err != nil { + return "", err + } + defer func() { + if err != nil { + removeDevice(f, name) + } + }() + + dev := int(d.Dev) + + di := makeTableIoctl(name, targets) + if flags&CreateReadOnly != 0 { + di.Flags |= _DM_READONLY_FLAG + } + err = ioctl(f, _DM_TABLE_LOAD, di) + if err != nil { + return "", err + } + initIoctl(&d, size, name) + err = ioctl(f, _DM_DEV_SUSPEND, &d) + if err != nil { + return "", err + } + + p := path.Join("/dev/mapper", name) + os.Remove(p) + err = unix.Mknod(p, unix.S_IFBLK|0600, dev) + if err != nil { + return "", nil + } + + return p, nil +} + +// RemoveDevice removes a device-mapper device and its associated device node. +func RemoveDevice(name string) error { + f, err := openMapper() + if err != nil { + return err + } + defer f.Close() + os.Remove(path.Join("/dev/mapper", name)) + return removeDevice(f, name) +} + +func removeDevice(f *os.File, name string) error { + var d dmIoctl + initIoctl(&d, int(unsafe.Sizeof(d)), name) + err := ioctl(f, _DM_DEV_REMOVE, &d) + if err != nil { + return err + } + return nil +} diff --git a/internal/guest/storage/devicemapper/devicemapper_test.go b/internal/guest/storage/devicemapper/devicemapper_test.go new file mode 100644 index 0000000000..723fe1496a --- /dev/null +++ b/internal/guest/storage/devicemapper/devicemapper_test.go @@ -0,0 +1,142 @@ +// +build linux + +package devicemapper + +import ( + "flag" + "os" + "testing" + "unsafe" + + "golang.org/x/sys/unix" +) + +var ( + integration = flag.Bool("integration", false, "run integration tests") +) + +func TestMain(m *testing.M) { + flag.Parse() + m.Run() +} + +func validateDevice(t *testing.T, p string, sectors int64, writable bool) { + dev, err := os.OpenFile(p, os.O_RDWR|os.O_SYNC, 0) + if err != nil { + t.Fatal(err) + } + defer dev.Close() + + var size int64 + _, _, errno := unix.Syscall(unix.SYS_IOCTL, dev.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&size))) + if errno != 0 { + t.Fatal(errno) + } + if size != sectors*512 { + t.Fatalf("expected %d bytes, got %d", sectors*512, size) + } + + var b [512]byte + _, err = unix.Read(int(dev.Fd()), b[:]) + if err != unix.EIO { + t.Fatalf("expected EIO, got %s", err) + } + _, err = unix.Write(int(dev.Fd()), b[:]) + if writable { + if err != unix.EIO { + t.Fatalf("expected EIO, got %s", err) + } + } else if err != unix.EPERM { + t.Fatalf("expected EPERM, got %s", err) + } + +} + +type device struct { + Name, Path string +} + +func (d *device) Close() (err error) { + if d.Name != "" { + err = RemoveDevice(d.Name) + if err == nil { + d.Name = "" + } + } + return +} + +func createDevice(name string, flags CreateFlags, targets []Target) (*device, error) { + p, err := CreateDevice(name, flags, targets) + if err != nil { + return nil, err + } + return &device{Name: name, Path: p}, nil +} + +func TestCreateError(t *testing.T) { + if !*integration { + t.Skip() + } + d, err := createDevice("test-device", 0, []Target{ + {Type: "error", SectorStart: 0, LengthInBlocks: 1}, + {Type: "error", SectorStart: 1, LengthInBlocks: 2}, + }) + if err != nil { + t.Fatal(err) + } + defer d.Close() + validateDevice(t, d.Path, 3, true) + err = d.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestReadOnlyError(t *testing.T) { + if !*integration { + t.Skip() + } + d, err := createDevice("test-device", CreateReadOnly, []Target{ + {Type: "error", SectorStart: 0, LengthInBlocks: 1}, + {Type: "error", SectorStart: 1, LengthInBlocks: 2}, + }) + if err != nil { + t.Fatal(err) + } + defer d.Close() + validateDevice(t, d.Path, 3, false) + err = d.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestLinearError(t *testing.T) { + if !*integration { + t.Skip() + } + b, err := createDevice("base-device", 0, []Target{ + {Type: "error", SectorStart: 0, LengthInBlocks: 100}, + }) + if err != nil { + t.Fatal(err) + } + defer b.Close() + d, err := createDevice("linear-device", 0, []Target{ + LinearTarget(0, 50, b.Path, 50), + }) + if err != nil { + t.Fatal(err) + } + defer d.Close() + validateDevice(t, d.Path, 50, true) + err = d.Close() + if err != nil { + t.Fatal(err) + } + err = b.Close() + if err != nil { + t.Fatal(err) + } +} diff --git a/internal/guest/storage/mount.go b/internal/guest/storage/mount.go new file mode 100644 index 0000000000..9fe230136c --- /dev/null +++ b/internal/guest/storage/mount.go @@ -0,0 +1,180 @@ +// +build linux + +package storage + +import ( + "bufio" + "context" + "fmt" + "os" + "strings" + "syscall" + + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "go.opencensus.io/trace" + "golang.org/x/sys/unix" +) + +const procMountFile = "/proc/mounts" +const numProcMountFields = 6 + +// Test dependencies +var ( + osStat = os.Stat + unixUnmount = unix.Unmount + unixMount = unix.Mount + osRemoveAll = os.RemoveAll + listMounts = listMountPointsUnderPath + + flags = map[string]struct { + clear bool + flag uintptr + }{ + "acl": {false, unix.MS_POSIXACL}, + "async": {true, unix.MS_SYNCHRONOUS}, + "atime": {true, unix.MS_NOATIME}, + "bind": {false, unix.MS_BIND}, + "defaults": {false, 0}, + "dev": {true, unix.MS_NODEV}, + "diratime": {true, unix.MS_NODIRATIME}, + "dirsync": {false, unix.MS_DIRSYNC}, + "exec": {true, unix.MS_NOEXEC}, + "iversion": {false, unix.MS_I_VERSION}, + "lazytime": {false, unix.MS_LAZYTIME}, + "loud": {true, unix.MS_SILENT}, + "mand": {false, unix.MS_MANDLOCK}, + "noacl": {true, unix.MS_POSIXACL}, + "noatime": {false, unix.MS_NOATIME}, + "nodev": {false, unix.MS_NODEV}, + "nodiratime": {false, unix.MS_NODIRATIME}, + "noexec": {false, unix.MS_NOEXEC}, + "noiversion": {true, unix.MS_I_VERSION}, + "nolazytime": {true, unix.MS_LAZYTIME}, + "nomand": {true, unix.MS_MANDLOCK}, + "norelatime": {true, unix.MS_RELATIME}, + "nostrictatime": {true, unix.MS_STRICTATIME}, + "nosuid": {false, unix.MS_NOSUID}, + "rbind": {false, unix.MS_BIND | unix.MS_REC}, + "relatime": {false, unix.MS_RELATIME}, + "remount": {false, unix.MS_REMOUNT}, + "ro": {false, unix.MS_RDONLY}, + "rw": {true, unix.MS_RDONLY}, + "silent": {false, unix.MS_SILENT}, + "strictatime": {false, unix.MS_STRICTATIME}, + "suid": {true, unix.MS_NOSUID}, + "sync": {false, unix.MS_SYNCHRONOUS}, + } + + propagationFlags = map[string]uintptr{ + "private": unix.MS_PRIVATE, + "shared": unix.MS_SHARED, + "slave": unix.MS_SLAVE, + "unbindable": unix.MS_UNBINDABLE, + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "rshared": unix.MS_SHARED | unix.MS_REC, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + } +) + +func ParseMountOptions(options []string) (flagOpts uintptr, pgFlags []uintptr, data []string) { + for _, o := range options { + if f, exists := flags[o]; exists && f.flag != 0 { + if f.clear { + flagOpts &= ^f.flag + } else { + flagOpts |= f.flag + } + } else if f, exists := propagationFlags[o]; exists && f != 0 { + pgFlags = append(pgFlags, f) + } else { + data = append(data, o) + } + } + return +} + +// MountRShared creates a bind mountpoint and marks it as rshared +// Expected that the filepath exists before calling this function +func MountRShared(path string) error { + if path == "" { + return errors.New("Path must not be empty to mount as rshared") + } + if err := unixMount(path, path, "", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("Failed to create bind mount for %v: %v", path, err) + } + if err := unixMount(path, path, "", syscall.MS_SHARED|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("Failed to make %v rshared: %v", path, err) + } + return nil +} + +// UnmountPath unmounts the target path if it exists and is a mount path. If +// removeTarget this will remove the previously mounted folder. +func UnmountPath(ctx context.Context, target string, removeTarget bool) (err error) { + _, span := trace.StartSpan(ctx, "storage::UnmountPath") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes( + trace.StringAttribute("target", target), + trace.BoolAttribute("remove", removeTarget)) + + if _, err := osStat(target); err != nil { + if os.IsNotExist(err) { + return nil + } + return errors.Wrapf(err, "failed to determine if path '%s' exists", target) + } + + if err := unixUnmount(target, 0); err != nil { + // If `Unmount` returns `EINVAL` it's not mounted. Just delete the + // folder. + if err != unix.EINVAL { + return errors.Wrapf(err, "failed to unmount path '%s'", target) + } + } + if removeTarget { + return osRemoveAll(target) + } + return nil +} + +func UnmountAllInPath(ctx context.Context, path string, removeTarget bool) (err error) { + childMounts, err := listMounts(path) + if err != nil { + return err + } + + for i := len(childMounts) - 1; i >= 0; i-- { + childPath := childMounts[i] + if err := UnmountPath(ctx, childPath, removeTarget); err != nil { + return err + } + } + return nil +} + +func listMountPointsUnderPath(path string) ([]string, error) { + var mountPoints []string + f, err := os.Open(procMountFile) + if err != nil { + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + fields := strings.Split(line, " ") + if len(fields) < numProcMountFields { + continue + } + destPath := fields[1] + if strings.HasPrefix(destPath, path) { + mountPoints = append(mountPoints, destPath) + } + } + + return mountPoints, nil +} diff --git a/internal/guest/storage/mount_test.go b/internal/guest/storage/mount_test.go new file mode 100644 index 0000000000..2ddb838d69 --- /dev/null +++ b/internal/guest/storage/mount_test.go @@ -0,0 +1,212 @@ +// +build linux + +package storage + +import ( + "context" + "os" + "testing" + + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +func clearTestDependencies() { + osStat = nil + unixUnmount = nil + osRemoveAll = nil +} + +func Test_Unmount_Stat_Valid_Path(t *testing.T) { + clearTestDependencies() + + expectedName := "/dev/fake" + osStat = func(name string) (os.FileInfo, error) { + if expectedName != name { + t.Errorf("expected name: %s, got: %s", expectedName, name) + return nil, errors.New("unexpected name") + } + return nil, nil + } + unixUnmount = func(target string, flags int) error { + return nil + } + err := UnmountPath(context.Background(), expectedName, false) + if err != nil { + t.Fatalf("expected nil error, got: %v", err) + } +} + +func Test_Unmount_Stat_NotExist(t *testing.T) { + clearTestDependencies() + + // Should return early + osStat = func(name string) (os.FileInfo, error) { + return nil, os.ErrNotExist + } + err := UnmountPath(context.Background(), "/dev/fake", false) + if err != nil { + t.Fatalf("expected nil error, got: %v", err) + } +} + +func Test_Unmount_Stat_OtherError_Error(t *testing.T) { + clearTestDependencies() + + expectedErr := errors.New("expected stat err") + osStat = func(name string) (os.FileInfo, error) { + return nil, expectedErr + } + err := UnmountPath(context.Background(), "/dev/fake", false) + if errors.Cause(err) != expectedErr { + t.Fatalf("expected err: %v, got: %v", expectedErr, err) + } +} + +func Test_Unmount_Valid_Target(t *testing.T) { + clearTestDependencies() + + osStat = func(name string) (os.FileInfo, error) { + return nil, nil + } + expectedTarget := "/dev/fake" + unixUnmount = func(target string, flags int) error { + if expectedTarget != target { + t.Errorf("expected target: %s, got: %s", expectedTarget, target) + return errors.New("unexpected target") + } + return nil + } + err := UnmountPath(context.Background(), expectedTarget, false) + if err != nil { + t.Fatalf("expected nil error, got: %v", err) + } +} + +func Test_Unmount_Valid_Flags(t *testing.T) { + clearTestDependencies() + + osStat = func(name string) (os.FileInfo, error) { + return nil, nil + } + unixUnmount = func(target string, flags int) error { + if 0 != flags { + t.Errorf("expected flags 0, got: %d", flags) + return errors.New("unexpected flags") + } + return nil + } + err := UnmountPath(context.Background(), "/fake/path", false) + if err != nil { + t.Fatalf("expected nil error, got: %v", err) + } +} + +func Test_Unmount_NotMounted(t *testing.T) { + clearTestDependencies() + + osStat = func(name string) (os.FileInfo, error) { + return nil, nil + } + unixUnmount = func(target string, flags int) error { + return unix.EINVAL + } + err := UnmountPath(context.Background(), "/dev/fake", false) + if err != nil { + t.Fatalf("expected nil error, got: %v", err) + } +} + +func Test_Unmount_OtherError(t *testing.T) { + clearTestDependencies() + + osStat = func(name string) (os.FileInfo, error) { + return nil, nil + } + expectedErr := errors.New("expected unmount error") + unixUnmount = func(target string, flags int) error { + return expectedErr + } + err := UnmountPath(context.Background(), "/dev/fake", false) + if errors.Cause(err) != expectedErr { + t.Fatalf("expected err: %v, got: %v", expectedErr, err) + } +} + +func Test_Unmount_RemoveAll_Valid_Path(t *testing.T) { + clearTestDependencies() + + osStat = func(name string) (os.FileInfo, error) { + return nil, nil + } + unixUnmount = func(target string, flags int) error { + return nil + } + expectedPath := "/fake/path" + osRemoveAll = func(path string) error { + if expectedPath != path { + t.Errorf("expected path %s, got: %s", expectedPath, path) + return errors.New("unexpected path") + } + return nil + } + err := UnmountPath(context.Background(), expectedPath, true) + if err != nil { + t.Fatalf("expected nil error, got: %v", err) + } +} + +func Test_Unmount_RemoveAll_Called(t *testing.T) { + clearTestDependencies() + + osStat = func(name string) (os.FileInfo, error) { + return nil, nil + } + unixUnmount = func(target string, flags int) error { + return nil + } + removeAllCalled := false + osRemoveAll = func(path string) error { + removeAllCalled = true + return nil + } + err := UnmountPath(context.Background(), "/fake/path", true) + if err != nil { + t.Fatalf("expected nil error, got: %v", err) + } + if !removeAllCalled { + t.Fatal("expected remove to be called") + } +} + +func Test_UnmountAllInPath_Unmount_Order(t *testing.T) { + clearTestDependencies() + parent := "/fake" + child := "/fake/test" + listMounts = func(path string) ([]string, error) { + return []string{parent, child}, nil + } + + osStat = func(name string) (os.FileInfo, error) { + return nil, nil + } + + timesCalled := 0 + unixUnmount = func(target string, flags int) error { + if timesCalled == 0 && target != child { + return errors.Errorf("expected to unmount %v first, got %v", child, target) + } + timesCalled += 1 + return nil + } + + osRemoveAll = func(path string) error { + return nil + } + + err := UnmountAllInPath(context.Background(), parent, true) + + if err != nil { + t.Fatalf("expected nil error, got: %v", err) + } +} diff --git a/internal/guest/storage/overlay/overlay.go b/internal/guest/storage/overlay/overlay.go new file mode 100644 index 0000000000..07fc24eff1 --- /dev/null +++ b/internal/guest/storage/overlay/overlay.go @@ -0,0 +1,93 @@ +// +build linux + +package overlay + +import ( + "context" + "os" + "strings" + + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "go.opencensus.io/trace" + "golang.org/x/sys/unix" +) + +// Test dependencies +var ( + osMkdirAll = os.MkdirAll + osRemoveAll = os.RemoveAll + unixMount = unix.Mount +) + +// Mount creates an overlay mount with `layerPaths` at `rootfsPath`. +// +// If `upperdirPath != ""` the path will be created. On mount failure the +// created `upperdirPath` will be automatically cleaned up. +// +// If `workdirPath != ""` the path will be created. On mount failure the created +// `workdirPath` will be automatically cleaned up. +// +// Always creates `rootfsPath`. On mount failure the created `rootfsPath` will +// be automatically cleaned up. +func Mount(ctx context.Context, layerPaths []string, upperdirPath, workdirPath, rootfsPath string, readonly bool) (err error) { + _, span := trace.StartSpan(ctx, "overlay::Mount") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + lowerdir := strings.Join(layerPaths, ":") + span.AddAttributes( + trace.StringAttribute("layerPaths", lowerdir), + trace.StringAttribute("upperdirPath", upperdirPath), + trace.StringAttribute("workdirPath", workdirPath), + trace.StringAttribute("rootfsPath", rootfsPath), + trace.BoolAttribute("readonly", readonly)) + + if rootfsPath == "" { + return errors.New("cannot have empty rootfsPath") + } + + if readonly && (upperdirPath != "" || workdirPath != "") { + return errors.Errorf("upperdirPath: %q, and workdirPath: %q must be empty when readonly==true", upperdirPath, workdirPath) + } + + options := []string{"lowerdir=" + lowerdir} + if upperdirPath != "" { + if err := osMkdirAll(upperdirPath, 0755); err != nil { + return errors.Wrap(err, "failed to create upper directory in scratch space") + } + defer func() { + if err != nil { + osRemoveAll(upperdirPath) + } + }() + options = append(options, "upperdir="+upperdirPath) + } + if workdirPath != "" { + if err := osMkdirAll(workdirPath, 0755); err != nil { + return errors.Wrap(err, "failed to create workdir in scratch space") + } + defer func() { + if err != nil { + osRemoveAll(workdirPath) + } + }() + options = append(options, "workdir="+workdirPath) + } + if err := osMkdirAll(rootfsPath, 0755); err != nil { + return errors.Wrapf(err, "failed to create directory for container root filesystem %s", rootfsPath) + } + defer func() { + if err != nil { + osRemoveAll(rootfsPath) + } + }() + var flags uintptr + if readonly { + flags |= unix.MS_RDONLY + } + if err := unixMount("overlay", rootfsPath, "overlay", flags, strings.Join(options, ",")); err != nil { + return errors.Wrapf(err, "failed to mount container root filesystem using overlayfs %s", rootfsPath) + } + return nil +} diff --git a/internal/guest/storage/overlay/overlay_test.go b/internal/guest/storage/overlay/overlay_test.go new file mode 100644 index 0000000000..48363c1d41 --- /dev/null +++ b/internal/guest/storage/overlay/overlay_test.go @@ -0,0 +1,130 @@ +// +build linux + +package overlay + +import ( + "context" + "errors" + "os" + "testing" +) + +type undo struct { + osMkdirAll func(string, os.FileMode) error + osRemoveAll func(string) error + unixMount func(string, string, string, uintptr, string) error +} + +func (u *undo) Close() { + osMkdirAll = u.osMkdirAll + unixMount = u.unixMount +} + +// Captures the actual product function context and returns them on `Close()`. +// It sets the test dependencies to `nil` so that any unpredicted call to that +// function will cause the test to panic. +func captureTestMethods() *undo { + u := &undo{ + osMkdirAll: osMkdirAll, + osRemoveAll: osRemoveAll, + unixMount: unixMount, + } + osMkdirAll = nil + osRemoveAll = nil + unixMount = nil + return u +} + +func Test_Mount_Success(t *testing.T) { + undo := captureTestMethods() + defer undo.Close() + + var upperCreated, workCreated, rootCreated bool + osMkdirAll = func(path string, perm os.FileMode) error { + if perm != 0755 { + t.Errorf("os.MkdirAll at: %s, perm: %v expected perm: 0755", path, perm) + } + switch path { + case "/upper": + upperCreated = true + return nil + case "/work": + workCreated = true + return nil + case "/root": + rootCreated = true + return nil + } + return errors.New("unexpected os.MkdirAll path") + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + if source != "overlay" { + t.Errorf("expected source: 'overlay' got: %v", source) + } + if target != "/root" { + t.Errorf("expected target: '/root' got: %v", target) + } + if fstype != "overlay" { + t.Errorf("expected fstype: 'overlay' got: %v", fstype) + } + if flags != 0 { + t.Errorf("expected flags: '0' got: %v", flags) + } + if data != "lowerdir=/layer1:/layer2,upperdir=/upper,workdir=/work" { + t.Errorf("expected data: 'lowerdir=/layer1:/layer2,upperdir=/upper,workdir=/work' got: %v", data) + } + return nil + } + + err := Mount(context.Background(), []string{"/layer1", "/layer2"}, "/upper", "/work", "/root", false) + if err != nil { + t.Fatalf("expected no error got: %v", err) + } + if !upperCreated || !workCreated || !rootCreated { + t.Fatalf("expected all upper: %v, work: %v, root: %v to be created", upperCreated, workCreated, rootCreated) + } +} + +func Test_Mount_Readonly_Success(t *testing.T) { + undo := captureTestMethods() + defer undo.Close() + + var rootCreated bool + osMkdirAll = func(path string, perm os.FileMode) error { + if perm != 0755 { + t.Errorf("os.MkdirAll at: %s, perm: %v expected perm: 0755", path, perm) + } + switch path { + case "/root": + rootCreated = true + return nil + } + return errors.New("unexpected os.MkdirAll path") + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + if source != "overlay" { + t.Errorf("expected source: 'overlay' got: %v", source) + } + if target != "/root" { + t.Errorf("expected target: '/root' got: %v", target) + } + if fstype != "overlay" { + t.Errorf("expected fstype: 'overlay' got: %v", fstype) + } + if flags != 0 { + t.Errorf("expected flags: '0' got: %v", flags) + } + if data != "lowerdir=/layer1:/layer2" { + t.Errorf("expected data: 'lowerdir=/layer1:/layer2' got: %v", data) + } + return nil + } + + err := Mount(context.Background(), []string{"/layer1", "/layer2"}, "", "", "/root", false) + if err != nil { + t.Fatalf("expected no error got: %v", err) + } + if !rootCreated { + t.Fatal("expected root to be created") + } +} diff --git a/internal/guest/storage/pci/pci.go b/internal/guest/storage/pci/pci.go new file mode 100644 index 0000000000..e75b253186 --- /dev/null +++ b/internal/guest/storage/pci/pci.go @@ -0,0 +1,64 @@ +// +build linux + +package pci + +import ( + "context" + "fmt" + "path/filepath" + "strings" + + "github.com/Microsoft/hcsshim/internal/guest/storage" + "github.com/Microsoft/hcsshim/internal/guest/storage/vmbus" +) + +var storageWaitForFileMatchingPattern = storage.WaitForFileMatchingPattern +var vmbusWaitForDevicePath = vmbus.WaitForDevicePath + +// WaitForPCIDeviceFromVMBusGUID waits for bus location path of the device to be present +func WaitForPCIDeviceFromVMBusGUID(ctx context.Context, vmBusGUID string) error { + _, err := FindDeviceBusLocationFromVMBusGUID(ctx, vmBusGUID) + return err +} + +// FindDeviceBusLocationFromVMBusGUID finds device bus location by +// reading /sys/bus/vmbus/devices//... for pci specific directories +func FindDeviceBusLocationFromVMBusGUID(ctx context.Context, vmBusGUID string) (string, error) { + pciDir, err := findVMBusPCIDir(ctx, vmBusGUID) + if err != nil { + return "", err + } + + pciDeviceLocation, err := findVMBusPCIDevice(ctx, pciDir) + if err != nil { + return "", err + } + return pciDeviceLocation, nil +} + +// findVMBusPCIDir waits for the pci bus directory matching pattern +// /sys/bus/vmbus/devices//pci* to exist and returns +// the full resulting path or an error +func findVMBusPCIDir(ctx context.Context, vmBusGUID string) (string, error) { + vmBusPCIPathPattern := filepath.Join(vmBusGUID, "pci*") + return vmbusWaitForDevicePath(ctx, vmBusPCIPathPattern) +} + +// findVMBusPCIDevice waits for the pci bus location directory under the path +// returned from findVMBusPCIDir to exist and returns the pci bus location or an error +func findVMBusPCIDevice(ctx context.Context, pciDirFullPath string) (string, error) { + // trim /sys/bus/vmbus/devices//pciXXXX:XX to XXXX:XX + _, pciDirName := filepath.Split(pciDirFullPath) + busPrefix := strings.TrimPrefix(pciDirName, "pci") + + // under /sys/bus/vmbus/devices//pciXXXX:XX/ look for directory matching XXXX:XX* pattern + busPathPattern := filepath.Join(pciDirFullPath, fmt.Sprintf("%s*", busPrefix)) + busFileFullPath, err := storageWaitForFileMatchingPattern(ctx, busPathPattern) + if err != nil { + return "", err + } + + // return the resulting XXXX:XX:YY.Y pci bus location + _, busFile := filepath.Split(busFileFullPath) + return busFile, nil +} diff --git a/internal/guest/storage/pci/pci_test.go b/internal/guest/storage/pci/pci_test.go new file mode 100644 index 0000000000..2c27dc4f96 --- /dev/null +++ b/internal/guest/storage/pci/pci_test.go @@ -0,0 +1,43 @@ +// +build linux + +package pci + +import ( + "context" + "path/filepath" + "testing" + "time" +) + +func Test_WaitForPCIDeviceFromVMBusGUID_Success(t *testing.T) { + ctx, _ := context.WithTimeout(context.Background(), 2*time.Second) + + vmBusGUID := "1111-2222-3333-4444" + pciDir := "pci1234:00" + busLocation := "1234:00:00.0" + + vmbusWaitForDevicePath = func(ctx context.Context, vmbusGUIDPattern string) (string, error) { + vmBusPath := filepath.Join("/sys/bus/vmbus/devices", vmbusGUIDPattern) + vmBusDirPath, targetPattern := filepath.Split(vmBusPath) + if targetPattern == "pci*" { + return filepath.Join(vmBusDirPath, pciDir), nil + } + return "", nil + } + + storageWaitForFileMatchingPattern = func(ctx context.Context, pattern string) (string, error) { + vmBusPciDirPath, targetPattern := filepath.Split(pattern) + if targetPattern == "1234:00*" { + return filepath.Join(vmBusPciDirPath, busLocation), nil + } + return "", nil + } + + resultBusLocation, err := FindDeviceBusLocationFromVMBusGUID(ctx, vmBusGUID) + if err != nil { + t.Fatalf("expected to succeed, instead got: %v", err) + } + if resultBusLocation != busLocation { + t.Fatalf("result %s does not match expected result %s", resultBusLocation, busLocation) + } +} diff --git a/internal/guest/storage/plan9/plan9.go b/internal/guest/storage/plan9/plan9.go new file mode 100644 index 0000000000..796d4685cc --- /dev/null +++ b/internal/guest/storage/plan9/plan9.go @@ -0,0 +1,84 @@ +// +build linux + +package plan9 + +import ( + "context" + "fmt" + "os" + "syscall" + + "github.com/Microsoft/hcsshim/internal/guest/transport" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "go.opencensus.io/trace" + "golang.org/x/sys/unix" +) + +const packetPayloadBytes = 65536 + +// Test dependencies +var ( + osMkdirAll = os.MkdirAll + osRemoveAll = os.RemoveAll + unixMount = unix.Mount +) + +// Mount dials a connection from `vsock` and mounts a Plan9 share to `target`. +// +// `target` will be created. On mount failure the created `target` will be +// automatically cleaned up. +func Mount(ctx context.Context, vsock transport.Transport, target, share string, port uint32, readonly bool) (err error) { + _, span := trace.StartSpan(ctx, "plan9::Mount") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes( + trace.StringAttribute("target", target), + trace.StringAttribute("share", share), + trace.Int64Attribute("port", int64(port)), + trace.BoolAttribute("readonly", readonly)) + + if err := osMkdirAll(target, 0700); err != nil { + return err + } + defer func() { + if err != nil { + osRemoveAll(target) + } + }() + conn, err := vsock.Dial(port) + if err != nil { + return errors.Wrapf(err, "could not connect to plan9 server for %s", target) + } + f, err := conn.File() + conn.Close() + if err != nil { + return errors.Wrapf(err, "could not get file for plan9 connection for %s", target) + } + defer f.Close() + + var mountOptions uintptr + data := fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,msize=%d", f.Fd(), f.Fd(), packetPayloadBytes) + if readonly { + mountOptions |= unix.MS_RDONLY + data += ",noload" + } + if share != "" { + data += ",aname=" + share + } + + // set socket options to maximize bandwidth + err = syscall.SetsockoptInt(int(f.Fd()), syscall.SOL_SOCKET, syscall.SO_RCVBUF, packetPayloadBytes) + if err != nil { + return errors.Wrapf(err, "failed to set sock option syscall.SO_RCVBUF to %v on fd %v", packetPayloadBytes, f.Fd()) + } + err = syscall.SetsockoptInt(int(f.Fd()), syscall.SOL_SOCKET, syscall.SO_SNDBUF, packetPayloadBytes) + if err != nil { + return errors.Wrapf(err, "failed to set sock option syscall.SO_SNDBUF to %v on fd %v", packetPayloadBytes, f.Fd()) + } + if err := unixMount(target, target, "9p", mountOptions, data); err != nil { + return errors.Wrapf(err, "failed to mount directory for mapped directory %s", target) + } + return nil +} diff --git a/internal/guest/storage/pmem/pmem.go b/internal/guest/storage/pmem/pmem.go new file mode 100644 index 0000000000..63a9158987 --- /dev/null +++ b/internal/guest/storage/pmem/pmem.go @@ -0,0 +1,209 @@ +// +build linux + +package pmem + +import ( + "context" + "fmt" + "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/log" + "os" + + "github.com/Microsoft/hcsshim/internal/guest/storage" + dm "github.com/Microsoft/hcsshim/internal/guest/storage/devicemapper" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "go.opencensus.io/trace" + "golang.org/x/sys/unix" +) + +// Test dependencies +var ( + osMkdirAll = os.MkdirAll + osRemoveAll = os.RemoveAll + unixMount = unix.Mount +) + +const ( + pMemFmt = "/dev/pmem%d" + linearDeviceFmt = "dm-linear-pmem%d-%d-%d" + verityDeviceFmt = "dm-verity-pmem%d-%s" +) + +// mountInternal mounts source to target via unix.Mount +func mountInternal(ctx context.Context, source, target string) (err error) { + if err := osMkdirAll(target, 0700); err != nil { + return err + } + defer func() { + if err != nil { + if err := osRemoveAll(target); err != nil { + log.G(ctx).WithError(err).Debugf("error cleaning up target: %s", target) + } + } + }() + + flags := uintptr(unix.MS_RDONLY) + if err := unixMount(source, target, "ext4", flags, "noload"); err != nil { + return errors.Wrapf(err, "failed to mount %s onto %s", source, target) + } + return nil +} + +// Mount mounts the pmem device at `/dev/pmem` to `target` in a basic scenario. +// If either mappingInfo or verityInfo are non-nil, the device-mapper framework is used +// to create linear and verity targets accordingly. If both are non-nil, the linear +// target is created first and used as the data/hash device for the verity target. +// +// `target` will be created. On mount failure the created `target` will be +// automatically cleaned up. +// +// Note: For now the platform only supports readonly pmem that is assumed to be +// `ext4`. +// +// Note: both mappingInfo and verityInfo can be non-nil at the same time, in that case +// linear target is created first and it becomes the data/hash device for verity target. +func Mount(ctx context.Context, device uint32, target string, mappingInfo *prot.DeviceMappingInfo, verityInfo *prot.DeviceVerityInfo) (err error) { + mCtx, span := trace.StartSpan(ctx, "pmem::Mount") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes( + trace.Int64Attribute("deviceNumber", int64(device)), + trace.StringAttribute("target", target)) + + devicePath := fmt.Sprintf(pMemFmt, device) + // dm linear target has to be created first. when verity info is also present, the linear target becomes the data + // device instead of the original VPMem. + if mappingInfo != nil { + dmLinearName := fmt.Sprintf(linearDeviceFmt, device, mappingInfo.DeviceOffsetInBytes, mappingInfo.DeviceSizeInBytes) + if dmLinearPath, err := createDMLinearTarget(mCtx, devicePath, dmLinearName, target, mappingInfo); err != nil { + return err + } else { + devicePath = dmLinearPath + } + defer func() { + if err != nil { + if err := dm.RemoveDevice(dmLinearName); err != nil { + log.G(mCtx).WithError(err).Debugf("failed to cleanup linear target: %s", dmLinearName) + } + } + }() + } + + if verityInfo != nil { + dmVerityName := fmt.Sprintf(verityDeviceFmt, device, verityInfo.RootDigest) + if dmVerityPath, err := createDMVerityTarget(mCtx, devicePath, dmVerityName, target, verityInfo); err != nil { + return err + } else { + devicePath = dmVerityPath + } + defer func() { + if err != nil { + if err := dm.RemoveDevice(dmVerityName); err != nil { + log.G(mCtx).WithError(err).Debugf("failed to cleanup verity target: %s", dmVerityName) + } + } + }() + } + + return mountInternal(mCtx, devicePath, target) +} + +// createDMLinearTarget creates dm-linear target from a given `device` slot location and `mappingInfo` +func createDMLinearTarget(ctx context.Context, devPath, devName string, target string, mappingInfo *prot.DeviceMappingInfo) (_ string, err error) { + _, span := trace.StartSpan(ctx, "pmem::createDMLinearTarget") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + linearTarget := dm.PMemLinearTarget(mappingInfo.DeviceSizeInBytes, devPath, mappingInfo.DeviceOffsetInBytes) + + span.AddAttributes( + trace.StringAttribute("devicePath", devPath), + trace.Int64Attribute("deviceStart", mappingInfo.DeviceOffsetInBytes), + trace.Int64Attribute("sectorSize", mappingInfo.DeviceSizeInBytes), + trace.StringAttribute("target", target), + trace.StringAttribute("linearTable", fmt.Sprintf("%s: '%d %d %s'", devName, linearTarget.SectorStart, linearTarget.LengthInBlocks, linearTarget.Params))) + + devMapperPath, err := dm.CreateDevice(devName, dm.CreateReadOnly, []dm.Target{linearTarget}) + if err != nil { + return "", errors.Wrapf(err, "failed to create dm-linear target: pmem device: %s, offset: %d", devPath, mappingInfo.DeviceOffsetInBytes) + } + + return devMapperPath, nil +} + +// createDMVerityTarget creates a dm-verity target for a given device and mounts that target instead of the device itself +// +// verity target table +// 0 417792 verity 1 /dev/sdb /dev/sdc 4096 4096 52224 1 sha256 2aa4f7b7b6...f4952060e8 762307f4bc8...d2a6b7595d8.. +// | | | | | | | | | | | | | +// start| | | data_dev | data_block | #blocks | hash_alg root_digest salt +// size | version hash_dev | hash_offset +// target hash_block +func createDMVerityTarget(ctx context.Context, devPath, devName, target string, verityInfo *prot.DeviceVerityInfo) (_ string, err error) { + _, span := trace.StartSpan(ctx, "pmem::createDMVerityTarget") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + dmBlocks := verityInfo.Ext4SizeInBytes / dm.BlockSize + dataBlocks := verityInfo.Ext4SizeInBytes / int64(verityInfo.BlockSize) + hashOffsetBlocks := dataBlocks + if verityInfo.SuperBlock { + hashOffsetBlocks++ + } + hashes := fmt.Sprintf("%s %s %s", verityInfo.Algorithm, verityInfo.RootDigest, verityInfo.Salt) + blkInfo := fmt.Sprintf("%d %d %d %d", verityInfo.BlockSize, verityInfo.BlockSize, dataBlocks, hashOffsetBlocks) + devices := fmt.Sprintf("%s %s", devPath, devPath) + + verityTarget := dm.Target{ + SectorStart: 0, + LengthInBlocks: dmBlocks, + Type: "verity", + Params: fmt.Sprintf("%d %s %s %s", verityInfo.Version, devices, blkInfo, hashes), + } + + span.AddAttributes( + trace.StringAttribute("devicePath", devPath), + trace.StringAttribute("target", target), + trace.Int64Attribute("sectorSize", dmBlocks), + trace.StringAttribute("verityTable", verityTarget.Params)) + + mapperPath, err := dm.CreateDevice(devName, dm.CreateReadOnly, []dm.Target{verityTarget}) + if err != nil { + return "", errors.Wrapf(err, "failed to create dm-verity target: pmem device: %s", devPath) + } + + return mapperPath, nil +} + +// Unmount unmounts `target` and removes corresponding linear and verity targets when needed +func Unmount(ctx context.Context, devNumber uint32, target string, mappingInfo *prot.DeviceMappingInfo, verityInfo *prot.DeviceVerityInfo) (err error) { + _, span := trace.StartSpan(ctx, "pmem::Unmount") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes( + trace.Int64Attribute("device", int64(devNumber)), + trace.StringAttribute("target", target)) + + if err := storage.UnmountPath(ctx, target, true); err != nil { + return errors.Wrapf(err, "failed to unmount target: %s", target) + } + + if verityInfo != nil { + dmVerityName := fmt.Sprintf(verityDeviceFmt, devNumber, verityInfo.RootDigest) + if err := dm.RemoveDevice(dmVerityName); err != nil { + return errors.Wrapf(err, "failed to remove dm verity target: %s", dmVerityName) + } + } + + if mappingInfo != nil { + dmLinearName := fmt.Sprintf(linearDeviceFmt, devNumber, mappingInfo.DeviceOffsetInBytes, mappingInfo.DeviceSizeInBytes) + if err := dm.RemoveDevice(dmLinearName); err != nil { + return errors.Wrapf(err, "failed to remove dm linear target: %s", dmLinearName) + } + } + + return nil +} diff --git a/internal/guest/storage/pmem/pmem_test.go b/internal/guest/storage/pmem/pmem_test.go new file mode 100644 index 0000000000..35f1489e72 --- /dev/null +++ b/internal/guest/storage/pmem/pmem_test.go @@ -0,0 +1,226 @@ +// +build linux + +package pmem + +import ( + "context" + "fmt" + "os" + "testing" + + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +func clearTestDependencies() { + osMkdirAll = nil + osRemoveAll = nil + unixMount = nil +} + +func Test_Mount_Mkdir_Fails_Error(t *testing.T) { + clearTestDependencies() + + expectedErr := errors.New("mkdir : no such file or directory") + osMkdirAll = func(path string, perm os.FileMode) error { + return expectedErr + } + err := Mount(context.Background(), 0, "", nil, nil) + if errors.Cause(err) != expectedErr { + t.Fatalf("expected err: %v, got: %v", expectedErr, err) + } +} + +func Test_Mount_Mkdir_ExpectedPath(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + target := "/fake/path" + osMkdirAll = func(path string, perm os.FileMode) error { + if path != target { + t.Errorf("expected path: %v, got: %v", target, path) + return errors.New("unexpected path") + } + return nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + // Fake the mount success + return nil + } + err := Mount(context.Background(), 0, target, nil, nil) + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } +} + +func Test_Mount_Mkdir_ExpectedPerm(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + target := "/fake/path" + osMkdirAll = func(path string, perm os.FileMode) error { + if perm != os.FileMode(0700) { + t.Errorf("expected perm: %v, got: %v", os.FileMode(0700), perm) + return errors.New("unexpected perm") + } + return nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + // Fake the mount success + return nil + } + err := Mount(context.Background(), 0, target, nil, nil) + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } +} + +func Test_Mount_Calls_RemoveAll_OnMountFailure(t *testing.T) { + clearTestDependencies() + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + target := "/fake/path" + removeAllCalled := false + osRemoveAll = func(path string) error { + removeAllCalled = true + if path != target { + t.Errorf("expected path: %v, got: %v", target, path) + return errors.New("unexpected path") + } + return nil + } + expectedErr := errors.New("unexpected mount failure") + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + // Fake the mount failure to test remove is called + return expectedErr + } + err := Mount(context.Background(), 0, target, nil, nil) + if errors.Cause(err) != expectedErr { + t.Fatalf("expected err: %v, got: %v", expectedErr, err) + } + if !removeAllCalled { + t.Fatal("expected os.RemoveAll to be called on mount failure") + } +} + +func Test_Mount_Valid_Source(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + device := uint32(20) + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + expected := fmt.Sprintf("/dev/pmem%d", device) + if source != expected { + t.Errorf("expected source: %s, got: %s", expected, source) + return errors.New("unexpected source") + } + return nil + } + err := Mount(context.Background(), device, "/fake/path", nil, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Valid_Target(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + expectedTarget := "/fake/path" + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + if expectedTarget != target { + t.Errorf("expected target: %s, got: %s", expectedTarget, target) + return errors.New("unexpected target") + } + return nil + } + err := Mount(context.Background(), 0, expectedTarget, nil, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Valid_FSType(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + expectedFSType := "ext4" + if expectedFSType != fstype { + t.Errorf("expected fstype: %s, got: %s", expectedFSType, fstype) + return errors.New("unexpected fstype") + } + return nil + } + err := Mount(context.Background(), 0, "/fake/path", nil, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Valid_Flags(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + expectedFlags := uintptr(unix.MS_RDONLY) + if expectedFlags != flags { + t.Errorf("expected flags: %v, got: %v", expectedFlags, flags) + return errors.New("unexpected flags") + } + return nil + } + err := Mount(context.Background(), 0, "/fake/path", nil, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Valid_Data(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + expectedData := "noload" + if expectedData != data { + t.Errorf("expected data: %s, got: %s", expectedData, data) + return errors.New("unexpected data") + } + return nil + } + err := Mount(context.Background(), 0, "/fake/path", nil, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} diff --git a/internal/guest/storage/scsi/scsi.go b/internal/guest/storage/scsi/scsi.go new file mode 100644 index 0000000000..5a5841015c --- /dev/null +++ b/internal/guest/storage/scsi/scsi.go @@ -0,0 +1,170 @@ +// +build linux + +package scsi + +import ( + "context" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "time" + + "github.com/Microsoft/hcsshim/internal/guest/storage" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/oc" + "github.com/pkg/errors" + "go.opencensus.io/trace" + "golang.org/x/sys/unix" +) + +// Test dependencies +var ( + osMkdirAll = os.MkdirAll + osRemoveAll = os.RemoveAll + unixMount = unix.Mount + + // controllerLunToName is stubbed to make testing `Mount` easier. + controllerLunToName = ControllerLunToName +) + +// Mount creates a mount from the SCSI device on `controller` index `lun` to +// `target` +// +// `target` will be created. On mount failure the created `target` will be +// automatically cleaned up. +func Mount(ctx context.Context, controller, lun uint8, target string, readonly bool, options []string) (err error) { + ctx, span := trace.StartSpan(ctx, "scsi::Mount") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes( + trace.Int64Attribute("controller", int64(controller)), + trace.Int64Attribute("lun", int64(lun))) + + if err := osMkdirAll(target, 0700); err != nil { + return err + } + defer func() { + if err != nil { + osRemoveAll(target) + } + }() + source, err := controllerLunToName(ctx, controller, lun) + if err != nil { + return err + } + + // we only care about readonly mount option when mounting the device + var flags uintptr + data := "" + if readonly { + flags |= unix.MS_RDONLY + data = "noload" + } + for { + if err := unixMount(source, target, "ext4", flags, data); err != nil { + // The `source` found by controllerLunToName can take some time + // before its actually available under `/dev/sd*`. Retry while we + // wait for `source` to show up. + if err == unix.ENOENT { + select { + case <-ctx.Done(): + return ctx.Err() + default: + time.Sleep(10 * time.Millisecond) + continue + } + } + return err + } + break + } + + // remount the target to account for propagation flags + _, pgFlags, _ := storage.ParseMountOptions(options) + if len(pgFlags) != 0 { + for _, pg := range pgFlags { + if err := unixMount(target, target, "", pg, ""); err != nil { + return err + } + } + } + + return nil +} + +// ControllerLunToName finds the `/dev/sd*` path to the SCSI device on +// `controller` index `lun`. +func ControllerLunToName(ctx context.Context, controller, lun uint8) (_ string, err error) { + ctx, span := trace.StartSpan(ctx, "scsi::ControllerLunToName") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes( + trace.Int64Attribute("controller", int64(controller)), + trace.Int64Attribute("lun", int64(lun))) + + scsiID := fmt.Sprintf("0:0:%d:%d", controller, lun) + + // Devices matching the given SCSI code should each have a subdirectory + // under /sys/bus/scsi/devices//block. + blockPath := filepath.Join("/sys/bus/scsi/devices", scsiID, "block") + var deviceNames []os.FileInfo + for { + deviceNames, err = ioutil.ReadDir(blockPath) + if err != nil && !os.IsNotExist(err) { + return "", err + } + if len(deviceNames) == 0 { + select { + case <-ctx.Done(): + return "", ctx.Err() + default: + time.Sleep(time.Millisecond * 10) + continue + } + } + break + } + + if len(deviceNames) == 0 { + return "", errors.Errorf("no matching device names found for SCSI ID \"%s\"", scsiID) + } + if len(deviceNames) > 1 { + return "", errors.Errorf("more than one block device could match SCSI ID \"%s\"", scsiID) + } + + devicePath := filepath.Join("/dev", deviceNames[0].Name()) + log.G(ctx).WithField("devicePath", devicePath).Debug("found device path") + return devicePath, nil +} + +// UnplugDevice finds the SCSI device on `controller` index `lun` and issues a +// guest initiated unplug. +// +// If the device is not attached returns no error. +func UnplugDevice(ctx context.Context, controller, lun uint8) (err error) { + _, span := trace.StartSpan(ctx, "scsi::UnplugDevice") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + span.AddAttributes( + trace.Int64Attribute("controller", int64(controller)), + trace.Int64Attribute("lun", int64(lun))) + + scsiID := fmt.Sprintf("0:0:%d:%d", controller, lun) + f, err := os.OpenFile(filepath.Join("/sys/bus/scsi/devices", scsiID, "delete"), os.O_WRONLY, 0644) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + if _, err := f.Write([]byte("1\n")); err != nil { + return err + } + return nil +} diff --git a/internal/guest/storage/scsi/scsi_test.go b/internal/guest/storage/scsi/scsi_test.go new file mode 100644 index 0000000000..3969ec1885 --- /dev/null +++ b/internal/guest/storage/scsi/scsi_test.go @@ -0,0 +1,387 @@ +// +build linux + +package scsi + +import ( + "context" + "errors" + "os" + "testing" + + "golang.org/x/sys/unix" +) + +func clearTestDependencies() { + osMkdirAll = nil + osRemoveAll = nil + unixMount = nil + controllerLunToName = nil +} + +func Test_Mount_Mkdir_Fails_Error(t *testing.T) { + clearTestDependencies() + + expectedErr := errors.New("mkdir : no such file or directory") + osMkdirAll = func(path string, perm os.FileMode) error { + return expectedErr + } + err := Mount(context.Background(), 0, 0, "", false, nil) + if err != expectedErr { + t.Fatalf("expected err: %v, got: %v", expectedErr, err) + } +} + +func Test_Mount_Mkdir_ExpectedPath(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + target := "/fake/path" + osMkdirAll = func(path string, perm os.FileMode) error { + if path != target { + t.Errorf("expected path: %v, got: %v", target, path) + return errors.New("unexpected path") + } + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + // Fake the mount success + return nil + } + err := Mount(context.Background(), 0, 0, target, false, nil) + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } +} + +func Test_Mount_Mkdir_ExpectedPerm(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + target := "/fake/path" + osMkdirAll = func(path string, perm os.FileMode) error { + if perm != os.FileMode(0700) { + t.Errorf("expected perm: %v, got: %v", os.FileMode(0700), perm) + return errors.New("unexpected perm") + } + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + // Fake the mount success + return nil + } + err := Mount(context.Background(), 0, 0, target, false, nil) + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } +} + +func Test_Mount_ControllerLunToName_Valid_Controller(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + expectedController := uint8(2) + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + if expectedController != controller { + t.Errorf("expected controller: %v, got: %v", expectedController, controller) + return "", errors.New("unexpected controller") + } + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + // Fake the mount success + return nil + } + err := Mount(context.Background(), expectedController, 0, "/fake/path", false, nil) + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } +} + +func Test_Mount_ControllerLunToName_Valid_Lun(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + expectedLun := uint8(2) + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + if expectedLun != lun { + t.Errorf("expected lun: %v, got: %v", expectedLun, lun) + return "", errors.New("unexpected lun") + } + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + // Fake the mount success + return nil + } + err := Mount(context.Background(), 0, expectedLun, "/fake/path", false, nil) + if err != nil { + t.Fatalf("expected nil error got: %v", err) + } +} + +func Test_Mount_Calls_RemoveAll_OnControllerToLunFailure(t *testing.T) { + clearTestDependencies() + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + expectedErr := errors.New("expected controller to lun failure") + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", expectedErr + } + target := "/fake/path" + removeAllCalled := false + osRemoveAll = func(path string) error { + removeAllCalled = true + if path != target { + t.Errorf("expected path: %v, got: %v", target, path) + return errors.New("unexpected path") + } + return nil + } + + // NOTE: Do NOT set unixMount because the controller to lun fails. Expect it + // not to be called. + + err := Mount(context.Background(), 0, 0, target, false, nil) + if err != expectedErr { + t.Fatalf("expected err: %v, got: %v", expectedErr, err) + } + if !removeAllCalled { + t.Fatal("expected os.RemoveAll to be called on mount failure") + } +} + +func Test_Mount_Calls_RemoveAll_OnMountFailure(t *testing.T) { + clearTestDependencies() + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + target := "/fake/path" + removeAllCalled := false + osRemoveAll = func(path string) error { + removeAllCalled = true + if path != target { + t.Errorf("expected path: %v, got: %v", target, path) + return errors.New("unexpected path") + } + return nil + } + expectedErr := errors.New("unexpected mount failure") + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + // Fake the mount failure to test remove is called + return expectedErr + } + err := Mount(context.Background(), 0, 0, target, false, nil) + if err != expectedErr { + t.Fatalf("expected err: %v, got: %v", expectedErr, err) + } + if !removeAllCalled { + t.Fatal("expected os.RemoveAll to be called on mount failure") + } +} + +func Test_Mount_Valid_Source(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + expectedSource := "/dev/sdz" + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return expectedSource, nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + if expectedSource != source { + t.Errorf("expected source: %s, got: %s", expectedSource, source) + return errors.New("unexpected source") + } + return nil + } + err := Mount(context.Background(), 0, 0, "/fake/path", false, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Valid_Target(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + expectedTarget := "/fake/path" + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + if expectedTarget != target { + t.Errorf("expected target: %s, got: %s", expectedTarget, target) + return errors.New("unexpected target") + } + return nil + } + err := Mount(context.Background(), 0, 0, expectedTarget, false, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Valid_FSType(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + expectedFSType := "ext4" + if expectedFSType != fstype { + t.Errorf("expected fstype: %s, got: %s", expectedFSType, fstype) + return errors.New("unexpected fstype") + } + return nil + } + err := Mount(context.Background(), 0, 0, "/fake/path", false, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Valid_Flags(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + expectedFlags := uintptr(0) + if expectedFlags != flags { + t.Errorf("expected flags: %v, got: %v", expectedFlags, flags) + return errors.New("unexpected flags") + } + return nil + } + err := Mount(context.Background(), 0, 0, "/fake/path", false, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Readonly_Valid_Flags(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + expectedFlags := uintptr(unix.MS_RDONLY) + if expectedFlags != flags { + t.Errorf("expected flags: %v, got: %v", expectedFlags, flags) + return errors.New("unexpected flags") + } + return nil + } + err := Mount(context.Background(), 0, 0, "/fake/path", true, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Valid_Data(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + if "" != data { + t.Errorf("expected empty data, got: %s", data) + return errors.New("unexpected data") + } + return nil + } + err := Mount(context.Background(), 0, 0, "/fake/path", false, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} + +func Test_Mount_Readonly_Valid_Data(t *testing.T) { + clearTestDependencies() + + // NOTE: Do NOT set osRemoveAll because the mount succeeds. Expect it not to + // be called. + + osMkdirAll = func(path string, perm os.FileMode) error { + return nil + } + controllerLunToName = func(ctx context.Context, controller, lun uint8) (string, error) { + return "", nil + } + unixMount = func(source string, target string, fstype string, flags uintptr, data string) error { + expectedData := "noload" + if expectedData != data { + t.Errorf("expected data: %s, got: %s", expectedData, data) + return errors.New("unexpected data") + } + return nil + } + err := Mount(context.Background(), 0, 0, "/fake/path", true, nil) + if err != nil { + t.Fatalf("expected nil err, got: %v", err) + } +} diff --git a/internal/guest/storage/utilities.go b/internal/guest/storage/utilities.go new file mode 100644 index 0000000000..1a4727e537 --- /dev/null +++ b/internal/guest/storage/utilities.go @@ -0,0 +1,38 @@ +// +build linux + +package storage + +import ( + "context" + "fmt" + "path/filepath" + "time" + + "github.com/pkg/errors" +) + +// export this variable so it can be mocked to aid in testing for consuming packages +var filepathglob = filepath.Glob + +// WaitForFileMatchingPattern waits for a single file that matches the given path pattern and returns the full path +// to the resulting file +func WaitForFileMatchingPattern(ctx context.Context, pattern string) (string, error) { + for { + files, err := filepathglob(pattern) + if err != nil { + return "", err + } + if len(files) == 0 { + select { + case <-ctx.Done(): + return "", errors.Wrapf(ctx.Err(), "timed out waiting for file matching pattern %s to exist", pattern) + default: + time.Sleep(time.Millisecond * 10) + continue + } + } else if len(files) > 1 { + return "", fmt.Errorf("more than one file could exist for pattern \"%s\"", pattern) + } + return files[0], nil + } +} diff --git a/internal/guest/storage/utilities_test.go b/internal/guest/storage/utilities_test.go new file mode 100644 index 0000000000..239f95f056 --- /dev/null +++ b/internal/guest/storage/utilities_test.go @@ -0,0 +1,87 @@ +// +build linux + +package storage + +import ( + "context" + "io/ioutil" + "os" + "path/filepath" + "testing" + "time" +) + +func Test_WaitForFileMatchingPattern_Success(t *testing.T) { + ctx, _ := context.WithTimeout(context.Background(), 2*time.Second) + + testDir, err := ioutil.TempDir("", "vmbus_test") + if err != nil { + t.Fatalf("unexpected error creating temp dir %v", err) + } + defer os.RemoveAll(testDir) + + actualPath := filepath.Join(testDir, "path1") + err = os.Mkdir(actualPath, 0777) + if err != nil { + t.Fatalf("unexpected error creating test path: %v", err) + } + + pathPattern := filepath.Join(testDir, "path*") + pathsToTest := []string{actualPath, pathPattern} + for _, p := range pathsToTest { + result, err := WaitForFileMatchingPattern(ctx, p) + if err != nil { + t.Fatalf("expected to find path %v but got error: %v", p, err) + } + if result != actualPath { + t.Fatalf("expected to return path %s, instead go %s", actualPath, result) + } + } +} + +func Test_WaitForFileMatchingPattern_Multiple_Matches(t *testing.T) { + ctx, _ := context.WithTimeout(context.Background(), 2*time.Second) + + testDir, err := ioutil.TempDir("", "vmbus_test") + if err != nil { + t.Fatalf("unexpected error creating temp dir %v", err) + } + defer os.RemoveAll(testDir) + + actualPaths := []string{"path1", "path2"} + for _, p := range actualPaths { + fullPath := filepath.Join(testDir, p) + err := os.Mkdir(fullPath, 0777) + if err != nil { + t.Fatalf("unexpected error creating test path: %v", err) + } + } + + pathPattern := filepath.Join(testDir, "path*") + _, err = WaitForFileMatchingPattern(ctx, pathPattern) + if err == nil { + t.Fatalf("expected to fail due to multiple matching files") + } +} + +func Test_WaitForFileMatchingPattern_No_Matches(t *testing.T) { + ctx, _ := context.WithTimeout(context.Background(), 2*time.Second) + + testDir, err := ioutil.TempDir("", "vmbus_test") + if err != nil { + t.Fatalf("unexpected error creating temp dir %v", err) + } + defer os.RemoveAll(testDir) + + actualPath := filepath.Join(testDir, "path1") + err = os.Mkdir(actualPath, 0777) + if err != nil { + t.Fatalf("unexpected error creating test path: %v", err) + } + + badTestPath := filepath.Join(testDir, "path2") + _, err = WaitForFileMatchingPattern(ctx, badTestPath) + if err == nil { + t.Fatalf("expected to fail due to no matching files") + } +} diff --git a/internal/guest/storage/vmbus/vmbus.go b/internal/guest/storage/vmbus/vmbus.go new file mode 100644 index 0000000000..6ea91bbb9c --- /dev/null +++ b/internal/guest/storage/vmbus/vmbus.go @@ -0,0 +1,18 @@ +// +build linux + +package vmbus + +import ( + "context" + "path/filepath" + + "github.com/Microsoft/hcsshim/internal/guest/storage" +) + +var storageWaitForFileMatchingPattern = storage.WaitForFileMatchingPattern + +// WaitForDevicePath waits for the vmbus device to exist at /sys/bus/vmbus/devices/... +func WaitForDevicePath(ctx context.Context, vmbusGUIDPattern string) (string, error) { + vmBusPath := filepath.Join("/sys/bus/vmbus/devices", vmbusGUIDPattern) + return storageWaitForFileMatchingPattern(ctx, vmBusPath) +} diff --git a/internal/guest/storage/vmbus/vmbus_test.go b/internal/guest/storage/vmbus/vmbus_test.go new file mode 100644 index 0000000000..491865c21d --- /dev/null +++ b/internal/guest/storage/vmbus/vmbus_test.go @@ -0,0 +1,35 @@ +// +build linux + +package vmbus + +import ( + "context" + "path/filepath" + "testing" + "time" +) + +func Test_WaitForVMBusDevicePath_Success(t *testing.T) { + ctx, _ := context.WithTimeout(context.Background(), 2*time.Second) + + vmBusGUID := "1111-2222-3333-4444" + pciDir := "pci1234:00" + + storageWaitForFileMatchingPattern = func(ctx context.Context, pattern string) (string, error) { + vmBusDirPath, targetPattern := filepath.Split(pattern) + if targetPattern == "pci*" { + return filepath.Join(vmBusDirPath, pciDir), nil + } + return "", nil + } + + vmBusGUIDPattern := filepath.Join(vmBusGUID, "pci*") + expectedResult := filepath.Join("/sys/bus/vmbus/devices", vmBusGUID, pciDir) + result, err := WaitForDevicePath(ctx, vmBusGUIDPattern) + if err != nil { + t.Fatalf("expected to succeed, instead got: %v", err) + } + if result != expectedResult { + t.Fatalf("result %s does not match expected result %s", result, expectedResult) + } +} diff --git a/internal/guest/transport/transport.go b/internal/guest/transport/transport.go new file mode 100644 index 0000000000..0bff0299d5 --- /dev/null +++ b/internal/guest/transport/transport.go @@ -0,0 +1,28 @@ +// Package transport defines the interfaces describing a connection-like data +// transport mechanism. +package transport + +import ( + "io" + "os" +) + +// Transport is the interface defining a method of transporting data in a +// connection-like way. +// Examples of a Transport implementation could be: +// Hyper-V socket transport +// TCP/IP socket transport +// Mocked-out local transport +type Transport interface { + // Dial takes a port number and returns a connected connection. + Dial(port uint32) (Connection, error) +} + +// Connection is the interface defining a data connection, such as a socket or +// a mocked implementation. +type Connection interface { + io.ReadWriteCloser + CloseRead() error + CloseWrite() error + File() (*os.File, error) +} diff --git a/internal/guest/transport/vsock.go b/internal/guest/transport/vsock.go new file mode 100644 index 0000000000..c731b984aa --- /dev/null +++ b/internal/guest/transport/vsock.go @@ -0,0 +1,50 @@ +package transport + +import ( + "fmt" + "syscall" + "time" + + "github.com/linuxkit/virtsock/pkg/vsock" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +//nolint:deadcode,varcheck +const ( + vmaddrCidHost = 2 + vmaddrCidAny = 0xffffffff +) + +// VsockTransport is an implementation of Transport which uses vsock +// sockets. +type VsockTransport struct{} + +var _ Transport = &VsockTransport{} + +// Dial accepts a vsock socket port number as configuration, and +// returns an unconnected VsockConnection struct. +func (t *VsockTransport) Dial(port uint32) (Connection, error) { + logrus.WithFields(logrus.Fields{ + "port": port, + }).Info("opengcs::VsockTransport::Dial - vsock dial port") + + // HACK: Remove loop when vsock bugs are fixed! + // Retry 10 times because vsock.Dial can return connection time out + // due to some underlying kernel bug. + for i := 0; i < 10; i++ { + conn, err := vsock.Dial(vmaddrCidHost, port) + if err == nil { + return conn, nil + } + // If the error was ETIMEDOUT retry, otherwise fail. + cause := errors.Cause(err) + if errno, ok := cause.(syscall.Errno); ok && errno == syscall.ETIMEDOUT { + time.Sleep(100 * time.Millisecond) + continue + } else { + return nil, errors.Wrapf(err, "vsock Dial port (%d) failed", port) + } + } + return nil, fmt.Errorf("failed connecting the VsockConnection: can't connect after 10 attempts") +} diff --git a/opengcs/README.md b/opengcs/README.md new file mode 100644 index 0000000000..acc247fa90 --- /dev/null +++ b/opengcs/README.md @@ -0,0 +1,34 @@ + +# Open Guest Compute Service (opengcs) [![Build Status](https://github.com/microsoft/opengcs/actions/workflows/ci.yml/badge.svg)](https://github.com/microsoft/opengcs/actions/workflows/ci.yml) + +Open Guest Compute Service is a Linux open source project to further the development of a production quality implementation of Linux Hyper-V containers on Windows (LCOW). It's designed to run inside a custom Linux OS for supporting Linux container payload. + +# LCOW v1 (deprecated) + +The original version of `LCOW v1` was designed to run directly through `Docker` against the `HCS` (Host Compute Service) on Windows. This workflow is no longer supported by this repository however it has not been intentionally broken. If you would like to continue to use `LCOW v1` there is a branch `lcow_v1` that is the LKG branch previous to the removal of `LCOW v1` from the `master` branchline. All future efforts are focused on `LCOW v2`. + +# LCOW v2 + +The primary difference between `LCOW v1` and `LCOW v2` is that `v1` was designed to hide the concept of the Utility VM. The caller created a _Linux container_ and operated on the container as if it was natively running on Windows. In the background a lightweight Utility VM was created that actually hosted the container but this was not visible and its resources not controllable via the caller. Although this works, it severely limited certain abilities such as the concept of Kubernetes pod or placing multiple LCOW containers in a single hypervisor boundary and set of resources. + +Thus `LCOW v2` was created which has two primary differences. +- The Utility VM backing the Linux containers is a first class construct. Thus it can be managed in a lifetime separate from the actual containers running in it. +- The communication from host to guest is no longer done via the platform. This means that `LCOW v2` can iterate simply by improving its host/guest protocol with no need for taking Windows updates. + +The focus of `LCOW v2` as a replacement of `LCOW v1` is through the coordination and work that has gone into [containerd/containerd](https://github.com/containerd/containerd) and its [Runtime V2](https://github.com/containerd/containerd/tree/master/runtime/v2) interface. To see our `containerd` hostside shim please look here [Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1](https://github.com/microsoft/hcsshim/tree/master/cmd/containerd-shim-runhcs-v1). + +# Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.microsoft.com. + +When you submit a pull request, a CLA-bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +We also ask that contributors [sign their commits](https://git-scm.com/docs/git-commit) using `git commit -s` or `git commit --signoff` to certify they either authored the work themselves or otherwise have permission to use it in this project. + +# Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/vendor/github.com/containerd/cgroups/.gitignore b/vendor/github.com/containerd/cgroups/.gitignore new file mode 100644 index 0000000000..3465c14cf7 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/.gitignore @@ -0,0 +1,2 @@ +example/example +cmd/cgctl/cgctl diff --git a/vendor/github.com/containerd/cgroups/Makefile b/vendor/github.com/containerd/cgroups/Makefile new file mode 100644 index 0000000000..19e6607561 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/Makefile @@ -0,0 +1,24 @@ +# Copyright The containerd Authors. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PACKAGES=$(shell go list ./... | grep -v /vendor/) + +all: cgutil + go build -v + +cgutil: + cd cmd/cgctl && go build -v + +proto: + protobuild --quiet ${PACKAGES} diff --git a/vendor/github.com/containerd/cgroups/Protobuild.toml b/vendor/github.com/containerd/cgroups/Protobuild.toml new file mode 100644 index 0000000000..1c4c802fe1 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/Protobuild.toml @@ -0,0 +1,46 @@ +version = "unstable" +generator = "gogoctrd" +plugins = ["grpc"] + +# Control protoc include paths. Below are usually some good defaults, but feel +# free to try it without them if it works for your project. +[includes] + # Include paths that will be added before all others. Typically, you want to + # treat the root of the project as an include, but this may not be necessary. + # before = ["."] + + # Paths that should be treated as include roots in relation to the vendor + # directory. These will be calculated with the vendor directory nearest the + # target package. + # vendored = ["github.com/gogo/protobuf"] + packages = ["github.com/gogo/protobuf"] + + # Paths that will be added untouched to the end of the includes. We use + # `/usr/local/include` to pickup the common install location of protobuf. + # This is the default. + after = ["/usr/local/include", "/usr/include"] + +# This section maps protobuf imports to Go packages. These will become +# `-M` directives in the call to the go protobuf generator. +[packages] + "gogoproto/gogo.proto" = "github.com/gogo/protobuf/gogoproto" + "google/protobuf/any.proto" = "github.com/gogo/protobuf/types" + "google/protobuf/descriptor.proto" = "github.com/gogo/protobuf/protoc-gen-gogo/descriptor" + "google/protobuf/field_mask.proto" = "github.com/gogo/protobuf/types" + "google/protobuf/timestamp.proto" = "github.com/gogo/protobuf/types" + +# Aggregrate the API descriptors to lock down API changes. +[[descriptors]] +prefix = "github.com/containerd/cgroups/stats/v1" +target = "stats/v1/metrics.pb.txt" +ignore_files = [ + "google/protobuf/descriptor.proto", + "gogoproto/gogo.proto" +] +[[descriptors]] +prefix = "github.com/containerd/cgroups/v2/stats" +target = "v2/stats/metrics.pb.txt" +ignore_files = [ + "google/protobuf/descriptor.proto", + "gogoproto/gogo.proto" +] diff --git a/vendor/github.com/containerd/cgroups/README.md b/vendor/github.com/containerd/cgroups/README.md new file mode 100644 index 0000000000..d4b09f3d61 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/README.md @@ -0,0 +1,149 @@ +# cgroups + +[![Build Status](https://github.com/containerd/cgroups/workflows/CI/badge.svg)](https://github.com/containerd/cgroups/actions?query=workflow%3ACI) +[![codecov](https://codecov.io/gh/containerd/cgroups/branch/master/graph/badge.svg)](https://codecov.io/gh/containerd/cgroups) +[![GoDoc](https://godoc.org/github.com/containerd/cgroups?status.svg)](https://godoc.org/github.com/containerd/cgroups) +[![Go Report Card](https://goreportcard.com/badge/github.com/containerd/cgroups)](https://goreportcard.com/report/github.com/containerd/cgroups) + +Go package for creating, managing, inspecting, and destroying cgroups. +The resources format for settings on the cgroup uses the OCI runtime-spec found +[here](https://github.com/opencontainers/runtime-spec). + +## Examples + +### Create a new cgroup + +This creates a new cgroup using a static path for all subsystems under `/test`. + +* /sys/fs/cgroup/cpu/test +* /sys/fs/cgroup/memory/test +* etc.... + +It uses a single hierarchy and specifies cpu shares as a resource constraint and +uses the v1 implementation of cgroups. + + +```go +shares := uint64(100) +control, err := cgroups.New(cgroups.V1, cgroups.StaticPath("/test"), &specs.LinuxResources{ + CPU: &specs.CPU{ + Shares: &shares, + }, +}) +defer control.Delete() +``` + +### Create with systemd slice support + + +```go +control, err := cgroups.New(cgroups.Systemd, cgroups.Slice("system.slice", "runc-test"), &specs.LinuxResources{ + CPU: &specs.CPU{ + Shares: &shares, + }, +}) + +``` + +### Load an existing cgroup + +```go +control, err = cgroups.Load(cgroups.V1, cgroups.StaticPath("/test")) +``` + +### Add a process to the cgroup + +```go +if err := control.Add(cgroups.Process{Pid:1234}); err != nil { +} +``` + +### Update the cgroup + +To update the resources applied in the cgroup + +```go +shares = uint64(200) +if err := control.Update(&specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Shares: &shares, + }, +}); err != nil { +} +``` + +### Freeze and Thaw the cgroup + +```go +if err := control.Freeze(); err != nil { +} +if err := control.Thaw(); err != nil { +} +``` + +### List all processes in the cgroup or recursively + +```go +processes, err := control.Processes(cgroups.Devices, recursive) +``` + +### Get Stats on the cgroup + +```go +stats, err := control.Stat() +``` + +By adding `cgroups.IgnoreNotExist` all non-existent files will be ignored, e.g. swap memory stats without swap enabled +```go +stats, err := control.Stat(cgroups.IgnoreNotExist) +``` + +### Move process across cgroups + +This allows you to take processes from one cgroup and move them to another. + +```go +err := control.MoveTo(destination) +``` + +### Create subcgroup + +```go +subCgroup, err := control.New("child", resources) +``` + +### Registering for memory events + +This allows you to get notified by an eventfd for v1 memory cgroups events. + +```go +event := cgroups.MemoryThresholdEvent(50 * 1024 * 1024, false) +efd, err := control.RegisterMemoryEvent(event) +``` + +```go +event := cgroups.MemoryPressureEvent(cgroups.MediumPressure, cgroups.DefaultMode) +efd, err := control.RegisterMemoryEvent(event) +``` + +```go +efd, err := control.OOMEventFD() +// or by using RegisterMemoryEvent +event := cgroups.OOMEvent() +efd, err := control.RegisterMemoryEvent(event) +``` + +### Attention + +All static path should not include `/sys/fs/cgroup/` prefix, it should start with your own cgroups name + +## Project details + +Cgroups is a containerd sub-project, licensed under the [Apache 2.0 license](./LICENSE). +As a containerd sub-project, you will find the: + + * [Project governance](https://github.com/containerd/project/blob/master/GOVERNANCE.md), + * [Maintainers](https://github.com/containerd/project/blob/master/MAINTAINERS), + * and [Contributing guidelines](https://github.com/containerd/project/blob/master/CONTRIBUTING.md) + +information in our [`containerd/project`](https://github.com/containerd/project) repository. diff --git a/vendor/github.com/containerd/cgroups/Vagrantfile b/vendor/github.com/containerd/cgroups/Vagrantfile new file mode 100644 index 0000000000..4596ad8a7d --- /dev/null +++ b/vendor/github.com/containerd/cgroups/Vagrantfile @@ -0,0 +1,46 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +Vagrant.configure("2") do |config| +# Fedora box is used for testing cgroup v2 support + config.vm.box = "fedora/32-cloud-base" + config.vm.provider :virtualbox do |v| + v.memory = 2048 + v.cpus = 2 + end + config.vm.provider :libvirt do |v| + v.memory = 2048 + v.cpus = 2 + end + config.vm.provision "shell", inline: <<-SHELL + set -eux -o pipefail + # configuration + GO_VERSION="1.15" + + # install gcc and Golang + dnf -y install gcc + curl -fsSL "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" | tar Cxz /usr/local + + # setup env vars + cat >> /etc/profile.d/sh.local < /test.sh < 0 { + return nil + } + + // Even the kernel is compiled with the CFQ scheduler, the cgroup may not use + // block devices with the CFQ scheduler. If so, we should fallback to throttle.* files. + settings = []blkioStatSettings{ + { + name: "throttle.io_serviced", + entry: &stats.Blkio.IoServicedRecursive, + }, + { + name: "throttle.io_service_bytes", + entry: &stats.Blkio.IoServiceBytesRecursive, + }, + } + for _, t := range settings { + if err := b.readEntry(devices, path, t.name, t.entry); err != nil { + return err + } + } + return nil +} + +func (b *blkioController) readEntry(devices map[deviceKey]string, path, name string, entry *[]*v1.BlkIOEntry) error { + f, err := os.Open(filepath.Join(b.Path(path), "blkio."+name)) + if err != nil { + return err + } + defer f.Close() + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkIOStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return fmt.Errorf("invalid line found while parsing %s: %s", path, sc.Text()) + } + } + major, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return err + } + minor, err := strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return err + } + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err := strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return err + } + *entry = append(*entry, &v1.BlkIOEntry{ + Device: devices[deviceKey{major, minor}], + Major: major, + Minor: minor, + Op: op, + Value: v, + }) + } + return sc.Err() +} + +func createBlkioSettings(blkio *specs.LinuxBlockIO) []blkioSettings { + settings := []blkioSettings{} + + if blkio.Weight != nil { + settings = append(settings, + blkioSettings{ + name: "weight", + value: blkio.Weight, + format: uintf, + }) + } + if blkio.LeafWeight != nil { + settings = append(settings, + blkioSettings{ + name: "leaf_weight", + value: blkio.LeafWeight, + format: uintf, + }) + } + for _, wd := range blkio.WeightDevice { + if wd.Weight != nil { + settings = append(settings, + blkioSettings{ + name: "weight_device", + value: wd, + format: weightdev, + }) + } + if wd.LeafWeight != nil { + settings = append(settings, + blkioSettings{ + name: "leaf_weight_device", + value: wd, + format: weightleafdev, + }) + } + } + for _, t := range []struct { + name string + list []specs.LinuxThrottleDevice + }{ + { + name: "throttle.read_bps_device", + list: blkio.ThrottleReadBpsDevice, + }, + { + name: "throttle.read_iops_device", + list: blkio.ThrottleReadIOPSDevice, + }, + { + name: "throttle.write_bps_device", + list: blkio.ThrottleWriteBpsDevice, + }, + { + name: "throttle.write_iops_device", + list: blkio.ThrottleWriteIOPSDevice, + }, + } { + for _, td := range t.list { + settings = append(settings, blkioSettings{ + name: t.name, + value: td, + format: throttleddev, + }) + } + } + return settings +} + +type blkioSettings struct { + name string + value interface{} + format func(v interface{}) []byte +} + +type blkioStatSettings struct { + name string + entry *[]*v1.BlkIOEntry +} + +func uintf(v interface{}) []byte { + return []byte(strconv.FormatUint(uint64(*v.(*uint16)), 10)) +} + +func weightdev(v interface{}) []byte { + wd := v.(specs.LinuxWeightDevice) + return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, *wd.Weight)) +} + +func weightleafdev(v interface{}) []byte { + wd := v.(specs.LinuxWeightDevice) + return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, *wd.LeafWeight)) +} + +func throttleddev(v interface{}) []byte { + td := v.(specs.LinuxThrottleDevice) + return []byte(fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)) +} + +func splitBlkIOStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +type deviceKey struct { + major, minor uint64 +} + +// getDevices makes a best effort attempt to read all the devices into a map +// keyed by major and minor number. Since devices may be mapped multiple times, +// we err on taking the first occurrence. +func getDevices(r io.Reader) (map[deviceKey]string, error) { + + var ( + s = bufio.NewScanner(r) + devices = make(map[deviceKey]string) + ) + for s.Scan() { + fields := strings.Fields(s.Text()) + major, err := strconv.Atoi(fields[0]) + if err != nil { + return nil, err + } + minor, err := strconv.Atoi(fields[1]) + if err != nil { + return nil, err + } + key := deviceKey{ + major: uint64(major), + minor: uint64(minor), + } + if _, ok := devices[key]; ok { + continue + } + devices[key] = filepath.Join("/dev", fields[2]) + } + return devices, s.Err() +} diff --git a/vendor/github.com/containerd/cgroups/cgroup.go b/vendor/github.com/containerd/cgroups/cgroup.go new file mode 100644 index 0000000000..e0e014b282 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/cgroup.go @@ -0,0 +1,552 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" +) + +// New returns a new control via the cgroup cgroups interface +func New(hierarchy Hierarchy, path Path, resources *specs.LinuxResources, opts ...InitOpts) (Cgroup, error) { + config := newInitConfig() + for _, o := range opts { + if err := o(config); err != nil { + return nil, err + } + } + subsystems, err := hierarchy() + if err != nil { + return nil, err + } + var active []Subsystem + for _, s := range subsystems { + // check if subsystem exists + if err := initializeSubsystem(s, path, resources); err != nil { + if err == ErrControllerNotActive { + if config.InitCheck != nil { + if skerr := config.InitCheck(s, path, err); skerr != nil { + if skerr != ErrIgnoreSubsystem { + return nil, skerr + } + } + } + continue + } + return nil, err + } + active = append(active, s) + } + return &cgroup{ + path: path, + subsystems: active, + }, nil +} + +// Load will load an existing cgroup and allow it to be controlled +// All static path should not include `/sys/fs/cgroup/` prefix, it should start with your own cgroups name +func Load(hierarchy Hierarchy, path Path, opts ...InitOpts) (Cgroup, error) { + config := newInitConfig() + for _, o := range opts { + if err := o(config); err != nil { + return nil, err + } + } + var activeSubsystems []Subsystem + subsystems, err := hierarchy() + if err != nil { + return nil, err + } + // check that the subsystems still exist, and keep only those that actually exist + for _, s := range pathers(subsystems) { + p, err := path(s.Name()) + if err != nil { + if os.IsNotExist(errors.Cause(err)) { + return nil, ErrCgroupDeleted + } + if err == ErrControllerNotActive { + if config.InitCheck != nil { + if skerr := config.InitCheck(s, path, err); skerr != nil { + if skerr != ErrIgnoreSubsystem { + return nil, skerr + } + } + } + continue + } + return nil, err + } + if _, err := os.Lstat(s.Path(p)); err != nil { + if os.IsNotExist(err) { + continue + } + return nil, err + } + activeSubsystems = append(activeSubsystems, s) + } + // if we do not have any active systems then the cgroup is deleted + if len(activeSubsystems) == 0 { + return nil, ErrCgroupDeleted + } + return &cgroup{ + path: path, + subsystems: activeSubsystems, + }, nil +} + +type cgroup struct { + path Path + + subsystems []Subsystem + mu sync.Mutex + err error +} + +// New returns a new sub cgroup +func (c *cgroup) New(name string, resources *specs.LinuxResources) (Cgroup, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + path := subPath(c.path, name) + for _, s := range c.subsystems { + if err := initializeSubsystem(s, path, resources); err != nil { + return nil, err + } + } + return &cgroup{ + path: path, + subsystems: c.subsystems, + }, nil +} + +// Subsystems returns all the subsystems that are currently being +// consumed by the group +func (c *cgroup) Subsystems() []Subsystem { + return c.subsystems +} + +// Add moves the provided process into the new cgroup +func (c *cgroup) Add(process Process) error { + if process.Pid <= 0 { + return ErrInvalidPid + } + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + return c.add(process) +} + +func (c *cgroup) add(process Process) error { + for _, s := range pathers(c.subsystems) { + p, err := c.path(s.Name()) + if err != nil { + return err + } + if err := retryingWriteFile( + filepath.Join(s.Path(p), cgroupProcs), + []byte(strconv.Itoa(process.Pid)), + defaultFilePerm, + ); err != nil { + return err + } + } + return nil +} + +// AddTask moves the provided tasks (threads) into the new cgroup +func (c *cgroup) AddTask(process Process) error { + if process.Pid <= 0 { + return ErrInvalidPid + } + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + return c.addTask(process) +} + +func (c *cgroup) addTask(process Process) error { + for _, s := range pathers(c.subsystems) { + p, err := c.path(s.Name()) + if err != nil { + return err + } + if err := retryingWriteFile( + filepath.Join(s.Path(p), cgroupTasks), + []byte(strconv.Itoa(process.Pid)), + defaultFilePerm, + ); err != nil { + return err + } + } + return nil +} + +// Delete will remove the control group from each of the subsystems registered +func (c *cgroup) Delete() error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + var errs []string + for _, s := range c.subsystems { + if d, ok := s.(deleter); ok { + sp, err := c.path(s.Name()) + if err != nil { + return err + } + if err := d.Delete(sp); err != nil { + errs = append(errs, string(s.Name())) + } + continue + } + if p, ok := s.(pather); ok { + sp, err := c.path(s.Name()) + if err != nil { + return err + } + path := p.Path(sp) + if err := remove(path); err != nil { + errs = append(errs, path) + } + } + } + if len(errs) > 0 { + return fmt.Errorf("cgroups: unable to remove paths %s", strings.Join(errs, ", ")) + } + c.err = ErrCgroupDeleted + return nil +} + +// Stat returns the current metrics for the cgroup +func (c *cgroup) Stat(handlers ...ErrorHandler) (*v1.Metrics, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + if len(handlers) == 0 { + handlers = append(handlers, errPassthrough) + } + var ( + stats = &v1.Metrics{ + CPU: &v1.CPUStat{ + Throttling: &v1.Throttle{}, + Usage: &v1.CPUUsage{}, + }, + } + wg = &sync.WaitGroup{} + errs = make(chan error, len(c.subsystems)) + ) + for _, s := range c.subsystems { + if ss, ok := s.(stater); ok { + sp, err := c.path(s.Name()) + if err != nil { + return nil, err + } + wg.Add(1) + go func() { + defer wg.Done() + if err := ss.Stat(sp, stats); err != nil { + for _, eh := range handlers { + if herr := eh(err); herr != nil { + errs <- herr + } + } + } + }() + } + } + wg.Wait() + close(errs) + for err := range errs { + return nil, err + } + return stats, nil +} + +// Update updates the cgroup with the new resource values provided +// +// Be prepared to handle EBUSY when trying to update a cgroup with +// live processes and other operations like Stats being performed at the +// same time +func (c *cgroup) Update(resources *specs.LinuxResources) error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + for _, s := range c.subsystems { + if u, ok := s.(updater); ok { + sp, err := c.path(s.Name()) + if err != nil { + return err + } + if err := u.Update(sp, resources); err != nil { + return err + } + } + } + return nil +} + +// Processes returns the processes running inside the cgroup along +// with the subsystem used, pid, and path +func (c *cgroup) Processes(subsystem Name, recursive bool) ([]Process, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + return c.processes(subsystem, recursive) +} + +func (c *cgroup) processes(subsystem Name, recursive bool) ([]Process, error) { + s := c.getSubsystem(subsystem) + sp, err := c.path(subsystem) + if err != nil { + return nil, err + } + path := s.(pather).Path(sp) + var processes []Process + err = filepath.Walk(path, func(p string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !recursive && info.IsDir() { + if p == path { + return nil + } + return filepath.SkipDir + } + dir, name := filepath.Split(p) + if name != cgroupProcs { + return nil + } + procs, err := readPids(dir, subsystem) + if err != nil { + return err + } + processes = append(processes, procs...) + return nil + }) + return processes, err +} + +// Tasks returns the tasks running inside the cgroup along +// with the subsystem used, pid, and path +func (c *cgroup) Tasks(subsystem Name, recursive bool) ([]Task, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + return c.tasks(subsystem, recursive) +} + +func (c *cgroup) tasks(subsystem Name, recursive bool) ([]Task, error) { + s := c.getSubsystem(subsystem) + sp, err := c.path(subsystem) + if err != nil { + return nil, err + } + path := s.(pather).Path(sp) + var tasks []Task + err = filepath.Walk(path, func(p string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !recursive && info.IsDir() { + if p == path { + return nil + } + return filepath.SkipDir + } + dir, name := filepath.Split(p) + if name != cgroupTasks { + return nil + } + procs, err := readTasksPids(dir, subsystem) + if err != nil { + return err + } + tasks = append(tasks, procs...) + return nil + }) + return tasks, err +} + +// Freeze freezes the entire cgroup and all the processes inside it +func (c *cgroup) Freeze() error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + s := c.getSubsystem(Freezer) + if s == nil { + return ErrFreezerNotSupported + } + sp, err := c.path(Freezer) + if err != nil { + return err + } + return s.(*freezerController).Freeze(sp) +} + +// Thaw thaws out the cgroup and all the processes inside it +func (c *cgroup) Thaw() error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + s := c.getSubsystem(Freezer) + if s == nil { + return ErrFreezerNotSupported + } + sp, err := c.path(Freezer) + if err != nil { + return err + } + return s.(*freezerController).Thaw(sp) +} + +// OOMEventFD returns the memory cgroup's out of memory event fd that triggers +// when processes inside the cgroup receive an oom event. Returns +// ErrMemoryNotSupported if memory cgroups is not supported. +func (c *cgroup) OOMEventFD() (uintptr, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return 0, c.err + } + s := c.getSubsystem(Memory) + if s == nil { + return 0, ErrMemoryNotSupported + } + sp, err := c.path(Memory) + if err != nil { + return 0, err + } + return s.(*memoryController).memoryEvent(sp, OOMEvent()) +} + +// RegisterMemoryEvent allows the ability to register for all v1 memory cgroups +// notifications. +func (c *cgroup) RegisterMemoryEvent(event MemoryEvent) (uintptr, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return 0, c.err + } + s := c.getSubsystem(Memory) + if s == nil { + return 0, ErrMemoryNotSupported + } + sp, err := c.path(Memory) + if err != nil { + return 0, err + } + return s.(*memoryController).memoryEvent(sp, event) +} + +// State returns the state of the cgroup and its processes +func (c *cgroup) State() State { + c.mu.Lock() + defer c.mu.Unlock() + c.checkExists() + if c.err != nil && c.err == ErrCgroupDeleted { + return Deleted + } + s := c.getSubsystem(Freezer) + if s == nil { + return Thawed + } + sp, err := c.path(Freezer) + if err != nil { + return Unknown + } + state, err := s.(*freezerController).state(sp) + if err != nil { + return Unknown + } + return state +} + +// MoveTo does a recursive move subsystem by subsystem of all the processes +// inside the group +func (c *cgroup) MoveTo(destination Cgroup) error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + for _, s := range c.subsystems { + processes, err := c.processes(s.Name(), true) + if err != nil { + return err + } + for _, p := range processes { + if err := destination.Add(p); err != nil { + if strings.Contains(err.Error(), "no such process") { + continue + } + return err + } + } + } + return nil +} + +func (c *cgroup) getSubsystem(n Name) Subsystem { + for _, s := range c.subsystems { + if s.Name() == n { + return s + } + } + return nil +} + +func (c *cgroup) checkExists() { + for _, s := range pathers(c.subsystems) { + p, err := c.path(s.Name()) + if err != nil { + return + } + if _, err := os.Lstat(s.Path(p)); err != nil { + if os.IsNotExist(err) { + c.err = ErrCgroupDeleted + return + } + } + } +} diff --git a/vendor/github.com/containerd/cgroups/control.go b/vendor/github.com/containerd/cgroups/control.go new file mode 100644 index 0000000000..a4cb9b8324 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/control.go @@ -0,0 +1,92 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "os" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + cgroupProcs = "cgroup.procs" + cgroupTasks = "tasks" + defaultDirPerm = 0755 +) + +// defaultFilePerm is a var so that the test framework can change the filemode +// of all files created when the tests are running. The difference between the +// tests and real world use is that files like "cgroup.procs" will exist when writing +// to a read cgroup filesystem and do not exist prior when running in the tests. +// this is set to a non 0 value in the test code +var defaultFilePerm = os.FileMode(0) + +type Process struct { + // Subsystem is the name of the subsystem that the process is in + Subsystem Name + // Pid is the process id of the process + Pid int + // Path is the full path of the subsystem and location that the process is in + Path string +} + +type Task struct { + // Subsystem is the name of the subsystem that the task is in + Subsystem Name + // Pid is the process id of the task + Pid int + // Path is the full path of the subsystem and location that the task is in + Path string +} + +// Cgroup handles interactions with the individual groups to perform +// actions on them as them main interface to this cgroup package +type Cgroup interface { + // New creates a new cgroup under the calling cgroup + New(string, *specs.LinuxResources) (Cgroup, error) + // Add adds a process to the cgroup (cgroup.procs) + Add(Process) error + // AddTask adds a process to the cgroup (tasks) + AddTask(Process) error + // Delete removes the cgroup as a whole + Delete() error + // MoveTo moves all the processes under the calling cgroup to the provided one + // subsystems are moved one at a time + MoveTo(Cgroup) error + // Stat returns the stats for all subsystems in the cgroup + Stat(...ErrorHandler) (*v1.Metrics, error) + // Update updates all the subsystems with the provided resource changes + Update(resources *specs.LinuxResources) error + // Processes returns all the processes in a select subsystem for the cgroup + Processes(Name, bool) ([]Process, error) + // Tasks returns all the tasks in a select subsystem for the cgroup + Tasks(Name, bool) ([]Task, error) + // Freeze freezes or pauses all processes inside the cgroup + Freeze() error + // Thaw thaw or resumes all processes inside the cgroup + Thaw() error + // OOMEventFD returns the memory subsystem's event fd for OOM events + OOMEventFD() (uintptr, error) + // RegisterMemoryEvent returns the memory subsystems event fd for whatever memory event was + // registered for. Can alternatively register for the oom event with this method. + RegisterMemoryEvent(MemoryEvent) (uintptr, error) + // State returns the cgroups current state + State() State + // Subsystems returns all the subsystems in the cgroup + Subsystems() []Subsystem +} diff --git a/vendor/github.com/containerd/cgroups/cpu.go b/vendor/github.com/containerd/cgroups/cpu.go new file mode 100644 index 0000000000..27024f17b8 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/cpu.go @@ -0,0 +1,125 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewCpu(root string) *cpuController { + return &cpuController{ + root: filepath.Join(root, string(Cpu)), + } +} + +type cpuController struct { + root string +} + +func (c *cpuController) Name() Name { + return Cpu +} + +func (c *cpuController) Path(path string) string { + return filepath.Join(c.root, path) +} + +func (c *cpuController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(c.Path(path), defaultDirPerm); err != nil { + return err + } + if cpu := resources.CPU; cpu != nil { + for _, t := range []struct { + name string + ivalue *int64 + uvalue *uint64 + }{ + { + name: "rt_period_us", + uvalue: cpu.RealtimePeriod, + }, + { + name: "rt_runtime_us", + ivalue: cpu.RealtimeRuntime, + }, + { + name: "shares", + uvalue: cpu.Shares, + }, + { + name: "cfs_period_us", + uvalue: cpu.Period, + }, + { + name: "cfs_quota_us", + ivalue: cpu.Quota, + }, + } { + var value []byte + if t.uvalue != nil { + value = []byte(strconv.FormatUint(*t.uvalue, 10)) + } else if t.ivalue != nil { + value = []byte(strconv.FormatInt(*t.ivalue, 10)) + } + if value != nil { + if err := retryingWriteFile( + filepath.Join(c.Path(path), "cpu."+t.name), + value, + defaultFilePerm, + ); err != nil { + return err + } + } + } + } + return nil +} + +func (c *cpuController) Update(path string, resources *specs.LinuxResources) error { + return c.Create(path, resources) +} + +func (c *cpuController) Stat(path string, stats *v1.Metrics) error { + f, err := os.Open(filepath.Join(c.Path(path), "cpu.stat")) + if err != nil { + return err + } + defer f.Close() + // get or create the cpu field because cpuacct can also set values on this struct + sc := bufio.NewScanner(f) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return err + } + switch key { + case "nr_periods": + stats.CPU.Throttling.Periods = v + case "nr_throttled": + stats.CPU.Throttling.ThrottledPeriods = v + case "throttled_time": + stats.CPU.Throttling.ThrottledTime = v + } + } + return sc.Err() +} diff --git a/vendor/github.com/containerd/cgroups/cpuacct.go b/vendor/github.com/containerd/cgroups/cpuacct.go new file mode 100644 index 0000000000..e5fc864bd7 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/cpuacct.go @@ -0,0 +1,123 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" +) + +const nanosecondsInSecond = 1000000000 + +var clockTicks = getClockTicks() + +func NewCpuacct(root string) *cpuacctController { + return &cpuacctController{ + root: filepath.Join(root, string(Cpuacct)), + } +} + +type cpuacctController struct { + root string +} + +func (c *cpuacctController) Name() Name { + return Cpuacct +} + +func (c *cpuacctController) Path(path string) string { + return filepath.Join(c.root, path) +} + +func (c *cpuacctController) Stat(path string, stats *v1.Metrics) error { + user, kernel, err := c.getUsage(path) + if err != nil { + return err + } + total, err := readUint(filepath.Join(c.Path(path), "cpuacct.usage")) + if err != nil { + return err + } + percpu, err := c.percpuUsage(path) + if err != nil { + return err + } + stats.CPU.Usage.Total = total + stats.CPU.Usage.User = user + stats.CPU.Usage.Kernel = kernel + stats.CPU.Usage.PerCPU = percpu + return nil +} + +func (c *cpuacctController) percpuUsage(path string) ([]uint64, error) { + var usage []uint64 + data, err := ioutil.ReadFile(filepath.Join(c.Path(path), "cpuacct.usage_percpu")) + if err != nil { + return nil, err + } + for _, v := range strings.Fields(string(data)) { + u, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, err + } + usage = append(usage, u) + } + return usage, nil +} + +func (c *cpuacctController) getUsage(path string) (user uint64, kernel uint64, err error) { + statPath := filepath.Join(c.Path(path), "cpuacct.stat") + data, err := ioutil.ReadFile(statPath) + if err != nil { + return 0, 0, err + } + fields := strings.Fields(string(data)) + if len(fields) != 4 { + return 0, 0, fmt.Errorf("%q is expected to have 4 fields", statPath) + } + for _, t := range []struct { + index int + name string + value *uint64 + }{ + { + index: 0, + name: "user", + value: &user, + }, + { + index: 2, + name: "system", + value: &kernel, + }, + } { + if fields[t.index] != t.name { + return 0, 0, fmt.Errorf("expected field %q but found %q in %q", t.name, fields[t.index], statPath) + } + v, err := strconv.ParseUint(fields[t.index+1], 10, 64) + if err != nil { + return 0, 0, err + } + *t.value = v + } + return (user * nanosecondsInSecond) / clockTicks, (kernel * nanosecondsInSecond) / clockTicks, nil +} diff --git a/vendor/github.com/containerd/cgroups/cpuset.go b/vendor/github.com/containerd/cgroups/cpuset.go new file mode 100644 index 0000000000..3cae173bdd --- /dev/null +++ b/vendor/github.com/containerd/cgroups/cpuset.go @@ -0,0 +1,159 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewCpuset(root string) *cpusetController { + return &cpusetController{ + root: filepath.Join(root, string(Cpuset)), + } +} + +type cpusetController struct { + root string +} + +func (c *cpusetController) Name() Name { + return Cpuset +} + +func (c *cpusetController) Path(path string) string { + return filepath.Join(c.root, path) +} + +func (c *cpusetController) Create(path string, resources *specs.LinuxResources) error { + if err := c.ensureParent(c.Path(path), c.root); err != nil { + return err + } + if err := os.MkdirAll(c.Path(path), defaultDirPerm); err != nil { + return err + } + if err := c.copyIfNeeded(c.Path(path), filepath.Dir(c.Path(path))); err != nil { + return err + } + if resources.CPU != nil { + for _, t := range []struct { + name string + value string + }{ + { + name: "cpus", + value: resources.CPU.Cpus, + }, + { + name: "mems", + value: resources.CPU.Mems, + }, + } { + if t.value != "" { + if err := retryingWriteFile( + filepath.Join(c.Path(path), "cpuset."+t.name), + []byte(t.value), + defaultFilePerm, + ); err != nil { + return err + } + } + } + } + return nil +} + +func (c *cpusetController) Update(path string, resources *specs.LinuxResources) error { + return c.Create(path, resources) +} + +func (c *cpusetController) getValues(path string) (cpus []byte, mems []byte, err error) { + if cpus, err = ioutil.ReadFile(filepath.Join(path, "cpuset.cpus")); err != nil && !os.IsNotExist(err) { + return + } + if mems, err = ioutil.ReadFile(filepath.Join(path, "cpuset.mems")); err != nil && !os.IsNotExist(err) { + return + } + return cpus, mems, nil +} + +// ensureParent makes sure that the parent directory of current is created +// and populated with the proper cpus and mems files copied from +// it's parent. +func (c *cpusetController) ensureParent(current, root string) error { + parent := filepath.Dir(current) + if _, err := filepath.Rel(root, parent); err != nil { + return nil + } + // Avoid infinite recursion. + if parent == current { + return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") + } + if cleanPath(parent) != root { + if err := c.ensureParent(parent, root); err != nil { + return err + } + } + if err := os.MkdirAll(current, defaultDirPerm); err != nil { + return err + } + return c.copyIfNeeded(current, parent) +} + +// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func (c *cpusetController) copyIfNeeded(current, parent string) error { + var ( + err error + currentCpus, currentMems []byte + parentCpus, parentMems []byte + ) + if currentCpus, currentMems, err = c.getValues(current); err != nil { + return err + } + if parentCpus, parentMems, err = c.getValues(parent); err != nil { + return err + } + if isEmpty(currentCpus) { + if err := retryingWriteFile( + filepath.Join(current, "cpuset.cpus"), + parentCpus, + defaultFilePerm, + ); err != nil { + return err + } + } + if isEmpty(currentMems) { + if err := retryingWriteFile( + filepath.Join(current, "cpuset.mems"), + parentMems, + defaultFilePerm, + ); err != nil { + return err + } + } + return nil +} + +func isEmpty(b []byte) bool { + return len(bytes.Trim(b, "\n")) == 0 +} diff --git a/vendor/github.com/containerd/cgroups/devices.go b/vendor/github.com/containerd/cgroups/devices.go new file mode 100644 index 0000000000..7792566d5e --- /dev/null +++ b/vendor/github.com/containerd/cgroups/devices.go @@ -0,0 +1,92 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + allowDeviceFile = "devices.allow" + denyDeviceFile = "devices.deny" + wildcard = -1 +) + +func NewDevices(root string) *devicesController { + return &devicesController{ + root: filepath.Join(root, string(Devices)), + } +} + +type devicesController struct { + root string +} + +func (d *devicesController) Name() Name { + return Devices +} + +func (d *devicesController) Path(path string) string { + return filepath.Join(d.root, path) +} + +func (d *devicesController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(d.Path(path), defaultDirPerm); err != nil { + return err + } + for _, device := range resources.Devices { + file := denyDeviceFile + if device.Allow { + file = allowDeviceFile + } + if device.Type == "" { + device.Type = "a" + } + if err := retryingWriteFile( + filepath.Join(d.Path(path), file), + []byte(deviceString(device)), + defaultFilePerm, + ); err != nil { + return err + } + } + return nil +} + +func (d *devicesController) Update(path string, resources *specs.LinuxResources) error { + return d.Create(path, resources) +} + +func deviceString(device specs.LinuxDeviceCgroup) string { + return fmt.Sprintf("%s %s:%s %s", + device.Type, + deviceNumber(device.Major), + deviceNumber(device.Minor), + device.Access, + ) +} + +func deviceNumber(number *int64) string { + if number == nil || *number == wildcard { + return "*" + } + return fmt.Sprint(*number) +} diff --git a/vendor/github.com/containerd/cgroups/errors.go b/vendor/github.com/containerd/cgroups/errors.go new file mode 100644 index 0000000000..f1ad8315c8 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/errors.go @@ -0,0 +1,47 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "errors" + "os" +) + +var ( + ErrInvalidPid = errors.New("cgroups: pid must be greater than 0") + ErrMountPointNotExist = errors.New("cgroups: cgroup mountpoint does not exist") + ErrInvalidFormat = errors.New("cgroups: parsing file with invalid format failed") + ErrFreezerNotSupported = errors.New("cgroups: freezer cgroup not supported on this system") + ErrMemoryNotSupported = errors.New("cgroups: memory cgroup not supported on this system") + ErrCgroupDeleted = errors.New("cgroups: cgroup deleted") + ErrNoCgroupMountDestination = errors.New("cgroups: cannot find cgroup mount destination") +) + +// ErrorHandler is a function that handles and acts on errors +type ErrorHandler func(err error) error + +// IgnoreNotExist ignores any errors that are for not existing files +func IgnoreNotExist(err error) error { + if os.IsNotExist(err) { + return nil + } + return err +} + +func errPassthrough(err error) error { + return err +} diff --git a/vendor/github.com/containerd/cgroups/freezer.go b/vendor/github.com/containerd/cgroups/freezer.go new file mode 100644 index 0000000000..59a7e71283 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/freezer.go @@ -0,0 +1,82 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "io/ioutil" + "path/filepath" + "strings" + "time" +) + +func NewFreezer(root string) *freezerController { + return &freezerController{ + root: filepath.Join(root, string(Freezer)), + } +} + +type freezerController struct { + root string +} + +func (f *freezerController) Name() Name { + return Freezer +} + +func (f *freezerController) Path(path string) string { + return filepath.Join(f.root, path) +} + +func (f *freezerController) Freeze(path string) error { + return f.waitState(path, Frozen) +} + +func (f *freezerController) Thaw(path string) error { + return f.waitState(path, Thawed) +} + +func (f *freezerController) changeState(path string, state State) error { + return retryingWriteFile( + filepath.Join(f.root, path, "freezer.state"), + []byte(strings.ToUpper(string(state))), + defaultFilePerm, + ) +} + +func (f *freezerController) state(path string) (State, error) { + current, err := ioutil.ReadFile(filepath.Join(f.root, path, "freezer.state")) + if err != nil { + return "", err + } + return State(strings.ToLower(strings.TrimSpace(string(current)))), nil +} + +func (f *freezerController) waitState(path string, state State) error { + for { + if err := f.changeState(path, state); err != nil { + return err + } + current, err := f.state(path) + if err != nil { + return err + } + if current == state { + return nil + } + time.Sleep(1 * time.Millisecond) + } +} diff --git a/vendor/github.com/containerd/cgroups/go.mod b/vendor/github.com/containerd/cgroups/go.mod new file mode 100644 index 0000000000..eed71ffb48 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/go.mod @@ -0,0 +1,18 @@ +module github.com/containerd/cgroups + +go 1.13 + +require ( + github.com/cilium/ebpf v0.4.0 + github.com/coreos/go-systemd/v22 v22.1.0 + github.com/cpuguy83/go-md2man/v2 v2.0.0 // indirect + github.com/docker/go-units v0.4.0 + github.com/godbus/dbus/v5 v5.0.3 + github.com/gogo/protobuf v1.3.2 + github.com/opencontainers/runtime-spec v1.0.2 + github.com/pkg/errors v0.9.1 + github.com/sirupsen/logrus v1.7.0 + github.com/stretchr/testify v1.6.1 + github.com/urfave/cli v1.22.2 + golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c +) diff --git a/vendor/github.com/containerd/cgroups/go.sum b/vendor/github.com/containerd/cgroups/go.sum new file mode 100644 index 0000000000..f086102370 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/go.sum @@ -0,0 +1,82 @@ +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/cilium/ebpf v0.4.0 h1:QlHdikaxALkqWasW8hAC1mfR0jdmvbfaBdBPFmRSglA= +github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= +github.com/coreos/go-systemd/v22 v22.1.0 h1:kq/SbG2BCKLkDKkjQf5OWwKWUKj1lgs3lFI4PxnR5lg= +github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= +github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/cpuguy83/go-md2man/v2 v2.0.0 h1:EoUDS0afbrsXAZ9YQ9jdu/mZ2sXgT1/2yyNng4PGlyM= +github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= +github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= +github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/godbus/dbus/v5 v5.0.3 h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME= +github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M= +github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= +github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= +github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= +github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= +github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/urfave/cli v1.22.2 h1:gsqYFH8bb9ekPA12kRo0hfjngWQjkJPlN9R0N78BoUo= +github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c h1:VwygUrnw9jn88c4u8GD3rZQbqrP/tgas88tPUbBxQrk= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/vendor/github.com/containerd/cgroups/hierarchy.go b/vendor/github.com/containerd/cgroups/hierarchy.go new file mode 100644 index 0000000000..ca3f1b9380 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/hierarchy.go @@ -0,0 +1,20 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +// Hierarchy enables both unified and split hierarchy for cgroups +type Hierarchy func() ([]Subsystem, error) diff --git a/vendor/github.com/containerd/cgroups/hugetlb.go b/vendor/github.com/containerd/cgroups/hugetlb.go new file mode 100644 index 0000000000..c0eb03b24d --- /dev/null +++ b/vendor/github.com/containerd/cgroups/hugetlb.go @@ -0,0 +1,109 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewHugetlb(root string) (*hugetlbController, error) { + sizes, err := hugePageSizes() + if err != nil { + return nil, err + } + + return &hugetlbController{ + root: filepath.Join(root, string(Hugetlb)), + sizes: sizes, + }, nil +} + +type hugetlbController struct { + root string + sizes []string +} + +func (h *hugetlbController) Name() Name { + return Hugetlb +} + +func (h *hugetlbController) Path(path string) string { + return filepath.Join(h.root, path) +} + +func (h *hugetlbController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(h.Path(path), defaultDirPerm); err != nil { + return err + } + for _, limit := range resources.HugepageLimits { + if err := retryingWriteFile( + filepath.Join(h.Path(path), strings.Join([]string{"hugetlb", limit.Pagesize, "limit_in_bytes"}, ".")), + []byte(strconv.FormatUint(limit.Limit, 10)), + defaultFilePerm, + ); err != nil { + return err + } + } + return nil +} + +func (h *hugetlbController) Stat(path string, stats *v1.Metrics) error { + for _, size := range h.sizes { + s, err := h.readSizeStat(path, size) + if err != nil { + return err + } + stats.Hugetlb = append(stats.Hugetlb, s) + } + return nil +} + +func (h *hugetlbController) readSizeStat(path, size string) (*v1.HugetlbStat, error) { + s := v1.HugetlbStat{ + Pagesize: size, + } + for _, t := range []struct { + name string + value *uint64 + }{ + { + name: "usage_in_bytes", + value: &s.Usage, + }, + { + name: "max_usage_in_bytes", + value: &s.Max, + }, + { + name: "failcnt", + value: &s.Failcnt, + }, + } { + v, err := readUint(filepath.Join(h.Path(path), strings.Join([]string{"hugetlb", size, t.name}, "."))) + if err != nil { + return nil, err + } + *t.value = v + } + return &s, nil +} diff --git a/vendor/github.com/containerd/cgroups/memory.go b/vendor/github.com/containerd/cgroups/memory.go new file mode 100644 index 0000000000..e271866ef9 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/memory.go @@ -0,0 +1,480 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +// MemoryEvent is an interface that V1 memory Cgroup notifications implement. Arg returns the +// file name whose fd should be written to "cgroups.event_control". EventFile returns the name of +// the file that supports the notification api e.g. "memory.usage_in_bytes". +type MemoryEvent interface { + Arg() string + EventFile() string +} + +type memoryThresholdEvent struct { + threshold uint64 + swap bool +} + +// MemoryThresholdEvent returns a new memory threshold event to be used with RegisterMemoryEvent. +// If swap is true, the event will be registered using memory.memsw.usage_in_bytes +func MemoryThresholdEvent(threshold uint64, swap bool) MemoryEvent { + return &memoryThresholdEvent{ + threshold, + swap, + } +} + +func (m *memoryThresholdEvent) Arg() string { + return strconv.FormatUint(m.threshold, 10) +} + +func (m *memoryThresholdEvent) EventFile() string { + if m.swap { + return "memory.memsw.usage_in_bytes" + } + return "memory.usage_in_bytes" +} + +type oomEvent struct{} + +// OOMEvent returns a new oom event to be used with RegisterMemoryEvent. +func OOMEvent() MemoryEvent { + return &oomEvent{} +} + +func (oom *oomEvent) Arg() string { + return "" +} + +func (oom *oomEvent) EventFile() string { + return "memory.oom_control" +} + +type memoryPressureEvent struct { + pressureLevel MemoryPressureLevel + hierarchy EventNotificationMode +} + +// MemoryPressureEvent returns a new memory pressure event to be used with RegisterMemoryEvent. +func MemoryPressureEvent(pressureLevel MemoryPressureLevel, hierarchy EventNotificationMode) MemoryEvent { + return &memoryPressureEvent{ + pressureLevel, + hierarchy, + } +} + +func (m *memoryPressureEvent) Arg() string { + return string(m.pressureLevel) + "," + string(m.hierarchy) +} + +func (m *memoryPressureEvent) EventFile() string { + return "memory.pressure_level" +} + +// MemoryPressureLevel corresponds to the memory pressure levels defined +// for memory cgroups. +type MemoryPressureLevel string + +// The three memory pressure levels are as follows. +// - The "low" level means that the system is reclaiming memory for new +// allocations. Monitoring this reclaiming activity might be useful for +// maintaining cache level. Upon notification, the program (typically +// "Activity Manager") might analyze vmstat and act in advance (i.e. +// prematurely shutdown unimportant services). +// - The "medium" level means that the system is experiencing medium memory +// pressure, the system might be making swap, paging out active file caches, +// etc. Upon this event applications may decide to further analyze +// vmstat/zoneinfo/memcg or internal memory usage statistics and free any +// resources that can be easily reconstructed or re-read from a disk. +// - The "critical" level means that the system is actively thrashing, it is +// about to out of memory (OOM) or even the in-kernel OOM killer is on its +// way to trigger. Applications should do whatever they can to help the +// system. It might be too late to consult with vmstat or any other +// statistics, so it is advisable to take an immediate action. +// "https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt" Section 11 +const ( + LowPressure MemoryPressureLevel = "low" + MediumPressure MemoryPressureLevel = "medium" + CriticalPressure MemoryPressureLevel = "critical" +) + +// EventNotificationMode corresponds to the notification modes +// for the memory cgroups pressure level notifications. +type EventNotificationMode string + +// There are three optional modes that specify different propagation behavior: +// - "default": this is the default behavior specified above. This mode is the +// same as omitting the optional mode parameter, preserved by backwards +// compatibility. +// - "hierarchy": events always propagate up to the root, similar to the default +// behavior, except that propagation continues regardless of whether there are +// event listeners at each level, with the "hierarchy" mode. In the above +// example, groups A, B, and C will receive notification of memory pressure. +// - "local": events are pass-through, i.e. they only receive notifications when +// memory pressure is experienced in the memcg for which the notification is +// registered. In the above example, group C will receive notification if +// registered for "local" notification and the group experiences memory +// pressure. However, group B will never receive notification, regardless if +// there is an event listener for group C or not, if group B is registered for +// local notification. +// "https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt" Section 11 +const ( + DefaultMode EventNotificationMode = "default" + LocalMode EventNotificationMode = "local" + HierarchyMode EventNotificationMode = "hierarchy" +) + +// NewMemory returns a Memory controller given the root folder of cgroups. +// It may optionally accept other configuration options, such as IgnoreModules(...) +func NewMemory(root string, options ...func(*memoryController)) *memoryController { + mc := &memoryController{ + root: filepath.Join(root, string(Memory)), + ignored: map[string]struct{}{}, + } + for _, opt := range options { + opt(mc) + } + return mc +} + +// IgnoreModules configure the memory controller to not read memory metrics for some +// module names (e.g. passing "memsw" would avoid all the memory.memsw.* entries) +func IgnoreModules(names ...string) func(*memoryController) { + return func(mc *memoryController) { + for _, name := range names { + mc.ignored[name] = struct{}{} + } + } +} + +// OptionalSwap allows the memory controller to not fail if cgroups is not accounting +// Swap memory (there are no memory.memsw.* entries) +func OptionalSwap() func(*memoryController) { + return func(mc *memoryController) { + _, err := os.Stat(filepath.Join(mc.root, "memory.memsw.usage_in_bytes")) + if os.IsNotExist(err) { + mc.ignored["memsw"] = struct{}{} + } + } +} + +type memoryController struct { + root string + ignored map[string]struct{} +} + +func (m *memoryController) Name() Name { + return Memory +} + +func (m *memoryController) Path(path string) string { + return filepath.Join(m.root, path) +} + +func (m *memoryController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(m.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Memory == nil { + return nil + } + return m.set(path, getMemorySettings(resources)) +} + +func (m *memoryController) Update(path string, resources *specs.LinuxResources) error { + if resources.Memory == nil { + return nil + } + g := func(v *int64) bool { + return v != nil && *v > 0 + } + settings := getMemorySettings(resources) + if g(resources.Memory.Limit) && g(resources.Memory.Swap) { + // if the updated swap value is larger than the current memory limit set the swap changes first + // then set the memory limit as swap must always be larger than the current limit + current, err := readUint(filepath.Join(m.Path(path), "memory.limit_in_bytes")) + if err != nil { + return err + } + if current < uint64(*resources.Memory.Swap) { + settings[0], settings[1] = settings[1], settings[0] + } + } + return m.set(path, settings) +} + +func (m *memoryController) Stat(path string, stats *v1.Metrics) error { + fMemStat, err := os.Open(filepath.Join(m.Path(path), "memory.stat")) + if err != nil { + return err + } + defer fMemStat.Close() + stats.Memory = &v1.MemoryStat{ + Usage: &v1.MemoryEntry{}, + Swap: &v1.MemoryEntry{}, + Kernel: &v1.MemoryEntry{}, + KernelTCP: &v1.MemoryEntry{}, + } + if err := m.parseStats(fMemStat, stats.Memory); err != nil { + return err + } + + fMemOomControl, err := os.Open(filepath.Join(m.Path(path), "memory.oom_control")) + if err != nil { + return err + } + defer fMemOomControl.Close() + stats.MemoryOomControl = &v1.MemoryOomControl{} + if err := m.parseOomControlStats(fMemOomControl, stats.MemoryOomControl); err != nil { + return err + } + for _, t := range []struct { + module string + entry *v1.MemoryEntry + }{ + { + module: "", + entry: stats.Memory.Usage, + }, + { + module: "memsw", + entry: stats.Memory.Swap, + }, + { + module: "kmem", + entry: stats.Memory.Kernel, + }, + { + module: "kmem.tcp", + entry: stats.Memory.KernelTCP, + }, + } { + if _, ok := m.ignored[t.module]; ok { + continue + } + for _, tt := range []struct { + name string + value *uint64 + }{ + { + name: "usage_in_bytes", + value: &t.entry.Usage, + }, + { + name: "max_usage_in_bytes", + value: &t.entry.Max, + }, + { + name: "failcnt", + value: &t.entry.Failcnt, + }, + { + name: "limit_in_bytes", + value: &t.entry.Limit, + }, + } { + parts := []string{"memory"} + if t.module != "" { + parts = append(parts, t.module) + } + parts = append(parts, tt.name) + v, err := readUint(filepath.Join(m.Path(path), strings.Join(parts, "."))) + if err != nil { + return err + } + *tt.value = v + } + } + return nil +} + +func (m *memoryController) parseStats(r io.Reader, stat *v1.MemoryStat) error { + var ( + raw = make(map[string]uint64) + sc = bufio.NewScanner(r) + line int + ) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return fmt.Errorf("%d: %v", line, err) + } + raw[key] = v + line++ + } + if err := sc.Err(); err != nil { + return err + } + stat.Cache = raw["cache"] + stat.RSS = raw["rss"] + stat.RSSHuge = raw["rss_huge"] + stat.MappedFile = raw["mapped_file"] + stat.Dirty = raw["dirty"] + stat.Writeback = raw["writeback"] + stat.PgPgIn = raw["pgpgin"] + stat.PgPgOut = raw["pgpgout"] + stat.PgFault = raw["pgfault"] + stat.PgMajFault = raw["pgmajfault"] + stat.InactiveAnon = raw["inactive_anon"] + stat.ActiveAnon = raw["active_anon"] + stat.InactiveFile = raw["inactive_file"] + stat.ActiveFile = raw["active_file"] + stat.Unevictable = raw["unevictable"] + stat.HierarchicalMemoryLimit = raw["hierarchical_memory_limit"] + stat.HierarchicalSwapLimit = raw["hierarchical_memsw_limit"] + stat.TotalCache = raw["total_cache"] + stat.TotalRSS = raw["total_rss"] + stat.TotalRSSHuge = raw["total_rss_huge"] + stat.TotalMappedFile = raw["total_mapped_file"] + stat.TotalDirty = raw["total_dirty"] + stat.TotalWriteback = raw["total_writeback"] + stat.TotalPgPgIn = raw["total_pgpgin"] + stat.TotalPgPgOut = raw["total_pgpgout"] + stat.TotalPgFault = raw["total_pgfault"] + stat.TotalPgMajFault = raw["total_pgmajfault"] + stat.TotalInactiveAnon = raw["total_inactive_anon"] + stat.TotalActiveAnon = raw["total_active_anon"] + stat.TotalInactiveFile = raw["total_inactive_file"] + stat.TotalActiveFile = raw["total_active_file"] + stat.TotalUnevictable = raw["total_unevictable"] + return nil +} + +func (m *memoryController) parseOomControlStats(r io.Reader, stat *v1.MemoryOomControl) error { + var ( + raw = make(map[string]uint64) + sc = bufio.NewScanner(r) + line int + ) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return fmt.Errorf("%d: %v", line, err) + } + raw[key] = v + line++ + } + if err := sc.Err(); err != nil { + return err + } + stat.OomKillDisable = raw["oom_kill_disable"] + stat.UnderOom = raw["under_oom"] + stat.OomKill = raw["oom_kill"] + return nil +} + +func (m *memoryController) set(path string, settings []memorySettings) error { + for _, t := range settings { + if t.value != nil { + if err := retryingWriteFile( + filepath.Join(m.Path(path), "memory."+t.name), + []byte(strconv.FormatInt(*t.value, 10)), + defaultFilePerm, + ); err != nil { + return err + } + } + } + return nil +} + +type memorySettings struct { + name string + value *int64 +} + +func getMemorySettings(resources *specs.LinuxResources) []memorySettings { + mem := resources.Memory + var swappiness *int64 + if mem.Swappiness != nil { + v := int64(*mem.Swappiness) + swappiness = &v + } + return []memorySettings{ + { + name: "limit_in_bytes", + value: mem.Limit, + }, + { + name: "soft_limit_in_bytes", + value: mem.Reservation, + }, + { + name: "memsw.limit_in_bytes", + value: mem.Swap, + }, + { + name: "kmem.limit_in_bytes", + value: mem.Kernel, + }, + { + name: "kmem.tcp.limit_in_bytes", + value: mem.KernelTCP, + }, + { + name: "oom_control", + value: getOomControlValue(mem), + }, + { + name: "swappiness", + value: swappiness, + }, + } +} + +func getOomControlValue(mem *specs.LinuxMemory) *int64 { + if mem.DisableOOMKiller != nil && *mem.DisableOOMKiller { + i := int64(1) + return &i + } + return nil +} + +func (m *memoryController) memoryEvent(path string, event MemoryEvent) (uintptr, error) { + root := m.Path(path) + efd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) + if err != nil { + return 0, err + } + evtFile, err := os.Open(filepath.Join(root, event.EventFile())) + if err != nil { + unix.Close(efd) + return 0, err + } + defer evtFile.Close() + data := fmt.Sprintf("%d %d %s", efd, evtFile.Fd(), event.Arg()) + evctlPath := filepath.Join(root, "cgroup.event_control") + if err := retryingWriteFile(evctlPath, []byte(data), 0700); err != nil { + unix.Close(efd) + return 0, err + } + return uintptr(efd), nil +} diff --git a/vendor/github.com/containerd/cgroups/named.go b/vendor/github.com/containerd/cgroups/named.go new file mode 100644 index 0000000000..06b16c3b15 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/named.go @@ -0,0 +1,39 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import "path/filepath" + +func NewNamed(root string, name Name) *namedController { + return &namedController{ + root: root, + name: name, + } +} + +type namedController struct { + root string + name Name +} + +func (n *namedController) Name() Name { + return n.name +} + +func (n *namedController) Path(path string) string { + return filepath.Join(n.root, string(n.name), path) +} diff --git a/vendor/github.com/containerd/cgroups/net_cls.go b/vendor/github.com/containerd/cgroups/net_cls.go new file mode 100644 index 0000000000..839b06de08 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/net_cls.go @@ -0,0 +1,61 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "os" + "path/filepath" + "strconv" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewNetCls(root string) *netclsController { + return &netclsController{ + root: filepath.Join(root, string(NetCLS)), + } +} + +type netclsController struct { + root string +} + +func (n *netclsController) Name() Name { + return NetCLS +} + +func (n *netclsController) Path(path string) string { + return filepath.Join(n.root, path) +} + +func (n *netclsController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(n.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Network != nil && resources.Network.ClassID != nil && *resources.Network.ClassID > 0 { + return retryingWriteFile( + filepath.Join(n.Path(path), "net_cls.classid"), + []byte(strconv.FormatUint(uint64(*resources.Network.ClassID), 10)), + defaultFilePerm, + ) + } + return nil +} + +func (n *netclsController) Update(path string, resources *specs.LinuxResources) error { + return n.Create(path, resources) +} diff --git a/vendor/github.com/containerd/cgroups/net_prio.go b/vendor/github.com/containerd/cgroups/net_prio.go new file mode 100644 index 0000000000..6362fd084f --- /dev/null +++ b/vendor/github.com/containerd/cgroups/net_prio.go @@ -0,0 +1,65 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewNetPrio(root string) *netprioController { + return &netprioController{ + root: filepath.Join(root, string(NetPrio)), + } +} + +type netprioController struct { + root string +} + +func (n *netprioController) Name() Name { + return NetPrio +} + +func (n *netprioController) Path(path string) string { + return filepath.Join(n.root, path) +} + +func (n *netprioController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(n.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Network != nil { + for _, prio := range resources.Network.Priorities { + if err := retryingWriteFile( + filepath.Join(n.Path(path), "net_prio.ifpriomap"), + formatPrio(prio.Name, prio.Priority), + defaultFilePerm, + ); err != nil { + return err + } + } + } + return nil +} + +func formatPrio(name string, prio uint32) []byte { + return []byte(fmt.Sprintf("%s %d", name, prio)) +} diff --git a/vendor/github.com/containerd/cgroups/opts.go b/vendor/github.com/containerd/cgroups/opts.go new file mode 100644 index 0000000000..a1449e298d --- /dev/null +++ b/vendor/github.com/containerd/cgroups/opts.go @@ -0,0 +1,61 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "github.com/pkg/errors" +) + +var ( + // ErrIgnoreSubsystem allows the specific subsystem to be skipped + ErrIgnoreSubsystem = errors.New("skip subsystem") + // ErrDevicesRequired is returned when the devices subsystem is required but + // does not exist or is not active + ErrDevicesRequired = errors.New("devices subsystem is required") +) + +// InitOpts allows configuration for the creation or loading of a cgroup +type InitOpts func(*InitConfig) error + +// InitConfig provides configuration options for the creation +// or loading of a cgroup and its subsystems +type InitConfig struct { + // InitCheck can be used to check initialization errors from the subsystem + InitCheck InitCheck +} + +func newInitConfig() *InitConfig { + return &InitConfig{ + InitCheck: RequireDevices, + } +} + +// InitCheck allows subsystems errors to be checked when initialized or loaded +type InitCheck func(Subsystem, Path, error) error + +// AllowAny allows any subsystem errors to be skipped +func AllowAny(_ Subsystem, _ Path, _ error) error { + return ErrIgnoreSubsystem +} + +// RequireDevices requires the device subsystem but no others +func RequireDevices(s Subsystem, _ Path, _ error) error { + if s.Name() == Devices { + return ErrDevicesRequired + } + return ErrIgnoreSubsystem +} diff --git a/vendor/github.com/containerd/cgroups/paths.go b/vendor/github.com/containerd/cgroups/paths.go new file mode 100644 index 0000000000..27197ecad7 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/paths.go @@ -0,0 +1,107 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "path/filepath" + + "github.com/pkg/errors" +) + +type Path func(subsystem Name) (string, error) + +func RootPath(subsystem Name) (string, error) { + return "/", nil +} + +// StaticPath returns a static path to use for all cgroups +func StaticPath(path string) Path { + return func(_ Name) (string, error) { + return path, nil + } +} + +// NestedPath will nest the cgroups based on the calling processes cgroup +// placing its child processes inside its own path +func NestedPath(suffix string) Path { + paths, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return errorPath(err) + } + return existingPath(paths, suffix) +} + +// PidPath will return the correct cgroup paths for an existing process running inside a cgroup +// This is commonly used for the Load function to restore an existing container +func PidPath(pid int) Path { + p := fmt.Sprintf("/proc/%d/cgroup", pid) + paths, err := parseCgroupFile(p) + if err != nil { + return errorPath(errors.Wrapf(err, "parse cgroup file %s", p)) + } + return existingPath(paths, "") +} + +// ErrControllerNotActive is returned when a controller is not supported or enabled +var ErrControllerNotActive = errors.New("controller is not supported") + +func existingPath(paths map[string]string, suffix string) Path { + // localize the paths based on the root mount dest for nested cgroups + for n, p := range paths { + dest, err := getCgroupDestination(n) + if err != nil { + return errorPath(err) + } + rel, err := filepath.Rel(dest, p) + if err != nil { + return errorPath(err) + } + if rel == "." { + rel = dest + } + paths[n] = filepath.Join("/", rel) + } + return func(name Name) (string, error) { + root, ok := paths[string(name)] + if !ok { + if root, ok = paths["name="+string(name)]; !ok { + return "", ErrControllerNotActive + } + } + if suffix != "" { + return filepath.Join(root, suffix), nil + } + return root, nil + } +} + +func subPath(path Path, subName string) Path { + return func(name Name) (string, error) { + p, err := path(name) + if err != nil { + return "", err + } + return filepath.Join(p, subName), nil + } +} + +func errorPath(err error) Path { + return func(_ Name) (string, error) { + return "", err + } +} diff --git a/vendor/github.com/containerd/cgroups/perf_event.go b/vendor/github.com/containerd/cgroups/perf_event.go new file mode 100644 index 0000000000..648786db68 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/perf_event.go @@ -0,0 +1,37 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import "path/filepath" + +func NewPerfEvent(root string) *PerfEventController { + return &PerfEventController{ + root: filepath.Join(root, string(PerfEvent)), + } +} + +type PerfEventController struct { + root string +} + +func (p *PerfEventController) Name() Name { + return PerfEvent +} + +func (p *PerfEventController) Path(path string) string { + return filepath.Join(p.root, path) +} diff --git a/vendor/github.com/containerd/cgroups/pids.go b/vendor/github.com/containerd/cgroups/pids.go new file mode 100644 index 0000000000..ce78e44c18 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/pids.go @@ -0,0 +1,86 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewPids(root string) *pidsController { + return &pidsController{ + root: filepath.Join(root, string(Pids)), + } +} + +type pidsController struct { + root string +} + +func (p *pidsController) Name() Name { + return Pids +} + +func (p *pidsController) Path(path string) string { + return filepath.Join(p.root, path) +} + +func (p *pidsController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(p.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Pids != nil && resources.Pids.Limit > 0 { + return retryingWriteFile( + filepath.Join(p.Path(path), "pids.max"), + []byte(strconv.FormatInt(resources.Pids.Limit, 10)), + defaultFilePerm, + ) + } + return nil +} + +func (p *pidsController) Update(path string, resources *specs.LinuxResources) error { + return p.Create(path, resources) +} + +func (p *pidsController) Stat(path string, stats *v1.Metrics) error { + current, err := readUint(filepath.Join(p.Path(path), "pids.current")) + if err != nil { + return err + } + var max uint64 + maxData, err := ioutil.ReadFile(filepath.Join(p.Path(path), "pids.max")) + if err != nil { + return err + } + if maxS := strings.TrimSpace(string(maxData)); maxS != "max" { + if max, err = parseUint(maxS, 10, 64); err != nil { + return err + } + } + stats.Pids = &v1.PidsStat{ + Current: current, + Limit: max, + } + return nil +} diff --git a/vendor/github.com/containerd/cgroups/rdma.go b/vendor/github.com/containerd/cgroups/rdma.go new file mode 100644 index 0000000000..b6f0d416c7 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/rdma.go @@ -0,0 +1,154 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +type rdmaController struct { + root string +} + +func (p *rdmaController) Name() Name { + return Rdma +} + +func (p *rdmaController) Path(path string) string { + return filepath.Join(p.root, path) +} + +func NewRdma(root string) *rdmaController { + return &rdmaController{ + root: filepath.Join(root, string(Rdma)), + } +} + +func createCmdString(device string, limits *specs.LinuxRdma) string { + var cmdString string + + cmdString = device + if limits.HcaHandles != nil { + cmdString = cmdString + " " + "hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + + if limits.HcaObjects != nil { + cmdString = cmdString + " " + "hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +func (p *rdmaController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(p.Path(path), defaultDirPerm); err != nil { + return err + } + + for device, limit := range resources.Rdma { + if device != "" && (limit.HcaHandles != nil || limit.HcaObjects != nil) { + return retryingWriteFile( + filepath.Join(p.Path(path), "rdma.max"), + []byte(createCmdString(device, &limit)), + defaultFilePerm, + ) + } + } + return nil +} + +func (p *rdmaController) Update(path string, resources *specs.LinuxResources) error { + return p.Create(path, resources) +} + +func parseRdmaKV(raw string, entry *v1.RdmaEntry) { + var value uint64 + var err error + + parts := strings.Split(raw, "=") + switch len(parts) { + case 2: + if parts[1] == "max" { + value = math.MaxUint32 + } else { + value, err = parseUint(parts[1], 10, 32) + if err != nil { + return + } + } + if parts[0] == "hca_handle" { + entry.HcaHandles = uint32(value) + } else if parts[0] == "hca_object" { + entry.HcaObjects = uint32(value) + } + } +} + +func toRdmaEntry(strEntries []string) []*v1.RdmaEntry { + var rdmaEntries []*v1.RdmaEntry + for i := range strEntries { + parts := strings.Fields(strEntries[i]) + switch len(parts) { + case 3: + entry := new(v1.RdmaEntry) + entry.Device = parts[0] + parseRdmaKV(parts[1], entry) + parseRdmaKV(parts[2], entry) + + rdmaEntries = append(rdmaEntries, entry) + default: + continue + } + } + return rdmaEntries +} + +func (p *rdmaController) Stat(path string, stats *v1.Metrics) error { + + currentData, err := ioutil.ReadFile(filepath.Join(p.Path(path), "rdma.current")) + if err != nil { + return err + } + currentPerDevices := strings.Split(string(currentData), "\n") + + maxData, err := ioutil.ReadFile(filepath.Join(p.Path(path), "rdma.max")) + if err != nil { + return err + } + maxPerDevices := strings.Split(string(maxData), "\n") + + // If device got removed between reading two files, ignore returning + // stats. + if len(currentPerDevices) != len(maxPerDevices) { + return nil + } + + currentEntries := toRdmaEntry(currentPerDevices) + maxEntries := toRdmaEntry(maxPerDevices) + + stats.Rdma = &v1.RdmaStat{ + Current: currentEntries, + Limit: maxEntries, + } + return nil +} diff --git a/vendor/github.com/containerd/cgroups/state.go b/vendor/github.com/containerd/cgroups/state.go new file mode 100644 index 0000000000..cfeabbbc60 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/state.go @@ -0,0 +1,28 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +// State is a type that represents the state of the current cgroup +type State string + +const ( + Unknown State = "" + Thawed State = "thawed" + Frozen State = "frozen" + Freezing State = "freezing" + Deleted State = "deleted" +) diff --git a/vendor/github.com/containerd/cgroups/subsystem.go b/vendor/github.com/containerd/cgroups/subsystem.go new file mode 100644 index 0000000000..b2f41854d2 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/subsystem.go @@ -0,0 +1,116 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "os" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +// Name is a typed name for a cgroup subsystem +type Name string + +const ( + Devices Name = "devices" + Hugetlb Name = "hugetlb" + Freezer Name = "freezer" + Pids Name = "pids" + NetCLS Name = "net_cls" + NetPrio Name = "net_prio" + PerfEvent Name = "perf_event" + Cpuset Name = "cpuset" + Cpu Name = "cpu" + Cpuacct Name = "cpuacct" + Memory Name = "memory" + Blkio Name = "blkio" + Rdma Name = "rdma" +) + +// Subsystems returns a complete list of the default cgroups +// available on most linux systems +func Subsystems() []Name { + n := []Name{ + Freezer, + Pids, + NetCLS, + NetPrio, + PerfEvent, + Cpuset, + Cpu, + Cpuacct, + Memory, + Blkio, + Rdma, + } + if !RunningInUserNS() { + n = append(n, Devices) + } + if _, err := os.Stat("/sys/kernel/mm/hugepages"); err == nil { + n = append(n, Hugetlb) + } + return n +} + +type Subsystem interface { + Name() Name +} + +type pather interface { + Subsystem + Path(path string) string +} + +type creator interface { + Subsystem + Create(path string, resources *specs.LinuxResources) error +} + +type deleter interface { + Subsystem + Delete(path string) error +} + +type stater interface { + Subsystem + Stat(path string, stats *v1.Metrics) error +} + +type updater interface { + Subsystem + Update(path string, resources *specs.LinuxResources) error +} + +// SingleSubsystem returns a single cgroup subsystem within the base Hierarchy +func SingleSubsystem(baseHierarchy Hierarchy, subsystem Name) Hierarchy { + return func() ([]Subsystem, error) { + subsystems, err := baseHierarchy() + if err != nil { + return nil, err + } + for _, s := range subsystems { + if s.Name() == subsystem { + return []Subsystem{ + s, + }, nil + } + } + return nil, fmt.Errorf("unable to find subsystem %s", subsystem) + } +} diff --git a/vendor/github.com/containerd/cgroups/systemd.go b/vendor/github.com/containerd/cgroups/systemd.go new file mode 100644 index 0000000000..c17f34a626 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/systemd.go @@ -0,0 +1,155 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "path/filepath" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + SystemdDbus Name = "systemd" + defaultSlice = "system.slice" +) + +var ( + canDelegate bool + once sync.Once +) + +func Systemd() ([]Subsystem, error) { + root, err := v1MountPoint() + if err != nil { + return nil, err + } + defaultSubsystems, err := defaults(root) + if err != nil { + return nil, err + } + s, err := NewSystemd(root) + if err != nil { + return nil, err + } + // make sure the systemd controller is added first + return append([]Subsystem{s}, defaultSubsystems...), nil +} + +func Slice(slice, name string) Path { + if slice == "" { + slice = defaultSlice + } + return func(subsystem Name) (string, error) { + return filepath.Join(slice, name), nil + } +} + +func NewSystemd(root string) (*SystemdController, error) { + return &SystemdController{ + root: root, + }, nil +} + +type SystemdController struct { + mu sync.Mutex + root string +} + +func (s *SystemdController) Name() Name { + return SystemdDbus +} + +func (s *SystemdController) Create(path string, _ *specs.LinuxResources) error { + conn, err := systemdDbus.New() + if err != nil { + return err + } + defer conn.Close() + slice, name := splitName(path) + // We need to see if systemd can handle the delegate property + // Systemd will return an error if it cannot handle delegate regardless + // of its bool setting. + checkDelegate := func() { + canDelegate = true + dlSlice := newProperty("Delegate", true) + if _, err := conn.StartTransientUnit(slice, "testdelegate", []systemdDbus.Property{dlSlice}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + // Starting with systemd v237, Delegate is not even a property of slices anymore, + // so the D-Bus call fails with "InvalidArgs" error. + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") { + canDelegate = false + } + } + } + + conn.StopUnit(slice, "testDelegate", nil) + } + once.Do(checkDelegate) + properties := []systemdDbus.Property{ + systemdDbus.PropDescription("cgroup " + name), + systemdDbus.PropWants(slice), + newProperty("DefaultDependencies", false), + newProperty("MemoryAccounting", true), + newProperty("CPUAccounting", true), + newProperty("BlockIOAccounting", true), + } + + // If we can delegate, we add the property back in + if canDelegate { + properties = append(properties, newProperty("Delegate", true)) + } + + ch := make(chan string) + _, err = conn.StartTransientUnit(name, "replace", properties, ch) + if err != nil { + return err + } + <-ch + return nil +} + +func (s *SystemdController) Delete(path string) error { + conn, err := systemdDbus.New() + if err != nil { + return err + } + defer conn.Close() + _, name := splitName(path) + ch := make(chan string) + _, err = conn.StopUnit(name, "replace", ch) + if err != nil { + return err + } + <-ch + return nil +} + +func newProperty(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func splitName(path string) (slice string, unit string) { + slice, unit = filepath.Split(path) + return strings.TrimSuffix(slice, "/"), unit +} diff --git a/vendor/github.com/containerd/cgroups/ticks.go b/vendor/github.com/containerd/cgroups/ticks.go new file mode 100644 index 0000000000..84dc38d0cc --- /dev/null +++ b/vendor/github.com/containerd/cgroups/ticks.go @@ -0,0 +1,26 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +func getClockTicks() uint64 { + // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and + // on Linux it's a constant which is safe to be hard coded, + // so we can avoid using cgo here. + // See https://github.com/containerd/cgroups/pull/12 for + // more details. + return 100 +} diff --git a/vendor/github.com/containerd/cgroups/utils.go b/vendor/github.com/containerd/cgroups/utils.go new file mode 100644 index 0000000000..ed894b3ec4 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/utils.go @@ -0,0 +1,404 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" + + units "github.com/docker/go-units" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +var ( + nsOnce sync.Once + inUserNS bool + checkMode sync.Once + cgMode CGMode +) + +const unifiedMountpoint = "/sys/fs/cgroup" + +// CGMode is the cgroups mode of the host system +type CGMode int + +const ( + // Unavailable cgroup mountpoint + Unavailable CGMode = iota + // Legacy cgroups v1 + Legacy + // Hybrid with cgroups v1 and v2 controllers mounted + Hybrid + // Unified with only cgroups v2 mounted + Unified +) + +// Mode returns the cgroups mode running on the host +func Mode() CGMode { + checkMode.Do(func() { + var st unix.Statfs_t + if err := unix.Statfs(unifiedMountpoint, &st); err != nil { + cgMode = Unavailable + return + } + switch st.Type { + case unix.CGROUP2_SUPER_MAGIC: + cgMode = Unified + default: + cgMode = Legacy + if err := unix.Statfs(filepath.Join(unifiedMountpoint, "unified"), &st); err != nil { + return + } + if st.Type == unix.CGROUP2_SUPER_MAGIC { + cgMode = Hybrid + } + } + }) + return cgMode +} + +// RunningInUserNS detects whether we are currently running in a user namespace. +// Copied from github.com/lxc/lxd/shared/util.go +func RunningInUserNS() bool { + nsOnce.Do(func() { + file, err := os.Open("/proc/self/uid_map") + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return + } + defer file.Close() + + buf := bufio.NewReader(file) + l, _, err := buf.ReadLine() + if err != nil { + return + } + + line := string(l) + var a, b, c int64 + fmt.Sscanf(line, "%d %d %d", &a, &b, &c) + + /* + * We assume we are in the initial user namespace if we have a full + * range - 4294967295 uids starting at uid 0. + */ + if a == 0 && b == 0 && c == 4294967295 { + return + } + inUserNS = true + }) + return inUserNS +} + +// defaults returns all known groups +func defaults(root string) ([]Subsystem, error) { + h, err := NewHugetlb(root) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + s := []Subsystem{ + NewNamed(root, "systemd"), + NewFreezer(root), + NewPids(root), + NewNetCls(root), + NewNetPrio(root), + NewPerfEvent(root), + NewCpuset(root), + NewCpu(root), + NewCpuacct(root), + NewMemory(root), + NewBlkio(root), + NewRdma(root), + } + // only add the devices cgroup if we are not in a user namespace + // because modifications are not allowed + if !RunningInUserNS() { + s = append(s, NewDevices(root)) + } + // add the hugetlb cgroup if error wasn't due to missing hugetlb + // cgroup support on the host + if err == nil { + s = append(s, h) + } + return s, nil +} + +// remove will remove a cgroup path handling EAGAIN and EBUSY errors and +// retrying the remove after a exp timeout +func remove(path string) error { + delay := 10 * time.Millisecond + for i := 0; i < 5; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + if err := os.RemoveAll(path); err == nil { + return nil + } + } + return fmt.Errorf("cgroups: unable to remove path %q", path) +} + +// readPids will read all the pids of processes in a cgroup by the provided path +func readPids(path string, subsystem Name) ([]Process, error) { + f, err := os.Open(filepath.Join(path, cgroupProcs)) + if err != nil { + return nil, err + } + defer f.Close() + var ( + out []Process + s = bufio.NewScanner(f) + ) + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, Process{ + Pid: pid, + Subsystem: subsystem, + Path: path, + }) + } + } + if err := s.Err(); err != nil { + // failed to read all pids? + return nil, err + } + return out, nil +} + +// readTasksPids will read all the pids of tasks in a cgroup by the provided path +func readTasksPids(path string, subsystem Name) ([]Task, error) { + f, err := os.Open(filepath.Join(path, cgroupTasks)) + if err != nil { + return nil, err + } + defer f.Close() + var ( + out []Task + s = bufio.NewScanner(f) + ) + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, Task{ + Pid: pid, + Subsystem: subsystem, + Path: path, + }) + } + } + if err := s.Err(); err != nil { + return nil, err + } + return out, nil +} + +func hugePageSizes() ([]string, error) { + var ( + pageSizes []string + sizeList = []string{"B", "KB", "MB", "GB", "TB", "PB"} + ) + files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages") + if err != nil { + return nil, err + } + for _, st := range files { + nameArray := strings.Split(st.Name(), "-") + pageSize, err := units.RAMInBytes(nameArray[1]) + if err != nil { + return nil, err + } + pageSizes = append(pageSizes, units.CustomSize("%g%s", float64(pageSize), 1024.0, sizeList)) + } + return pageSizes, nil +} + +func readUint(path string) (uint64, error) { + v, err := ioutil.ReadFile(path) + if err != nil { + return 0, err + } + return parseUint(strings.TrimSpace(string(v)), 10, 64) +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + v, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && + intErr.(*strconv.NumError).Err == strconv.ErrRange && + intValue < 0 { + return 0, nil + } + return 0, err + } + return v, nil +} + +func parseKV(raw string) (string, uint64, error) { + parts := strings.Fields(raw) + switch len(parts) { + case 2: + v, err := parseUint(parts[1], 10, 64) + if err != nil { + return "", 0, err + } + return parts[0], v, nil + default: + return "", 0, ErrInvalidFormat + } +} + +func parseCgroupFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + return parseCgroupFromReader(f) +} + +func parseCgroupFromReader(r io.Reader) (map[string]string, error) { + var ( + cgroups = make(map[string]string) + s = bufio.NewScanner(r) + ) + for s.Scan() { + var ( + text = s.Text() + parts = strings.SplitN(text, ":", 3) + ) + if len(parts) < 3 { + return nil, fmt.Errorf("invalid cgroup entry: %q", text) + } + for _, subs := range strings.Split(parts[1], ",") { + if subs != "" { + cgroups[subs] = parts[2] + } + } + } + if err := s.Err(); err != nil { + return nil, err + } + return cgroups, nil +} + +func getCgroupDestination(subsystem string) (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + fields := strings.Split(s.Text(), " ") + if len(fields) < 10 { + // broken mountinfo? + continue + } + if fields[len(fields)-3] != "cgroup" { + continue + } + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[3], nil + } + } + } + if err := s.Err(); err != nil { + return "", err + } + return "", ErrNoCgroupMountDestination +} + +func pathers(subystems []Subsystem) []pather { + var out []pather + for _, s := range subystems { + if p, ok := s.(pather); ok { + out = append(out, p) + } + } + return out +} + +func initializeSubsystem(s Subsystem, path Path, resources *specs.LinuxResources) error { + if c, ok := s.(creator); ok { + p, err := path(s.Name()) + if err != nil { + return err + } + if err := c.Create(p, resources); err != nil { + return err + } + } else if c, ok := s.(pather); ok { + p, err := path(s.Name()) + if err != nil { + return err + } + // do the default create if the group does not have a custom one + if err := os.MkdirAll(c.Path(p), defaultDirPerm); err != nil { + return err + } + } + return nil +} + +func cleanPath(path string) string { + if path == "" { + return "" + } + path = filepath.Clean(path) + if !filepath.IsAbs(path) { + path, _ = filepath.Rel(string(os.PathSeparator), filepath.Clean(string(os.PathSeparator)+path)) + } + return path +} + +func retryingWriteFile(path string, data []byte, mode os.FileMode) error { + // Retry writes on EINTR; see: + // https://github.com/golang/go/issues/38033 + for { + err := ioutil.WriteFile(path, data, mode) + if err == nil { + return nil + } else if !errors.Is(err, syscall.EINTR) { + return err + } + } +} diff --git a/vendor/github.com/containerd/cgroups/v1.go b/vendor/github.com/containerd/cgroups/v1.go new file mode 100644 index 0000000000..2ec215c06f --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v1.go @@ -0,0 +1,73 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strings" +) + +// V1 returns all the groups in the default cgroups mountpoint in a single hierarchy +func V1() ([]Subsystem, error) { + root, err := v1MountPoint() + if err != nil { + return nil, err + } + subsystems, err := defaults(root) + if err != nil { + return nil, err + } + var enabled []Subsystem + for _, s := range pathers(subsystems) { + // check and remove the default groups that do not exist + if _, err := os.Lstat(s.Path("/")); err == nil { + enabled = append(enabled, s) + } + } + return enabled, nil +} + +// v1MountPoint returns the mount point where the cgroup +// mountpoints are mounted in a single hiearchy +func v1MountPoint() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + var ( + text = scanner.Text() + fields = strings.Split(text, " ") + numFields = len(fields) + ) + if numFields < 10 { + return "", fmt.Errorf("mountinfo: bad entry %q", text) + } + if fields[numFields-3] == "cgroup" { + return filepath.Dir(fields[4]), nil + } + } + if err := scanner.Err(); err != nil { + return "", err + } + return "", ErrMountPointNotExist +} diff --git a/vendor/github.com/coreos/go-systemd/v22/LICENSE b/vendor/github.com/coreos/go-systemd/v22/LICENSE new file mode 100644 index 0000000000..37ec93a14f --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/LICENSE @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/coreos/go-systemd/v22/NOTICE b/vendor/github.com/coreos/go-systemd/v22/NOTICE new file mode 100644 index 0000000000..23a0ada2fb --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/NOTICE @@ -0,0 +1,5 @@ +CoreOS Project +Copyright 2018 CoreOS, Inc + +This product includes software developed at CoreOS, Inc. +(http://www.coreos.com/). diff --git a/vendor/github.com/coreos/go-systemd/v22/dbus/dbus.go b/vendor/github.com/coreos/go-systemd/v22/dbus/dbus.go new file mode 100644 index 0000000000..91584a1668 --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/dbus.go @@ -0,0 +1,240 @@ +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Integration with the systemd D-Bus API. See http://www.freedesktop.org/wiki/Software/systemd/dbus/ +package dbus + +import ( + "encoding/hex" + "fmt" + "os" + "strconv" + "strings" + "sync" + + "github.com/godbus/dbus/v5" +) + +const ( + alpha = `abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ` + num = `0123456789` + alphanum = alpha + num + signalBuffer = 100 +) + +// needsEscape checks whether a byte in a potential dbus ObjectPath needs to be escaped +func needsEscape(i int, b byte) bool { + // Escape everything that is not a-z-A-Z-0-9 + // Also escape 0-9 if it's the first character + return strings.IndexByte(alphanum, b) == -1 || + (i == 0 && strings.IndexByte(num, b) != -1) +} + +// PathBusEscape sanitizes a constituent string of a dbus ObjectPath using the +// rules that systemd uses for serializing special characters. +func PathBusEscape(path string) string { + // Special case the empty string + if len(path) == 0 { + return "_" + } + n := []byte{} + for i := 0; i < len(path); i++ { + c := path[i] + if needsEscape(i, c) { + e := fmt.Sprintf("_%x", c) + n = append(n, []byte(e)...) + } else { + n = append(n, c) + } + } + return string(n) +} + +// pathBusUnescape is the inverse of PathBusEscape. +func pathBusUnescape(path string) string { + if path == "_" { + return "" + } + n := []byte{} + for i := 0; i < len(path); i++ { + c := path[i] + if c == '_' && i+2 < len(path) { + res, err := hex.DecodeString(path[i+1 : i+3]) + if err == nil { + n = append(n, res...) + } + i += 2 + } else { + n = append(n, c) + } + } + return string(n) +} + +// Conn is a connection to systemd's dbus endpoint. +type Conn struct { + // sysconn/sysobj are only used to call dbus methods + sysconn *dbus.Conn + sysobj dbus.BusObject + + // sigconn/sigobj are only used to receive dbus signals + sigconn *dbus.Conn + sigobj dbus.BusObject + + jobListener struct { + jobs map[dbus.ObjectPath]chan<- string + sync.Mutex + } + subStateSubscriber struct { + updateCh chan<- *SubStateUpdate + errCh chan<- error + sync.Mutex + ignore map[dbus.ObjectPath]int64 + cleanIgnore int64 + } + propertiesSubscriber struct { + updateCh chan<- *PropertiesUpdate + errCh chan<- error + sync.Mutex + } +} + +// New establishes a connection to any available bus and authenticates. +// Callers should call Close() when done with the connection. +func New() (*Conn, error) { + conn, err := NewSystemConnection() + if err != nil && os.Geteuid() == 0 { + return NewSystemdConnection() + } + return conn, err +} + +// NewSystemConnection establishes a connection to the system bus and authenticates. +// Callers should call Close() when done with the connection +func NewSystemConnection() (*Conn, error) { + return NewConnection(func() (*dbus.Conn, error) { + return dbusAuthHelloConnection(dbus.SystemBusPrivate) + }) +} + +// NewUserConnection establishes a connection to the session bus and +// authenticates. This can be used to connect to systemd user instances. +// Callers should call Close() when done with the connection. +func NewUserConnection() (*Conn, error) { + return NewConnection(func() (*dbus.Conn, error) { + return dbusAuthHelloConnection(dbus.SessionBusPrivate) + }) +} + +// NewSystemdConnection establishes a private, direct connection to systemd. +// This can be used for communicating with systemd without a dbus daemon. +// Callers should call Close() when done with the connection. +func NewSystemdConnection() (*Conn, error) { + return NewConnection(func() (*dbus.Conn, error) { + // We skip Hello when talking directly to systemd. + return dbusAuthConnection(func(opts ...dbus.ConnOption) (*dbus.Conn, error) { + return dbus.Dial("unix:path=/run/systemd/private") + }) + }) +} + +// Close closes an established connection +func (c *Conn) Close() { + c.sysconn.Close() + c.sigconn.Close() +} + +// NewConnection establishes a connection to a bus using a caller-supplied function. +// This allows connecting to remote buses through a user-supplied mechanism. +// The supplied function may be called multiple times, and should return independent connections. +// The returned connection must be fully initialised: the org.freedesktop.DBus.Hello call must have succeeded, +// and any authentication should be handled by the function. +func NewConnection(dialBus func() (*dbus.Conn, error)) (*Conn, error) { + sysconn, err := dialBus() + if err != nil { + return nil, err + } + + sigconn, err := dialBus() + if err != nil { + sysconn.Close() + return nil, err + } + + c := &Conn{ + sysconn: sysconn, + sysobj: systemdObject(sysconn), + sigconn: sigconn, + sigobj: systemdObject(sigconn), + } + + c.subStateSubscriber.ignore = make(map[dbus.ObjectPath]int64) + c.jobListener.jobs = make(map[dbus.ObjectPath]chan<- string) + + // Setup the listeners on jobs so that we can get completions + c.sigconn.BusObject().Call("org.freedesktop.DBus.AddMatch", 0, + "type='signal', interface='org.freedesktop.systemd1.Manager', member='JobRemoved'") + + c.dispatch() + return c, nil +} + +// GetManagerProperty returns the value of a property on the org.freedesktop.systemd1.Manager +// interface. The value is returned in its string representation, as defined at +// https://developer.gnome.org/glib/unstable/gvariant-text.html +func (c *Conn) GetManagerProperty(prop string) (string, error) { + variant, err := c.sysobj.GetProperty("org.freedesktop.systemd1.Manager." + prop) + if err != nil { + return "", err + } + return variant.String(), nil +} + +func dbusAuthConnection(createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) { + conn, err := createBus() + if err != nil { + return nil, err + } + + // Only use EXTERNAL method, and hardcode the uid (not username) + // to avoid a username lookup (which requires a dynamically linked + // libc) + methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(os.Getuid()))} + + err = conn.Auth(methods) + if err != nil { + conn.Close() + return nil, err + } + + return conn, nil +} + +func dbusAuthHelloConnection(createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) { + conn, err := dbusAuthConnection(createBus) + if err != nil { + return nil, err + } + + if err = conn.Hello(); err != nil { + conn.Close() + return nil, err + } + + return conn, nil +} + +func systemdObject(conn *dbus.Conn) dbus.BusObject { + return conn.Object("org.freedesktop.systemd1", dbus.ObjectPath("/org/freedesktop/systemd1")) +} diff --git a/vendor/github.com/coreos/go-systemd/v22/dbus/methods.go b/vendor/github.com/coreos/go-systemd/v22/dbus/methods.go new file mode 100644 index 0000000000..e38659d7be --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/methods.go @@ -0,0 +1,600 @@ +// Copyright 2015, 2018 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dbus + +import ( + "errors" + "fmt" + "path" + "strconv" + + "github.com/godbus/dbus/v5" +) + +func (c *Conn) jobComplete(signal *dbus.Signal) { + var id uint32 + var job dbus.ObjectPath + var unit string + var result string + dbus.Store(signal.Body, &id, &job, &unit, &result) + c.jobListener.Lock() + out, ok := c.jobListener.jobs[job] + if ok { + out <- result + delete(c.jobListener.jobs, job) + } + c.jobListener.Unlock() +} + +func (c *Conn) startJob(ch chan<- string, job string, args ...interface{}) (int, error) { + if ch != nil { + c.jobListener.Lock() + defer c.jobListener.Unlock() + } + + var p dbus.ObjectPath + err := c.sysobj.Call(job, 0, args...).Store(&p) + if err != nil { + return 0, err + } + + if ch != nil { + c.jobListener.jobs[p] = ch + } + + // ignore error since 0 is fine if conversion fails + jobID, _ := strconv.Atoi(path.Base(string(p))) + + return jobID, nil +} + +// StartUnit enqueues a start job and depending jobs, if any (unless otherwise +// specified by the mode string). +// +// Takes the unit to activate, plus a mode string. The mode needs to be one of +// replace, fail, isolate, ignore-dependencies, ignore-requirements. If +// "replace" the call will start the unit and its dependencies, possibly +// replacing already queued jobs that conflict with this. If "fail" the call +// will start the unit and its dependencies, but will fail if this would change +// an already queued job. If "isolate" the call will start the unit in question +// and terminate all units that aren't dependencies of it. If +// "ignore-dependencies" it will start a unit but ignore all its dependencies. +// If "ignore-requirements" it will start a unit but only ignore the +// requirement dependencies. It is not recommended to make use of the latter +// two options. +// +// If the provided channel is non-nil, a result string will be sent to it upon +// job completion: one of done, canceled, timeout, failed, dependency, skipped. +// done indicates successful execution of a job. canceled indicates that a job +// has been canceled before it finished execution. timeout indicates that the +// job timeout was reached. failed indicates that the job failed. dependency +// indicates that a job this job has been depending on failed and the job hence +// has been removed too. skipped indicates that a job was skipped because it +// didn't apply to the units current state. +// +// If no error occurs, the ID of the underlying systemd job will be returned. There +// does exist the possibility for no error to be returned, but for the returned job +// ID to be 0. In this case, the actual underlying ID is not 0 and this datapoint +// should not be considered authoritative. +// +// If an error does occur, it will be returned to the user alongside a job ID of 0. +func (c *Conn) StartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ch, "org.freedesktop.systemd1.Manager.StartUnit", name, mode) +} + +// StopUnit is similar to StartUnit but stops the specified unit rather +// than starting it. +func (c *Conn) StopUnit(name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ch, "org.freedesktop.systemd1.Manager.StopUnit", name, mode) +} + +// ReloadUnit reloads a unit. Reloading is done only if the unit is already running and fails otherwise. +func (c *Conn) ReloadUnit(name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadUnit", name, mode) +} + +// RestartUnit restarts a service. If a service is restarted that isn't +// running it will be started. +func (c *Conn) RestartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ch, "org.freedesktop.systemd1.Manager.RestartUnit", name, mode) +} + +// TryRestartUnit is like RestartUnit, except that a service that isn't running +// is not affected by the restart. +func (c *Conn) TryRestartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ch, "org.freedesktop.systemd1.Manager.TryRestartUnit", name, mode) +} + +// ReloadOrRestartUnit attempts a reload if the unit supports it and use a restart +// otherwise. +func (c *Conn) ReloadOrRestartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadOrRestartUnit", name, mode) +} + +// ReloadOrTryRestartUnit attempts a reload if the unit supports it and use a "Try" +// flavored restart otherwise. +func (c *Conn) ReloadOrTryRestartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadOrTryRestartUnit", name, mode) +} + +// StartTransientUnit() may be used to create and start a transient unit, which +// will be released as soon as it is not running or referenced anymore or the +// system is rebooted. name is the unit name including suffix, and must be +// unique. mode is the same as in StartUnit(), properties contains properties +// of the unit. +func (c *Conn) StartTransientUnit(name string, mode string, properties []Property, ch chan<- string) (int, error) { + return c.startJob(ch, "org.freedesktop.systemd1.Manager.StartTransientUnit", name, mode, properties, make([]PropertyCollection, 0)) +} + +// KillUnit takes the unit name and a UNIX signal number to send. All of the unit's +// processes are killed. +func (c *Conn) KillUnit(name string, signal int32) { + c.sysobj.Call("org.freedesktop.systemd1.Manager.KillUnit", 0, name, "all", signal).Store() +} + +// ResetFailedUnit resets the "failed" state of a specific unit. +func (c *Conn) ResetFailedUnit(name string) error { + return c.sysobj.Call("org.freedesktop.systemd1.Manager.ResetFailedUnit", 0, name).Store() +} + +// SystemState returns the systemd state. Equivalent to `systemctl is-system-running`. +func (c *Conn) SystemState() (*Property, error) { + var err error + var prop dbus.Variant + + obj := c.sysconn.Object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") + err = obj.Call("org.freedesktop.DBus.Properties.Get", 0, "org.freedesktop.systemd1.Manager", "SystemState").Store(&prop) + if err != nil { + return nil, err + } + + return &Property{Name: "SystemState", Value: prop}, nil +} + +// getProperties takes the unit path and returns all of its dbus object properties, for the given dbus interface +func (c *Conn) getProperties(path dbus.ObjectPath, dbusInterface string) (map[string]interface{}, error) { + var err error + var props map[string]dbus.Variant + + if !path.IsValid() { + return nil, fmt.Errorf("invalid unit name: %v", path) + } + + obj := c.sysconn.Object("org.freedesktop.systemd1", path) + err = obj.Call("org.freedesktop.DBus.Properties.GetAll", 0, dbusInterface).Store(&props) + if err != nil { + return nil, err + } + + out := make(map[string]interface{}, len(props)) + for k, v := range props { + out[k] = v.Value() + } + + return out, nil +} + +// GetUnitProperties takes the (unescaped) unit name and returns all of its dbus object properties. +func (c *Conn) GetUnitProperties(unit string) (map[string]interface{}, error) { + path := unitPath(unit) + return c.getProperties(path, "org.freedesktop.systemd1.Unit") +} + +// GetUnitPathProperties takes the (escaped) unit path and returns all of its dbus object properties. +func (c *Conn) GetUnitPathProperties(path dbus.ObjectPath) (map[string]interface{}, error) { + return c.getProperties(path, "org.freedesktop.systemd1.Unit") +} + +// GetAllProperties takes the (unescaped) unit name and returns all of its dbus object properties. +func (c *Conn) GetAllProperties(unit string) (map[string]interface{}, error) { + path := unitPath(unit) + return c.getProperties(path, "") +} + +func (c *Conn) getProperty(unit string, dbusInterface string, propertyName string) (*Property, error) { + var err error + var prop dbus.Variant + + path := unitPath(unit) + if !path.IsValid() { + return nil, errors.New("invalid unit name: " + unit) + } + + obj := c.sysconn.Object("org.freedesktop.systemd1", path) + err = obj.Call("org.freedesktop.DBus.Properties.Get", 0, dbusInterface, propertyName).Store(&prop) + if err != nil { + return nil, err + } + + return &Property{Name: propertyName, Value: prop}, nil +} + +func (c *Conn) GetUnitProperty(unit string, propertyName string) (*Property, error) { + return c.getProperty(unit, "org.freedesktop.systemd1.Unit", propertyName) +} + +// GetServiceProperty returns property for given service name and property name +func (c *Conn) GetServiceProperty(service string, propertyName string) (*Property, error) { + return c.getProperty(service, "org.freedesktop.systemd1.Service", propertyName) +} + +// GetUnitTypeProperties returns the extra properties for a unit, specific to the unit type. +// Valid values for unitType: Service, Socket, Target, Device, Mount, Automount, Snapshot, Timer, Swap, Path, Slice, Scope +// return "dbus.Error: Unknown interface" if the unitType is not the correct type of the unit +func (c *Conn) GetUnitTypeProperties(unit string, unitType string) (map[string]interface{}, error) { + path := unitPath(unit) + return c.getProperties(path, "org.freedesktop.systemd1."+unitType) +} + +// SetUnitProperties() may be used to modify certain unit properties at runtime. +// Not all properties may be changed at runtime, but many resource management +// settings (primarily those in systemd.cgroup(5)) may. The changes are applied +// instantly, and stored on disk for future boots, unless runtime is true, in which +// case the settings only apply until the next reboot. name is the name of the unit +// to modify. properties are the settings to set, encoded as an array of property +// name and value pairs. +func (c *Conn) SetUnitProperties(name string, runtime bool, properties ...Property) error { + return c.sysobj.Call("org.freedesktop.systemd1.Manager.SetUnitProperties", 0, name, runtime, properties).Store() +} + +func (c *Conn) GetUnitTypeProperty(unit string, unitType string, propertyName string) (*Property, error) { + return c.getProperty(unit, "org.freedesktop.systemd1."+unitType, propertyName) +} + +type UnitStatus struct { + Name string // The primary unit name as string + Description string // The human readable description string + LoadState string // The load state (i.e. whether the unit file has been loaded successfully) + ActiveState string // The active state (i.e. whether the unit is currently started or not) + SubState string // The sub state (a more fine-grained version of the active state that is specific to the unit type, which the active state is not) + Followed string // A unit that is being followed in its state by this unit, if there is any, otherwise the empty string. + Path dbus.ObjectPath // The unit object path + JobId uint32 // If there is a job queued for the job unit the numeric job id, 0 otherwise + JobType string // The job type as string + JobPath dbus.ObjectPath // The job object path +} + +type storeFunc func(retvalues ...interface{}) error + +func (c *Conn) listUnitsInternal(f storeFunc) ([]UnitStatus, error) { + result := make([][]interface{}, 0) + err := f(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + status := make([]UnitStatus, len(result)) + statusInterface := make([]interface{}, len(status)) + for i := range status { + statusInterface[i] = &status[i] + } + + err = dbus.Store(resultInterface, statusInterface...) + if err != nil { + return nil, err + } + + return status, nil +} + +// ListUnits returns an array with all currently loaded units. Note that +// units may be known by multiple names at the same time, and hence there might +// be more unit names loaded than actual units behind them. +// Also note that a unit is only loaded if it is active and/or enabled. +// Units that are both disabled and inactive will thus not be returned. +func (c *Conn) ListUnits() ([]UnitStatus, error) { + return c.listUnitsInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnits", 0).Store) +} + +// ListUnitsFiltered returns an array with units filtered by state. +// It takes a list of units' statuses to filter. +func (c *Conn) ListUnitsFiltered(states []string) ([]UnitStatus, error) { + return c.listUnitsInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitsFiltered", 0, states).Store) +} + +// ListUnitsByPatterns returns an array with units. +// It takes a list of units' statuses and names to filter. +// Note that units may be known by multiple names at the same time, +// and hence there might be more unit names loaded than actual units behind them. +func (c *Conn) ListUnitsByPatterns(states []string, patterns []string) ([]UnitStatus, error) { + return c.listUnitsInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitsByPatterns", 0, states, patterns).Store) +} + +// ListUnitsByNames returns an array with units. It takes a list of units' +// names and returns an UnitStatus array. Comparing to ListUnitsByPatterns +// method, this method returns statuses even for inactive or non-existing +// units. Input array should contain exact unit names, but not patterns. +// Note: Requires systemd v230 or higher +func (c *Conn) ListUnitsByNames(units []string) ([]UnitStatus, error) { + return c.listUnitsInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitsByNames", 0, units).Store) +} + +type UnitFile struct { + Path string + Type string +} + +func (c *Conn) listUnitFilesInternal(f storeFunc) ([]UnitFile, error) { + result := make([][]interface{}, 0) + err := f(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + files := make([]UnitFile, len(result)) + fileInterface := make([]interface{}, len(files)) + for i := range files { + fileInterface[i] = &files[i] + } + + err = dbus.Store(resultInterface, fileInterface...) + if err != nil { + return nil, err + } + + return files, nil +} + +// ListUnitFiles returns an array of all available units on disk. +func (c *Conn) ListUnitFiles() ([]UnitFile, error) { + return c.listUnitFilesInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitFiles", 0).Store) +} + +// ListUnitFilesByPatterns returns an array of all available units on disk matched the patterns. +func (c *Conn) ListUnitFilesByPatterns(states []string, patterns []string) ([]UnitFile, error) { + return c.listUnitFilesInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitFilesByPatterns", 0, states, patterns).Store) +} + +type LinkUnitFileChange EnableUnitFileChange + +// LinkUnitFiles() links unit files (that are located outside of the +// usual unit search paths) into the unit search path. +// +// It takes a list of absolute paths to unit files to link and two +// booleans. The first boolean controls whether the unit shall be +// enabled for runtime only (true, /run), or persistently (false, +// /etc). +// The second controls whether symlinks pointing to other units shall +// be replaced if necessary. +// +// This call returns a list of the changes made. The list consists of +// structures with three strings: the type of the change (one of symlink +// or unlink), the file name of the symlink and the destination of the +// symlink. +func (c *Conn) LinkUnitFiles(files []string, runtime bool, force bool) ([]LinkUnitFileChange, error) { + result := make([][]interface{}, 0) + err := c.sysobj.Call("org.freedesktop.systemd1.Manager.LinkUnitFiles", 0, files, runtime, force).Store(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]LinkUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return nil, err + } + + return changes, nil +} + +// EnableUnitFiles() may be used to enable one or more units in the system (by +// creating symlinks to them in /etc or /run). +// +// It takes a list of unit files to enable (either just file names or full +// absolute paths if the unit files are residing outside the usual unit +// search paths), and two booleans: the first controls whether the unit shall +// be enabled for runtime only (true, /run), or persistently (false, /etc). +// The second one controls whether symlinks pointing to other units shall +// be replaced if necessary. +// +// This call returns one boolean and an array with the changes made. The +// boolean signals whether the unit files contained any enablement +// information (i.e. an [Install]) section. The changes list consists of +// structures with three strings: the type of the change (one of symlink +// or unlink), the file name of the symlink and the destination of the +// symlink. +func (c *Conn) EnableUnitFiles(files []string, runtime bool, force bool) (bool, []EnableUnitFileChange, error) { + var carries_install_info bool + + result := make([][]interface{}, 0) + err := c.sysobj.Call("org.freedesktop.systemd1.Manager.EnableUnitFiles", 0, files, runtime, force).Store(&carries_install_info, &result) + if err != nil { + return false, nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]EnableUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return false, nil, err + } + + return carries_install_info, changes, nil +} + +type EnableUnitFileChange struct { + Type string // Type of the change (one of symlink or unlink) + Filename string // File name of the symlink + Destination string // Destination of the symlink +} + +// DisableUnitFiles() may be used to disable one or more units in the system (by +// removing symlinks to them from /etc or /run). +// +// It takes a list of unit files to disable (either just file names or full +// absolute paths if the unit files are residing outside the usual unit +// search paths), and one boolean: whether the unit was enabled for runtime +// only (true, /run), or persistently (false, /etc). +// +// This call returns an array with the changes made. The changes list +// consists of structures with three strings: the type of the change (one of +// symlink or unlink), the file name of the symlink and the destination of the +// symlink. +func (c *Conn) DisableUnitFiles(files []string, runtime bool) ([]DisableUnitFileChange, error) { + result := make([][]interface{}, 0) + err := c.sysobj.Call("org.freedesktop.systemd1.Manager.DisableUnitFiles", 0, files, runtime).Store(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]DisableUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return nil, err + } + + return changes, nil +} + +type DisableUnitFileChange struct { + Type string // Type of the change (one of symlink or unlink) + Filename string // File name of the symlink + Destination string // Destination of the symlink +} + +// MaskUnitFiles masks one or more units in the system +// +// It takes three arguments: +// * list of units to mask (either just file names or full +// absolute paths if the unit files are residing outside +// the usual unit search paths) +// * runtime to specify whether the unit was enabled for runtime +// only (true, /run/systemd/..), or persistently (false, /etc/systemd/..) +// * force flag +func (c *Conn) MaskUnitFiles(files []string, runtime bool, force bool) ([]MaskUnitFileChange, error) { + result := make([][]interface{}, 0) + err := c.sysobj.Call("org.freedesktop.systemd1.Manager.MaskUnitFiles", 0, files, runtime, force).Store(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]MaskUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return nil, err + } + + return changes, nil +} + +type MaskUnitFileChange struct { + Type string // Type of the change (one of symlink or unlink) + Filename string // File name of the symlink + Destination string // Destination of the symlink +} + +// UnmaskUnitFiles unmasks one or more units in the system +// +// It takes two arguments: +// * list of unit files to mask (either just file names or full +// absolute paths if the unit files are residing outside +// the usual unit search paths) +// * runtime to specify whether the unit was enabled for runtime +// only (true, /run/systemd/..), or persistently (false, /etc/systemd/..) +func (c *Conn) UnmaskUnitFiles(files []string, runtime bool) ([]UnmaskUnitFileChange, error) { + result := make([][]interface{}, 0) + err := c.sysobj.Call("org.freedesktop.systemd1.Manager.UnmaskUnitFiles", 0, files, runtime).Store(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]UnmaskUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return nil, err + } + + return changes, nil +} + +type UnmaskUnitFileChange struct { + Type string // Type of the change (one of symlink or unlink) + Filename string // File name of the symlink + Destination string // Destination of the symlink +} + +// Reload instructs systemd to scan for and reload unit files. This is +// equivalent to a 'systemctl daemon-reload'. +func (c *Conn) Reload() error { + return c.sysobj.Call("org.freedesktop.systemd1.Manager.Reload", 0).Store() +} + +func unitPath(name string) dbus.ObjectPath { + return dbus.ObjectPath("/org/freedesktop/systemd1/unit/" + PathBusEscape(name)) +} + +// unitName returns the unescaped base element of the supplied escaped path +func unitName(dpath dbus.ObjectPath) string { + return pathBusUnescape(path.Base(string(dpath))) +} diff --git a/vendor/github.com/coreos/go-systemd/v22/dbus/properties.go b/vendor/github.com/coreos/go-systemd/v22/dbus/properties.go new file mode 100644 index 0000000000..fb42b62733 --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/properties.go @@ -0,0 +1,237 @@ +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dbus + +import ( + "github.com/godbus/dbus/v5" +) + +// From the systemd docs: +// +// The properties array of StartTransientUnit() may take many of the settings +// that may also be configured in unit files. Not all parameters are currently +// accepted though, but we plan to cover more properties with future release. +// Currently you may set the Description, Slice and all dependency types of +// units, as well as RemainAfterExit, ExecStart for service units, +// TimeoutStopUSec and PIDs for scope units, and CPUAccounting, CPUShares, +// BlockIOAccounting, BlockIOWeight, BlockIOReadBandwidth, +// BlockIOWriteBandwidth, BlockIODeviceWeight, MemoryAccounting, MemoryLimit, +// DevicePolicy, DeviceAllow for services/scopes/slices. These fields map +// directly to their counterparts in unit files and as normal D-Bus object +// properties. The exception here is the PIDs field of scope units which is +// used for construction of the scope only and specifies the initial PIDs to +// add to the scope object. + +type Property struct { + Name string + Value dbus.Variant +} + +type PropertyCollection struct { + Name string + Properties []Property +} + +type execStart struct { + Path string // the binary path to execute + Args []string // an array with all arguments to pass to the executed command, starting with argument 0 + UncleanIsFailure bool // a boolean whether it should be considered a failure if the process exits uncleanly +} + +// PropExecStart sets the ExecStart service property. The first argument is a +// slice with the binary path to execute followed by the arguments to pass to +// the executed command. See +// http://www.freedesktop.org/software/systemd/man/systemd.service.html#ExecStart= +func PropExecStart(command []string, uncleanIsFailure bool) Property { + execStarts := []execStart{ + { + Path: command[0], + Args: command, + UncleanIsFailure: uncleanIsFailure, + }, + } + + return Property{ + Name: "ExecStart", + Value: dbus.MakeVariant(execStarts), + } +} + +// PropRemainAfterExit sets the RemainAfterExit service property. See +// http://www.freedesktop.org/software/systemd/man/systemd.service.html#RemainAfterExit= +func PropRemainAfterExit(b bool) Property { + return Property{ + Name: "RemainAfterExit", + Value: dbus.MakeVariant(b), + } +} + +// PropType sets the Type service property. See +// http://www.freedesktop.org/software/systemd/man/systemd.service.html#Type= +func PropType(t string) Property { + return Property{ + Name: "Type", + Value: dbus.MakeVariant(t), + } +} + +// PropDescription sets the Description unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit#Description= +func PropDescription(desc string) Property { + return Property{ + Name: "Description", + Value: dbus.MakeVariant(desc), + } +} + +func propDependency(name string, units []string) Property { + return Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +// PropRequires sets the Requires unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Requires= +func PropRequires(units ...string) Property { + return propDependency("Requires", units) +} + +// PropRequiresOverridable sets the RequiresOverridable unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequiresOverridable= +func PropRequiresOverridable(units ...string) Property { + return propDependency("RequiresOverridable", units) +} + +// PropRequisite sets the Requisite unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Requisite= +func PropRequisite(units ...string) Property { + return propDependency("Requisite", units) +} + +// PropRequisiteOverridable sets the RequisiteOverridable unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequisiteOverridable= +func PropRequisiteOverridable(units ...string) Property { + return propDependency("RequisiteOverridable", units) +} + +// PropWants sets the Wants unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Wants= +func PropWants(units ...string) Property { + return propDependency("Wants", units) +} + +// PropBindsTo sets the BindsTo unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#BindsTo= +func PropBindsTo(units ...string) Property { + return propDependency("BindsTo", units) +} + +// PropRequiredBy sets the RequiredBy unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequiredBy= +func PropRequiredBy(units ...string) Property { + return propDependency("RequiredBy", units) +} + +// PropRequiredByOverridable sets the RequiredByOverridable unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequiredByOverridable= +func PropRequiredByOverridable(units ...string) Property { + return propDependency("RequiredByOverridable", units) +} + +// PropWantedBy sets the WantedBy unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#WantedBy= +func PropWantedBy(units ...string) Property { + return propDependency("WantedBy", units) +} + +// PropBoundBy sets the BoundBy unit property. See +// http://www.freedesktop.org/software/systemd/main/systemd.unit.html#BoundBy= +func PropBoundBy(units ...string) Property { + return propDependency("BoundBy", units) +} + +// PropConflicts sets the Conflicts unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Conflicts= +func PropConflicts(units ...string) Property { + return propDependency("Conflicts", units) +} + +// PropConflictedBy sets the ConflictedBy unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#ConflictedBy= +func PropConflictedBy(units ...string) Property { + return propDependency("ConflictedBy", units) +} + +// PropBefore sets the Before unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Before= +func PropBefore(units ...string) Property { + return propDependency("Before", units) +} + +// PropAfter sets the After unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#After= +func PropAfter(units ...string) Property { + return propDependency("After", units) +} + +// PropOnFailure sets the OnFailure unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#OnFailure= +func PropOnFailure(units ...string) Property { + return propDependency("OnFailure", units) +} + +// PropTriggers sets the Triggers unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Triggers= +func PropTriggers(units ...string) Property { + return propDependency("Triggers", units) +} + +// PropTriggeredBy sets the TriggeredBy unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#TriggeredBy= +func PropTriggeredBy(units ...string) Property { + return propDependency("TriggeredBy", units) +} + +// PropPropagatesReloadTo sets the PropagatesReloadTo unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#PropagatesReloadTo= +func PropPropagatesReloadTo(units ...string) Property { + return propDependency("PropagatesReloadTo", units) +} + +// PropRequiresMountsFor sets the RequiresMountsFor unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequiresMountsFor= +func PropRequiresMountsFor(units ...string) Property { + return propDependency("RequiresMountsFor", units) +} + +// PropSlice sets the Slice unit property. See +// http://www.freedesktop.org/software/systemd/man/systemd.resource-control.html#Slice= +func PropSlice(slice string) Property { + return Property{ + Name: "Slice", + Value: dbus.MakeVariant(slice), + } +} + +// PropPids sets the PIDs field of scope units used in the initial construction +// of the scope only and specifies the initial PIDs to add to the scope object. +// See https://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/#properties +func PropPids(pids ...uint32) Property { + return Property{ + Name: "PIDs", + Value: dbus.MakeVariant(pids), + } +} diff --git a/vendor/github.com/coreos/go-systemd/v22/dbus/set.go b/vendor/github.com/coreos/go-systemd/v22/dbus/set.go new file mode 100644 index 0000000000..17c5d48565 --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/set.go @@ -0,0 +1,47 @@ +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dbus + +type set struct { + data map[string]bool +} + +func (s *set) Add(value string) { + s.data[value] = true +} + +func (s *set) Remove(value string) { + delete(s.data, value) +} + +func (s *set) Contains(value string) (exists bool) { + _, exists = s.data[value] + return +} + +func (s *set) Length() int { + return len(s.data) +} + +func (s *set) Values() (values []string) { + for val := range s.data { + values = append(values, val) + } + return +} + +func newSet() *set { + return &set{make(map[string]bool)} +} diff --git a/vendor/github.com/coreos/go-systemd/v22/dbus/subscription.go b/vendor/github.com/coreos/go-systemd/v22/dbus/subscription.go new file mode 100644 index 0000000000..7e370fea21 --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/subscription.go @@ -0,0 +1,333 @@ +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dbus + +import ( + "errors" + "log" + "time" + + "github.com/godbus/dbus/v5" +) + +const ( + cleanIgnoreInterval = int64(10 * time.Second) + ignoreInterval = int64(30 * time.Millisecond) +) + +// Subscribe sets up this connection to subscribe to all systemd dbus events. +// This is required before calling SubscribeUnits. When the connection closes +// systemd will automatically stop sending signals so there is no need to +// explicitly call Unsubscribe(). +func (c *Conn) Subscribe() error { + c.sigconn.BusObject().Call("org.freedesktop.DBus.AddMatch", 0, + "type='signal',interface='org.freedesktop.systemd1.Manager',member='UnitNew'") + c.sigconn.BusObject().Call("org.freedesktop.DBus.AddMatch", 0, + "type='signal',interface='org.freedesktop.DBus.Properties',member='PropertiesChanged'") + + return c.sigobj.Call("org.freedesktop.systemd1.Manager.Subscribe", 0).Store() +} + +// Unsubscribe this connection from systemd dbus events. +func (c *Conn) Unsubscribe() error { + return c.sigobj.Call("org.freedesktop.systemd1.Manager.Unsubscribe", 0).Store() +} + +func (c *Conn) dispatch() { + ch := make(chan *dbus.Signal, signalBuffer) + + c.sigconn.Signal(ch) + + go func() { + for { + signal, ok := <-ch + if !ok { + return + } + + if signal.Name == "org.freedesktop.systemd1.Manager.JobRemoved" { + c.jobComplete(signal) + } + + if c.subStateSubscriber.updateCh == nil && + c.propertiesSubscriber.updateCh == nil { + continue + } + + var unitPath dbus.ObjectPath + switch signal.Name { + case "org.freedesktop.systemd1.Manager.JobRemoved": + unitName := signal.Body[2].(string) + c.sysobj.Call("org.freedesktop.systemd1.Manager.GetUnit", 0, unitName).Store(&unitPath) + case "org.freedesktop.systemd1.Manager.UnitNew": + unitPath = signal.Body[1].(dbus.ObjectPath) + case "org.freedesktop.DBus.Properties.PropertiesChanged": + if signal.Body[0].(string) == "org.freedesktop.systemd1.Unit" { + unitPath = signal.Path + + if len(signal.Body) >= 2 { + if changed, ok := signal.Body[1].(map[string]dbus.Variant); ok { + c.sendPropertiesUpdate(unitPath, changed) + } + } + } + } + + if unitPath == dbus.ObjectPath("") { + continue + } + + c.sendSubStateUpdate(unitPath) + } + }() +} + +// SubscribeUnits returns two unbuffered channels which will receive all changed units every +// interval. Deleted units are sent as nil. +func (c *Conn) SubscribeUnits(interval time.Duration) (<-chan map[string]*UnitStatus, <-chan error) { + return c.SubscribeUnitsCustom(interval, 0, func(u1, u2 *UnitStatus) bool { return *u1 != *u2 }, nil) +} + +// SubscribeUnitsCustom is like SubscribeUnits but lets you specify the buffer +// size of the channels, the comparison function for detecting changes and a filter +// function for cutting down on the noise that your channel receives. +func (c *Conn) SubscribeUnitsCustom(interval time.Duration, buffer int, isChanged func(*UnitStatus, *UnitStatus) bool, filterUnit func(string) bool) (<-chan map[string]*UnitStatus, <-chan error) { + old := make(map[string]*UnitStatus) + statusChan := make(chan map[string]*UnitStatus, buffer) + errChan := make(chan error, buffer) + + go func() { + for { + timerChan := time.After(interval) + + units, err := c.ListUnits() + if err == nil { + cur := make(map[string]*UnitStatus) + for i := range units { + if filterUnit != nil && filterUnit(units[i].Name) { + continue + } + cur[units[i].Name] = &units[i] + } + + // add all new or changed units + changed := make(map[string]*UnitStatus) + for n, u := range cur { + if oldU, ok := old[n]; !ok || isChanged(oldU, u) { + changed[n] = u + } + delete(old, n) + } + + // add all deleted units + for oldN := range old { + changed[oldN] = nil + } + + old = cur + + if len(changed) != 0 { + statusChan <- changed + } + } else { + errChan <- err + } + + <-timerChan + } + }() + + return statusChan, errChan +} + +type SubStateUpdate struct { + UnitName string + SubState string +} + +// SetSubStateSubscriber writes to updateCh when any unit's substate changes. +// Although this writes to updateCh on every state change, the reported state +// may be more recent than the change that generated it (due to an unavoidable +// race in the systemd dbus interface). That is, this method provides a good +// way to keep a current view of all units' states, but is not guaranteed to +// show every state transition they go through. Furthermore, state changes +// will only be written to the channel with non-blocking writes. If updateCh +// is full, it attempts to write an error to errCh; if errCh is full, the error +// passes silently. +func (c *Conn) SetSubStateSubscriber(updateCh chan<- *SubStateUpdate, errCh chan<- error) { + if c == nil { + msg := "nil receiver" + select { + case errCh <- errors.New(msg): + default: + log.Printf("full error channel while reporting: %s\n", msg) + } + return + } + + c.subStateSubscriber.Lock() + defer c.subStateSubscriber.Unlock() + c.subStateSubscriber.updateCh = updateCh + c.subStateSubscriber.errCh = errCh +} + +func (c *Conn) sendSubStateUpdate(unitPath dbus.ObjectPath) { + c.subStateSubscriber.Lock() + defer c.subStateSubscriber.Unlock() + + if c.subStateSubscriber.updateCh == nil { + return + } + + isIgnored := c.shouldIgnore(unitPath) + defer c.cleanIgnore() + if isIgnored { + return + } + + info, err := c.GetUnitPathProperties(unitPath) + if err != nil { + select { + case c.subStateSubscriber.errCh <- err: + default: + log.Printf("full error channel while reporting: %s\n", err) + } + return + } + defer c.updateIgnore(unitPath, info) + + name, ok := info["Id"].(string) + if !ok { + msg := "failed to cast info.Id" + select { + case c.subStateSubscriber.errCh <- errors.New(msg): + default: + log.Printf("full error channel while reporting: %s\n", err) + } + return + } + substate, ok := info["SubState"].(string) + if !ok { + msg := "failed to cast info.SubState" + select { + case c.subStateSubscriber.errCh <- errors.New(msg): + default: + log.Printf("full error channel while reporting: %s\n", msg) + } + return + } + + update := &SubStateUpdate{name, substate} + select { + case c.subStateSubscriber.updateCh <- update: + default: + msg := "update channel is full" + select { + case c.subStateSubscriber.errCh <- errors.New(msg): + default: + log.Printf("full error channel while reporting: %s\n", msg) + } + return + } +} + +// The ignore functions work around a wart in the systemd dbus interface. +// Requesting the properties of an unloaded unit will cause systemd to send a +// pair of UnitNew/UnitRemoved signals. Because we need to get a unit's +// properties on UnitNew (as that's the only indication of a new unit coming up +// for the first time), we would enter an infinite loop if we did not attempt +// to detect and ignore these spurious signals. The signal themselves are +// indistinguishable from relevant ones, so we (somewhat hackishly) ignore an +// unloaded unit's signals for a short time after requesting its properties. +// This means that we will miss e.g. a transient unit being restarted +// *immediately* upon failure and also a transient unit being started +// immediately after requesting its status (with systemctl status, for example, +// because this causes a UnitNew signal to be sent which then causes us to fetch +// the properties). + +func (c *Conn) shouldIgnore(path dbus.ObjectPath) bool { + t, ok := c.subStateSubscriber.ignore[path] + return ok && t >= time.Now().UnixNano() +} + +func (c *Conn) updateIgnore(path dbus.ObjectPath, info map[string]interface{}) { + loadState, ok := info["LoadState"].(string) + if !ok { + return + } + + // unit is unloaded - it will trigger bad systemd dbus behavior + if loadState == "not-found" { + c.subStateSubscriber.ignore[path] = time.Now().UnixNano() + ignoreInterval + } +} + +// without this, ignore would grow unboundedly over time +func (c *Conn) cleanIgnore() { + now := time.Now().UnixNano() + if c.subStateSubscriber.cleanIgnore < now { + c.subStateSubscriber.cleanIgnore = now + cleanIgnoreInterval + + for p, t := range c.subStateSubscriber.ignore { + if t < now { + delete(c.subStateSubscriber.ignore, p) + } + } + } +} + +// PropertiesUpdate holds a map of a unit's changed properties +type PropertiesUpdate struct { + UnitName string + Changed map[string]dbus.Variant +} + +// SetPropertiesSubscriber writes to updateCh when any unit's properties +// change. Every property change reported by systemd will be sent; that is, no +// transitions will be "missed" (as they might be with SetSubStateSubscriber). +// However, state changes will only be written to the channel with non-blocking +// writes. If updateCh is full, it attempts to write an error to errCh; if +// errCh is full, the error passes silently. +func (c *Conn) SetPropertiesSubscriber(updateCh chan<- *PropertiesUpdate, errCh chan<- error) { + c.propertiesSubscriber.Lock() + defer c.propertiesSubscriber.Unlock() + c.propertiesSubscriber.updateCh = updateCh + c.propertiesSubscriber.errCh = errCh +} + +// we don't need to worry about shouldIgnore() here because +// sendPropertiesUpdate doesn't call GetProperties() +func (c *Conn) sendPropertiesUpdate(unitPath dbus.ObjectPath, changedProps map[string]dbus.Variant) { + c.propertiesSubscriber.Lock() + defer c.propertiesSubscriber.Unlock() + + if c.propertiesSubscriber.updateCh == nil { + return + } + + update := &PropertiesUpdate{unitName(unitPath), changedProps} + + select { + case c.propertiesSubscriber.updateCh <- update: + default: + msg := "update channel is full" + select { + case c.propertiesSubscriber.errCh <- errors.New(msg): + default: + log.Printf("full error channel while reporting: %s\n", msg) + } + return + } +} diff --git a/vendor/github.com/coreos/go-systemd/v22/dbus/subscription_set.go b/vendor/github.com/coreos/go-systemd/v22/dbus/subscription_set.go new file mode 100644 index 0000000000..5b408d5847 --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/subscription_set.go @@ -0,0 +1,57 @@ +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dbus + +import ( + "time" +) + +// SubscriptionSet returns a subscription set which is like conn.Subscribe but +// can filter to only return events for a set of units. +type SubscriptionSet struct { + *set + conn *Conn +} + +func (s *SubscriptionSet) filter(unit string) bool { + return !s.Contains(unit) +} + +// Subscribe starts listening for dbus events for all of the units in the set. +// Returns channels identical to conn.SubscribeUnits. +func (s *SubscriptionSet) Subscribe() (<-chan map[string]*UnitStatus, <-chan error) { + // TODO: Make fully evented by using systemd 209 with properties changed values + return s.conn.SubscribeUnitsCustom(time.Second, 0, + mismatchUnitStatus, + func(unit string) bool { return s.filter(unit) }, + ) +} + +// NewSubscriptionSet returns a new subscription set. +func (conn *Conn) NewSubscriptionSet() *SubscriptionSet { + return &SubscriptionSet{newSet(), conn} +} + +// mismatchUnitStatus returns true if the provided UnitStatus objects +// are not equivalent. false is returned if the objects are equivalent. +// Only the Name, Description and state-related fields are used in +// the comparison. +func mismatchUnitStatus(u1, u2 *UnitStatus) bool { + return u1.Name != u2.Name || + u1.Description != u2.Description || + u1.LoadState != u2.LoadState || + u1.ActiveState != u2.ActiveState || + u1.SubState != u2.SubState +} diff --git a/vendor/github.com/docker/go-units/CONTRIBUTING.md b/vendor/github.com/docker/go-units/CONTRIBUTING.md new file mode 100644 index 0000000000..9ea86d784e --- /dev/null +++ b/vendor/github.com/docker/go-units/CONTRIBUTING.md @@ -0,0 +1,67 @@ +# Contributing to go-units + +Want to hack on go-units? Awesome! Here are instructions to get you started. + +go-units is a part of the [Docker](https://www.docker.com) project, and follows +the same rules and principles. If you're already familiar with the way +Docker does things, you'll feel right at home. + +Otherwise, go read Docker's +[contributions guidelines](https://github.com/docker/docker/blob/master/CONTRIBUTING.md), +[issue triaging](https://github.com/docker/docker/blob/master/project/ISSUE-TRIAGE.md), +[review process](https://github.com/docker/docker/blob/master/project/REVIEWING.md) and +[branches and tags](https://github.com/docker/docker/blob/master/project/BRANCHES-AND-TAGS.md). + +### Sign your work + +The sign-off is a simple line at the end of the explanation for the patch. Your +signature certifies that you wrote the patch or otherwise have the right to pass +it on as an open-source patch. The rules are pretty simple: if you can certify +the below (from [developercertificate.org](http://developercertificate.org/)): + +``` +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. +660 York Street, Suite 102, +San Francisco, CA 94110 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +Then you just add a line to every git commit message: + + Signed-off-by: Joe Smith + +Use your real name (sorry, no pseudonyms or anonymous contributions.) + +If you set your `user.name` and `user.email` git configs, you can sign your +commit automatically with `git commit -s`. diff --git a/vendor/github.com/docker/go-units/LICENSE b/vendor/github.com/docker/go-units/LICENSE new file mode 100644 index 0000000000..b55b37bc31 --- /dev/null +++ b/vendor/github.com/docker/go-units/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + https://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2015 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/docker/go-units/MAINTAINERS b/vendor/github.com/docker/go-units/MAINTAINERS new file mode 100644 index 0000000000..4aac7c7411 --- /dev/null +++ b/vendor/github.com/docker/go-units/MAINTAINERS @@ -0,0 +1,46 @@ +# go-units maintainers file +# +# This file describes who runs the docker/go-units project and how. +# This is a living document - if you see something out of date or missing, speak up! +# +# It is structured to be consumable by both humans and programs. +# To extract its contents programmatically, use any TOML-compliant parser. +# +# This file is compiled into the MAINTAINERS file in docker/opensource. +# +[Org] + [Org."Core maintainers"] + people = [ + "akihirosuda", + "dnephin", + "thajeztah", + "vdemeester", + ] + +[people] + +# A reference list of all people associated with the project. +# All other sections should refer to people by their canonical key +# in the people section. + + # ADD YOURSELF HERE IN ALPHABETICAL ORDER + + [people.akihirosuda] + Name = "Akihiro Suda" + Email = "akihiro.suda.cz@hco.ntt.co.jp" + GitHub = "AkihiroSuda" + + [people.dnephin] + Name = "Daniel Nephin" + Email = "dnephin@gmail.com" + GitHub = "dnephin" + + [people.thajeztah] + Name = "Sebastiaan van Stijn" + Email = "github@gone.nl" + GitHub = "thaJeztah" + + [people.vdemeester] + Name = "Vincent Demeester" + Email = "vincent@sbr.pm" + GitHub = "vdemeester" \ No newline at end of file diff --git a/vendor/github.com/docker/go-units/README.md b/vendor/github.com/docker/go-units/README.md new file mode 100644 index 0000000000..4f70a4e134 --- /dev/null +++ b/vendor/github.com/docker/go-units/README.md @@ -0,0 +1,16 @@ +[![GoDoc](https://godoc.org/github.com/docker/go-units?status.svg)](https://godoc.org/github.com/docker/go-units) + +# Introduction + +go-units is a library to transform human friendly measurements into machine friendly values. + +## Usage + +See the [docs in godoc](https://godoc.org/github.com/docker/go-units) for examples and documentation. + +## Copyright and license + +Copyright © 2015 Docker, Inc. + +go-units is licensed under the Apache License, Version 2.0. +See [LICENSE](LICENSE) for the full text of the license. diff --git a/vendor/github.com/docker/go-units/circle.yml b/vendor/github.com/docker/go-units/circle.yml new file mode 100644 index 0000000000..af9d605529 --- /dev/null +++ b/vendor/github.com/docker/go-units/circle.yml @@ -0,0 +1,11 @@ +dependencies: + post: + # install golint + - go get golang.org/x/lint/golint + +test: + pre: + # run analysis before tests + - go vet ./... + - test -z "$(golint ./... | tee /dev/stderr)" + - test -z "$(gofmt -s -l . | tee /dev/stderr)" diff --git a/vendor/github.com/docker/go-units/duration.go b/vendor/github.com/docker/go-units/duration.go new file mode 100644 index 0000000000..48dd8744d4 --- /dev/null +++ b/vendor/github.com/docker/go-units/duration.go @@ -0,0 +1,35 @@ +// Package units provides helper function to parse and print size and time units +// in human-readable format. +package units + +import ( + "fmt" + "time" +) + +// HumanDuration returns a human-readable approximation of a duration +// (eg. "About a minute", "4 hours ago", etc.). +func HumanDuration(d time.Duration) string { + if seconds := int(d.Seconds()); seconds < 1 { + return "Less than a second" + } else if seconds == 1 { + return "1 second" + } else if seconds < 60 { + return fmt.Sprintf("%d seconds", seconds) + } else if minutes := int(d.Minutes()); minutes == 1 { + return "About a minute" + } else if minutes < 60 { + return fmt.Sprintf("%d minutes", minutes) + } else if hours := int(d.Hours() + 0.5); hours == 1 { + return "About an hour" + } else if hours < 48 { + return fmt.Sprintf("%d hours", hours) + } else if hours < 24*7*2 { + return fmt.Sprintf("%d days", hours/24) + } else if hours < 24*30*2 { + return fmt.Sprintf("%d weeks", hours/24/7) + } else if hours < 24*365*2 { + return fmt.Sprintf("%d months", hours/24/30) + } + return fmt.Sprintf("%d years", int(d.Hours())/24/365) +} diff --git a/vendor/github.com/docker/go-units/size.go b/vendor/github.com/docker/go-units/size.go new file mode 100644 index 0000000000..85f6ab0715 --- /dev/null +++ b/vendor/github.com/docker/go-units/size.go @@ -0,0 +1,108 @@ +package units + +import ( + "fmt" + "regexp" + "strconv" + "strings" +) + +// See: http://en.wikipedia.org/wiki/Binary_prefix +const ( + // Decimal + + KB = 1000 + MB = 1000 * KB + GB = 1000 * MB + TB = 1000 * GB + PB = 1000 * TB + + // Binary + + KiB = 1024 + MiB = 1024 * KiB + GiB = 1024 * MiB + TiB = 1024 * GiB + PiB = 1024 * TiB +) + +type unitMap map[string]int64 + +var ( + decimalMap = unitMap{"k": KB, "m": MB, "g": GB, "t": TB, "p": PB} + binaryMap = unitMap{"k": KiB, "m": MiB, "g": GiB, "t": TiB, "p": PiB} + sizeRegex = regexp.MustCompile(`^(\d+(\.\d+)*) ?([kKmMgGtTpP])?[iI]?[bB]?$`) +) + +var decimapAbbrs = []string{"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"} +var binaryAbbrs = []string{"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"} + +func getSizeAndUnit(size float64, base float64, _map []string) (float64, string) { + i := 0 + unitsLimit := len(_map) - 1 + for size >= base && i < unitsLimit { + size = size / base + i++ + } + return size, _map[i] +} + +// CustomSize returns a human-readable approximation of a size +// using custom format. +func CustomSize(format string, size float64, base float64, _map []string) string { + size, unit := getSizeAndUnit(size, base, _map) + return fmt.Sprintf(format, size, unit) +} + +// HumanSizeWithPrecision allows the size to be in any precision, +// instead of 4 digit precision used in units.HumanSize. +func HumanSizeWithPrecision(size float64, precision int) string { + size, unit := getSizeAndUnit(size, 1000.0, decimapAbbrs) + return fmt.Sprintf("%.*g%s", precision, size, unit) +} + +// HumanSize returns a human-readable approximation of a size +// capped at 4 valid numbers (eg. "2.746 MB", "796 KB"). +func HumanSize(size float64) string { + return HumanSizeWithPrecision(size, 4) +} + +// BytesSize returns a human-readable size in bytes, kibibytes, +// mebibytes, gibibytes, or tebibytes (eg. "44kiB", "17MiB"). +func BytesSize(size float64) string { + return CustomSize("%.4g%s", size, 1024.0, binaryAbbrs) +} + +// FromHumanSize returns an integer from a human-readable specification of a +// size using SI standard (eg. "44kB", "17MB"). +func FromHumanSize(size string) (int64, error) { + return parseSize(size, decimalMap) +} + +// RAMInBytes parses a human-readable string representing an amount of RAM +// in bytes, kibibytes, mebibytes, gibibytes, or tebibytes and +// returns the number of bytes, or -1 if the string is unparseable. +// Units are case-insensitive, and the 'b' suffix is optional. +func RAMInBytes(size string) (int64, error) { + return parseSize(size, binaryMap) +} + +// Parses the human-readable size string into the amount it represents. +func parseSize(sizeStr string, uMap unitMap) (int64, error) { + matches := sizeRegex.FindStringSubmatch(sizeStr) + if len(matches) != 4 { + return -1, fmt.Errorf("invalid size: '%s'", sizeStr) + } + + size, err := strconv.ParseFloat(matches[1], 64) + if err != nil { + return -1, err + } + + unitPrefix := strings.ToLower(matches[3]) + if mul, ok := uMap[unitPrefix]; ok { + size *= float64(mul) + } + + return int64(size), nil +} diff --git a/vendor/github.com/docker/go-units/ulimit.go b/vendor/github.com/docker/go-units/ulimit.go new file mode 100644 index 0000000000..fca0400cc8 --- /dev/null +++ b/vendor/github.com/docker/go-units/ulimit.go @@ -0,0 +1,123 @@ +package units + +import ( + "fmt" + "strconv" + "strings" +) + +// Ulimit is a human friendly version of Rlimit. +type Ulimit struct { + Name string + Hard int64 + Soft int64 +} + +// Rlimit specifies the resource limits, such as max open files. +type Rlimit struct { + Type int `json:"type,omitempty"` + Hard uint64 `json:"hard,omitempty"` + Soft uint64 `json:"soft,omitempty"` +} + +const ( + // magic numbers for making the syscall + // some of these are defined in the syscall package, but not all. + // Also since Windows client doesn't get access to the syscall package, need to + // define these here + rlimitAs = 9 + rlimitCore = 4 + rlimitCPU = 0 + rlimitData = 2 + rlimitFsize = 1 + rlimitLocks = 10 + rlimitMemlock = 8 + rlimitMsgqueue = 12 + rlimitNice = 13 + rlimitNofile = 7 + rlimitNproc = 6 + rlimitRss = 5 + rlimitRtprio = 14 + rlimitRttime = 15 + rlimitSigpending = 11 + rlimitStack = 3 +) + +var ulimitNameMapping = map[string]int{ + //"as": rlimitAs, // Disabled since this doesn't seem usable with the way Docker inits a container. + "core": rlimitCore, + "cpu": rlimitCPU, + "data": rlimitData, + "fsize": rlimitFsize, + "locks": rlimitLocks, + "memlock": rlimitMemlock, + "msgqueue": rlimitMsgqueue, + "nice": rlimitNice, + "nofile": rlimitNofile, + "nproc": rlimitNproc, + "rss": rlimitRss, + "rtprio": rlimitRtprio, + "rttime": rlimitRttime, + "sigpending": rlimitSigpending, + "stack": rlimitStack, +} + +// ParseUlimit parses and returns a Ulimit from the specified string. +func ParseUlimit(val string) (*Ulimit, error) { + parts := strings.SplitN(val, "=", 2) + if len(parts) != 2 { + return nil, fmt.Errorf("invalid ulimit argument: %s", val) + } + + if _, exists := ulimitNameMapping[parts[0]]; !exists { + return nil, fmt.Errorf("invalid ulimit type: %s", parts[0]) + } + + var ( + soft int64 + hard = &soft // default to soft in case no hard was set + temp int64 + err error + ) + switch limitVals := strings.Split(parts[1], ":"); len(limitVals) { + case 2: + temp, err = strconv.ParseInt(limitVals[1], 10, 64) + if err != nil { + return nil, err + } + hard = &temp + fallthrough + case 1: + soft, err = strconv.ParseInt(limitVals[0], 10, 64) + if err != nil { + return nil, err + } + default: + return nil, fmt.Errorf("too many limit value arguments - %s, can only have up to two, `soft[:hard]`", parts[1]) + } + + if *hard != -1 { + if soft == -1 { + return nil, fmt.Errorf("ulimit soft limit must be less than or equal to hard limit: soft: -1 (unlimited), hard: %d", *hard) + } + if soft > *hard { + return nil, fmt.Errorf("ulimit soft limit must be less than or equal to hard limit: %d > %d", soft, *hard) + } + } + + return &Ulimit{Name: parts[0], Soft: soft, Hard: *hard}, nil +} + +// GetRlimit returns the RLimit corresponding to Ulimit. +func (u *Ulimit) GetRlimit() (*Rlimit, error) { + t, exists := ulimitNameMapping[u.Name] + if !exists { + return nil, fmt.Errorf("invalid ulimit name %s", u.Name) + } + + return &Rlimit{Type: t, Soft: uint64(u.Soft), Hard: uint64(u.Hard)}, nil +} + +func (u *Ulimit) String() string { + return fmt.Sprintf("%s=%d:%d", u.Name, u.Soft, u.Hard) +} diff --git a/vendor/github.com/godbus/dbus/v5/.travis.yml b/vendor/github.com/godbus/dbus/v5/.travis.yml new file mode 100644 index 0000000000..dd67672048 --- /dev/null +++ b/vendor/github.com/godbus/dbus/v5/.travis.yml @@ -0,0 +1,50 @@ +dist: bionic +language: go +go_import_path: github.com/godbus/dbus + +go: + - 1.11.x + - 1.12.x + - 1.13.x + - tip + +matrix: + fast_finish: true + allow_failures: + - go: tip + +addons: + apt: + packages: + - dbus + - dbus-x11 + +before_install: + - export GO111MODULE=on + +script: + - go test -v -race -mod=readonly ./... # Run all the tests with the race detector enabled + - go vet ./... # go vet is the official Go static analyzer + +jobs: + include: + # The build matrix doesn't cover build stages, so manually expand + # the jobs with anchors + - &multiarch + stage: "Multiarch Test" + go: 1.11.x + env: TARGETS="386 arm arm64 ppc64le" + before_install: + - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + script: + - | + set -e + for target in $TARGETS; do + printf "\e[1mRunning test suite under ${target}.\e[0m\n" + GOARCH="$target" go test -v ./... + printf "\n\n" + done + - <<: *multiarch + go: 1.12.x + - <<: *multiarch + go: 1.13.x diff --git a/vendor/github.com/godbus/dbus/v5/CONTRIBUTING.md b/vendor/github.com/godbus/dbus/v5/CONTRIBUTING.md new file mode 100644 index 0000000000..c88f9b2bdd --- /dev/null +++ b/vendor/github.com/godbus/dbus/v5/CONTRIBUTING.md @@ -0,0 +1,50 @@ +# How to Contribute + +## Getting Started + +- Fork the repository on GitHub +- Read the [README](README.markdown) for build and test instructions +- Play with the project, submit bugs, submit patches! + +## Contribution Flow + +This is a rough outline of what a contributor's workflow looks like: + +- Create a topic branch from where you want to base your work (usually master). +- Make commits of logical units. +- Make sure your commit messages are in the proper format (see below). +- Push your changes to a topic branch in your fork of the repository. +- Make sure the tests pass, and add any new tests as appropriate. +- Submit a pull request to the original repository. + +Thanks for your contributions! + +### Format of the Commit Message + +We follow a rough convention for commit messages that is designed to answer two +questions: what changed and why. The subject line should feature the what and +the body of the commit should describe the why. + +``` +scripts: add the test-cluster command + +this uses tmux to setup a test cluster that you can easily kill and +start for debugging. + +Fixes #38 +``` + +The format can be described more formally as follows: + +``` +: + + + +