Skip to content
This repository was archived by the owner on May 12, 2021. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,9 @@ DEFDISABLENESTINGCHECKS := false
DEFMSIZE9P := 8192
DEFHOTPLUGVFIOONROOTBUS := false

# Default cgroup model
DEFSANDBOXCGROUPONLY ?= false

SED = sed

CLI_DIR = cli
Expand Down Expand Up @@ -424,6 +427,7 @@ USER_VARS += DEFDISABLENESTINGCHECKS
USER_VARS += DEFMSIZE9P
USER_VARS += DEFHOTPLUGVFIOONROOTBUS
USER_VARS += DEFENTROPYSOURCE
USER_VARS += DEFSANDBOXCGROUPONLY
USER_VARS += BUILDFLAGS


Expand Down Expand Up @@ -579,6 +583,7 @@ $(GENERATED_FILES): %: %.in $(MAKEFILE_LIST) VERSION .git-commit
-e "s|@DEFMSIZE9P@|$(DEFMSIZE9P)|g" \
-e "s|@DEFHOTPLUGONROOTBUS@|$(DEFHOTPLUGVFIOONROOTBUS)|g" \
-e "s|@DEFENTROPYSOURCE@|$(DEFENTROPYSOURCE)|g" \
-e "s|@DEFSANDBOXCGROUPONLY@|$(DEFSANDBOXCGROUPONLY)|g" \
$< > $@

generate-config: $(CONFIGS)
Expand Down
8 changes: 8 additions & 0 deletions cli/config/configuration-acrn.toml.in
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,14 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# (default: false)
#disable_new_netns = true

# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
# The container cgroups in the host are not created, just one single cgroup per sandbox.
# The sandbox cgroup is not constrained by the runtime
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
# See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@

# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# They may break compatibility, and are prepared for a big version bump.
Expand Down
8 changes: 8 additions & 0 deletions cli/config/configuration-fc.toml.in
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,14 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# (default: false)
#disable_new_netns = true

# if enable, the runtime will add all the kata processes inside one dedicated cgroup.
# The container cgroups in the host are not created, just one single cgroup per sandbox.
# The sandbox cgroup is not constrained by the runtime
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
# See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@

# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# They may break compatibility, and are prepared for a big version bump.
Expand Down
6 changes: 6 additions & 0 deletions cli/config/configuration-nemu.toml.in
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,12 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# (default: false)
#disable_new_netns = true

# if enable, the runtime use the parent cgroup of a container PodSandbox. This
# should be enabled for users where the caller setup the parent cgroup of the
# containers running in a sandbox so all the resouces of the kata container run
# in the same cgroup and performance isolation its more accurate.
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@

# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# They may break compatibility, and are prepared for a big version bump.
Expand Down
8 changes: 8 additions & 0 deletions cli/config/configuration-qemu.toml.in
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,14 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# (default: false)
#disable_new_netns = true

# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
# The container cgroups in the host are not created, just one single cgroup per sandbox.
# The sandbox cgroup is not constrained by the runtime
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
# See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@

# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# They may break compatibility, and are prepared for a big version bump.
Expand Down
2 changes: 2 additions & 0 deletions cli/kata-env.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ type RuntimeInfo struct {
Trace bool
DisableGuestSeccomp bool
DisableNewNetNs bool
SandboxCgroupOnly bool
Experimental []exp.Feature
Path string
}
Expand Down Expand Up @@ -187,6 +188,7 @@ func getRuntimeInfo(configFile string, config oci.RuntimeConfig) RuntimeInfo {
Config: runtimeConfig,
Path: runtimePath,
DisableNewNetNs: config.DisableNewNetNs,
SandboxCgroupOnly: config.SandboxCgroupOnly,
Experimental: config.Experimental,
DisableGuestSeccomp: config.DisableGuestSeccomp,
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/katautils/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ type runtime struct {
Tracing bool `toml:"enable_tracing"`
DisableNewNetNs bool `toml:"disable_new_netns"`
DisableGuestSeccomp bool `toml:"disable_guest_seccomp"`
SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"`
Experimental []string `toml:"experimental"`
InterNetworkModel string `toml:"internetworking_model"`
}
Expand Down Expand Up @@ -1054,6 +1055,7 @@ func LoadConfiguration(configPath string, ignoreLogging, builtIn bool) (resolved
config.ProxyConfig = vc.ProxyConfig{Debug: config.Debug}
}

config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly
config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs
for _, f := range tomlConf.Runtime.Experimental {
feature := exp.Get(f)
Expand Down
7 changes: 7 additions & 0 deletions virtcontainers/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ func createSandboxFromConfig(ctx context.Context, sandboxConfig SandboxConfig, f
return nil, err
}

// Move runtime to sandbox cgroup so all process are created there.
if s.config.SandboxCgroupOnly {
if err := s.setupSandboxCgroup(); err != nil {
return nil, err
}
}

// cleanup sandbox resources in case of any failure
defer func() {
if err != nil {
Expand Down
190 changes: 0 additions & 190 deletions virtcontainers/cgroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@ package virtcontainers
import (
"bufio"
"fmt"
"math"
"os"
"path/filepath"
"strings"

"github.com/containerd/cgroups"
"github.com/kata-containers/runtime/virtcontainers/pkg/annotations"
specs "github.com/opencontainers/runtime-spec/specs-go"
)

Expand Down Expand Up @@ -144,194 +142,6 @@ func parentCgroup(hierarchy cgroups.Hierarchy, path string) (cgroups.Cgroup, err
return parentCgroup, nil
}

func (s *Sandbox) updateCgroups() error {
if s.state.CgroupPath == "" {
s.Logger().Warn("sandbox's cgroup won't be updated: cgroup path is empty")
return nil
}

cgroup, err := cgroupsLoadFunc(V1Constraints, cgroups.StaticPath(s.state.CgroupPath))
if err != nil {
return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err)
}

if err := s.constrainHypervisor(cgroup); err != nil {
return err
}

if len(s.containers) <= 1 {
// nothing to update
return nil
}

resources, err := s.resources()
if err != nil {
return err
}

if err := cgroup.Update(&resources); err != nil {
return fmt.Errorf("Could not update cgroup %v: %v", s.state.CgroupPath, err)
}

return nil
}

func (s *Sandbox) deleteCgroups() error {
s.Logger().Debug("Deleting sandbox cgroup")

path := cgroupNoConstraintsPath(s.state.CgroupPath)
s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup")
noConstraintsCgroup, err := cgroupsLoadFunc(V1NoConstraints, cgroups.StaticPath(path))
if err == cgroups.ErrCgroupDeleted {
// cgroup already deleted
return nil
}

if err != nil {
return fmt.Errorf("Could not load cgroup without constraints %v: %v", path, err)
}

// move running process here, that way cgroup can be removed
parent, err := parentCgroup(V1NoConstraints, path)
if err != nil {
// parent cgroup doesn't exist, that means there are no process running
// and the no constraints cgroup was removed.
s.Logger().WithError(err).Warn("Parent cgroup doesn't exist")
return nil
}

if err := noConstraintsCgroup.MoveTo(parent); err != nil {
// Don't fail, cgroup can be deleted
s.Logger().WithError(err).Warn("Could not move process from no constraints to parent cgroup")
}

return noConstraintsCgroup.Delete()
}

func (s *Sandbox) constrainHypervisor(cgroup cgroups.Cgroup) error {
pids := s.hypervisor.getPids()
if len(pids) == 0 || pids[0] == 0 {
return fmt.Errorf("Invalid hypervisor PID: %+v", pids)
}

// Move hypervisor into cgroups without constraints,
// those cgroups are not yet supported.
resources := &specs.LinuxResources{}
path := cgroupNoConstraintsPath(s.state.CgroupPath)
noConstraintsCgroup, err := cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources)
if err != nil {
return fmt.Errorf("Could not create cgroup %v: %v", path, err)
}
for _, pid := range pids {
if pid <= 0 {
s.Logger().Warnf("Invalid hypervisor pid: %d", pid)
continue
}
if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil {
return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err)
}
}

// when new container joins, new CPU could be hotplugged, so we
// have to query fresh vcpu info from hypervisor for every time.
tids, err := s.hypervisor.getThreadIDs()
if err != nil {
return fmt.Errorf("failed to get thread ids from hypervisor: %v", err)
}
if len(tids.vcpus) == 0 {
// If there's no tid returned from the hypervisor, this is not
// a bug. It simply means there is nothing to constrain, hence
// let's return without any error from here.
return nil
}

// We are about to move just the vcpus (threads) into cgroups with constraints.
// Move whole hypervisor process whould be easier but the IO/network performance
// whould be impacted.
for _, i := range tids.vcpus {
// In contrast, AddTask will write thread id to `tasks`
// After this, vcpu threads are in "vcpu" sub-cgroup, other threads in
// qemu will be left in parent cgroup untouched.
if err := cgroup.AddTask(cgroups.Process{
Pid: i,
}); err != nil {
return err
}
}

return nil
}

func (s *Sandbox) resources() (specs.LinuxResources, error) {
resources := specs.LinuxResources{
CPU: s.cpuResources(),
}

return resources, nil
}

func (s *Sandbox) cpuResources() *specs.LinuxCPU {
// Use default period and quota if they are not specified.
// Container will inherit the constraints from its parent.
quota := int64(0)
period := uint64(0)
shares := uint64(0)
realtimePeriod := uint64(0)
realtimeRuntime := int64(0)

cpu := &specs.LinuxCPU{
Quota: &quota,
Period: &period,
Shares: &shares,
RealtimePeriod: &realtimePeriod,
RealtimeRuntime: &realtimeRuntime,
}

for _, c := range s.containers {
ann := c.GetAnnotations()
if ann[annotations.ContainerTypeKey] == string(PodSandbox) {
// skip sandbox container
continue
}

if c.config.Resources.CPU == nil {
continue
}

if c.config.Resources.CPU.Shares != nil {
shares = uint64(math.Max(float64(*c.config.Resources.CPU.Shares), float64(shares)))
}

if c.config.Resources.CPU.Quota != nil {
quota += *c.config.Resources.CPU.Quota
}

if c.config.Resources.CPU.Period != nil {
period = uint64(math.Max(float64(*c.config.Resources.CPU.Period), float64(period)))
}

if c.config.Resources.CPU.Cpus != "" {
cpu.Cpus += c.config.Resources.CPU.Cpus + ","
}

if c.config.Resources.CPU.RealtimeRuntime != nil {
realtimeRuntime += *c.config.Resources.CPU.RealtimeRuntime
}

if c.config.Resources.CPU.RealtimePeriod != nil {
realtimePeriod += *c.config.Resources.CPU.RealtimePeriod
}

if c.config.Resources.CPU.Mems != "" {
cpu.Mems += c.config.Resources.CPU.Mems + ","
}
}

cpu.Cpus = strings.Trim(cpu.Cpus, " \n\t,")

return validCPUResources(cpu)
}

// validCPUResources checks CPU resources coherency
func validCPUResources(cpuSpec *specs.LinuxCPU) *specs.LinuxCPU {
if cpuSpec == nil {
Expand Down
12 changes: 6 additions & 6 deletions virtcontainers/cgroups_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,12 @@ func TestUpdateCgroups(t *testing.T) {
}

// empty path
err := s.updateCgroups()
err := s.cgroupsUpdate()
assert.NoError(err)

// path doesn't exist
s.state.CgroupPath = "/abc/123/rgb"
err = s.updateCgroups()
err = s.cgroupsUpdate()
assert.Error(err)

if os.Getuid() != 0 {
Expand All @@ -152,7 +152,7 @@ func TestUpdateCgroups(t *testing.T) {
s.hypervisor = &mockHypervisor{mockPid: 0}

// bad pid
err = s.updateCgroups()
err = s.cgroupsUpdate()
assert.Error(err)

// fake workload
Expand All @@ -161,7 +161,7 @@ func TestUpdateCgroups(t *testing.T) {
s.hypervisor = &mockHypervisor{mockPid: cmd.Process.Pid}

// no containers
err = s.updateCgroups()
err = s.cgroupsUpdate()
assert.NoError(err)

s.config = &SandboxConfig{}
Expand All @@ -186,11 +186,11 @@ func TestUpdateCgroups(t *testing.T) {
},
}

err = s.updateCgroups()
err = s.cgroupsUpdate()
assert.NoError(err)

// cleanup
assert.NoError(cmd.Process.Kill())
err = s.deleteCgroups()
err = s.cgroupsDelete()
assert.NoError(err)
}
Loading