diff --git a/cmd/containerd-shim-runhcs-v1/clone.go b/cmd/containerd-shim-runhcs-v1/clone.go new file mode 100644 index 0000000000..221aabacab --- /dev/null +++ b/cmd/containerd-shim-runhcs-v1/clone.go @@ -0,0 +1,50 @@ +package main + +import ( + "context" + + "github.com/Microsoft/hcsshim/internal/clone" + "github.com/Microsoft/hcsshim/internal/uvm" +) + +// saveAsTemplate saves the UVM and container inside it as a template and also stores the +// relevant information in the registry so that clones can be created from this template. +// Every cloned uvm gets its own NIC and we do not want to create clones of a template +// which still has a NIC attached to it. So remove the NICs attached to the template uvm +// before saving it. +// Similar to the NIC scenario we do not want to create clones from a template with an +// active GCS connection so close the GCS connection too. +func saveAsTemplate(ctx context.Context, templateTask *hcsTask) (err error) { + var utc *uvm.UVMTemplateConfig + var templateConfig *clone.TemplateConfig + + if err = templateTask.host.RemoveAllNICs(ctx); err != nil { + return err + } + + if err = templateTask.host.CloseGCSConnection(); err != nil { + return err + } + + utc, err = templateTask.host.GenerateTemplateConfig() + if err != nil { + return err + } + + templateConfig = &clone.TemplateConfig{ + TemplateUVMID: utc.UVMID, + TemplateUVMResources: utc.Resources, + TemplateUVMCreateOpts: utc.CreateOpts, + TemplateContainerID: templateTask.id, + TemplateContainerSpec: *templateTask.taskSpec, + } + + if err = clone.SaveTemplateConfig(ctx, templateConfig); err != nil { + return err + } + + if err = templateTask.host.SaveAsTemplate(ctx); err != nil { + return err + } + return nil +} diff --git a/cmd/containerd-shim-runhcs-v1/exec_clone.go b/cmd/containerd-shim-runhcs-v1/exec_clone.go new file mode 100644 index 0000000000..078149f140 --- /dev/null +++ b/cmd/containerd-shim-runhcs-v1/exec_clone.go @@ -0,0 +1,67 @@ +package main + +import ( + "context" + + "github.com/Microsoft/hcsshim/internal/cmd" + "github.com/Microsoft/hcsshim/internal/cow" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/uvm" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +func newClonedExec( + ctx context.Context, + events publisher, + tid string, + host *uvm.UtilityVM, + c cow.Container, + id, bundle string, + isWCOW bool, + spec *specs.Process, + io cmd.UpstreamIO) *clonedExec { + log.G(ctx).WithFields(logrus.Fields{ + "tid": tid, + "eid": id, // Init exec ID is always same as Task ID + "bundle": bundle, + }).Debug("newClonedExec") + + he := &hcsExec{ + events: events, + tid: tid, + host: host, + c: c, + id: id, + bundle: bundle, + isWCOW: isWCOW, + spec: spec, + io: io, + processDone: make(chan struct{}), + state: shimExecStateCreated, + exitStatus: 255, // By design for non-exited process status. + exited: make(chan struct{}), + } + + ce := &clonedExec{ + he, + } + go he.waitForContainerExit() + return ce +} + +var _ = (shimExec)(&clonedExec{}) + +// clonedExec inherits from hcsExec. The only difference between these two is that +// on starting a clonedExec it doesn't attempt to start the container even if the +// exec is the init process. This is because in case of clonedExec the container is +// already running inside the pod. +type clonedExec struct { + *hcsExec +} + +func (ce *clonedExec) Start(ctx context.Context) (err error) { + // A cloned exec should never initialize the container as it should + // already be running. + return ce.startInternal(ctx, false) +} diff --git a/cmd/containerd-shim-runhcs-v1/exec_hcs.go b/cmd/containerd-shim-runhcs-v1/exec_hcs.go index 728ef51c10..7c2115c2cc 100644 --- a/cmd/containerd-shim-runhcs-v1/exec_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/exec_hcs.go @@ -181,7 +181,7 @@ func (he *hcsExec) Status() *task.StateResponse { } } -func (he *hcsExec) Start(ctx context.Context) (err error) { +func (he *hcsExec) startInternal(ctx context.Context, initializeContainer bool) (err error) { he.sl.Lock() defer he.sl.Unlock() if he.state != shimExecStateCreated { @@ -192,8 +192,7 @@ func (he *hcsExec) Start(ctx context.Context) (err error) { he.exitFromCreatedL(ctx, 1) } }() - if he.id == he.tid { - // This is the init exec. We need to start the container itself + if initializeContainer { err = he.c.Start(ctx) if err != nil { return err @@ -257,6 +256,12 @@ func (he *hcsExec) Start(ctx context.Context) (err error) { return nil } +func (he *hcsExec) Start(ctx context.Context) (err error) { + // If he.id == he.tid then this is the init exec. + // We need to initialize the container itself before starting this exec. + return he.startInternal(ctx, he.id == he.tid) +} + func (he *hcsExec) Kill(ctx context.Context, signal uint32) error { he.sl.Lock() defer he.sl.Unlock() @@ -414,6 +419,8 @@ func (he *hcsExec) exitFromCreatedL(ctx context.Context, status int) { // // 6. Close `he.exited` channel to unblock any waiters who might have called // `Create`/`Wait`/`Start` which is a valid pattern. +// +// 7. Finally, save the UVM and this container as a template if specified. func (he *hcsExec) waitForExit() { ctx, span := trace.StartSpan(context.Background(), "hcsExec::waitForExit") defer span.End() diff --git a/cmd/containerd-shim-runhcs-v1/pod.go b/cmd/containerd-shim-runhcs-v1/pod.go index 2d89b0d8ea..6e341b83fb 100644 --- a/cmd/containerd-shim-runhcs-v1/pod.go +++ b/cmd/containerd-shim-runhcs-v1/pod.go @@ -152,21 +152,8 @@ func createPod(ctx context.Context, events publisher, req *task.CreateTaskReques // isolated. Process isolated WCOW gets the namespace endpoints // automatically. if parent != nil { - nsid := "" - if s.Windows != nil && s.Windows.Network != nil { - nsid = s.Windows.Network.NetworkNamespace - } - - if nsid != "" { - endpoints, err := hcsoci.GetNamespaceEndpoints(ctx, nsid) - if err != nil { - return nil, err - } - err = parent.AddNetNS(ctx, nsid) - if err != nil { - return nil, err - } - err = parent.AddEndpointsToNS(ctx, nsid, endpoints) + if s.Windows != nil && s.Windows.Network != nil && s.Windows.Network.NetworkNamespace != "" { + err = hcsoci.SetupNetworkNamespace(ctx, parent, s.Windows.Network.NetworkNamespace) if err != nil { return nil, err } @@ -241,6 +228,16 @@ func (p *pod) ID() string { return p.id } +func (p *pod) GetCloneAnnotations(ctx context.Context, s *specs.Spec) (bool, string, error) { + isTemplate, templateID, err := oci.ParseCloneAnnotations(ctx, s) + if err != nil { + return false, "", err + } else if (isTemplate || templateID != "") && p.host == nil { + return false, "", fmt.Errorf("save as template and creating clones is only supported for hyper-v isolated containers") + } + return isTemplate, templateID, nil +} + func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (_ shimTask, err error) { if req.ID == p.id { return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists", req.ID) @@ -283,7 +280,17 @@ func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *sp sid) } - st, err := newHcsTask(ctx, p.events, p.host, false, req, s) + _, templateID, err := p.GetCloneAnnotations(ctx, s) + if err != nil { + return nil, err + } + + var st shimTask + if templateID != "" { + st, err = newClonedHcsTask(ctx, p.events, p.host, false, req, s, templateID) + } else { + st, err = newHcsTask(ctx, p.events, p.host, false, req, s) + } if err != nil { return nil, err } diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index aa93019e1f..c16271c79c 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -123,6 +123,7 @@ func newHcsTask( }).Debug("newHcsTask") owner := filepath.Base(os.Args[0]) + isTemplate := oci.ParseAnnotationsSaveAsTemplate(ctx, s) io, err := cmd.NewNpipeIO(ctx, req.Stdin, req.Stdout, req.Stderr, req.Terminal) if err != nil { @@ -162,14 +163,16 @@ func newHcsTask( } ht := &hcsTask{ - events: events, - id: req.ID, - isWCOW: oci.IsWCOW(s), - c: system, - cr: resources, - ownsHost: ownsParent, - host: parent, - closed: make(chan struct{}), + events: events, + id: req.ID, + isWCOW: oci.IsWCOW(s), + c: system, + cr: resources, + ownsHost: ownsParent, + host: parent, + closed: make(chan struct{}), + taskSpec: s, + isTemplate: isTemplate, } ht.init = newHcsExec( ctx, @@ -189,9 +192,121 @@ func newHcsTask( // handle this case. go ht.waitForHostExit() } + + // In the normal case the `Signal` call from the caller killed this task's + // init process. Or the init process ran to completion - this will mostly + // happen when we are creating a template and want to wait for init process + // to finish before we save the template. In such cases do not tear down the + // container after init exits - because we need the container in the template + go ht.waitInitExit(!isTemplate) + + // Publish the created event + ht.events.publishEvent( + ctx, + runtime.TaskCreateEventTopic, + &eventstypes.TaskCreate{ + ContainerID: req.ID, + Bundle: req.Bundle, + Rootfs: req.Rootfs, + IO: &eventstypes.TaskIO{ + Stdin: req.Stdin, + Stdout: req.Stdout, + Stderr: req.Stderr, + Terminal: req.Terminal, + }, + Checkpoint: "", + Pid: uint32(ht.init.Pid()), + }) + return ht, nil +} + +// newClonedTask creates a container within `parent`. The parent must be already cloned +// from a template and hence this container must already be present inside that parent. +// This function simply creates the go wrapper around the container that is already +// running inside the cloned parent. +// This task MAY own the UVM that it is running in but as of now the cloning feature is +// only used for WCOW hyper-V isolated containers and for WCOW, the wcowPodSandboxTask +// owns that UVM. +func newClonedHcsTask( + ctx context.Context, + events publisher, + parent *uvm.UtilityVM, + ownsParent bool, + req *task.CreateTaskRequest, + s *specs.Spec, + templateID string) (_ shimTask, err error) { + log.G(ctx).WithFields(logrus.Fields{ + "tid": req.ID, + "ownsParent": ownsParent, + "templateid": templateID, + }).Debug("newClonedHcsTask") + + owner := filepath.Base(os.Args[0]) + + if parent.OS() != "windows" { + return nil, fmt.Errorf("cloned task can only be created inside a windows host") + } + + io, err := cmd.NewNpipeIO(ctx, req.Stdin, req.Stdout, req.Stderr, req.Terminal) + if err != nil { + return nil, err + } + + var netNS string + if s.Windows != nil && + s.Windows.Network != nil { + netNS = s.Windows.Network.NetworkNamespace + } + + // This is a cloned task. Use the templateid as the ID of the container here + // because that's the ID of this container inside the UVM. + opts := hcsoci.CreateOptions{ + ID: templateID, + Owner: owner, + Spec: s, + HostingSystem: parent, + NetworkNamespace: netNS, + } + system, resources, err := hcsoci.CloneContainer(ctx, &opts) + if err != nil { + return nil, err + } + + ht := &hcsTask{ + events: events, + id: req.ID, + isWCOW: oci.IsWCOW(s), + c: system, + cr: resources, + ownsHost: ownsParent, + host: parent, + closed: make(chan struct{}), + templateID: templateID, + taskSpec: s, + isTemplate: false, + } + ht.init = newClonedExec( + ctx, + events, + req.ID, + parent, + system, + req.ID, + req.Bundle, + ht.isWCOW, + s.Process, + io) + + if parent != nil { + // We have a parent UVM. Listen for its exit and forcibly close this + // task. This is not expected but in the event of a UVM crash we need to + // handle this case. + go ht.waitForHostExit() + } + // In the normal case the `Signal` call from the caller killed this task's // init process. - go ht.waitInitExit() + go ht.waitInitExit(true) // Publish the created event ht.events.publishEvent( @@ -268,6 +383,22 @@ type hcsTask struct { // closeHostOnce is used to close `host`. This will only be used if // `ownsHost==true` and `host != nil`. closeHostOnce sync.Once + + // templateID represents the id of the template container from which this container + // is cloned. The parent UVM (inside which this container is running) identifies this + // container with it's original id (i.e the id that was assigned to this container + // at the time of template creation i.e the templateID). Hence, every request that + // is sent to the GCS must actually use templateID to reference this container. + // A non-empty templateID specifies that this task was cloned. + templateID string + + // if isTemplate is true then this container will be saved as a template as soon + // as its init process exits. Note: templateID and isTemplate are mutually exclusive. + // i.e isTemplate can not be true when templateID is not empty. + isTemplate bool + + // taskSpec represents the spec/configuration for this task. + taskSpec *specs.Spec } func (ht *hcsTask) ID() string { @@ -451,7 +582,7 @@ func (ht *hcsTask) Wait() *task.StateResponse { return ht.init.Wait() } -func (ht *hcsTask) waitInitExit() { +func (ht *hcsTask) waitInitExit(destroyContainer bool) { ctx, span := trace.StartSpan(context.Background(), "hcsTask::waitInitExit") defer span.End() span.AddAttributes(trace.StringAttribute("tid", ht.id)) @@ -459,8 +590,20 @@ func (ht *hcsTask) waitInitExit() { // Wait for it to exit on its own ht.init.Wait() - // Close the host and event the exit - ht.close(ctx) + if destroyContainer { + // Close the host and event the exit + ht.close(ctx) + } else { + // Close the container's host, but do not close or terminate the container itself + ht.closeHost(ctx) + } + + if ht.isTemplate { + // Save the host as a template + if err := saveAsTemplate(ctx, ht); err != nil { + log.G(ctx).WithError(err).Error("failed to save as template") + } + } } // waitForHostExit waits for the host virtual machine to exit. Once exited diff --git a/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go b/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go index 9f60d6398c..88a467b09d 100644 --- a/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go +++ b/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go @@ -7,6 +7,7 @@ import ( "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/cmd" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/shimdiag" @@ -170,6 +171,10 @@ func (wpst *wcowPodSandboxTask) close(ctx context.Context) { if err := wpst.host.Close(); err != nil { log.G(ctx).WithError(err).Error("failed host vm shutdown") } + // cleanup template state if any exists + if err := clone.RemoveSavedTemplateConfig(wpst.host.ID()); err != nil { + log.G(ctx).WithError(err).Error("failed to cleanup template config state for vm") + } } // Send the `init` exec exit notification always. exit := wpst.init.Status() diff --git a/internal/clone/registry.go b/internal/clone/registry.go new file mode 100644 index 0000000000..ed5ef3dce4 --- /dev/null +++ b/internal/clone/registry.go @@ -0,0 +1,168 @@ +package clone + +import ( + "bytes" + "context" + "encoding/gob" + "fmt" + + "github.com/Microsoft/hcsshim/internal/regstate" + "github.com/Microsoft/hcsshim/internal/uvm" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + configRoot = "LateClone" + configKey = "UVMConfig" + templateConfigCurrentSerialVersionID = 1 +) + +// TemplateConfig struct maintains all of the information about a template. This includes +// the information for both the template container and the template UVM. This struct is +// serialized and stored in the registry and hence is version controlled. +// Note: Update the `templateConfigCurrentSerialVersionID` when this structure definition +// is changed. +type TemplateConfig struct { + SerialVersionID uint32 + TemplateUVMID string + TemplateUVMResources []uvm.Cloneable + TemplateUVMCreateOpts uvm.OptionsWCOW + TemplateContainerID string + // Below we store the container spec for the template container so that when + // cloning containers we can verify that a different spec is not provided for the + // cloned container. + TemplateContainerSpec specs.Spec +} + +// When encoding interfaces gob requires us to register the struct types that we will be +// using under those interfaces. This registration needs to happen on both sides i.e the +// side which encodes the data (i.e the shim process of the template) and the side which +// decodes the data (i.e the shim process of the clone). +// Go init function: https://golang.org/doc/effective_go.html#init +func init() { + // Register the pointer to structs because that is what is being stored. + gob.Register(&uvm.VSMBShare{}) + gob.Register(&uvm.SCSIMount{}) +} + +func encodeTemplateConfig(templateConfig *TemplateConfig) ([]byte, error) { + var buf bytes.Buffer + + encoder := gob.NewEncoder(&buf) + if err := encoder.Encode(templateConfig); err != nil { + return nil, fmt.Errorf("error while encoding template config: %s", err) + } + return buf.Bytes(), nil +} + +func decodeTemplateConfig(encodedBytes []byte) (*TemplateConfig, error) { + var templateConfig TemplateConfig + + reader := bytes.NewReader(encodedBytes) + decoder := gob.NewDecoder(reader) + if err := decoder.Decode(&templateConfig); err != nil { + return nil, fmt.Errorf("error while decoding template config: %s", err) + } + return &templateConfig, nil +} + +// loadPersistedUVMConfig loads a persisted config from the registry that matches the given ID +// If not found returns `regstate.NotFoundError` +func loadPersistedUVMConfig(id string) ([]byte, error) { + sk, err := regstate.Open(configRoot, false) + if err != nil { + return nil, err + } + defer sk.Close() + + var encodedConfig []byte + if err := sk.Get(id, configKey, &encodedConfig); err != nil { + return nil, err + } + return encodedConfig, nil +} + +// storePersistedUVMConfig stores the given config to the registry. +// If the store fails returns the store error. +func storePersistedUVMConfig(id string, encodedConfig []byte) error { + sk, err := regstate.Open(configRoot, false) + if err != nil { + return err + } + defer sk.Close() + + if err := sk.Create(id, configKey, encodedConfig); err != nil { + return err + } + return nil +} + +// removePersistedUVMConfig removes any persisted state associated with this config. If the config +// is not found in the registery `Remove` returns no error. +func removePersistedUVMConfig(id string) error { + sk, err := regstate.Open(configRoot, false) + if err != nil { + if regstate.IsNotFoundError(err) { + return nil + } + return err + } + defer sk.Close() + + if err := sk.Remove(id); err != nil { + if regstate.IsNotFoundError(err) { + return nil + } + return err + } + return nil +} + +// Saves all the information required to create a clone from the template +// of this container into the registry. +func SaveTemplateConfig(ctx context.Context, templateConfig *TemplateConfig) error { + _, err := loadPersistedUVMConfig(templateConfig.TemplateUVMID) + if !regstate.IsNotFoundError(err) { + return fmt.Errorf("parent VM(ID: %s) config shouldn't exit in registry (%s)", templateConfig.TemplateUVMID, err) + } + + // set the serial version before encoding + templateConfig.SerialVersionID = templateConfigCurrentSerialVersionID + + encodedBytes, err := encodeTemplateConfig(templateConfig) + if err != nil { + return fmt.Errorf("failed to encode template config: %s", err) + } + + if err := storePersistedUVMConfig(templateConfig.TemplateUVMID, encodedBytes); err != nil { + return fmt.Errorf("failed to store encoded template config: %s", err) + } + + return nil +} + +// Removes all the state associated with the template with given ID +// If there is no state associated with this ID then the function simply returns without +// doing anything. +func RemoveSavedTemplateConfig(id string) error { + return removePersistedUVMConfig(id) +} + +// Retrieves the UVMTemplateConfig for the template with given ID from the registry. +func FetchTemplateConfig(ctx context.Context, id string) (*TemplateConfig, error) { + encodedBytes, err := loadPersistedUVMConfig(id) + if err != nil { + return nil, fmt.Errorf("failed to fetch encoded template config: %s", err) + } + + templateConfig, err := decodeTemplateConfig(encodedBytes) + if err != nil { + return nil, fmt.Errorf("failed to decode template config: %s", err) + } + + if templateConfig.SerialVersionID != templateConfigCurrentSerialVersionID { + return nil, fmt.Errorf("serialized version of TemplateConfig: %d doesn't match with the current version: %d", templateConfig.SerialVersionID, templateConfigCurrentSerialVersionID) + } + + return templateConfig, nil +} diff --git a/internal/cow/cow.go b/internal/cow/cow.go index 8193315f06..89c1955131 100644 --- a/internal/cow/cow.go +++ b/internal/cow/cow.go @@ -80,4 +80,6 @@ type Container interface { // container to be terminated by some error condition (including calling // Close). Wait() error + // Modify this container + Modify(ctx context.Context, config interface{}) error } diff --git a/internal/gcs/container.go b/internal/gcs/container.go index 218d138f00..8e4ca07905 100644 --- a/internal/gcs/container.go +++ b/internal/gcs/container.go @@ -61,6 +61,23 @@ func (gc *GuestConnection) CreateContainer(ctx context.Context, cid string, conf return c, nil } +// CloneContainer just creates the wrappers and sets up notification requests for a +// container that is already running inside the UVM (after cloning). +func (gc *GuestConnection) CloneContainer(ctx context.Context, cid string) (_ *Container, err error) { + c := &Container{ + gc: gc, + id: cid, + notifyCh: make(chan struct{}), + closeCh: make(chan struct{}), + } + err = gc.requestNotify(cid, c.notifyCh) + if err != nil { + return nil, err + } + go c.waitBackground() + return c, nil +} + // OS returns the operating system of the container, "linux" or "windows". func (c *Container) OS() string { return c.gc.os diff --git a/internal/hcsoci/clone.go b/internal/hcsoci/clone.go new file mode 100644 index 0000000000..4d26f21d0b --- /dev/null +++ b/internal/hcsoci/clone.go @@ -0,0 +1,50 @@ +// +build windows + +package hcsoci + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/cow" + "github.com/Microsoft/hcsshim/internal/requesttype" + hcsschema "github.com/Microsoft/hcsshim/internal/schema2" +) + +const ( + MappedDirectoryResourcePath = "Container/MappedDirectories" + MappedPipeResourcePath = "Container/MappedPipes" +) + +// Usually mounts specified in the container config are added in the container doc +// that is passed along with the container creation reuqest. However, for cloned containers +// we don't send any create container request so we must add the mounts one by one by +// doing Modify requests to that container. +func addMountsToClone(ctx context.Context, c cow.Container, mounts *mountsConfig) error { + // TODO(ambarve) : Find out if there is a way to send request for all the mounts + // at the same time to save time + for _, md := range mounts.mdsv2 { + requestDocument := &hcsschema.ModifySettingRequest{ + RequestType: requesttype.Add, + ResourcePath: MappedDirectoryResourcePath, + Settings: md, + } + err := c.Modify(ctx, requestDocument) + if err != nil { + return fmt.Errorf("error while adding mapped directory (%s) to the container: %s", md.HostPath, err) + } + } + + for _, mp := range mounts.mpsv2 { + requestDocument := &hcsschema.ModifySettingRequest{ + RequestType: requesttype.Add, + ResourcePath: MappedPipeResourcePath, + Settings: mp, + } + err := c.Modify(ctx, requestDocument) + if err != nil { + return fmt.Errorf("error while adding mapped pipe (%s) to the container: %s", mp.HostPath, err) + } + } + return nil +} diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index e425316298..f87adb1e99 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -11,6 +11,7 @@ import ( "strconv" "github.com/Microsoft/go-winio/pkg/guid" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/cow" "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/log" @@ -65,26 +66,133 @@ type createOptionsInternal struct { actualOwner string // Owner for the container actualNetworkNamespace string ccgState *hcsschema.ContainerCredentialGuardState // Container Credential Guard information to be attached to HCS container document + isTemplate bool // Are we going to save this container as a template + templateID string // Template ID of the template from which this container is being cloned } -// CreateContainer creates a container. It can cope with a wide variety of -// scenarios, including v1 HCS schema calls, as well as more complex v2 HCS schema -// calls. Note we always return the resources that have been allocated, even in the -// case of an error. This provides support for the debugging option not to -// release the resources on failure, so that the client can make the necessary -// call to release resources that have been allocated as part of calling this function. -func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { +// compares two slices of strings and returns true if they are same, returns false otherwise. +// The elements in the slices don't have to be in the same order for them to be equal. +func cmpSlices(s1, s2 []string) bool { + equal := (len(s1) == len(s2)) + for i := 0; equal && i < len(s1); i++ { + found := false + for j := 0; !found && j < len(s2); j++ { + found = (s1[i] == s2[j]) + } + equal = equal && found + } + return equal +} + +// Compares to mount structs and returns true if they are equal, returns false otherwise. +func compareMounts(m1, m2 specs.Mount) bool { + return cmpSlices(m1.Options, m2.Options) && (m1.Source == m2.Source) && (m1.Destination == m2.Destination) && (m1.Type == m2.Type) +} + +// verifyCloneContainerSpecs compares the container creation spec provided during the template container +// creation and the spec provided during cloned container creation and checks that all the fields match +// (except for the certain fields that are allowed to be different). +func verifyCloneContainerSpecs(templateSpec, cloneSpec *specs.Spec) error { + // Following fields can be different in the template and clone specs. + // 1. Process + // 2. Annotations - Only the template/cloning related annotations can be different. + // 3. Windows.LayerFolders - Only the last i.e scratch layer can be different. + + if templateSpec.Version != cloneSpec.Version { + return fmt.Errorf("OCI Runtime Spec version of template (%s) doesn't match with the Spec version of clone (%s)", templateSpec.Version, cloneSpec.Version) + } + + // for annotations check that the values of memory & cpu annotations are same + if templateSpec.Annotations[oci.AnnotationContainerMemorySizeInMB] != cloneSpec.Annotations[oci.AnnotationContainerMemorySizeInMB] { + return fmt.Errorf("memory size limit for template and clone containers can not be different") + } + if templateSpec.Annotations[oci.AnnotationContainerProcessorCount] != cloneSpec.Annotations[oci.AnnotationContainerProcessorCount] { + return fmt.Errorf("processor count for template and clone containers can not be different") + } + if templateSpec.Annotations[oci.AnnotationContainerProcessorLimit] != cloneSpec.Annotations[oci.AnnotationContainerProcessorLimit] { + return fmt.Errorf("processor limit for template and clone containers can not be different") + } + + // LayerFolders should be identical except for the last element. + if !cmpSlices(templateSpec.Windows.LayerFolders[:len(templateSpec.Windows.LayerFolders)-1], cloneSpec.Windows.LayerFolders[:len(cloneSpec.Windows.LayerFolders)-1]) { + return fmt.Errorf("layers provided for template container and clone container don't match. Check the image specified in container config") + } + + if templateSpec.Windows.HyperV != cloneSpec.Windows.HyperV { + return fmt.Errorf("HyperV spec for template and clone containers can not be different") + } + + if templateSpec.Windows.Network.AllowUnqualifiedDNSQuery != cloneSpec.Windows.Network.AllowUnqualifiedDNSQuery { + return fmt.Errorf("different values for allow unqualified DNS query can not be provided for template and clones") + } + if templateSpec.Windows.Network.NetworkSharedContainerName != cloneSpec.Windows.Network.NetworkSharedContainerName { + return fmt.Errorf("different network shared name can not be provided for template and clones") + } + if !cmpSlices(templateSpec.Windows.Network.DNSSearchList, cloneSpec.Windows.Network.DNSSearchList) { + return fmt.Errorf("different DNS search list can not be provided for template and clones") + } + return nil +} + +func validateContainerConfig(ctx context.Context, coi *createOptionsInternal) error { + + if coi.HostingSystem != nil && coi.HostingSystem.IsTemplate && !coi.isTemplate { + return fmt.Errorf("only a template container can be created inside a template pod. Any other combination is not valid") + } + + if coi.HostingSystem != nil && coi.templateID != "" && !coi.HostingSystem.IsClone { + return fmt.Errorf("A container can not be cloned inside a non cloned POD") + } + + if coi.templateID != "" { + // verify that the configurations provided for the template for + // this clone are same. + tc, err := clone.FetchTemplateConfig(ctx, coi.HostingSystem.TemplateID) + if err != nil { + return fmt.Errorf("config validation failed : %s", err) + } + if err := verifyCloneContainerSpecs(&tc.TemplateContainerSpec, coi.Spec); err != nil { + return err + } + } + + if coi.HostingSystem != nil && coi.HostingSystem.IsTemplate { + if len(coi.Spec.Windows.Devices) != 0 { + return fmt.Errorf("Mapped Devices are not supported for template containers") + } + + if _, ok := coi.Spec.Windows.CredentialSpec.(string); ok { + return fmt.Errorf("gMSA specifications are not supported for template containers") + } + + if coi.Spec.Windows.Servicing { + return fmt.Errorf("template containers can't be started in servicing mode") + } + + // check that no mounts are specified. + if len(coi.Spec.Mounts) > 0 { + return fmt.Errorf("user specified mounts are not permitted for template containers") + } + } + return nil +} + +func initializeCreateOptions(ctx context.Context, createOptions *CreateOptions) (*createOptionsInternal, error) { coi := &createOptionsInternal{ CreateOptions: createOptions, actualID: createOptions.ID, actualOwner: createOptions.Owner, } + if coi.Spec == nil { + return nil, fmt.Errorf("Spec must be supplied") + } + // Defaults if omitted by caller. if coi.actualID == "" { g, err := guid.NewV4() if err != nil { - return nil, nil, err + return nil, err } coi.actualID = g.String() } @@ -92,10 +200,6 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C coi.actualOwner = filepath.Base(os.Args[0]) } - if coi.Spec == nil { - return nil, nil, fmt.Errorf("Spec must be supplied") - } - if coi.HostingSystem != nil { // By definition, a hosting system can only be supplied for a v2 Xenon. coi.actualSchemaVersion = schemaversion.SchemaV21() @@ -103,10 +207,64 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C coi.actualSchemaVersion = schemaversion.DetermineSchemaVersion(coi.SchemaVersion) } + coi.isTemplate = oci.ParseAnnotationsSaveAsTemplate(ctx, createOptions.Spec) + coi.templateID = oci.ParseAnnotationsTemplateID(ctx, createOptions.Spec) + log.G(ctx).WithFields(logrus.Fields{ "options": fmt.Sprintf("%+v", createOptions), "schema": coi.actualSchemaVersion, - }).Debug("hcsshim::CreateContainer") + }).Debug("hcsshim::initializeCreateOptions") + + return coi, nil +} + +// configureSandboxNetwork creates a new network namespace for the pod (sandbox) +// if required and then adds that namespace to the pod. +func configureSandboxNetwork(ctx context.Context, coi *createOptionsInternal, r *resources.Resources) error { + if coi.NetworkNamespace != "" { + r.SetNetNS(coi.NetworkNamespace) + } else { + err := createNetworkNamespace(ctx, coi, r) + if err != nil { + return err + } + } + coi.actualNetworkNamespace = r.NetNS() + + if coi.HostingSystem != nil { + ct, _, err := oci.GetSandboxTypeAndID(coi.Spec.Annotations) + if err != nil { + return err + } + // Only add the network namespace to a standalone or sandbox + // container but not a workload container in a sandbox that inherits + // the namespace. + if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { + if err = SetupNetworkNamespace(ctx, coi.HostingSystem, coi.actualNetworkNamespace); err != nil { + return err + } + r.SetAddedNetNSToVM(true) + } + } + + return nil +} + +// CreateContainer creates a container. It can cope with a wide variety of +// scenarios, including v1 HCS schema calls, as well as more complex v2 HCS schema +// calls. Note we always return the resources that have been allocated, even in the +// case of an error. This provides support for the debugging option not to +// release the resources on failure, so that the client can make the necessary +// call to release resources that have been allocated as part of calling this function. +func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { + coi, err := initializeCreateOptions(ctx, createOptions) + if err != nil { + return nil, nil, err + } + + if err := validateContainerConfig(ctx, coi); err != nil { + return nil, nil, fmt.Errorf("container config validation failed: %s", err) + } r := resources.NewContainerResources(createOptions.ID) defer func() { @@ -130,41 +288,10 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C if coi.Spec.Windows != nil && coi.Spec.Windows.Network != nil && schemaversion.IsV21(coi.actualSchemaVersion) { + err = configureSandboxNetwork(ctx, coi, r) + if err != nil { + return nil, r, fmt.Errorf("failure while creating namespace for container: %s", err) - if coi.NetworkNamespace != "" { - r.SetNetNS(coi.NetworkNamespace) - } else { - err := createNetworkNamespace(ctx, coi, r) - if err != nil { - return nil, r, err - } - } - coi.actualNetworkNamespace = r.NetNS() - if coi.HostingSystem != nil { - ct, _, err := oci.GetSandboxTypeAndID(coi.Spec.Annotations) - if err != nil { - return nil, r, err - } - // Only add the network namespace to a standalone or sandbox - // container but not a workload container in a sandbox that inherits - // the namespace. - if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { - endpoints, err := GetNamespaceEndpoints(ctx, coi.actualNetworkNamespace) - if err != nil { - return nil, r, err - } - err = coi.HostingSystem.AddNetNS(ctx, coi.actualNetworkNamespace) - if err != nil { - return nil, r, err - } - err = coi.HostingSystem.AddEndpointsToNS(ctx, coi.actualNetworkNamespace, endpoints) - if err != nil { - // Best effort clean up the NS - coi.HostingSystem.RemoveNetNS(ctx, coi.actualNetworkNamespace) - return nil, r, err - } - r.SetAddedNetNSToVM(true) - } } } @@ -234,6 +361,64 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C return system, r, nil } +// CloneContainer is similar to CreateContainer but it does not add layers or namespace like +// CreateContainer does. Also, instead of sending create container request it sends a modify +// request to an existing container. CloneContainer only works for WCOW. +func CloneContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { + coi, err := initializeCreateOptions(ctx, createOptions) + if err != nil { + return nil, nil, err + } + + if err := validateContainerConfig(ctx, coi); err != nil { + return nil, nil, err + } + + if coi.Spec.Windows == nil || coi.HostingSystem == nil { + return nil, nil, fmt.Errorf("CloneContainer is only supported for Hyper-v isolated WCOW ") + } + + r := resources.NewContainerResources(createOptions.ID) + defer func() { + if err != nil { + if !coi.DoNotReleaseResourcesOnFailure { + resources.ReleaseResources(ctx, r, coi.HostingSystem, true) + } + } + }() + + if coi.HostingSystem != nil { + n := coi.HostingSystem.ContainerCounter() + if coi.Spec.Linux != nil { + r.SetContainerRootInUVM(fmt.Sprintf(lcowRootInUVM, createOptions.ID)) + } else { + r.SetContainerRootInUVM(fmt.Sprintf(wcowRootInUVM, strconv.FormatUint(n, 16))) + } + } + + if err = setupMounts(ctx, coi, r); err != nil { + return nil, r, err + } + + mounts, err := createMountsConfig(ctx, coi) + if err != nil { + return nil, r, err + } + + c, err := coi.HostingSystem.CloneContainer(ctx, coi.actualID) + if err != nil { + return nil, r, err + } + + // Everything that is usually added to the container during the createContainer + // request (via the gcsDocument) must be hot added here. + if err := addMountsToClone(ctx, c, mounts); err != nil { + return nil, r, err + } + + return c, r, nil +} + // isV2Xenon returns true if the create options are for a HCS schema V2 xenon container // with a hosting VM func (coi *createOptionsInternal) isV2Xenon() bool { diff --git a/internal/hcsoci/hcsdoc_wcow.go b/internal/hcsoci/hcsdoc_wcow.go index 4e3a06852f..a89992f256 100644 --- a/internal/hcsoci/hcsdoc_wcow.go +++ b/internal/hcsoci/hcsdoc_wcow.go @@ -23,6 +23,60 @@ import ( "github.com/sirupsen/logrus" ) +// A simple wrapper struct around the container mount configs that should be added to the +// container. +type mountsConfig struct { + mdsv1 []schema1.MappedDir + mpsv1 []schema1.MappedPipe + mdsv2 []hcsschema.MappedDirectory + mpsv2 []hcsschema.MappedPipe +} + +func createMountsConfig(ctx context.Context, coi *createOptionsInternal) (*mountsConfig, error) { + // Add the mounts as mapped directories or mapped pipes + // TODO: Mapped pipes to add in v2 schema. + var config mountsConfig + for _, mount := range coi.Spec.Mounts { + if mount.Type != "" { + return nil, fmt.Errorf("invalid container spec - Mount.Type '%s' must not be set", mount.Type) + } + if uvm.IsPipe(mount.Source) { + src, dst := uvm.GetContainerPipeMapping(coi.HostingSystem, mount) + config.mpsv1 = append(config.mpsv1, schema1.MappedPipe{HostPath: src, ContainerPipeName: dst}) + config.mpsv2 = append(config.mpsv2, hcsschema.MappedPipe{HostPath: src, ContainerPipeName: dst}) + } else { + readOnly := false + for _, o := range mount.Options { + if strings.ToLower(o) == "ro" { + readOnly = true + } + } + mdv1 := schema1.MappedDir{HostPath: mount.Source, ContainerPath: mount.Destination, ReadOnly: readOnly} + mdv2 := hcsschema.MappedDirectory{ContainerPath: mount.Destination, ReadOnly: readOnly} + if coi.HostingSystem == nil { + mdv2.HostPath = mount.Source + } else { + uvmPath, err := coi.HostingSystem.GetVSMBUvmPath(ctx, mount.Source, readOnly) + if err != nil { + if err == uvm.ErrNotAttached { + // It could also be a scsi mount. + uvmPath, err = coi.HostingSystem.GetScsiUvmPath(ctx, mount.Source) + if err != nil { + return nil, err + } + } else { + return nil, err + } + } + mdv2.HostPath = uvmPath + } + config.mdsv1 = append(config.mdsv1, mdv1) + config.mdsv2 = append(config.mdsv2, mdv2) + } + } + return &config, nil +} + // createWindowsContainerDocument creates documents for passing to HCS or GCS to create // a container, both hosted and process isolated. It creates both v1 and v2 // container objects, WCOW only. The containers storage should have been mounted already. @@ -182,7 +236,14 @@ func createWindowsContainerDocument(ctx context.Context, coi *createOptionsInter v2Container.Networking = &hcsschema.Networking{} v1.EndpointList = coi.Spec.Windows.Network.EndpointList - v2Container.Networking.Namespace = coi.actualNetworkNamespace + + // Use the reserved network namespace for containers created inside + // cloned or template UVMs. + if coi.HostingSystem != nil && (coi.HostingSystem.IsTemplate || coi.HostingSystem.IsClone) { + v2Container.Networking.Namespace = uvm.DEFAULT_CLONE_NETWORK_NAMESPACE_ID + } else { + v2Container.Networking.Namespace = coi.actualNetworkNamespace + } v1.AllowUnqualifiedDNSQuery = coi.Spec.Windows.Network.AllowUnqualifiedDNSQuery v2Container.Networking.AllowUnqualifiedDnsQuery = v1.AllowUnqualifiedDNSQuery @@ -268,60 +329,17 @@ func createWindowsContainerDocument(ctx context.Context, coi *createOptionsInter } } - // Add the mounts as mapped directories or mapped pipes - // TODO: Mapped pipes to add in v2 schema. - var ( - mdsv1 []schema1.MappedDir - mpsv1 []schema1.MappedPipe - mdsv2 []hcsschema.MappedDirectory - mpsv2 []hcsschema.MappedPipe - ) - for _, mount := range coi.Spec.Mounts { - if mount.Type != "" { - return nil, nil, fmt.Errorf("invalid container spec - Mount.Type '%s' must not be set", mount.Type) - } - if uvm.IsPipe(mount.Source) { - src, dst := uvm.GetContainerPipeMapping(coi.HostingSystem, mount) - mpsv1 = append(mpsv1, schema1.MappedPipe{HostPath: src, ContainerPipeName: dst}) - mpsv2 = append(mpsv2, hcsschema.MappedPipe{HostPath: src, ContainerPipeName: dst}) - } else { - readOnly := false - for _, o := range mount.Options { - if strings.ToLower(o) == "ro" { - readOnly = true - } - } - mdv1 := schema1.MappedDir{HostPath: mount.Source, ContainerPath: mount.Destination, ReadOnly: readOnly} - mdv2 := hcsschema.MappedDirectory{ContainerPath: mount.Destination, ReadOnly: readOnly} - if coi.HostingSystem == nil { - mdv2.HostPath = mount.Source - } else { - uvmPath, err := coi.HostingSystem.GetVSMBUvmPath(ctx, mount.Source, readOnly) - if err != nil { - if err == uvm.ErrNotAttached { - // It could also be a scsi mount. - uvmPath, err = coi.HostingSystem.GetScsiUvmPath(ctx, mount.Source) - if err != nil { - return nil, nil, err - } - } else { - return nil, nil, err - } - } - mdv2.HostPath = uvmPath - } - mdsv1 = append(mdsv1, mdv1) - mdsv2 = append(mdsv2, mdv2) - } + mounts, err := createMountsConfig(ctx, coi) + if err != nil { + return nil, nil, err } - - v1.MappedDirectories = mdsv1 - v2Container.MappedDirectories = mdsv2 - if len(mpsv1) > 0 && osversion.Get().Build < osversion.RS3 { + v1.MappedDirectories = mounts.mdsv1 + v2Container.MappedDirectories = mounts.mdsv2 + if len(mounts.mpsv1) > 0 && osversion.Get().Build < osversion.RS3 { return nil, nil, fmt.Errorf("named pipe mounts are not supported on this version of Windows") } - v1.MappedPipes = mpsv1 - v2Container.MappedPipes = mpsv2 + v1.MappedPipes = mounts.mpsv1 + v2Container.MappedPipes = mounts.mpsv2 // add assigned devices to the container definition if err := parseAssignedDevices(ctx, coi, v2Container); err != nil { diff --git a/internal/hcsoci/network.go b/internal/hcsoci/network.go index d33bd6b838..dd0ddca374 100644 --- a/internal/hcsoci/network.go +++ b/internal/hcsoci/network.go @@ -3,6 +3,7 @@ package hcsoci import ( "context" + "github.com/Microsoft/hcsshim/hcn" "github.com/Microsoft/hcsshim/internal/hns" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" @@ -68,3 +69,64 @@ func GetNamespaceEndpoints(ctx context.Context, netNS string) ([]*hns.HNSEndpoin } return endpoints, nil } + +// Network namespace setup is a bit different for templates and clones. +// For templates and clones we use a special network namespace ID. +// Details about this can be found in the Networking section of the late-clone wiki page. +// +// In this function we take the namespace ID of the namespace that was created for this +// UVM. We hot add the namespace (with the default ID if this is a template). We get the +// endpoints associated with this namespace and then hot add those endpoints (by changing +// their namespace IDs by the deafult IDs if it is a template). +func SetupNetworkNamespace(ctx context.Context, hostingSystem *uvm.UtilityVM, nsid string) error { + nsidInsideUVM := nsid + if hostingSystem.IsTemplate || hostingSystem.IsClone { + nsidInsideUVM = uvm.DEFAULT_CLONE_NETWORK_NAMESPACE_ID + } + + // Query endpoints with actual nsid + endpoints, err := GetNamespaceEndpoints(ctx, nsid) + if err != nil { + return err + } + + // Add the network namespace inside the UVM if it is not a clone. (Clones will + // inherit the namespace from template) + if !hostingSystem.IsClone { + // Get the namespace struct from the actual nsid. + hcnNamespace, err := hcn.GetNamespaceByID(nsid) + if err != nil { + return err + } + + // All templates should have a special NSID so that it + // will be easier to debug. Override it here. + if hostingSystem.IsTemplate { + hcnNamespace.Id = nsidInsideUVM + } + + if err = hostingSystem.AddNetNS(ctx, hcnNamespace); err != nil { + return err + } + } + + // If adding a network endpoint to clones or a template override nsid associated + // with it. + if hostingSystem.IsClone || hostingSystem.IsTemplate { + // replace nsid for each endpoint + for _, ep := range endpoints { + ep.Namespace = &hns.Namespace{ + ID: nsidInsideUVM, + } + } + } + + if err = hostingSystem.AddEndpointsToNS(ctx, nsidInsideUVM, endpoints); err != nil { + // Best effort clean up the NS + if removeErr := hostingSystem.RemoveNetNS(ctx, nsidInsideUVM); removeErr != nil { + log.G(ctx).Warn(removeErr) + } + return err + } + return nil +} diff --git a/internal/hcsoci/resources_wcow.go b/internal/hcsoci/resources_wcow.go index 9ace171729..0ef3ea5482 100644 --- a/internal/hcsoci/resources_wcow.go +++ b/internal/hcsoci/resources_wcow.go @@ -60,6 +60,53 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r r.SetLayers(layers) } + if err := setupMounts(ctx, coi, r); err != nil { + return err + } + + if cs, ok := coi.Spec.Windows.CredentialSpec.(string); ok { + // Only need to create a CCG instance for v2 containers + if schemaversion.IsV21(coi.actualSchemaVersion) { + hypervisorIsolated := coi.HostingSystem != nil + ccgInstance, ccgResource, err := credentials.CreateCredentialGuard(ctx, coi.actualID, cs, hypervisorIsolated) + if err != nil { + return err + } + coi.ccgState = ccgInstance.CredentialGuard + r.Add(ccgResource) + if hypervisorIsolated { + // If hypervisor isolated we need to add an hvsocket service table entry + // By default HVSocket won't allow something inside the VM to connect + // back to a process on the host. We need to update the HVSocket service table + // to allow a connection to CCG.exe on the host, so that GMSA can function. + // We need to hot add this here because at UVM creation time we don't know what containers + // will be launched in the UVM, nonetheless if they will ask for GMSA. This is a workaround + // for the previous design requirement for CCG V2 where the service entry + // must be present in the UVM'S HCS document before being sent over as hot adding + // an HvSocket service was not possible. + hvSockConfig := ccgInstance.HvSocketConfig + if err := coi.HostingSystem.UpdateHvSocketService(ctx, hvSockConfig.ServiceId, hvSockConfig.ServiceConfig); err != nil { + return fmt.Errorf("failed to update hvsocket service: %s", err) + } + } + } + } + + if coi.HostingSystem != nil && coi.hasWindowsAssignedDevices() { + windowsDevices, closers, err := handleAssignedDevicesWindows(ctx, coi.HostingSystem, coi.Spec.Annotations, coi.Spec.Windows.Devices) + if err != nil { + return err + } + r.Add(closers...) + coi.Spec.Windows.Devices = windowsDevices + } + + return nil +} + +// setupMount adds the custom mounts requested in the container configuration of this +// request. +func setupMounts(ctx context.Context, coi *createOptionsInternal, r *resources.Resources) error { // Validate each of the mounts. If this is a V2 Xenon, we have to add them as // VSMB shares to the utility VM. For V1 Xenon and Argons, there's nothing for // us to do as it's done by HCS. @@ -121,42 +168,5 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r } } - if cs, ok := coi.Spec.Windows.CredentialSpec.(string); ok { - // Only need to create a CCG instance for v2 containers - if schemaversion.IsV21(coi.actualSchemaVersion) { - hypervisorIsolated := coi.HostingSystem != nil - ccgInstance, ccgResource, err := credentials.CreateCredentialGuard(ctx, coi.actualID, cs, hypervisorIsolated) - if err != nil { - return err - } - coi.ccgState = ccgInstance.CredentialGuard - r.Add(ccgResource) - if hypervisorIsolated { - // If hypervisor isolated we need to add an hvsocket service table entry - // By default HVSocket won't allow something inside the VM to connect - // back to a process on the host. We need to update the HVSocket service table - // to allow a connection to CCG.exe on the host, so that GMSA can function. - // We need to hot add this here because at UVM creation time we don't know what containers - // will be launched in the UVM, nonetheless if they will ask for GMSA. This is a workaround - // for the previous design requirement for CCG V2 where the service entry - // must be present in the UVM'S HCS document before being sent over as hot adding - // an HvSocket service was not possible. - hvSockConfig := ccgInstance.HvSocketConfig - if err := coi.HostingSystem.UpdateHvSocketService(ctx, hvSockConfig.ServiceId, hvSockConfig.ServiceConfig); err != nil { - return fmt.Errorf("failed to update hvsocket service: %s", err) - } - } - } - } - - if coi.HostingSystem != nil && coi.hasWindowsAssignedDevices() { - windowsDevices, closers, err := handleAssignedDevicesWindows(ctx, coi.HostingSystem, coi.Spec.Annotations, coi.Spec.Windows.Devices) - if err != nil { - return err - } - r.Add(closers...) - coi.Spec.Windows.Devices = windowsDevices - } - return nil } diff --git a/internal/layers/layers.go b/internal/layers/layers.go index fb5429bcc0..01d8529cc6 100644 --- a/internal/layers/layers.go +++ b/internal/layers/layers.go @@ -125,6 +125,9 @@ func MountContainerLayers(ctx context.Context, layerFolders []string, guestRoot if uvm.OS() == "windows" { options := uvm.DefaultVSMBOptions(true) options.TakeBackupPrivilege = true + if uvm.IsTemplate { + uvm.SetSaveableVSMBOptions(options, options.ReadOnly) + } if _, err := uvm.AddVSMB(ctx, layerPath, options); err != nil { return "", fmt.Errorf("failed to add VSMB layer: %s", err) } diff --git a/internal/oci/uvm.go b/internal/oci/uvm.go index 9c8003cd25..3d57222804 100644 --- a/internal/oci/uvm.go +++ b/internal/oci/uvm.go @@ -3,10 +3,12 @@ package oci import ( "context" "errors" + "fmt" "strconv" "strings" runhcsopts "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/uvm" @@ -138,6 +140,21 @@ const ( // annotation used to specify the cpugroup ID that a UVM should be assigned to annotationCPUGroupID = "io.microsoft.virtualmachine.cpugroup.id" + + // SaveAsTemplate annotation must be used with a pod & container creation request. + // If this annotation is present in the request then it will save the UVM (pod) + // and the container(s) inside it as a template. However, this also means that this + // pod and the containers inside this pod will permananetly stay in the + // paused/templated state and can not be resumed again. + annotationSaveAsTemplate = "io.microsoft.virtualmachine.saveastemplate" + + // This annotation should be used when creating a pod or a container from a template. + // When creating a pod from a template use the ID of the templated pod as the + // TemplateID and when creating a container use the ID of the templated container as + // the TemplateID. It is the client's responsibility to make sure that the sandbox + // within which a cloned container needs to be created must also be created from the + // same template. + annotationTemplateID = "io.microsoft.virtualmachine.templateid" ) // parseAnnotationsBool searches `a` for `key` and if found verifies that the @@ -328,6 +345,32 @@ func parseAnnotationsString(a map[string]string, key string, def string) string return def } +// ParseAnnotationsSaveAsTemplate searches for the boolean value which specifies +// if this create request should be considered as a template creation request. If value +// is found the returns the actual value, returns false otherwise. +func ParseAnnotationsSaveAsTemplate(ctx context.Context, s *specs.Spec) bool { + return parseAnnotationsBool(ctx, s.Annotations, annotationSaveAsTemplate, false) +} + +// ParseAnnotationsTemplateID searches for the templateID in the create request. If the +// value is found then returns the value otherwise returns the empty string. +func ParseAnnotationsTemplateID(ctx context.Context, s *specs.Spec) string { + return parseAnnotationsString(s.Annotations, annotationTemplateID, "") +} + +func ParseCloneAnnotations(ctx context.Context, s *specs.Spec) (isTemplate bool, templateID string, err error) { + templateID = ParseAnnotationsTemplateID(ctx, s) + isTemplate = ParseAnnotationsSaveAsTemplate(ctx, s) + if templateID != "" && isTemplate { + return false, "", fmt.Errorf("templateID and save as template flags can not be passed in the same request") + } + + if (isTemplate || templateID != "") && !IsWCOW(s) { + return false, "", fmt.Errorf("save as template and creating clones is only available for WCOW") + } + return +} + // handleAnnotationKernelDirectBoot handles parsing annotationKernelDirectBoot and setting // implied annotations from the result. func handleAnnotationKernelDirectBoot(ctx context.Context, a map[string]string, lopts *uvm.OptionsLCOW) { @@ -369,6 +412,26 @@ func handleAnnotationFullyPhysicallyBacked(ctx context.Context, a map[string]str } } +// handleCloneAnnotations handles parsing annotations related to template creation and cloning +// Since late cloning is only supported for WCOW this function only deals with WCOW options. +func handleCloneAnnotations(ctx context.Context, a map[string]string, wopts *uvm.OptionsWCOW) (err error) { + wopts.IsTemplate = parseAnnotationsBool(ctx, a, annotationSaveAsTemplate, false) + templateID := parseAnnotationsString(a, annotationTemplateID, "") + if templateID != "" { + tc, err := clone.FetchTemplateConfig(ctx, templateID) + if err != nil { + return err + } + wopts.TemplateConfig = &uvm.UVMTemplateConfig{ + UVMID: tc.TemplateUVMID, + CreateOpts: tc.TemplateUVMCreateOpts, + Resources: tc.TemplateUVMResources, + } + wopts.IsClone = true + } + return nil +} + // SpecToUVMCreateOpts parses `s` and returns either `*uvm.OptionsLCOW` or // `*uvm.OptionsWCOW`. func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) (interface{}, error) { @@ -419,6 +482,9 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( wopts.DisableCompartmentNamespace = parseAnnotationsBool(ctx, s.Annotations, annotationDisableCompartmentNamespace, wopts.DisableCompartmentNamespace) wopts.CPUGroupID = parseAnnotationsString(s.Annotations, annotationCPUGroupID, wopts.CPUGroupID) handleAnnotationFullyPhysicallyBacked(ctx, s.Annotations, wopts) + if err := handleCloneAnnotations(ctx, s.Annotations, wopts); err != nil { + return nil, err + } return wopts, nil } return nil, errors.New("cannot create UVM opts spec is not LCOW or WCOW") diff --git a/internal/uvm/clone.go b/internal/uvm/clone.go index 7d25713e0e..5a7977b482 100644 --- a/internal/uvm/clone.go +++ b/internal/uvm/clone.go @@ -2,7 +2,9 @@ package uvm import ( "context" + "fmt" + "github.com/Microsoft/hcsshim/internal/cow" hcsschema "github.com/Microsoft/hcsshim/internal/schema2" "github.com/pkg/errors" ) @@ -50,7 +52,7 @@ type Cloneable interface { Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) error } -// cloneData contains all the information that might be required during cloning process of +// A struct to keep all the information that might be required during cloning process of // a resource. type cloneData struct { // doc spec for the clone @@ -68,15 +70,22 @@ type UVMTemplateConfig struct { UVMID string // Array of all resources that will be required while making a clone from this template Resources []Cloneable + // The OptionsWCOW used for template uvm creation + CreateOpts OptionsWCOW } // Captures all the information that is necessary to properly save this UVM as a template // and create clones from this template later. The struct returned by this method must be // later on made available while creating a clone from this template. -func (uvm *UtilityVM) GenerateTemplateConfig() *UVMTemplateConfig { +func (uvm *UtilityVM) GenerateTemplateConfig() (*UVMTemplateConfig, error) { + if _, ok := uvm.createOpts.(OptionsWCOW); !ok { + return nil, fmt.Errorf("template config can only be created for a WCOW uvm") + } + // Add all the SCSI Mounts and VSMB shares into the list of clones templateConfig := &UVMTemplateConfig{ - UVMID: uvm.ID(), + UVMID: uvm.ID(), + CreateOpts: uvm.createOpts.(OptionsWCOW), } for _, vsmbShare := range uvm.vsmbDirShares { @@ -95,13 +104,13 @@ func (uvm *UtilityVM) GenerateTemplateConfig() *UVMTemplateConfig { } } - return templateConfig + return templateConfig, nil } // Pauses the uvm and then saves it as a template. This uvm can not be restarted or used // after it is successfully saved. -// uvm must be in the paused state before we attempt to save it. save call will throw the -// VM in incorrect state exception if it is not in the paused state at the time of saving. +// uvm must be in the paused state before it can be saved as a template.save call will throw +// an incorrect uvm state exception if uvm is not in the paused state at the time of saving. func (uvm *UtilityVM) SaveAsTemplate(ctx context.Context) error { if err := uvm.hcsSystem.Pause(ctx); err != nil { return errors.Wrap(err, "error pausing the VM") @@ -115,3 +124,16 @@ func (uvm *UtilityVM) SaveAsTemplate(ctx context.Context) error { } return nil } + +// CloneContainer attaches back to a container that is already running inside the UVM +// because of the clone +func (uvm *UtilityVM) CloneContainer(ctx context.Context, id string) (cow.Container, error) { + if uvm.gc == nil { + return nil, fmt.Errorf("clone container cannot work without external GCS connection") + } + c, err := uvm.gc.CloneContainer(ctx, id) + if err != nil { + return nil, fmt.Errorf("failed to clone container %s: %s", id, err) + } + return c, nil +} diff --git a/internal/uvm/create.go b/internal/uvm/create.go index 5f69b015b2..47eda3cca9 100644 --- a/internal/uvm/create.go +++ b/internal/uvm/create.go @@ -87,6 +87,37 @@ type Options struct { CPUGroupID string } +// compares the create opts used during template creation with the create opts +// provided for clone creation. If they don't match (except for a few fields) +// then clone creation is failed. +func verifyCloneUvmCreateOpts(templateOpts, cloneOpts *OptionsWCOW) bool { + // Following fields can be different in the template and clone configurations. + // 1. the scratch layer path. i.e the last element of the LayerFolders path. + // 2. IsTemplate, IsClone and TemplateConfig variables. + // 3. ID + // 4. AdditionalHCSDocumentJSON + + // Save the original values of the fields that we want to ignore and replace them with + // the same values as that of the other object. So that we can simply use `==` operator. + templateIDBackup := templateOpts.ID + templateAdditionalJsonBackup := templateOpts.AdditionHCSDocumentJSON + templateOpts.ID = cloneOpts.ID + templateOpts.AdditionHCSDocumentJSON = cloneOpts.AdditionHCSDocumentJSON + + // We can't use `==` operator on structs which include slices in them. So compare the + // Layerfolders separately and then directly compare the Options struct. + result := (len(templateOpts.LayerFolders) == len(cloneOpts.LayerFolders)) + for i := 0; result && i < len(templateOpts.LayerFolders)-1; i++ { + result = result && (templateOpts.LayerFolders[i] == cloneOpts.LayerFolders[i]) + } + result = result && (*templateOpts.Options == *cloneOpts.Options) + + // set original values + templateOpts.ID = templateIDBackup + templateOpts.AdditionHCSDocumentJSON = templateAdditionalJsonBackup + return result +} + // Verifies that the final UVM options are correct and supported. func verifyOptions(ctx context.Context, options interface{}) error { switch opts := options.(type) { @@ -123,9 +154,18 @@ func verifyOptions(ctx context.Context, options interface{}) error { if len(opts.LayerFolders) < 2 { return errors.New("at least 2 LayerFolders must be supplied") } + if opts.IsClone && !verifyCloneUvmCreateOpts(&opts.TemplateConfig.CreateOpts, opts) { + return errors.New("clone configuration doesn't match with template configuration.") + } if opts.IsClone && opts.TemplateConfig == nil { return errors.New("template config can not be nil when creating clone") } + if opts.IsClone && !opts.ExternalGuestConnection { + return errors.New("External gcs connection can not be disabled for clones") + } + if opts.IsTemplate && opts.FullyPhysicallyBacked { + return errors.New("Template can not be created from a full physically backed UVM") + } } return nil } @@ -211,7 +251,7 @@ func (uvm *UtilityVM) Close() (err error) { } if err := uvm.CloseGCSConnection(); err != nil { - log.G(ctx).Errorf("close gcs connection failed: %f", err) + log.G(ctx).Errorf("close GCS connection failed: %s", err) } // outputListener will only be nil for a Create -> Stop without a Start. In diff --git a/internal/uvm/create_lcow.go b/internal/uvm/create_lcow.go index 1ce25e01f7..1b3308e0ba 100644 --- a/internal/uvm/create_lcow.go +++ b/internal/uvm/create_lcow.go @@ -168,6 +168,7 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error physicallyBacked: !opts.AllowOvercommit, devicesPhysicallyBacked: opts.FullyPhysicallyBacked, cpuGroupID: opts.CPUGroupID, + createOpts: opts, } defer func() { diff --git a/internal/uvm/create_wcow.go b/internal/uvm/create_wcow.go index 51d609895c..79bd93ec7e 100644 --- a/internal/uvm/create_wcow.go +++ b/internal/uvm/create_wcow.go @@ -215,6 +215,7 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error physicallyBacked: !opts.AllowOvercommit, devicesPhysicallyBacked: opts.FullyPhysicallyBacked, cpuGroupID: opts.CPUGroupID, + createOpts: *opts, } defer func() { @@ -277,11 +278,7 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error }, } - uvm.scsiLocations[0][0] = &SCSIMount{ - vm: uvm, - HostPath: doc.VirtualMachine.Devices.Scsi["0"].Attachments["0"].Path, - refCount: 1, - } + uvm.scsiLocations[0][0] = newSCSIMount(uvm, doc.VirtualMachine.Devices.Scsi["0"].Attachments["0"].Path, "", "", 1, 0, 0, false) } else { doc.VirtualMachine.RestoreState = &hcsschema.RestoreState{} doc.VirtualMachine.RestoreState.TemplateSystemId = opts.TemplateConfig.UVMID @@ -304,16 +301,14 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error uvm.namespaces[DEFAULT_CLONE_NETWORK_NAMESPACE_ID] = &namespaceInfo{ nics: make(map[string]*nicInfo), } - uvm.IsClone = true + uvm.TemplateID = opts.TemplateConfig.UVMID } // Add appropriate VSMB share options if this UVM needs to be saved as a template if opts.IsTemplate { for _, share := range doc.VirtualMachine.Devices.VirtualSmb.Shares { - share.Options.PseudoDirnotify = true - share.Options.NoLocks = true - share.Options.NoDirectmap = true + uvm.SetSaveableVSMBOptions(share.Options, share.Options.ReadOnly) } uvm.IsTemplate = true } diff --git a/internal/uvm/network.go b/internal/uvm/network.go index 2b0769d7af..4be39676f9 100644 --- a/internal/uvm/network.go +++ b/internal/uvm/network.go @@ -56,13 +56,16 @@ func (endpoints *NetworkEndpoints) Release(ctx context.Context) error { return nil } -// AddNetNS adds network namespace inside the guest. +// AddNetNS adds network namespace inside the guest without actually querying for the +// namespace by its ID. It uses the given namespace struct as it is in the guest request. +// This function is mostly used when we need to override the values inside the namespace +// struct returned by the GetNamespaceByID. For most uses cases AddNetNSByID is more appropriate. // -// If a namespace with `id` already exists returns `ErrNetNSAlreadyAttached`. -func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { +// If a namespace with the same id already exists returns `ErrNetNSAlreadyAttached`. +func (uvm *UtilityVM) AddNetNS(ctx context.Context, hcnNamespace *hcn.HostComputeNamespace) error { uvm.m.Lock() defer uvm.m.Unlock() - if _, ok := uvm.namespaces[id]; ok { + if _, ok := uvm.namespaces[hcnNamespace.Id]; ok { return ErrNetNSAlreadyAttached } @@ -70,10 +73,6 @@ func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { // Add a Guest Network namespace. On LCOW we add the adapters // dynamically. if uvm.operatingSystem == "windows" { - hcnNamespace, err := hcn.GetNamespaceByID(id) - if err != nil { - return err - } guestNamespace := hcsschema.ModifySettingRequest{ GuestRequest: guestrequest.GuestRequest{ ResourceType: guestrequest.ResourceTypeNetworkNamespace, @@ -90,12 +89,28 @@ func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { if uvm.namespaces == nil { uvm.namespaces = make(map[string]*namespaceInfo) } - uvm.namespaces[id] = &namespaceInfo{ + uvm.namespaces[hcnNamespace.Id] = &namespaceInfo{ nics: make(map[string]*nicInfo), } return nil } +// AddNetNSByID adds finds the namespace with given `id` and adds that +// network namespace inside the guest. +// +// If a namespace with `id` already exists returns `ErrNetNSAlreadyAttached`. +func (uvm *UtilityVM) AddNetNSByID(ctx context.Context, id string) error { + hcnNamespace, err := hcn.GetNamespaceByID(id) + if err != nil { + return err + } + + if err = uvm.AddNetNS(ctx, hcnNamespace); err != nil { + return err + } + return nil +} + // AddEndpointsToNS adds all unique `endpoints` to the network namespace // matching `id`. On failure does not roll back any previously successfully // added endpoints. @@ -314,3 +329,15 @@ func (uvm *UtilityVM) removeNIC(ctx context.Context, id guid.GUID, endpoint *hns } return nil } + +// Removes all NICs added to this uvm. +func (uvm *UtilityVM) RemoveAllNICs(ctx context.Context) error { + for _, ns := range uvm.namespaces { + for _, ninfo := range ns.nics { + if err := uvm.removeNIC(ctx, ninfo.ID, ninfo.Endpoint); err != nil { + return err + } + } + } + return nil +} diff --git a/internal/uvm/scsi.go b/internal/uvm/scsi.go index 03569ffa39..d211352ad2 100644 --- a/internal/uvm/scsi.go +++ b/internal/uvm/scsi.go @@ -448,17 +448,21 @@ func (sm *SCSIMount) Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) er ) if !sm.readOnly { - // Copy this scsi disk - // TODO(ambarve): This is a writeable SCSI mount. It can either be the - // scratch VHD of the UVM or it can be a SCSI mount that belongs to some - // container which is being automatically cloned here as a part of UVM - // cloning process. We will receive a request for creation of this - // container later on which will specify the storage path for this - // container. However, that storage location is not available now so we - // just use the storage of the uvm instead. Find a better way for handling - // this. Problem with this approach is that the scratch VHD of the container - // will not be automatically cleaned after container exits. It will stay - // there as long as the UVM keeps running. + // This is a writeable SCSI mount. It must be either the + // 1. scratch VHD of the UVM or + // 2. scratch VHD of the container. + // A user provided writable SCSI mount is not allowed on the template UVM + // or container and so this SCSI mount has to be the scratch VHD of the + // UVM or container. The container inside this UVM will automatically be + // cloned here when we are cloning the uvm itself. We will receive a + // request for creation of this container later and that request will + // specify the storage path for this container. However, that storage + // location is not available now so we just use the storage path of the + // uvm instead. + // TODO(ambarve): Find a better way for handling this. Problem with this + // approach is that the scratch VHD of the container will not be + // automatically cleaned after container exits. It will stay there as long + // as the UVM keeps running. // For the scratch VHD of the VM (always attached at Controller:0, LUN:0) // clone it in the scratch folder diff --git a/internal/uvm/types.go b/internal/uvm/types.go index 7d25ba2f0e..5f29166edf 100644 --- a/internal/uvm/types.go +++ b/internal/uvm/types.go @@ -122,4 +122,12 @@ type UtilityVM struct { // specifies if this UVM is a cloned from a template IsClone bool + + // ID of the template from which this clone was created. Only applies when IsClone + // is true + TemplateID string + + // The CreateOpts used to create this uvm. These can be either of type + // uvm.OptionsLCOW or uvm.OptionsWCOW + createOpts interface{} } diff --git a/internal/uvm/vsmb.go b/internal/uvm/vsmb.go index 201c7bb3ed..e15fb42554 100644 --- a/internal/uvm/vsmb.go +++ b/internal/uvm/vsmb.go @@ -60,6 +60,28 @@ func (uvm *UtilityVM) DefaultVSMBOptions(readOnly bool) *hcsschema.VirtualSmbSha return opts } +func (uvm *UtilityVM) SetSaveableVSMBOptions(opts *hcsschema.VirtualSmbShareOptions, readOnly bool) { + if readOnly { + opts.ShareRead = true + opts.CacheIo = true + opts.ReadOnly = true + opts.PseudoOplocks = true + opts.NoOplocks = false + } else { + // Using NoOpLocks can cause intermittent Access denied failures due to + // a VSMB bug that was fixed but not backported to RS5/19H1. + opts.ShareRead = false + opts.CacheIo = false + opts.ReadOnly = false + opts.PseudoOplocks = false + opts.NoOplocks = true + } + opts.NoLocks = true + opts.PseudoDirnotify = true + opts.NoDirectmap = true + return +} + // findVSMBShare finds a share by `hostPath`. If not found returns `ErrNotAttached`. func (uvm *UtilityVM) findVSMBShare(ctx context.Context, m map[string]*VSMBShare, shareKey string) (*VSMBShare, error) { share, ok := m[shareKey]