diff --git a/cmd/containerd-shim-runhcs-v1/clone.go b/cmd/containerd-shim-runhcs-v1/clone.go new file mode 100644 index 0000000000..221aabacab --- /dev/null +++ b/cmd/containerd-shim-runhcs-v1/clone.go @@ -0,0 +1,50 @@ +package main + +import ( + "context" + + "github.com/Microsoft/hcsshim/internal/clone" + "github.com/Microsoft/hcsshim/internal/uvm" +) + +// saveAsTemplate saves the UVM and container inside it as a template and also stores the +// relevant information in the registry so that clones can be created from this template. +// Every cloned uvm gets its own NIC and we do not want to create clones of a template +// which still has a NIC attached to it. So remove the NICs attached to the template uvm +// before saving it. +// Similar to the NIC scenario we do not want to create clones from a template with an +// active GCS connection so close the GCS connection too. +func saveAsTemplate(ctx context.Context, templateTask *hcsTask) (err error) { + var utc *uvm.UVMTemplateConfig + var templateConfig *clone.TemplateConfig + + if err = templateTask.host.RemoveAllNICs(ctx); err != nil { + return err + } + + if err = templateTask.host.CloseGCSConnection(); err != nil { + return err + } + + utc, err = templateTask.host.GenerateTemplateConfig() + if err != nil { + return err + } + + templateConfig = &clone.TemplateConfig{ + TemplateUVMID: utc.UVMID, + TemplateUVMResources: utc.Resources, + TemplateUVMCreateOpts: utc.CreateOpts, + TemplateContainerID: templateTask.id, + TemplateContainerSpec: *templateTask.taskSpec, + } + + if err = clone.SaveTemplateConfig(ctx, templateConfig); err != nil { + return err + } + + if err = templateTask.host.SaveAsTemplate(ctx); err != nil { + return err + } + return nil +} diff --git a/cmd/containerd-shim-runhcs-v1/exec_clone.go b/cmd/containerd-shim-runhcs-v1/exec_clone.go new file mode 100644 index 0000000000..078149f140 --- /dev/null +++ b/cmd/containerd-shim-runhcs-v1/exec_clone.go @@ -0,0 +1,67 @@ +package main + +import ( + "context" + + "github.com/Microsoft/hcsshim/internal/cmd" + "github.com/Microsoft/hcsshim/internal/cow" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/uvm" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +func newClonedExec( + ctx context.Context, + events publisher, + tid string, + host *uvm.UtilityVM, + c cow.Container, + id, bundle string, + isWCOW bool, + spec *specs.Process, + io cmd.UpstreamIO) *clonedExec { + log.G(ctx).WithFields(logrus.Fields{ + "tid": tid, + "eid": id, // Init exec ID is always same as Task ID + "bundle": bundle, + }).Debug("newClonedExec") + + he := &hcsExec{ + events: events, + tid: tid, + host: host, + c: c, + id: id, + bundle: bundle, + isWCOW: isWCOW, + spec: spec, + io: io, + processDone: make(chan struct{}), + state: shimExecStateCreated, + exitStatus: 255, // By design for non-exited process status. + exited: make(chan struct{}), + } + + ce := &clonedExec{ + he, + } + go he.waitForContainerExit() + return ce +} + +var _ = (shimExec)(&clonedExec{}) + +// clonedExec inherits from hcsExec. The only difference between these two is that +// on starting a clonedExec it doesn't attempt to start the container even if the +// exec is the init process. This is because in case of clonedExec the container is +// already running inside the pod. +type clonedExec struct { + *hcsExec +} + +func (ce *clonedExec) Start(ctx context.Context) (err error) { + // A cloned exec should never initialize the container as it should + // already be running. + return ce.startInternal(ctx, false) +} diff --git a/cmd/containerd-shim-runhcs-v1/exec_hcs.go b/cmd/containerd-shim-runhcs-v1/exec_hcs.go index 728ef51c10..7c2115c2cc 100644 --- a/cmd/containerd-shim-runhcs-v1/exec_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/exec_hcs.go @@ -181,7 +181,7 @@ func (he *hcsExec) Status() *task.StateResponse { } } -func (he *hcsExec) Start(ctx context.Context) (err error) { +func (he *hcsExec) startInternal(ctx context.Context, initializeContainer bool) (err error) { he.sl.Lock() defer he.sl.Unlock() if he.state != shimExecStateCreated { @@ -192,8 +192,7 @@ func (he *hcsExec) Start(ctx context.Context) (err error) { he.exitFromCreatedL(ctx, 1) } }() - if he.id == he.tid { - // This is the init exec. We need to start the container itself + if initializeContainer { err = he.c.Start(ctx) if err != nil { return err @@ -257,6 +256,12 @@ func (he *hcsExec) Start(ctx context.Context) (err error) { return nil } +func (he *hcsExec) Start(ctx context.Context) (err error) { + // If he.id == he.tid then this is the init exec. + // We need to initialize the container itself before starting this exec. + return he.startInternal(ctx, he.id == he.tid) +} + func (he *hcsExec) Kill(ctx context.Context, signal uint32) error { he.sl.Lock() defer he.sl.Unlock() @@ -414,6 +419,8 @@ func (he *hcsExec) exitFromCreatedL(ctx context.Context, status int) { // // 6. Close `he.exited` channel to unblock any waiters who might have called // `Create`/`Wait`/`Start` which is a valid pattern. +// +// 7. Finally, save the UVM and this container as a template if specified. func (he *hcsExec) waitForExit() { ctx, span := trace.StartSpan(context.Background(), "hcsExec::waitForExit") defer span.End() diff --git a/cmd/containerd-shim-runhcs-v1/pod.go b/cmd/containerd-shim-runhcs-v1/pod.go index 2d89b0d8ea..6e341b83fb 100644 --- a/cmd/containerd-shim-runhcs-v1/pod.go +++ b/cmd/containerd-shim-runhcs-v1/pod.go @@ -152,21 +152,8 @@ func createPod(ctx context.Context, events publisher, req *task.CreateTaskReques // isolated. Process isolated WCOW gets the namespace endpoints // automatically. if parent != nil { - nsid := "" - if s.Windows != nil && s.Windows.Network != nil { - nsid = s.Windows.Network.NetworkNamespace - } - - if nsid != "" { - endpoints, err := hcsoci.GetNamespaceEndpoints(ctx, nsid) - if err != nil { - return nil, err - } - err = parent.AddNetNS(ctx, nsid) - if err != nil { - return nil, err - } - err = parent.AddEndpointsToNS(ctx, nsid, endpoints) + if s.Windows != nil && s.Windows.Network != nil && s.Windows.Network.NetworkNamespace != "" { + err = hcsoci.SetupNetworkNamespace(ctx, parent, s.Windows.Network.NetworkNamespace) if err != nil { return nil, err } @@ -241,6 +228,16 @@ func (p *pod) ID() string { return p.id } +func (p *pod) GetCloneAnnotations(ctx context.Context, s *specs.Spec) (bool, string, error) { + isTemplate, templateID, err := oci.ParseCloneAnnotations(ctx, s) + if err != nil { + return false, "", err + } else if (isTemplate || templateID != "") && p.host == nil { + return false, "", fmt.Errorf("save as template and creating clones is only supported for hyper-v isolated containers") + } + return isTemplate, templateID, nil +} + func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (_ shimTask, err error) { if req.ID == p.id { return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists", req.ID) @@ -283,7 +280,17 @@ func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *sp sid) } - st, err := newHcsTask(ctx, p.events, p.host, false, req, s) + _, templateID, err := p.GetCloneAnnotations(ctx, s) + if err != nil { + return nil, err + } + + var st shimTask + if templateID != "" { + st, err = newClonedHcsTask(ctx, p.events, p.host, false, req, s, templateID) + } else { + st, err = newHcsTask(ctx, p.events, p.host, false, req, s) + } if err != nil { return nil, err } diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index aa93019e1f..c16271c79c 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -123,6 +123,7 @@ func newHcsTask( }).Debug("newHcsTask") owner := filepath.Base(os.Args[0]) + isTemplate := oci.ParseAnnotationsSaveAsTemplate(ctx, s) io, err := cmd.NewNpipeIO(ctx, req.Stdin, req.Stdout, req.Stderr, req.Terminal) if err != nil { @@ -162,14 +163,16 @@ func newHcsTask( } ht := &hcsTask{ - events: events, - id: req.ID, - isWCOW: oci.IsWCOW(s), - c: system, - cr: resources, - ownsHost: ownsParent, - host: parent, - closed: make(chan struct{}), + events: events, + id: req.ID, + isWCOW: oci.IsWCOW(s), + c: system, + cr: resources, + ownsHost: ownsParent, + host: parent, + closed: make(chan struct{}), + taskSpec: s, + isTemplate: isTemplate, } ht.init = newHcsExec( ctx, @@ -189,9 +192,121 @@ func newHcsTask( // handle this case. go ht.waitForHostExit() } + + // In the normal case the `Signal` call from the caller killed this task's + // init process. Or the init process ran to completion - this will mostly + // happen when we are creating a template and want to wait for init process + // to finish before we save the template. In such cases do not tear down the + // container after init exits - because we need the container in the template + go ht.waitInitExit(!isTemplate) + + // Publish the created event + ht.events.publishEvent( + ctx, + runtime.TaskCreateEventTopic, + &eventstypes.TaskCreate{ + ContainerID: req.ID, + Bundle: req.Bundle, + Rootfs: req.Rootfs, + IO: &eventstypes.TaskIO{ + Stdin: req.Stdin, + Stdout: req.Stdout, + Stderr: req.Stderr, + Terminal: req.Terminal, + }, + Checkpoint: "", + Pid: uint32(ht.init.Pid()), + }) + return ht, nil +} + +// newClonedTask creates a container within `parent`. The parent must be already cloned +// from a template and hence this container must already be present inside that parent. +// This function simply creates the go wrapper around the container that is already +// running inside the cloned parent. +// This task MAY own the UVM that it is running in but as of now the cloning feature is +// only used for WCOW hyper-V isolated containers and for WCOW, the wcowPodSandboxTask +// owns that UVM. +func newClonedHcsTask( + ctx context.Context, + events publisher, + parent *uvm.UtilityVM, + ownsParent bool, + req *task.CreateTaskRequest, + s *specs.Spec, + templateID string) (_ shimTask, err error) { + log.G(ctx).WithFields(logrus.Fields{ + "tid": req.ID, + "ownsParent": ownsParent, + "templateid": templateID, + }).Debug("newClonedHcsTask") + + owner := filepath.Base(os.Args[0]) + + if parent.OS() != "windows" { + return nil, fmt.Errorf("cloned task can only be created inside a windows host") + } + + io, err := cmd.NewNpipeIO(ctx, req.Stdin, req.Stdout, req.Stderr, req.Terminal) + if err != nil { + return nil, err + } + + var netNS string + if s.Windows != nil && + s.Windows.Network != nil { + netNS = s.Windows.Network.NetworkNamespace + } + + // This is a cloned task. Use the templateid as the ID of the container here + // because that's the ID of this container inside the UVM. + opts := hcsoci.CreateOptions{ + ID: templateID, + Owner: owner, + Spec: s, + HostingSystem: parent, + NetworkNamespace: netNS, + } + system, resources, err := hcsoci.CloneContainer(ctx, &opts) + if err != nil { + return nil, err + } + + ht := &hcsTask{ + events: events, + id: req.ID, + isWCOW: oci.IsWCOW(s), + c: system, + cr: resources, + ownsHost: ownsParent, + host: parent, + closed: make(chan struct{}), + templateID: templateID, + taskSpec: s, + isTemplate: false, + } + ht.init = newClonedExec( + ctx, + events, + req.ID, + parent, + system, + req.ID, + req.Bundle, + ht.isWCOW, + s.Process, + io) + + if parent != nil { + // We have a parent UVM. Listen for its exit and forcibly close this + // task. This is not expected but in the event of a UVM crash we need to + // handle this case. + go ht.waitForHostExit() + } + // In the normal case the `Signal` call from the caller killed this task's // init process. - go ht.waitInitExit() + go ht.waitInitExit(true) // Publish the created event ht.events.publishEvent( @@ -268,6 +383,22 @@ type hcsTask struct { // closeHostOnce is used to close `host`. This will only be used if // `ownsHost==true` and `host != nil`. closeHostOnce sync.Once + + // templateID represents the id of the template container from which this container + // is cloned. The parent UVM (inside which this container is running) identifies this + // container with it's original id (i.e the id that was assigned to this container + // at the time of template creation i.e the templateID). Hence, every request that + // is sent to the GCS must actually use templateID to reference this container. + // A non-empty templateID specifies that this task was cloned. + templateID string + + // if isTemplate is true then this container will be saved as a template as soon + // as its init process exits. Note: templateID and isTemplate are mutually exclusive. + // i.e isTemplate can not be true when templateID is not empty. + isTemplate bool + + // taskSpec represents the spec/configuration for this task. + taskSpec *specs.Spec } func (ht *hcsTask) ID() string { @@ -451,7 +582,7 @@ func (ht *hcsTask) Wait() *task.StateResponse { return ht.init.Wait() } -func (ht *hcsTask) waitInitExit() { +func (ht *hcsTask) waitInitExit(destroyContainer bool) { ctx, span := trace.StartSpan(context.Background(), "hcsTask::waitInitExit") defer span.End() span.AddAttributes(trace.StringAttribute("tid", ht.id)) @@ -459,8 +590,20 @@ func (ht *hcsTask) waitInitExit() { // Wait for it to exit on its own ht.init.Wait() - // Close the host and event the exit - ht.close(ctx) + if destroyContainer { + // Close the host and event the exit + ht.close(ctx) + } else { + // Close the container's host, but do not close or terminate the container itself + ht.closeHost(ctx) + } + + if ht.isTemplate { + // Save the host as a template + if err := saveAsTemplate(ctx, ht); err != nil { + log.G(ctx).WithError(err).Error("failed to save as template") + } + } } // waitForHostExit waits for the host virtual machine to exit. Once exited diff --git a/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go b/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go index 9f60d6398c..88a467b09d 100644 --- a/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go +++ b/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go @@ -7,6 +7,7 @@ import ( "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/cmd" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/shimdiag" @@ -170,6 +171,10 @@ func (wpst *wcowPodSandboxTask) close(ctx context.Context) { if err := wpst.host.Close(); err != nil { log.G(ctx).WithError(err).Error("failed host vm shutdown") } + // cleanup template state if any exists + if err := clone.RemoveSavedTemplateConfig(wpst.host.ID()); err != nil { + log.G(ctx).WithError(err).Error("failed to cleanup template config state for vm") + } } // Send the `init` exec exit notification always. exit := wpst.init.Status() diff --git a/internal/clone/registry.go b/internal/clone/registry.go new file mode 100644 index 0000000000..ed5ef3dce4 --- /dev/null +++ b/internal/clone/registry.go @@ -0,0 +1,168 @@ +package clone + +import ( + "bytes" + "context" + "encoding/gob" + "fmt" + + "github.com/Microsoft/hcsshim/internal/regstate" + "github.com/Microsoft/hcsshim/internal/uvm" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + configRoot = "LateClone" + configKey = "UVMConfig" + templateConfigCurrentSerialVersionID = 1 +) + +// TemplateConfig struct maintains all of the information about a template. This includes +// the information for both the template container and the template UVM. This struct is +// serialized and stored in the registry and hence is version controlled. +// Note: Update the `templateConfigCurrentSerialVersionID` when this structure definition +// is changed. +type TemplateConfig struct { + SerialVersionID uint32 + TemplateUVMID string + TemplateUVMResources []uvm.Cloneable + TemplateUVMCreateOpts uvm.OptionsWCOW + TemplateContainerID string + // Below we store the container spec for the template container so that when + // cloning containers we can verify that a different spec is not provided for the + // cloned container. + TemplateContainerSpec specs.Spec +} + +// When encoding interfaces gob requires us to register the struct types that we will be +// using under those interfaces. This registration needs to happen on both sides i.e the +// side which encodes the data (i.e the shim process of the template) and the side which +// decodes the data (i.e the shim process of the clone). +// Go init function: https://golang.org/doc/effective_go.html#init +func init() { + // Register the pointer to structs because that is what is being stored. + gob.Register(&uvm.VSMBShare{}) + gob.Register(&uvm.SCSIMount{}) +} + +func encodeTemplateConfig(templateConfig *TemplateConfig) ([]byte, error) { + var buf bytes.Buffer + + encoder := gob.NewEncoder(&buf) + if err := encoder.Encode(templateConfig); err != nil { + return nil, fmt.Errorf("error while encoding template config: %s", err) + } + return buf.Bytes(), nil +} + +func decodeTemplateConfig(encodedBytes []byte) (*TemplateConfig, error) { + var templateConfig TemplateConfig + + reader := bytes.NewReader(encodedBytes) + decoder := gob.NewDecoder(reader) + if err := decoder.Decode(&templateConfig); err != nil { + return nil, fmt.Errorf("error while decoding template config: %s", err) + } + return &templateConfig, nil +} + +// loadPersistedUVMConfig loads a persisted config from the registry that matches the given ID +// If not found returns `regstate.NotFoundError` +func loadPersistedUVMConfig(id string) ([]byte, error) { + sk, err := regstate.Open(configRoot, false) + if err != nil { + return nil, err + } + defer sk.Close() + + var encodedConfig []byte + if err := sk.Get(id, configKey, &encodedConfig); err != nil { + return nil, err + } + return encodedConfig, nil +} + +// storePersistedUVMConfig stores the given config to the registry. +// If the store fails returns the store error. +func storePersistedUVMConfig(id string, encodedConfig []byte) error { + sk, err := regstate.Open(configRoot, false) + if err != nil { + return err + } + defer sk.Close() + + if err := sk.Create(id, configKey, encodedConfig); err != nil { + return err + } + return nil +} + +// removePersistedUVMConfig removes any persisted state associated with this config. If the config +// is not found in the registery `Remove` returns no error. +func removePersistedUVMConfig(id string) error { + sk, err := regstate.Open(configRoot, false) + if err != nil { + if regstate.IsNotFoundError(err) { + return nil + } + return err + } + defer sk.Close() + + if err := sk.Remove(id); err != nil { + if regstate.IsNotFoundError(err) { + return nil + } + return err + } + return nil +} + +// Saves all the information required to create a clone from the template +// of this container into the registry. +func SaveTemplateConfig(ctx context.Context, templateConfig *TemplateConfig) error { + _, err := loadPersistedUVMConfig(templateConfig.TemplateUVMID) + if !regstate.IsNotFoundError(err) { + return fmt.Errorf("parent VM(ID: %s) config shouldn't exit in registry (%s)", templateConfig.TemplateUVMID, err) + } + + // set the serial version before encoding + templateConfig.SerialVersionID = templateConfigCurrentSerialVersionID + + encodedBytes, err := encodeTemplateConfig(templateConfig) + if err != nil { + return fmt.Errorf("failed to encode template config: %s", err) + } + + if err := storePersistedUVMConfig(templateConfig.TemplateUVMID, encodedBytes); err != nil { + return fmt.Errorf("failed to store encoded template config: %s", err) + } + + return nil +} + +// Removes all the state associated with the template with given ID +// If there is no state associated with this ID then the function simply returns without +// doing anything. +func RemoveSavedTemplateConfig(id string) error { + return removePersistedUVMConfig(id) +} + +// Retrieves the UVMTemplateConfig for the template with given ID from the registry. +func FetchTemplateConfig(ctx context.Context, id string) (*TemplateConfig, error) { + encodedBytes, err := loadPersistedUVMConfig(id) + if err != nil { + return nil, fmt.Errorf("failed to fetch encoded template config: %s", err) + } + + templateConfig, err := decodeTemplateConfig(encodedBytes) + if err != nil { + return nil, fmt.Errorf("failed to decode template config: %s", err) + } + + if templateConfig.SerialVersionID != templateConfigCurrentSerialVersionID { + return nil, fmt.Errorf("serialized version of TemplateConfig: %d doesn't match with the current version: %d", templateConfig.SerialVersionID, templateConfigCurrentSerialVersionID) + } + + return templateConfig, nil +} diff --git a/internal/cow/cow.go b/internal/cow/cow.go index 8193315f06..89c1955131 100644 --- a/internal/cow/cow.go +++ b/internal/cow/cow.go @@ -80,4 +80,6 @@ type Container interface { // container to be terminated by some error condition (including calling // Close). Wait() error + // Modify this container + Modify(ctx context.Context, config interface{}) error } diff --git a/internal/gcs/container.go b/internal/gcs/container.go index 218d138f00..8e4ca07905 100644 --- a/internal/gcs/container.go +++ b/internal/gcs/container.go @@ -61,6 +61,23 @@ func (gc *GuestConnection) CreateContainer(ctx context.Context, cid string, conf return c, nil } +// CloneContainer just creates the wrappers and sets up notification requests for a +// container that is already running inside the UVM (after cloning). +func (gc *GuestConnection) CloneContainer(ctx context.Context, cid string) (_ *Container, err error) { + c := &Container{ + gc: gc, + id: cid, + notifyCh: make(chan struct{}), + closeCh: make(chan struct{}), + } + err = gc.requestNotify(cid, c.notifyCh) + if err != nil { + return nil, err + } + go c.waitBackground() + return c, nil +} + // OS returns the operating system of the container, "linux" or "windows". func (c *Container) OS() string { return c.gc.os diff --git a/internal/gcs/guestconnection.go b/internal/gcs/guestconnection.go index 47005acc1b..acffdee37d 100644 --- a/internal/gcs/guestconnection.go +++ b/internal/gcs/guestconnection.go @@ -57,7 +57,7 @@ type GuestConnectionConfig struct { } // Connect establishes a GCS connection. `gcc.Conn` will be closed by this function. -func (gcc *GuestConnectionConfig) Connect(ctx context.Context) (_ *GuestConnection, err error) { +func (gcc *GuestConnectionConfig) Connect(ctx context.Context, isColdStart bool) (_ *GuestConnection, err error) { ctx, span := trace.StartSpan(ctx, "gcs::GuestConnectionConfig::Connect") defer span.End() defer func() { oc.SetSpanStatus(span, err) }() @@ -73,7 +73,7 @@ func (gcc *GuestConnectionConfig) Connect(ctx context.Context) (_ *GuestConnecti gc.brdg.Wait() gc.clearNotifies() }() - err = gc.connect(ctx) + err = gc.connect(ctx, isColdStart) if err != nil { gc.Close() return nil, err @@ -105,7 +105,10 @@ func (gc *GuestConnection) Protocol() uint32 { } // connect establishes a GCS connection. It must not be called more than once. -func (gc *GuestConnection) connect(ctx context.Context) (err error) { +// isColdStart should be true when the UVM is being connected to for the first time post-boot. +// It should be false for subsequent connections (e.g. when connecting to a UVM that has +// been cloned). +func (gc *GuestConnection) connect(ctx context.Context, isColdStart bool) (err error) { req := negotiateProtocolRequest{ MinimumVersion: protocolVersion, MaximumVersion: protocolVersion, @@ -123,7 +126,7 @@ func (gc *GuestConnection) connect(ctx context.Context) (err error) { if gc.os == "" { gc.os = "windows" } - if resp.Capabilities.SendHostCreateMessage { + if isColdStart && resp.Capabilities.SendHostCreateMessage { createReq := containerCreate{ requestBase: makeRequest(ctx, nullContainerID), ContainerConfig: anyInString{&uvmConfig{ diff --git a/internal/gcs/guestconnection_test.go b/internal/gcs/guestconnection_test.go index a77fcfd0d9..a878bae2ff 100644 --- a/internal/gcs/guestconnection_test.go +++ b/internal/gcs/guestconnection_test.go @@ -152,7 +152,7 @@ func connectGcs(ctx context.Context, t *testing.T) *GuestConnection { Log: logrus.NewEntry(logrus.StandardLogger()), IoListen: npipeIoListen, } - gc, err := gcc.Connect(context.Background()) + gc, err := gcc.Connect(context.Background(), true) if err != nil { c.Close() t.Fatal(err) diff --git a/internal/hcsoci/clone.go b/internal/hcsoci/clone.go new file mode 100644 index 0000000000..4d26f21d0b --- /dev/null +++ b/internal/hcsoci/clone.go @@ -0,0 +1,50 @@ +// +build windows + +package hcsoci + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/cow" + "github.com/Microsoft/hcsshim/internal/requesttype" + hcsschema "github.com/Microsoft/hcsshim/internal/schema2" +) + +const ( + MappedDirectoryResourcePath = "Container/MappedDirectories" + MappedPipeResourcePath = "Container/MappedPipes" +) + +// Usually mounts specified in the container config are added in the container doc +// that is passed along with the container creation reuqest. However, for cloned containers +// we don't send any create container request so we must add the mounts one by one by +// doing Modify requests to that container. +func addMountsToClone(ctx context.Context, c cow.Container, mounts *mountsConfig) error { + // TODO(ambarve) : Find out if there is a way to send request for all the mounts + // at the same time to save time + for _, md := range mounts.mdsv2 { + requestDocument := &hcsschema.ModifySettingRequest{ + RequestType: requesttype.Add, + ResourcePath: MappedDirectoryResourcePath, + Settings: md, + } + err := c.Modify(ctx, requestDocument) + if err != nil { + return fmt.Errorf("error while adding mapped directory (%s) to the container: %s", md.HostPath, err) + } + } + + for _, mp := range mounts.mpsv2 { + requestDocument := &hcsschema.ModifySettingRequest{ + RequestType: requesttype.Add, + ResourcePath: MappedPipeResourcePath, + Settings: mp, + } + err := c.Modify(ctx, requestDocument) + if err != nil { + return fmt.Errorf("error while adding mapped pipe (%s) to the container: %s", mp.HostPath, err) + } + } + return nil +} diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index e425316298..f87adb1e99 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -11,6 +11,7 @@ import ( "strconv" "github.com/Microsoft/go-winio/pkg/guid" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/cow" "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/log" @@ -65,26 +66,133 @@ type createOptionsInternal struct { actualOwner string // Owner for the container actualNetworkNamespace string ccgState *hcsschema.ContainerCredentialGuardState // Container Credential Guard information to be attached to HCS container document + isTemplate bool // Are we going to save this container as a template + templateID string // Template ID of the template from which this container is being cloned } -// CreateContainer creates a container. It can cope with a wide variety of -// scenarios, including v1 HCS schema calls, as well as more complex v2 HCS schema -// calls. Note we always return the resources that have been allocated, even in the -// case of an error. This provides support for the debugging option not to -// release the resources on failure, so that the client can make the necessary -// call to release resources that have been allocated as part of calling this function. -func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { +// compares two slices of strings and returns true if they are same, returns false otherwise. +// The elements in the slices don't have to be in the same order for them to be equal. +func cmpSlices(s1, s2 []string) bool { + equal := (len(s1) == len(s2)) + for i := 0; equal && i < len(s1); i++ { + found := false + for j := 0; !found && j < len(s2); j++ { + found = (s1[i] == s2[j]) + } + equal = equal && found + } + return equal +} + +// Compares to mount structs and returns true if they are equal, returns false otherwise. +func compareMounts(m1, m2 specs.Mount) bool { + return cmpSlices(m1.Options, m2.Options) && (m1.Source == m2.Source) && (m1.Destination == m2.Destination) && (m1.Type == m2.Type) +} + +// verifyCloneContainerSpecs compares the container creation spec provided during the template container +// creation and the spec provided during cloned container creation and checks that all the fields match +// (except for the certain fields that are allowed to be different). +func verifyCloneContainerSpecs(templateSpec, cloneSpec *specs.Spec) error { + // Following fields can be different in the template and clone specs. + // 1. Process + // 2. Annotations - Only the template/cloning related annotations can be different. + // 3. Windows.LayerFolders - Only the last i.e scratch layer can be different. + + if templateSpec.Version != cloneSpec.Version { + return fmt.Errorf("OCI Runtime Spec version of template (%s) doesn't match with the Spec version of clone (%s)", templateSpec.Version, cloneSpec.Version) + } + + // for annotations check that the values of memory & cpu annotations are same + if templateSpec.Annotations[oci.AnnotationContainerMemorySizeInMB] != cloneSpec.Annotations[oci.AnnotationContainerMemorySizeInMB] { + return fmt.Errorf("memory size limit for template and clone containers can not be different") + } + if templateSpec.Annotations[oci.AnnotationContainerProcessorCount] != cloneSpec.Annotations[oci.AnnotationContainerProcessorCount] { + return fmt.Errorf("processor count for template and clone containers can not be different") + } + if templateSpec.Annotations[oci.AnnotationContainerProcessorLimit] != cloneSpec.Annotations[oci.AnnotationContainerProcessorLimit] { + return fmt.Errorf("processor limit for template and clone containers can not be different") + } + + // LayerFolders should be identical except for the last element. + if !cmpSlices(templateSpec.Windows.LayerFolders[:len(templateSpec.Windows.LayerFolders)-1], cloneSpec.Windows.LayerFolders[:len(cloneSpec.Windows.LayerFolders)-1]) { + return fmt.Errorf("layers provided for template container and clone container don't match. Check the image specified in container config") + } + + if templateSpec.Windows.HyperV != cloneSpec.Windows.HyperV { + return fmt.Errorf("HyperV spec for template and clone containers can not be different") + } + + if templateSpec.Windows.Network.AllowUnqualifiedDNSQuery != cloneSpec.Windows.Network.AllowUnqualifiedDNSQuery { + return fmt.Errorf("different values for allow unqualified DNS query can not be provided for template and clones") + } + if templateSpec.Windows.Network.NetworkSharedContainerName != cloneSpec.Windows.Network.NetworkSharedContainerName { + return fmt.Errorf("different network shared name can not be provided for template and clones") + } + if !cmpSlices(templateSpec.Windows.Network.DNSSearchList, cloneSpec.Windows.Network.DNSSearchList) { + return fmt.Errorf("different DNS search list can not be provided for template and clones") + } + return nil +} + +func validateContainerConfig(ctx context.Context, coi *createOptionsInternal) error { + + if coi.HostingSystem != nil && coi.HostingSystem.IsTemplate && !coi.isTemplate { + return fmt.Errorf("only a template container can be created inside a template pod. Any other combination is not valid") + } + + if coi.HostingSystem != nil && coi.templateID != "" && !coi.HostingSystem.IsClone { + return fmt.Errorf("A container can not be cloned inside a non cloned POD") + } + + if coi.templateID != "" { + // verify that the configurations provided for the template for + // this clone are same. + tc, err := clone.FetchTemplateConfig(ctx, coi.HostingSystem.TemplateID) + if err != nil { + return fmt.Errorf("config validation failed : %s", err) + } + if err := verifyCloneContainerSpecs(&tc.TemplateContainerSpec, coi.Spec); err != nil { + return err + } + } + + if coi.HostingSystem != nil && coi.HostingSystem.IsTemplate { + if len(coi.Spec.Windows.Devices) != 0 { + return fmt.Errorf("Mapped Devices are not supported for template containers") + } + + if _, ok := coi.Spec.Windows.CredentialSpec.(string); ok { + return fmt.Errorf("gMSA specifications are not supported for template containers") + } + + if coi.Spec.Windows.Servicing { + return fmt.Errorf("template containers can't be started in servicing mode") + } + + // check that no mounts are specified. + if len(coi.Spec.Mounts) > 0 { + return fmt.Errorf("user specified mounts are not permitted for template containers") + } + } + return nil +} + +func initializeCreateOptions(ctx context.Context, createOptions *CreateOptions) (*createOptionsInternal, error) { coi := &createOptionsInternal{ CreateOptions: createOptions, actualID: createOptions.ID, actualOwner: createOptions.Owner, } + if coi.Spec == nil { + return nil, fmt.Errorf("Spec must be supplied") + } + // Defaults if omitted by caller. if coi.actualID == "" { g, err := guid.NewV4() if err != nil { - return nil, nil, err + return nil, err } coi.actualID = g.String() } @@ -92,10 +200,6 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C coi.actualOwner = filepath.Base(os.Args[0]) } - if coi.Spec == nil { - return nil, nil, fmt.Errorf("Spec must be supplied") - } - if coi.HostingSystem != nil { // By definition, a hosting system can only be supplied for a v2 Xenon. coi.actualSchemaVersion = schemaversion.SchemaV21() @@ -103,10 +207,64 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C coi.actualSchemaVersion = schemaversion.DetermineSchemaVersion(coi.SchemaVersion) } + coi.isTemplate = oci.ParseAnnotationsSaveAsTemplate(ctx, createOptions.Spec) + coi.templateID = oci.ParseAnnotationsTemplateID(ctx, createOptions.Spec) + log.G(ctx).WithFields(logrus.Fields{ "options": fmt.Sprintf("%+v", createOptions), "schema": coi.actualSchemaVersion, - }).Debug("hcsshim::CreateContainer") + }).Debug("hcsshim::initializeCreateOptions") + + return coi, nil +} + +// configureSandboxNetwork creates a new network namespace for the pod (sandbox) +// if required and then adds that namespace to the pod. +func configureSandboxNetwork(ctx context.Context, coi *createOptionsInternal, r *resources.Resources) error { + if coi.NetworkNamespace != "" { + r.SetNetNS(coi.NetworkNamespace) + } else { + err := createNetworkNamespace(ctx, coi, r) + if err != nil { + return err + } + } + coi.actualNetworkNamespace = r.NetNS() + + if coi.HostingSystem != nil { + ct, _, err := oci.GetSandboxTypeAndID(coi.Spec.Annotations) + if err != nil { + return err + } + // Only add the network namespace to a standalone or sandbox + // container but not a workload container in a sandbox that inherits + // the namespace. + if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { + if err = SetupNetworkNamespace(ctx, coi.HostingSystem, coi.actualNetworkNamespace); err != nil { + return err + } + r.SetAddedNetNSToVM(true) + } + } + + return nil +} + +// CreateContainer creates a container. It can cope with a wide variety of +// scenarios, including v1 HCS schema calls, as well as more complex v2 HCS schema +// calls. Note we always return the resources that have been allocated, even in the +// case of an error. This provides support for the debugging option not to +// release the resources on failure, so that the client can make the necessary +// call to release resources that have been allocated as part of calling this function. +func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { + coi, err := initializeCreateOptions(ctx, createOptions) + if err != nil { + return nil, nil, err + } + + if err := validateContainerConfig(ctx, coi); err != nil { + return nil, nil, fmt.Errorf("container config validation failed: %s", err) + } r := resources.NewContainerResources(createOptions.ID) defer func() { @@ -130,41 +288,10 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C if coi.Spec.Windows != nil && coi.Spec.Windows.Network != nil && schemaversion.IsV21(coi.actualSchemaVersion) { + err = configureSandboxNetwork(ctx, coi, r) + if err != nil { + return nil, r, fmt.Errorf("failure while creating namespace for container: %s", err) - if coi.NetworkNamespace != "" { - r.SetNetNS(coi.NetworkNamespace) - } else { - err := createNetworkNamespace(ctx, coi, r) - if err != nil { - return nil, r, err - } - } - coi.actualNetworkNamespace = r.NetNS() - if coi.HostingSystem != nil { - ct, _, err := oci.GetSandboxTypeAndID(coi.Spec.Annotations) - if err != nil { - return nil, r, err - } - // Only add the network namespace to a standalone or sandbox - // container but not a workload container in a sandbox that inherits - // the namespace. - if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { - endpoints, err := GetNamespaceEndpoints(ctx, coi.actualNetworkNamespace) - if err != nil { - return nil, r, err - } - err = coi.HostingSystem.AddNetNS(ctx, coi.actualNetworkNamespace) - if err != nil { - return nil, r, err - } - err = coi.HostingSystem.AddEndpointsToNS(ctx, coi.actualNetworkNamespace, endpoints) - if err != nil { - // Best effort clean up the NS - coi.HostingSystem.RemoveNetNS(ctx, coi.actualNetworkNamespace) - return nil, r, err - } - r.SetAddedNetNSToVM(true) - } } } @@ -234,6 +361,64 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C return system, r, nil } +// CloneContainer is similar to CreateContainer but it does not add layers or namespace like +// CreateContainer does. Also, instead of sending create container request it sends a modify +// request to an existing container. CloneContainer only works for WCOW. +func CloneContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { + coi, err := initializeCreateOptions(ctx, createOptions) + if err != nil { + return nil, nil, err + } + + if err := validateContainerConfig(ctx, coi); err != nil { + return nil, nil, err + } + + if coi.Spec.Windows == nil || coi.HostingSystem == nil { + return nil, nil, fmt.Errorf("CloneContainer is only supported for Hyper-v isolated WCOW ") + } + + r := resources.NewContainerResources(createOptions.ID) + defer func() { + if err != nil { + if !coi.DoNotReleaseResourcesOnFailure { + resources.ReleaseResources(ctx, r, coi.HostingSystem, true) + } + } + }() + + if coi.HostingSystem != nil { + n := coi.HostingSystem.ContainerCounter() + if coi.Spec.Linux != nil { + r.SetContainerRootInUVM(fmt.Sprintf(lcowRootInUVM, createOptions.ID)) + } else { + r.SetContainerRootInUVM(fmt.Sprintf(wcowRootInUVM, strconv.FormatUint(n, 16))) + } + } + + if err = setupMounts(ctx, coi, r); err != nil { + return nil, r, err + } + + mounts, err := createMountsConfig(ctx, coi) + if err != nil { + return nil, r, err + } + + c, err := coi.HostingSystem.CloneContainer(ctx, coi.actualID) + if err != nil { + return nil, r, err + } + + // Everything that is usually added to the container during the createContainer + // request (via the gcsDocument) must be hot added here. + if err := addMountsToClone(ctx, c, mounts); err != nil { + return nil, r, err + } + + return c, r, nil +} + // isV2Xenon returns true if the create options are for a HCS schema V2 xenon container // with a hosting VM func (coi *createOptionsInternal) isV2Xenon() bool { diff --git a/internal/hcsoci/hcsdoc_wcow.go b/internal/hcsoci/hcsdoc_wcow.go index 4e3a06852f..a89992f256 100644 --- a/internal/hcsoci/hcsdoc_wcow.go +++ b/internal/hcsoci/hcsdoc_wcow.go @@ -23,6 +23,60 @@ import ( "github.com/sirupsen/logrus" ) +// A simple wrapper struct around the container mount configs that should be added to the +// container. +type mountsConfig struct { + mdsv1 []schema1.MappedDir + mpsv1 []schema1.MappedPipe + mdsv2 []hcsschema.MappedDirectory + mpsv2 []hcsschema.MappedPipe +} + +func createMountsConfig(ctx context.Context, coi *createOptionsInternal) (*mountsConfig, error) { + // Add the mounts as mapped directories or mapped pipes + // TODO: Mapped pipes to add in v2 schema. + var config mountsConfig + for _, mount := range coi.Spec.Mounts { + if mount.Type != "" { + return nil, fmt.Errorf("invalid container spec - Mount.Type '%s' must not be set", mount.Type) + } + if uvm.IsPipe(mount.Source) { + src, dst := uvm.GetContainerPipeMapping(coi.HostingSystem, mount) + config.mpsv1 = append(config.mpsv1, schema1.MappedPipe{HostPath: src, ContainerPipeName: dst}) + config.mpsv2 = append(config.mpsv2, hcsschema.MappedPipe{HostPath: src, ContainerPipeName: dst}) + } else { + readOnly := false + for _, o := range mount.Options { + if strings.ToLower(o) == "ro" { + readOnly = true + } + } + mdv1 := schema1.MappedDir{HostPath: mount.Source, ContainerPath: mount.Destination, ReadOnly: readOnly} + mdv2 := hcsschema.MappedDirectory{ContainerPath: mount.Destination, ReadOnly: readOnly} + if coi.HostingSystem == nil { + mdv2.HostPath = mount.Source + } else { + uvmPath, err := coi.HostingSystem.GetVSMBUvmPath(ctx, mount.Source, readOnly) + if err != nil { + if err == uvm.ErrNotAttached { + // It could also be a scsi mount. + uvmPath, err = coi.HostingSystem.GetScsiUvmPath(ctx, mount.Source) + if err != nil { + return nil, err + } + } else { + return nil, err + } + } + mdv2.HostPath = uvmPath + } + config.mdsv1 = append(config.mdsv1, mdv1) + config.mdsv2 = append(config.mdsv2, mdv2) + } + } + return &config, nil +} + // createWindowsContainerDocument creates documents for passing to HCS or GCS to create // a container, both hosted and process isolated. It creates both v1 and v2 // container objects, WCOW only. The containers storage should have been mounted already. @@ -182,7 +236,14 @@ func createWindowsContainerDocument(ctx context.Context, coi *createOptionsInter v2Container.Networking = &hcsschema.Networking{} v1.EndpointList = coi.Spec.Windows.Network.EndpointList - v2Container.Networking.Namespace = coi.actualNetworkNamespace + + // Use the reserved network namespace for containers created inside + // cloned or template UVMs. + if coi.HostingSystem != nil && (coi.HostingSystem.IsTemplate || coi.HostingSystem.IsClone) { + v2Container.Networking.Namespace = uvm.DEFAULT_CLONE_NETWORK_NAMESPACE_ID + } else { + v2Container.Networking.Namespace = coi.actualNetworkNamespace + } v1.AllowUnqualifiedDNSQuery = coi.Spec.Windows.Network.AllowUnqualifiedDNSQuery v2Container.Networking.AllowUnqualifiedDnsQuery = v1.AllowUnqualifiedDNSQuery @@ -268,60 +329,17 @@ func createWindowsContainerDocument(ctx context.Context, coi *createOptionsInter } } - // Add the mounts as mapped directories or mapped pipes - // TODO: Mapped pipes to add in v2 schema. - var ( - mdsv1 []schema1.MappedDir - mpsv1 []schema1.MappedPipe - mdsv2 []hcsschema.MappedDirectory - mpsv2 []hcsschema.MappedPipe - ) - for _, mount := range coi.Spec.Mounts { - if mount.Type != "" { - return nil, nil, fmt.Errorf("invalid container spec - Mount.Type '%s' must not be set", mount.Type) - } - if uvm.IsPipe(mount.Source) { - src, dst := uvm.GetContainerPipeMapping(coi.HostingSystem, mount) - mpsv1 = append(mpsv1, schema1.MappedPipe{HostPath: src, ContainerPipeName: dst}) - mpsv2 = append(mpsv2, hcsschema.MappedPipe{HostPath: src, ContainerPipeName: dst}) - } else { - readOnly := false - for _, o := range mount.Options { - if strings.ToLower(o) == "ro" { - readOnly = true - } - } - mdv1 := schema1.MappedDir{HostPath: mount.Source, ContainerPath: mount.Destination, ReadOnly: readOnly} - mdv2 := hcsschema.MappedDirectory{ContainerPath: mount.Destination, ReadOnly: readOnly} - if coi.HostingSystem == nil { - mdv2.HostPath = mount.Source - } else { - uvmPath, err := coi.HostingSystem.GetVSMBUvmPath(ctx, mount.Source, readOnly) - if err != nil { - if err == uvm.ErrNotAttached { - // It could also be a scsi mount. - uvmPath, err = coi.HostingSystem.GetScsiUvmPath(ctx, mount.Source) - if err != nil { - return nil, nil, err - } - } else { - return nil, nil, err - } - } - mdv2.HostPath = uvmPath - } - mdsv1 = append(mdsv1, mdv1) - mdsv2 = append(mdsv2, mdv2) - } + mounts, err := createMountsConfig(ctx, coi) + if err != nil { + return nil, nil, err } - - v1.MappedDirectories = mdsv1 - v2Container.MappedDirectories = mdsv2 - if len(mpsv1) > 0 && osversion.Get().Build < osversion.RS3 { + v1.MappedDirectories = mounts.mdsv1 + v2Container.MappedDirectories = mounts.mdsv2 + if len(mounts.mpsv1) > 0 && osversion.Get().Build < osversion.RS3 { return nil, nil, fmt.Errorf("named pipe mounts are not supported on this version of Windows") } - v1.MappedPipes = mpsv1 - v2Container.MappedPipes = mpsv2 + v1.MappedPipes = mounts.mpsv1 + v2Container.MappedPipes = mounts.mpsv2 // add assigned devices to the container definition if err := parseAssignedDevices(ctx, coi, v2Container); err != nil { diff --git a/internal/hcsoci/network.go b/internal/hcsoci/network.go index d33bd6b838..dd0ddca374 100644 --- a/internal/hcsoci/network.go +++ b/internal/hcsoci/network.go @@ -3,6 +3,7 @@ package hcsoci import ( "context" + "github.com/Microsoft/hcsshim/hcn" "github.com/Microsoft/hcsshim/internal/hns" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" @@ -68,3 +69,64 @@ func GetNamespaceEndpoints(ctx context.Context, netNS string) ([]*hns.HNSEndpoin } return endpoints, nil } + +// Network namespace setup is a bit different for templates and clones. +// For templates and clones we use a special network namespace ID. +// Details about this can be found in the Networking section of the late-clone wiki page. +// +// In this function we take the namespace ID of the namespace that was created for this +// UVM. We hot add the namespace (with the default ID if this is a template). We get the +// endpoints associated with this namespace and then hot add those endpoints (by changing +// their namespace IDs by the deafult IDs if it is a template). +func SetupNetworkNamespace(ctx context.Context, hostingSystem *uvm.UtilityVM, nsid string) error { + nsidInsideUVM := nsid + if hostingSystem.IsTemplate || hostingSystem.IsClone { + nsidInsideUVM = uvm.DEFAULT_CLONE_NETWORK_NAMESPACE_ID + } + + // Query endpoints with actual nsid + endpoints, err := GetNamespaceEndpoints(ctx, nsid) + if err != nil { + return err + } + + // Add the network namespace inside the UVM if it is not a clone. (Clones will + // inherit the namespace from template) + if !hostingSystem.IsClone { + // Get the namespace struct from the actual nsid. + hcnNamespace, err := hcn.GetNamespaceByID(nsid) + if err != nil { + return err + } + + // All templates should have a special NSID so that it + // will be easier to debug. Override it here. + if hostingSystem.IsTemplate { + hcnNamespace.Id = nsidInsideUVM + } + + if err = hostingSystem.AddNetNS(ctx, hcnNamespace); err != nil { + return err + } + } + + // If adding a network endpoint to clones or a template override nsid associated + // with it. + if hostingSystem.IsClone || hostingSystem.IsTemplate { + // replace nsid for each endpoint + for _, ep := range endpoints { + ep.Namespace = &hns.Namespace{ + ID: nsidInsideUVM, + } + } + } + + if err = hostingSystem.AddEndpointsToNS(ctx, nsidInsideUVM, endpoints); err != nil { + // Best effort clean up the NS + if removeErr := hostingSystem.RemoveNetNS(ctx, nsidInsideUVM); removeErr != nil { + log.G(ctx).Warn(removeErr) + } + return err + } + return nil +} diff --git a/internal/hcsoci/resources_wcow.go b/internal/hcsoci/resources_wcow.go index 9ace171729..0ef3ea5482 100644 --- a/internal/hcsoci/resources_wcow.go +++ b/internal/hcsoci/resources_wcow.go @@ -60,6 +60,53 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r r.SetLayers(layers) } + if err := setupMounts(ctx, coi, r); err != nil { + return err + } + + if cs, ok := coi.Spec.Windows.CredentialSpec.(string); ok { + // Only need to create a CCG instance for v2 containers + if schemaversion.IsV21(coi.actualSchemaVersion) { + hypervisorIsolated := coi.HostingSystem != nil + ccgInstance, ccgResource, err := credentials.CreateCredentialGuard(ctx, coi.actualID, cs, hypervisorIsolated) + if err != nil { + return err + } + coi.ccgState = ccgInstance.CredentialGuard + r.Add(ccgResource) + if hypervisorIsolated { + // If hypervisor isolated we need to add an hvsocket service table entry + // By default HVSocket won't allow something inside the VM to connect + // back to a process on the host. We need to update the HVSocket service table + // to allow a connection to CCG.exe on the host, so that GMSA can function. + // We need to hot add this here because at UVM creation time we don't know what containers + // will be launched in the UVM, nonetheless if they will ask for GMSA. This is a workaround + // for the previous design requirement for CCG V2 where the service entry + // must be present in the UVM'S HCS document before being sent over as hot adding + // an HvSocket service was not possible. + hvSockConfig := ccgInstance.HvSocketConfig + if err := coi.HostingSystem.UpdateHvSocketService(ctx, hvSockConfig.ServiceId, hvSockConfig.ServiceConfig); err != nil { + return fmt.Errorf("failed to update hvsocket service: %s", err) + } + } + } + } + + if coi.HostingSystem != nil && coi.hasWindowsAssignedDevices() { + windowsDevices, closers, err := handleAssignedDevicesWindows(ctx, coi.HostingSystem, coi.Spec.Annotations, coi.Spec.Windows.Devices) + if err != nil { + return err + } + r.Add(closers...) + coi.Spec.Windows.Devices = windowsDevices + } + + return nil +} + +// setupMount adds the custom mounts requested in the container configuration of this +// request. +func setupMounts(ctx context.Context, coi *createOptionsInternal, r *resources.Resources) error { // Validate each of the mounts. If this is a V2 Xenon, we have to add them as // VSMB shares to the utility VM. For V1 Xenon and Argons, there's nothing for // us to do as it's done by HCS. @@ -121,42 +168,5 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r } } - if cs, ok := coi.Spec.Windows.CredentialSpec.(string); ok { - // Only need to create a CCG instance for v2 containers - if schemaversion.IsV21(coi.actualSchemaVersion) { - hypervisorIsolated := coi.HostingSystem != nil - ccgInstance, ccgResource, err := credentials.CreateCredentialGuard(ctx, coi.actualID, cs, hypervisorIsolated) - if err != nil { - return err - } - coi.ccgState = ccgInstance.CredentialGuard - r.Add(ccgResource) - if hypervisorIsolated { - // If hypervisor isolated we need to add an hvsocket service table entry - // By default HVSocket won't allow something inside the VM to connect - // back to a process on the host. We need to update the HVSocket service table - // to allow a connection to CCG.exe on the host, so that GMSA can function. - // We need to hot add this here because at UVM creation time we don't know what containers - // will be launched in the UVM, nonetheless if they will ask for GMSA. This is a workaround - // for the previous design requirement for CCG V2 where the service entry - // must be present in the UVM'S HCS document before being sent over as hot adding - // an HvSocket service was not possible. - hvSockConfig := ccgInstance.HvSocketConfig - if err := coi.HostingSystem.UpdateHvSocketService(ctx, hvSockConfig.ServiceId, hvSockConfig.ServiceConfig); err != nil { - return fmt.Errorf("failed to update hvsocket service: %s", err) - } - } - } - } - - if coi.HostingSystem != nil && coi.hasWindowsAssignedDevices() { - windowsDevices, closers, err := handleAssignedDevicesWindows(ctx, coi.HostingSystem, coi.Spec.Annotations, coi.Spec.Windows.Devices) - if err != nil { - return err - } - r.Add(closers...) - coi.Spec.Windows.Devices = windowsDevices - } - return nil } diff --git a/internal/layers/layers.go b/internal/layers/layers.go index fb5429bcc0..01d8529cc6 100644 --- a/internal/layers/layers.go +++ b/internal/layers/layers.go @@ -125,6 +125,9 @@ func MountContainerLayers(ctx context.Context, layerFolders []string, guestRoot if uvm.OS() == "windows" { options := uvm.DefaultVSMBOptions(true) options.TakeBackupPrivilege = true + if uvm.IsTemplate { + uvm.SetSaveableVSMBOptions(options, options.ReadOnly) + } if _, err := uvm.AddVSMB(ctx, layerPath, options); err != nil { return "", fmt.Errorf("failed to add VSMB layer: %s", err) } diff --git a/internal/oci/uvm.go b/internal/oci/uvm.go index 9c8003cd25..3d57222804 100644 --- a/internal/oci/uvm.go +++ b/internal/oci/uvm.go @@ -3,10 +3,12 @@ package oci import ( "context" "errors" + "fmt" "strconv" "strings" runhcsopts "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/uvm" @@ -138,6 +140,21 @@ const ( // annotation used to specify the cpugroup ID that a UVM should be assigned to annotationCPUGroupID = "io.microsoft.virtualmachine.cpugroup.id" + + // SaveAsTemplate annotation must be used with a pod & container creation request. + // If this annotation is present in the request then it will save the UVM (pod) + // and the container(s) inside it as a template. However, this also means that this + // pod and the containers inside this pod will permananetly stay in the + // paused/templated state and can not be resumed again. + annotationSaveAsTemplate = "io.microsoft.virtualmachine.saveastemplate" + + // This annotation should be used when creating a pod or a container from a template. + // When creating a pod from a template use the ID of the templated pod as the + // TemplateID and when creating a container use the ID of the templated container as + // the TemplateID. It is the client's responsibility to make sure that the sandbox + // within which a cloned container needs to be created must also be created from the + // same template. + annotationTemplateID = "io.microsoft.virtualmachine.templateid" ) // parseAnnotationsBool searches `a` for `key` and if found verifies that the @@ -328,6 +345,32 @@ func parseAnnotationsString(a map[string]string, key string, def string) string return def } +// ParseAnnotationsSaveAsTemplate searches for the boolean value which specifies +// if this create request should be considered as a template creation request. If value +// is found the returns the actual value, returns false otherwise. +func ParseAnnotationsSaveAsTemplate(ctx context.Context, s *specs.Spec) bool { + return parseAnnotationsBool(ctx, s.Annotations, annotationSaveAsTemplate, false) +} + +// ParseAnnotationsTemplateID searches for the templateID in the create request. If the +// value is found then returns the value otherwise returns the empty string. +func ParseAnnotationsTemplateID(ctx context.Context, s *specs.Spec) string { + return parseAnnotationsString(s.Annotations, annotationTemplateID, "") +} + +func ParseCloneAnnotations(ctx context.Context, s *specs.Spec) (isTemplate bool, templateID string, err error) { + templateID = ParseAnnotationsTemplateID(ctx, s) + isTemplate = ParseAnnotationsSaveAsTemplate(ctx, s) + if templateID != "" && isTemplate { + return false, "", fmt.Errorf("templateID and save as template flags can not be passed in the same request") + } + + if (isTemplate || templateID != "") && !IsWCOW(s) { + return false, "", fmt.Errorf("save as template and creating clones is only available for WCOW") + } + return +} + // handleAnnotationKernelDirectBoot handles parsing annotationKernelDirectBoot and setting // implied annotations from the result. func handleAnnotationKernelDirectBoot(ctx context.Context, a map[string]string, lopts *uvm.OptionsLCOW) { @@ -369,6 +412,26 @@ func handleAnnotationFullyPhysicallyBacked(ctx context.Context, a map[string]str } } +// handleCloneAnnotations handles parsing annotations related to template creation and cloning +// Since late cloning is only supported for WCOW this function only deals with WCOW options. +func handleCloneAnnotations(ctx context.Context, a map[string]string, wopts *uvm.OptionsWCOW) (err error) { + wopts.IsTemplate = parseAnnotationsBool(ctx, a, annotationSaveAsTemplate, false) + templateID := parseAnnotationsString(a, annotationTemplateID, "") + if templateID != "" { + tc, err := clone.FetchTemplateConfig(ctx, templateID) + if err != nil { + return err + } + wopts.TemplateConfig = &uvm.UVMTemplateConfig{ + UVMID: tc.TemplateUVMID, + CreateOpts: tc.TemplateUVMCreateOpts, + Resources: tc.TemplateUVMResources, + } + wopts.IsClone = true + } + return nil +} + // SpecToUVMCreateOpts parses `s` and returns either `*uvm.OptionsLCOW` or // `*uvm.OptionsWCOW`. func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) (interface{}, error) { @@ -419,6 +482,9 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( wopts.DisableCompartmentNamespace = parseAnnotationsBool(ctx, s.Annotations, annotationDisableCompartmentNamespace, wopts.DisableCompartmentNamespace) wopts.CPUGroupID = parseAnnotationsString(s.Annotations, annotationCPUGroupID, wopts.CPUGroupID) handleAnnotationFullyPhysicallyBacked(ctx, s.Annotations, wopts) + if err := handleCloneAnnotations(ctx, s.Annotations, wopts); err != nil { + return nil, err + } return wopts, nil } return nil, errors.New("cannot create UVM opts spec is not LCOW or WCOW") diff --git a/internal/uvm/clone.go b/internal/uvm/clone.go new file mode 100644 index 0000000000..5a7977b482 --- /dev/null +++ b/internal/uvm/clone.go @@ -0,0 +1,139 @@ +package uvm + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/cow" + hcsschema "github.com/Microsoft/hcsshim/internal/schema2" + "github.com/pkg/errors" +) + +const ( + hcsComputeSystemSaveType = "AsTemplate" + // default namespace ID used for all template and clone VMs. + DEFAULT_CLONE_NETWORK_NAMESPACE_ID = "89EB8A86-E253-41FD-9800-E6D88EB2E18A" +) + +// Cloneable is a generic interface for cloning a specific resource. Not all resources can +// be cloned and so all resources might not implement this interface. This interface is +// mainly used during late cloning process to clone the resources associated with the UVM +// and the container. For some resources (like scratch VHDs of the UVM & container) +// cloning means actually creating a copy of that resource while for some resources it +// simply means adding that resource to the cloned VM without copying (like VSMB shares). +// The Clone function of that resource will deal with these details. +type Cloneable interface { + // A resource that supports cloning should also support serialization and + // deserialization operations. This is because during resource cloning a resource + // is usually serialized in one process and then deserialized and cloned in some + // other process. Care should be taken while serializing a resource to not include + // any state that will not be valid during the deserialization step. By default + // gob encoding is used to serialize and deserialize resources but a resource can + // implement `gob.GobEncoder` & `gob.GobDecoder` interfaces to provide its own + // serialization and deserialization functions. + + // A SerialVersionID is an identifier used to recognize a unique version of a + // resource. Every time the definition of the resource struct changes this ID is + // bumped up. This ID is used to ensure that we serialize and deserialize the + // same version of a resource. + GetSerialVersionID() uint32 + + // Clone function creates a clone of the resource on the UVM `vm` (i.e adds the + // cloned resource to the `vm`) + // `cd` parameter can be used to pass any other data that is required during the + // cloning process of that resource (for example, when cloning SCSI Mounts we + // might need scratchFolder). + // Clone function should be called on a valid struct (Mostly on the struct which + // is deserialized, and so Clone function should only depend on the fields that + // are exported in the struct). + // The implementation of the clone function should avoid reading any data from the + // `vm` struct, it can add new fields to the vm struct but since the vm struct + // isn't fully ready at this point it shouldn't be used to read any data. + Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) error +} + +// A struct to keep all the information that might be required during cloning process of +// a resource. +type cloneData struct { + // doc spec for the clone + doc *hcsschema.ComputeSystem + // scratchFolder of the clone + scratchFolder string + // UVMID of the clone + uvmID string +} + +// UVMTemplateConfig is just a wrapper struct that keeps together all the resources that +// need to be saved to create a template. +type UVMTemplateConfig struct { + // ID of the template vm + UVMID string + // Array of all resources that will be required while making a clone from this template + Resources []Cloneable + // The OptionsWCOW used for template uvm creation + CreateOpts OptionsWCOW +} + +// Captures all the information that is necessary to properly save this UVM as a template +// and create clones from this template later. The struct returned by this method must be +// later on made available while creating a clone from this template. +func (uvm *UtilityVM) GenerateTemplateConfig() (*UVMTemplateConfig, error) { + if _, ok := uvm.createOpts.(OptionsWCOW); !ok { + return nil, fmt.Errorf("template config can only be created for a WCOW uvm") + } + + // Add all the SCSI Mounts and VSMB shares into the list of clones + templateConfig := &UVMTemplateConfig{ + UVMID: uvm.ID(), + CreateOpts: uvm.createOpts.(OptionsWCOW), + } + + for _, vsmbShare := range uvm.vsmbDirShares { + templateConfig.Resources = append(templateConfig.Resources, vsmbShare) + } + + for _, vsmbShare := range uvm.vsmbFileShares { + templateConfig.Resources = append(templateConfig.Resources, vsmbShare) + } + + for _, location := range uvm.scsiLocations { + for _, scsiMount := range location { + if scsiMount != nil { + templateConfig.Resources = append(templateConfig.Resources, scsiMount) + } + } + } + + return templateConfig, nil +} + +// Pauses the uvm and then saves it as a template. This uvm can not be restarted or used +// after it is successfully saved. +// uvm must be in the paused state before it can be saved as a template.save call will throw +// an incorrect uvm state exception if uvm is not in the paused state at the time of saving. +func (uvm *UtilityVM) SaveAsTemplate(ctx context.Context) error { + if err := uvm.hcsSystem.Pause(ctx); err != nil { + return errors.Wrap(err, "error pausing the VM") + } + + saveOptions := hcsschema.SaveOptions{ + SaveType: hcsComputeSystemSaveType, + } + if err := uvm.hcsSystem.Save(ctx, saveOptions); err != nil { + return errors.Wrap(err, "error saving the VM") + } + return nil +} + +// CloneContainer attaches back to a container that is already running inside the UVM +// because of the clone +func (uvm *UtilityVM) CloneContainer(ctx context.Context, id string) (cow.Container, error) { + if uvm.gc == nil { + return nil, fmt.Errorf("clone container cannot work without external GCS connection") + } + c, err := uvm.gc.CloneContainer(ctx, id) + if err != nil { + return nil, fmt.Errorf("failed to clone container %s: %s", id, err) + } + return c, nil +} diff --git a/internal/uvm/create.go b/internal/uvm/create.go index 1290ffcd4d..47eda3cca9 100644 --- a/internal/uvm/create.go +++ b/internal/uvm/create.go @@ -87,6 +87,37 @@ type Options struct { CPUGroupID string } +// compares the create opts used during template creation with the create opts +// provided for clone creation. If they don't match (except for a few fields) +// then clone creation is failed. +func verifyCloneUvmCreateOpts(templateOpts, cloneOpts *OptionsWCOW) bool { + // Following fields can be different in the template and clone configurations. + // 1. the scratch layer path. i.e the last element of the LayerFolders path. + // 2. IsTemplate, IsClone and TemplateConfig variables. + // 3. ID + // 4. AdditionalHCSDocumentJSON + + // Save the original values of the fields that we want to ignore and replace them with + // the same values as that of the other object. So that we can simply use `==` operator. + templateIDBackup := templateOpts.ID + templateAdditionalJsonBackup := templateOpts.AdditionHCSDocumentJSON + templateOpts.ID = cloneOpts.ID + templateOpts.AdditionHCSDocumentJSON = cloneOpts.AdditionHCSDocumentJSON + + // We can't use `==` operator on structs which include slices in them. So compare the + // Layerfolders separately and then directly compare the Options struct. + result := (len(templateOpts.LayerFolders) == len(cloneOpts.LayerFolders)) + for i := 0; result && i < len(templateOpts.LayerFolders)-1; i++ { + result = result && (templateOpts.LayerFolders[i] == cloneOpts.LayerFolders[i]) + } + result = result && (*templateOpts.Options == *cloneOpts.Options) + + // set original values + templateOpts.ID = templateIDBackup + templateOpts.AdditionHCSDocumentJSON = templateAdditionalJsonBackup + return result +} + // Verifies that the final UVM options are correct and supported. func verifyOptions(ctx context.Context, options interface{}) error { switch opts := options.(type) { @@ -123,6 +154,18 @@ func verifyOptions(ctx context.Context, options interface{}) error { if len(opts.LayerFolders) < 2 { return errors.New("at least 2 LayerFolders must be supplied") } + if opts.IsClone && !verifyCloneUvmCreateOpts(&opts.TemplateConfig.CreateOpts, opts) { + return errors.New("clone configuration doesn't match with template configuration.") + } + if opts.IsClone && opts.TemplateConfig == nil { + return errors.New("template config can not be nil when creating clone") + } + if opts.IsClone && !opts.ExternalGuestConnection { + return errors.New("External gcs connection can not be disabled for clones") + } + if opts.IsTemplate && opts.FullyPhysicallyBacked { + return errors.New("Template can not be created from a full physically backed UVM") + } } return nil } @@ -206,11 +249,9 @@ func (uvm *UtilityVM) Close() (err error) { uvm.hcsSystem.Terminate(ctx) uvm.Wait() } - if uvm.gc != nil { - uvm.gc.Close() - } - if uvm.gcListener != nil { - uvm.gcListener.Close() + + if err := uvm.CloseGCSConnection(); err != nil { + log.G(ctx).Errorf("close GCS connection failed: %s", err) } // outputListener will only be nil for a Create -> Stop without a Start. In @@ -333,3 +374,15 @@ func (uvm *UtilityVM) normalizeMemorySize(ctx context.Context, requested uint64) func (uvm *UtilityVM) DevicesPhysicallyBacked() bool { return uvm.devicesPhysicallyBacked } + +// Closes the external GCS connection if it is being used and also closes the +// listener for GCS connection. +func (uvm *UtilityVM) CloseGCSConnection() (err error) { + if uvm.gc != nil { + err = uvm.gc.Close() + } + if uvm.gcListener != nil { + err = uvm.gcListener.Close() + } + return +} diff --git a/internal/uvm/create_lcow.go b/internal/uvm/create_lcow.go index 1ce25e01f7..1b3308e0ba 100644 --- a/internal/uvm/create_lcow.go +++ b/internal/uvm/create_lcow.go @@ -168,6 +168,7 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error physicallyBacked: !opts.AllowOvercommit, devicesPhysicallyBacked: opts.FullyPhysicallyBacked, cpuGroupID: opts.CPUGroupID, + createOpts: opts, } defer func() { diff --git a/internal/uvm/create_test.go b/internal/uvm/create_test.go index 3cdd896d59..0a6e82f322 100644 --- a/internal/uvm/create_test.go +++ b/internal/uvm/create_test.go @@ -26,3 +26,12 @@ func TestCreateWCOWBadLayerFolders(t *testing.T) { t.Fatal(err) } } + +func TestCreateClone(t *testing.T) { + opts := NewDefaultOptionsWCOW(t.Name(), "") + opts.IsClone = true + _, err := CreateWCOW(context.Background(), opts) + if err == nil { + t.Fatalf("CreateWCOW should fail when IsClone is true and TemplateConfig is not provided") + } +} diff --git a/internal/uvm/create_wcow.go b/internal/uvm/create_wcow.go index 02177f4fd5..79bd93ec7e 100644 --- a/internal/uvm/create_wcow.go +++ b/internal/uvm/create_wcow.go @@ -28,6 +28,22 @@ type OptionsWCOW struct { *Options LayerFolders []string // Set of folders for base layers and scratch. Ordered from top most read-only through base read-only layer, followed by scratch + + // IsTemplate specifies if this UVM will be saved as a template in future. Setting + // this option will also enable some VSMB Options during UVM creation that allow + // template creation. + IsTemplate bool + + // IsClone specifies if this UVM should be created by cloning a template. If + // IsClone is true then a valid UVMTemplateConfig struct must be passed in the + // `TemplateConfig` field. + IsClone bool + + // TemplateConfig is only used during clone creation. If a uvm is + // being cloned then this TemplateConfig struct must be passed + // which holds all the information about the template from + // which this clone should be created. + TemplateConfig *UVMTemplateConfig } // NewDefaultOptionsWCOW creates the default options for a bootable version of @@ -43,83 +59,21 @@ func NewDefaultOptionsWCOW(id, owner string) *OptionsWCOW { } } -// CreateWCOW creates an HCS compute system representing a utility VM. -// -// WCOW Notes: -// - The scratch is always attached to SCSI 0:0 -// -func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error) { - ctx, span := trace.StartSpan(ctx, "uvm::CreateWCOW") - defer span.End() - defer func() { oc.SetSpanStatus(span, err) }() - - if opts.ID == "" { - g, err := guid.NewV4() - if err != nil { - return nil, err - } - opts.ID = g.String() - } - - span.AddAttributes(trace.StringAttribute(logfields.UVMID, opts.ID)) - log.G(ctx).WithField("options", fmt.Sprintf("%+v", opts)).Debug("uvm::CreateWCOW options") - - uvm := &UtilityVM{ - id: opts.ID, - owner: opts.Owner, - operatingSystem: "windows", - scsiControllerCount: 1, - vsmbDirShares: make(map[string]*VSMBShare), - vsmbFileShares: make(map[string]*VSMBShare), - vpciDevices: make(map[string]*VPCIDevice), - physicallyBacked: !opts.AllowOvercommit, - devicesPhysicallyBacked: opts.FullyPhysicallyBacked, - cpuGroupID: opts.CPUGroupID, - } - - defer func() { - if err != nil { - uvm.Close() - } - }() - - if err := verifyOptions(ctx, opts); err != nil { - return nil, errors.Wrap(err, errBadUVMOpts.Error()) - } +func (uvm *UtilityVM) startExternalGcsListener(ctx context.Context) error { + log.G(ctx).WithField("vmID", uvm.runtimeID).Debug("Using external GCS bridge") - uvmFolder, err := uvmfolder.LocateUVMFolder(ctx, opts.LayerFolders) + l, err := winio.ListenHvsock(&winio.HvsockAddr{ + VMID: uvm.runtimeID, + ServiceID: gcs.WindowsGcsHvsockServiceID, + }) if err != nil { - return nil, fmt.Errorf("failed to locate utility VM folder from layer folders: %s", err) - } - - // TODO: BUGBUG Remove this. @jhowardmsft - // It should be the responsiblity of the caller to do the creation and population. - // - Update runhcs too (vm.go). - // - Remove comment in function header - // - Update tests that rely on this current behaviour. - // Create the RW scratch in the top-most layer folder, creating the folder if it doesn't already exist. - scratchFolder := opts.LayerFolders[len(opts.LayerFolders)-1] - - // Create the directory if it doesn't exist - if _, err := os.Stat(scratchFolder); os.IsNotExist(err) { - if err := os.MkdirAll(scratchFolder, 0777); err != nil { - return nil, fmt.Errorf("failed to create utility VM scratch folder: %s", err) - } - } - - // Create sandbox.vhdx in the scratch folder based on the template, granting the correct permissions to it - scratchPath := filepath.Join(scratchFolder, "sandbox.vhdx") - if _, err := os.Stat(scratchPath); os.IsNotExist(err) { - if err := wcow.CreateUVMScratch(ctx, uvmFolder, scratchFolder, uvm.id); err != nil { - return nil, fmt.Errorf("failed to create scratch: %s", err) - } - } else { - // Sandbox.vhdx exists, just need to grant vm access to it. - if err := wclayer.GrantVmAccess(ctx, uvm.id, scratchPath); err != nil { - return nil, errors.Wrap(err, "failed to grant vm access to scratch") - } + return err } + uvm.gcListener = l + return nil +} +func prepareConfigDoc(ctx context.Context, uvm *UtilityVM, opts *OptionsWCOW, uvmFolder string) (*hcsschema.ComputeSystem, error) { processorTopology, err := processorinfo.HostProcessorInfo(ctx) if err != nil { return nil, fmt.Errorf("failed to get host processor information: %s", err) @@ -200,16 +154,6 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error }, }, Devices: &hcsschema.Devices{ - Scsi: map[string]hcsschema.Scsi{ - "0": { - Attachments: map[string]hcsschema.Attachment{ - "0": { - Path: scratchPath, - Type_: "VirtualDisk", - }, - }, - }, - }, HvSocket: &hcsschema.HvSocket2{ HvSocketConfig: &hcsschema.HvSocketSystemConfig{ // Allow administrators and SYSTEM to bind to vsock sockets @@ -234,10 +178,139 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error } } - uvm.scsiLocations[0][0] = &SCSIMount{ - vm: uvm, - HostPath: doc.VirtualMachine.Devices.Scsi["0"].Attachments["0"].Path, - refCount: 1, + return doc, nil +} + +// CreateWCOW creates an HCS compute system representing a utility VM. +// The HCS Compute system can either be created from scratch or can be cloned from a +// template. +// +// WCOW Notes: +// - The scratch is always attached to SCSI 0:0 +// +func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error) { + ctx, span := trace.StartSpan(ctx, "uvm::CreateWCOW") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + if opts.ID == "" { + g, err := guid.NewV4() + if err != nil { + return nil, err + } + opts.ID = g.String() + } + + span.AddAttributes(trace.StringAttribute(logfields.UVMID, opts.ID)) + log.G(ctx).WithField("options", fmt.Sprintf("%+v", opts)).Debug("uvm::CreateWCOW options") + + uvm := &UtilityVM{ + id: opts.ID, + owner: opts.Owner, + operatingSystem: "windows", + scsiControllerCount: 1, + vsmbDirShares: make(map[string]*VSMBShare), + vsmbFileShares: make(map[string]*VSMBShare), + vpciDevices: make(map[string]*VPCIDevice), + physicallyBacked: !opts.AllowOvercommit, + devicesPhysicallyBacked: opts.FullyPhysicallyBacked, + cpuGroupID: opts.CPUGroupID, + createOpts: *opts, + } + + defer func() { + if err != nil { + uvm.Close() + } + }() + + if err := verifyOptions(ctx, opts); err != nil { + return nil, errors.Wrap(err, errBadUVMOpts.Error()) + } + + uvmFolder, err := uvmfolder.LocateUVMFolder(ctx, opts.LayerFolders) + if err != nil { + return nil, fmt.Errorf("failed to locate utility VM folder from layer folders: %s", err) + } + + // TODO: BUGBUG Remove this. @jhowardmsft + // It should be the responsiblity of the caller to do the creation and population. + // - Update runhcs too (vm.go). + // - Remove comment in function header + // - Update tests that rely on this current behaviour. + // Create the RW scratch in the top-most layer folder, creating the folder if it doesn't already exist. + scratchFolder := opts.LayerFolders[len(opts.LayerFolders)-1] + + // Create the directory if it doesn't exist + if _, err := os.Stat(scratchFolder); os.IsNotExist(err) { + if err := os.MkdirAll(scratchFolder, 0777); err != nil { + return nil, fmt.Errorf("failed to create utility VM scratch folder: %s", err) + } + } + + doc, err := prepareConfigDoc(ctx, uvm, opts, uvmFolder) + if err != nil { + return nil, fmt.Errorf("error in preparing config doc: %s", err) + } + + if !opts.IsClone { + // Create sandbox.vhdx in the scratch folder based on the template, granting the correct permissions to it + scratchPath := filepath.Join(scratchFolder, "sandbox.vhdx") + if _, err := os.Stat(scratchPath); os.IsNotExist(err) { + if err := wcow.CreateUVMScratch(ctx, uvmFolder, scratchFolder, uvm.id); err != nil { + return nil, fmt.Errorf("failed to create scratch: %s", err) + } + } else { + // Sandbox.vhdx exists, just need to grant vm access to it. + if err := wclayer.GrantVmAccess(ctx, uvm.id, scratchPath); err != nil { + return nil, errors.Wrap(err, "failed to grant vm access to scratch") + } + } + + doc.VirtualMachine.Devices.Scsi = map[string]hcsschema.Scsi{ + "0": { + Attachments: map[string]hcsschema.Attachment{ + "0": { + Path: scratchPath, + Type_: "VirtualDisk", + }, + }, + }, + } + + uvm.scsiLocations[0][0] = newSCSIMount(uvm, doc.VirtualMachine.Devices.Scsi["0"].Attachments["0"].Path, "", "", 1, 0, 0, false) + } else { + doc.VirtualMachine.RestoreState = &hcsschema.RestoreState{} + doc.VirtualMachine.RestoreState.TemplateSystemId = opts.TemplateConfig.UVMID + + for _, cloneableResource := range opts.TemplateConfig.Resources { + err = cloneableResource.Clone(ctx, uvm, &cloneData{ + doc: doc, + scratchFolder: scratchFolder, + uvmID: opts.ID, + }) + if err != nil { + return nil, fmt.Errorf("failed while cloning: %s", err) + } + } + + // we add default clone namespace for each clone. Include it here. + if uvm.namespaces == nil { + uvm.namespaces = make(map[string]*namespaceInfo) + } + uvm.namespaces[DEFAULT_CLONE_NETWORK_NAMESPACE_ID] = &namespaceInfo{ + nics: make(map[string]*nicInfo), + } + uvm.IsClone = true + uvm.TemplateID = opts.TemplateConfig.UVMID + } + + // Add appropriate VSMB share options if this UVM needs to be saved as a template + if opts.IsTemplate { + for _, share := range doc.VirtualMachine.Devices.VirtualSmb.Shares { + uvm.SetSaveableVSMBOptions(share.Options, share.Options.ReadOnly) + } + uvm.IsTemplate = true } fullDoc, err := mergemaps.MergeJSON(doc, ([]byte)(opts.AdditionHCSDocumentJSON)) @@ -250,16 +323,11 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error return nil, fmt.Errorf("error while creating the compute system: %s", err) } + // All clones MUST use external gcs connection if opts.ExternalGuestConnection { - log.G(ctx).WithField("vmID", uvm.runtimeID).Debug("Using external GCS bridge") - l, err := winio.ListenHvsock(&winio.HvsockAddr{ - VMID: uvm.runtimeID, - ServiceID: gcs.WindowsGcsHvsockServiceID, - }) - if err != nil { + if err = uvm.startExternalGcsListener(ctx); err != nil { return nil, err } - uvm.gcListener = l } return uvm, nil diff --git a/internal/uvm/network.go b/internal/uvm/network.go index 2b0769d7af..4be39676f9 100644 --- a/internal/uvm/network.go +++ b/internal/uvm/network.go @@ -56,13 +56,16 @@ func (endpoints *NetworkEndpoints) Release(ctx context.Context) error { return nil } -// AddNetNS adds network namespace inside the guest. +// AddNetNS adds network namespace inside the guest without actually querying for the +// namespace by its ID. It uses the given namespace struct as it is in the guest request. +// This function is mostly used when we need to override the values inside the namespace +// struct returned by the GetNamespaceByID. For most uses cases AddNetNSByID is more appropriate. // -// If a namespace with `id` already exists returns `ErrNetNSAlreadyAttached`. -func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { +// If a namespace with the same id already exists returns `ErrNetNSAlreadyAttached`. +func (uvm *UtilityVM) AddNetNS(ctx context.Context, hcnNamespace *hcn.HostComputeNamespace) error { uvm.m.Lock() defer uvm.m.Unlock() - if _, ok := uvm.namespaces[id]; ok { + if _, ok := uvm.namespaces[hcnNamespace.Id]; ok { return ErrNetNSAlreadyAttached } @@ -70,10 +73,6 @@ func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { // Add a Guest Network namespace. On LCOW we add the adapters // dynamically. if uvm.operatingSystem == "windows" { - hcnNamespace, err := hcn.GetNamespaceByID(id) - if err != nil { - return err - } guestNamespace := hcsschema.ModifySettingRequest{ GuestRequest: guestrequest.GuestRequest{ ResourceType: guestrequest.ResourceTypeNetworkNamespace, @@ -90,12 +89,28 @@ func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { if uvm.namespaces == nil { uvm.namespaces = make(map[string]*namespaceInfo) } - uvm.namespaces[id] = &namespaceInfo{ + uvm.namespaces[hcnNamespace.Id] = &namespaceInfo{ nics: make(map[string]*nicInfo), } return nil } +// AddNetNSByID adds finds the namespace with given `id` and adds that +// network namespace inside the guest. +// +// If a namespace with `id` already exists returns `ErrNetNSAlreadyAttached`. +func (uvm *UtilityVM) AddNetNSByID(ctx context.Context, id string) error { + hcnNamespace, err := hcn.GetNamespaceByID(id) + if err != nil { + return err + } + + if err = uvm.AddNetNS(ctx, hcnNamespace); err != nil { + return err + } + return nil +} + // AddEndpointsToNS adds all unique `endpoints` to the network namespace // matching `id`. On failure does not roll back any previously successfully // added endpoints. @@ -314,3 +329,15 @@ func (uvm *UtilityVM) removeNIC(ctx context.Context, id guid.GUID, endpoint *hns } return nil } + +// Removes all NICs added to this uvm. +func (uvm *UtilityVM) RemoveAllNICs(ctx context.Context) error { + for _, ns := range uvm.namespaces { + for _, ninfo := range ns.nics { + if err := uvm.removeNIC(ctx, ninfo.ID, ninfo.Endpoint); err != nil { + return err + } + } + } + return nil +} diff --git a/internal/uvm/scsi.go b/internal/uvm/scsi.go index 119c9a1088..d211352ad2 100644 --- a/internal/uvm/scsi.go +++ b/internal/uvm/scsi.go @@ -1,11 +1,17 @@ package uvm import ( + "bytes" "context" + "encoding/gob" "fmt" + "io/ioutil" + "os" + "path/filepath" "strconv" "github.com/Microsoft/go-winio/pkg/security" + "github.com/Microsoft/hcsshim/internal/copyfile" "github.com/Microsoft/hcsshim/internal/guestrequest" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/requesttype" @@ -32,6 +38,8 @@ const ( VMAccessTypeIndividual ) +const scsiCurrentSerialVersionID = 1 + var ( ErrNoAvailableLocation = fmt.Errorf("no available location") ErrNotAttached = fmt.Errorf("not attached") @@ -67,41 +75,53 @@ type SCSIMount struct { // read-only layers. As RO layers are shared, we perform ref-counting. isLayer bool refCount uint32 + // specifies if this is a readonly layer + readOnly bool + // "VirtualDisk" or "PassThru" disk attachment type. + attachmentType string + // serialization ID + serialVersionID uint32 } func (sm *SCSIMount) logFormat() logrus.Fields { return logrus.Fields{ - "HostPath": sm.HostPath, - "UVMPath": sm.UVMPath, - "isLayer": sm.isLayer, - "refCount": sm.refCount, - "Controller": sm.Controller, - "LUN": sm.LUN, + "HostPath": sm.HostPath, + "UVMPath": sm.UVMPath, + "isLayer": sm.isLayer, + "refCount": sm.refCount, + "Controller": sm.Controller, + "LUN": sm.LUN, + "SerialVersionID": sm.serialVersionID, + } +} + +func newSCSIMount(uvm *UtilityVM, hostPath, uvmPath, attachmentType string, refCount uint32, controller int, lun int32, readOnly bool) *SCSIMount { + return &SCSIMount{ + vm: uvm, + HostPath: hostPath, + UVMPath: uvmPath, + refCount: refCount, + Controller: controller, + LUN: int32(lun), + readOnly: readOnly, + attachmentType: attachmentType, + serialVersionID: scsiCurrentSerialVersionID, } } // allocateSCSISlot finds the next available slot on the // SCSI controllers associated with a utility VM to use. // Lock must be held when calling this function -func (uvm *UtilityVM) allocateSCSISlot(ctx context.Context, hostPath string, uvmPath string) (*SCSIMount, error) { +func (uvm *UtilityVM) allocateSCSISlot(ctx context.Context) (int, int, error) { for controller, luns := range uvm.scsiLocations { for lun, sm := range luns { // If sm is nil, we have found an open slot so we allocate a new SCSIMount if sm == nil { - uvm.scsiLocations[controller][lun] = &SCSIMount{ - vm: uvm, - HostPath: hostPath, - UVMPath: uvmPath, - refCount: 1, - Controller: controller, - LUN: int32(lun), - } - log.G(ctx).WithFields(uvm.scsiLocations[controller][lun].logFormat()).Debug("allocated SCSI mount") - return uvm.scsiLocations[controller][lun], nil + return controller, lun, nil } } } - return nil, ErrNoAvailableLocation + return -1, -1, ErrNoAvailableLocation } func (uvm *UtilityVM) deallocateSCSIMount(ctx context.Context, sm *SCSIMount) { @@ -228,7 +248,7 @@ func (uvm *UtilityVM) AddSCSIPhysicalDisk(ctx context.Context, hostPath, uvmPath // // Returns result from calling modify with the given scsi mount func (uvm *UtilityVM) addSCSIActual(ctx context.Context, hostPath, uvmPath, attachmentType string, readOnly bool, vmAccess VMAccessType) (sm *SCSIMount, err error) { - sm, existed, err := uvm.allocateSCSIMount(ctx, hostPath, uvmPath, vmAccess) + sm, existed, err := uvm.allocateSCSIMount(ctx, readOnly, hostPath, uvmPath, attachmentType, vmAccess) if err != nil { return nil, err } @@ -294,9 +314,9 @@ func (uvm *UtilityVM) addSCSIActual(ctx context.Context, hostPath, uvmPath, atta // device or allocates a new one if not already present. // Returns the resulting *SCSIMount, a bool indicating if the scsi device was already present, // and error if any. -func (uvm *UtilityVM) allocateSCSIMount(ctx context.Context, hostPath, uvmPath string, vmAccess VMAccessType) (*SCSIMount, bool, error) { +func (uvm *UtilityVM) allocateSCSIMount(ctx context.Context, readOnly bool, hostPath, uvmPath, attachmentType string, vmAccess VMAccessType) (*SCSIMount, bool, error) { // Ensure the utility VM has access - err := uvm.grantAccess(ctx, hostPath, vmAccess) + err := grantAccess(ctx, uvm.id, hostPath, vmAccess) if err != nil { return nil, false, errors.Wrapf(err, "failed to grant VM access for SCSI mount") } @@ -311,11 +331,16 @@ func (uvm *UtilityVM) allocateSCSIMount(ctx context.Context, hostPath, uvmPath s return sm, true, nil } - sm, err := uvm.allocateSCSISlot(ctx, hostPath, uvmPath) + controller, lun, err := uvm.allocateSCSISlot(ctx) if err != nil { return nil, false, err } - return sm, false, nil + + uvm.scsiLocations[controller][lun] = newSCSIMount(uvm, hostPath, uvmPath, attachmentType, 1, controller, int32(lun), readOnly) + log.G(ctx).WithFields(uvm.scsiLocations[controller][lun].logFormat()).Debug("allocated SCSI mount") + + return uvm.scsiLocations[controller][lun], false, nil + } // GetScsiUvmPath returns the guest mounted path of a SCSI drive. @@ -332,13 +357,164 @@ func (uvm *UtilityVM) GetScsiUvmPath(ctx context.Context, hostPath string) (stri } // grantAccess helper function to grant access to a file for the vm or vm group -func (uvm *UtilityVM) grantAccess(ctx context.Context, hostPath string, vmAccess VMAccessType) error { +func grantAccess(ctx context.Context, uvmID string, hostPath string, vmAccess VMAccessType) error { switch vmAccess { case VMAccessTypeGroup: log.G(ctx).WithField("path", hostPath).Debug("granting vm group access") return security.GrantVmGroupAccess(hostPath) case VMAccessTypeIndividual: - return wclayer.GrantVmAccess(ctx, uvm.id, hostPath) + return wclayer.GrantVmAccess(ctx, uvmID, hostPath) + } + return nil +} + +var _ = (Cloneable)(&SCSIMount{}) + +// GobEncode serializes the SCSIMount struct +func (sm *SCSIMount) GobEncode() ([]byte, error) { + var buf bytes.Buffer + encoder := gob.NewEncoder(&buf) + errMsgFmt := "failed to encode SCSIMount: %s" + // encode only the fields that can be safely deserialized. + if err := encoder.Encode(sm.serialVersionID); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.HostPath); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.UVMPath); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.Controller); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.LUN); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.readOnly); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.attachmentType); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + return buf.Bytes(), nil +} + +// GobDecode deserializes the SCSIMount struct into the struct on which this is called +// (i.e the sm pointer) +func (sm *SCSIMount) GobDecode(data []byte) error { + buf := bytes.NewBuffer(data) + decoder := gob.NewDecoder(buf) + errMsgFmt := "failed to decode SCSIMount: %s" + // fields should be decoded in the same order in which they were encoded. + if err := decoder.Decode(&sm.serialVersionID); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if sm.serialVersionID != scsiCurrentSerialVersionID { + return fmt.Errorf("Serialized version of SCSIMount: %d doesn't match with the current version: %d", sm.serialVersionID, scsiCurrentSerialVersionID) + } + if err := decoder.Decode(&sm.HostPath); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.UVMPath); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.Controller); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.LUN); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.readOnly); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.attachmentType); err != nil { + return fmt.Errorf(errMsgFmt, err) } return nil } + +// Clone function creates a clone of the SCSIMount `sm` and adds the cloned SCSIMount to +// the uvm `vm`. If `sm` is read only then it is simply added to the `vm`. But if it is a +// writeable mount(e.g a scratch layer) then a copy of it is made and that copy is added +// to the `vm`. +func (sm *SCSIMount) Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) error { + var ( + dstVhdPath string = sm.HostPath + err error + dir string + conStr string = fmt.Sprintf("%d", sm.Controller) + lunStr string = fmt.Sprintf("%d", sm.LUN) + ) + + if !sm.readOnly { + // This is a writeable SCSI mount. It must be either the + // 1. scratch VHD of the UVM or + // 2. scratch VHD of the container. + // A user provided writable SCSI mount is not allowed on the template UVM + // or container and so this SCSI mount has to be the scratch VHD of the + // UVM or container. The container inside this UVM will automatically be + // cloned here when we are cloning the uvm itself. We will receive a + // request for creation of this container later and that request will + // specify the storage path for this container. However, that storage + // location is not available now so we just use the storage path of the + // uvm instead. + // TODO(ambarve): Find a better way for handling this. Problem with this + // approach is that the scratch VHD of the container will not be + // automatically cleaned after container exits. It will stay there as long + // as the UVM keeps running. + + // For the scratch VHD of the VM (always attached at Controller:0, LUN:0) + // clone it in the scratch folder + dir = cd.scratchFolder + if sm.Controller != 0 || sm.LUN != 0 { + dir, err = ioutil.TempDir(cd.scratchFolder, fmt.Sprintf("clone-mount-%d-%d", sm.Controller, sm.LUN)) + if err != nil { + return fmt.Errorf("error while creating directory for scsi mounts of clone vm: %s", err) + } + } + + // copy the VHDX + dstVhdPath = filepath.Join(dir, filepath.Base(sm.HostPath)) + log.G(ctx).WithFields(logrus.Fields{ + "source hostPath": sm.HostPath, + "controller": sm.Controller, + "LUN": sm.LUN, + "destination hostPath": dstVhdPath, + }).Debug("Creating a clone of SCSI mount") + + if err = copyfile.CopyFile(ctx, sm.HostPath, dstVhdPath, true); err != nil { + return err + } + + if err = grantAccess(ctx, cd.uvmID, dstVhdPath, VMAccessTypeIndividual); err != nil { + os.Remove(dstVhdPath) + return err + } + } + + if cd.doc.VirtualMachine.Devices.Scsi == nil { + cd.doc.VirtualMachine.Devices.Scsi = map[string]hcsschema.Scsi{} + } + + if _, ok := cd.doc.VirtualMachine.Devices.Scsi[conStr]; !ok { + cd.doc.VirtualMachine.Devices.Scsi[conStr] = hcsschema.Scsi{ + Attachments: map[string]hcsschema.Attachment{}, + } + } + + cd.doc.VirtualMachine.Devices.Scsi[conStr].Attachments[lunStr] = hcsschema.Attachment{ + Path: dstVhdPath, + Type_: sm.attachmentType, + } + + clonedScsiMount := newSCSIMount(vm, dstVhdPath, sm.UVMPath, sm.attachmentType, 1, sm.Controller, sm.LUN, sm.readOnly) + + vm.scsiLocations[sm.Controller][sm.LUN] = clonedScsiMount + + return nil +} + +func (sm *SCSIMount) GetSerialVersionID() uint32 { + return scsiCurrentSerialVersionID +} diff --git a/internal/uvm/start.go b/internal/uvm/start.go index c970a2fca7..7d8a8459e8 100644 --- a/internal/uvm/start.go +++ b/internal/uvm/start.go @@ -240,7 +240,7 @@ func (uvm *UtilityVM) Start(ctx context.Context) (err error) { Log: log.G(ctx).WithField(logfields.UVMID, uvm.id), IoListen: gcs.HvsockIoListen(uvm.runtimeID), } - uvm.gc, err = gcc.Connect(ctx) + uvm.gc, err = gcc.Connect(ctx, !uvm.IsClone) if err != nil { return err } diff --git a/internal/uvm/types.go b/internal/uvm/types.go index 5fcb323ba6..5f29166edf 100644 --- a/internal/uvm/types.go +++ b/internal/uvm/types.go @@ -116,4 +116,18 @@ type UtilityVM struct { // cpuGroupID is the ID of the cpugroup on the host that this UVM is assigned to cpuGroupID string + + // specifies if this UVM is created to be saved as a template + IsTemplate bool + + // specifies if this UVM is a cloned from a template + IsClone bool + + // ID of the template from which this clone was created. Only applies when IsClone + // is true + TemplateID string + + // The CreateOpts used to create this uvm. These can be either of type + // uvm.OptionsLCOW or uvm.OptionsWCOW + createOpts interface{} } diff --git a/internal/uvm/vsmb.go b/internal/uvm/vsmb.go index 9ecbbb1c6d..e15fb42554 100644 --- a/internal/uvm/vsmb.go +++ b/internal/uvm/vsmb.go @@ -1,7 +1,9 @@ package uvm import ( + "bytes" "context" + "encoding/gob" "fmt" "os" "path/filepath" @@ -17,23 +19,27 @@ import ( "golang.org/x/sys/windows" ) -const vsmbSharePrefix = `\\?\VMSMB\VSMB-{dcc079ae-60ba-4d07-847c-3493609c0870}\` +const ( + vsmbSharePrefix = `\\?\VMSMB\VSMB-{dcc079ae-60ba-4d07-847c-3493609c0870}\` + vsmbCurrentSerialVersionID = 1 +) // VSMBShare contains the host path for a Vsmb Mount type VSMBShare struct { // UVM the resource belongs to - vm *UtilityVM - HostPath string - refCount uint32 - name string - allowedFiles []string - guestPath string - readOnly bool + vm *UtilityVM + HostPath string + refCount uint32 + name string + allowedFiles []string + guestPath string + options hcsschema.VirtualSmbShareOptions + serialVersionID uint32 } // Release frees the resources of the corresponding vsmb Mount func (vsmb *VSMBShare) Release(ctx context.Context) error { - if err := vsmb.vm.RemoveVSMB(ctx, vsmb.HostPath, vsmb.readOnly); err != nil { + if err := vsmb.vm.RemoveVSMB(ctx, vsmb.HostPath, vsmb.options.ReadOnly); err != nil { return fmt.Errorf("failed to remove VSMB share: %s", err) } return nil @@ -54,6 +60,28 @@ func (uvm *UtilityVM) DefaultVSMBOptions(readOnly bool) *hcsschema.VirtualSmbSha return opts } +func (uvm *UtilityVM) SetSaveableVSMBOptions(opts *hcsschema.VirtualSmbShareOptions, readOnly bool) { + if readOnly { + opts.ShareRead = true + opts.CacheIo = true + opts.ReadOnly = true + opts.PseudoOplocks = true + opts.NoOplocks = false + } else { + // Using NoOpLocks can cause intermittent Access denied failures due to + // a VSMB bug that was fixed but not backported to RS5/19H1. + opts.ShareRead = false + opts.CacheIo = false + opts.ReadOnly = false + opts.PseudoOplocks = false + opts.NoOplocks = true + } + opts.NoLocks = true + opts.PseudoDirnotify = true + opts.NoDirectmap = true + return +} + // findVSMBShare finds a share by `hostPath`. If not found returns `ErrNotAttached`. func (uvm *UtilityVM) findVSMBShare(ctx context.Context, m map[string]*VSMBShare, shareKey string) (*VSMBShare, error) { share, ok := m[shareKey] @@ -172,10 +200,11 @@ func (uvm *UtilityVM) AddVSMB(ctx context.Context, hostPath string, options *hcs shareName := "s" + strconv.FormatUint(uvm.vsmbCounter, 16) share = &VSMBShare{ - vm: uvm, - name: shareName, - guestPath: vsmbSharePrefix + shareName, - readOnly: options.ReadOnly, + vm: uvm, + name: shareName, + guestPath: vsmbSharePrefix + shareName, + HostPath: hostPath, + serialVersionID: vsmbCurrentSerialVersionID, } } newAllowedFiles := share.allowedFiles @@ -211,6 +240,7 @@ func (uvm *UtilityVM) AddVSMB(ctx context.Context, hostPath string, options *hcs share.allowedFiles = newAllowedFiles share.refCount++ + share.options = *options m[shareKey] = share return share, nil } @@ -287,10 +317,107 @@ func (uvm *UtilityVM) GetVSMBUvmPath(ctx context.Context, hostPath string, readO return filepath.Join(share.guestPath, f), nil } -// getVSMBShareKey returns a string key which encapsulates the information that -// is used to look up an existing VSMB share. If a share is being added, but -// there is an existing share with the same key, the existing share will be used -// instead (and its ref count incremented). +var _ = (Cloneable)(&VSMBShare{}) + +// GobEncode serializes the VSMBShare struct +func (vsmb *VSMBShare) GobEncode() ([]byte, error) { + var buf bytes.Buffer + encoder := gob.NewEncoder(&buf) + errMsgFmt := "failed to encode VSMBShare: %s" + // encode only the fields that can be safely deserialized. + if err := encoder.Encode(vsmb.serialVersionID); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.HostPath); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.name); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.allowedFiles); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.guestPath); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.options); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + return buf.Bytes(), nil +} + +// GobDecode deserializes the VSMBShare struct into the struct on which this is called +// (i.e the vsmb pointer) +func (vsmb *VSMBShare) GobDecode(data []byte) error { + buf := bytes.NewBuffer(data) + decoder := gob.NewDecoder(buf) + errMsgFmt := "failed to decode VSMBShare: %s" + // fields should be decoded in the same order in which they were encoded. + // And verify the serialVersionID first + if err := decoder.Decode(&vsmb.serialVersionID); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if vsmb.serialVersionID != vsmbCurrentSerialVersionID { + return fmt.Errorf("Serialized version of VSMBShare %d doesn't match with the current version %d", vsmb.serialVersionID, vsmbCurrentSerialVersionID) + } + if err := decoder.Decode(&vsmb.HostPath); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&vsmb.name); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&vsmb.allowedFiles); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&vsmb.guestPath); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&vsmb.options); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + return nil +} + +// Clone creates a clone of the VSMBShare `vsmb` and adds that clone to the uvm `vm`. To +// clone VSMB share we just need to add it into the config doc of that VM and increase the +// vsmb counter. +func (vsmb *VSMBShare) Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) error { + cd.doc.VirtualMachine.Devices.VirtualSmb.Shares = append(cd.doc.VirtualMachine.Devices.VirtualSmb.Shares, hcsschema.VirtualSmbShare{ + Name: vsmb.name, + Path: vsmb.HostPath, + Options: &vsmb.options, + AllowedFiles: vsmb.allowedFiles, + }) + vm.vsmbCounter++ + + clonedVSMB := &VSMBShare{ + vm: vm, + HostPath: vsmb.HostPath, + refCount: 1, + name: vsmb.name, + options: vsmb.options, + allowedFiles: vsmb.allowedFiles, + guestPath: vsmb.guestPath, + serialVersionID: vsmbCurrentSerialVersionID, + } + + if vsmb.options.RestrictFileAccess { + vm.vsmbFileShares[vsmb.HostPath] = clonedVSMB + } else { + vm.vsmbDirShares[vsmb.HostPath] = clonedVSMB + } + + return nil +} + +// getVSMBShareKey returns a string key which encapsulates the information that is used to +// look up an existing VSMB share. If a share is being added, but there is an existing +// share with the same key, the existing share will be used instead (and its ref count +// incremented). func getVSMBShareKey(hostPath string, readOnly bool) string { return fmt.Sprintf("%v-%v", hostPath, readOnly) } + +func (vsmb *VSMBShare) GetSerialVersionID() uint32 { + return vsmbCurrentSerialVersionID +}