From e37a4dc0402b1090427f4b07908710b923e039f4 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 10 Jun 2020 20:00:47 -0700 Subject: [PATCH 1/4] Add support for saving hcs compute system. This is one of the many small PRs that enable the support for late cloning. This PR simply adds the go wrappers required for saving a HCS compute system which is used during template creation. Signed-off-by: Amit Barve --- internal/hcs/callback.go | 1 + internal/hcs/system.go | 32 ++++++++++++++++++++++++++ internal/timeout/timeout.go | 4 ++++ internal/vmcompute/vmcompute.go | 23 ++++++++++++++++++ internal/vmcompute/zsyscall_windows.go | 24 +++++++++++++++++++ 5 files changed, 84 insertions(+) diff --git a/internal/hcs/callback.go b/internal/hcs/callback.go index 62ba81751b..cebbe75ad4 100644 --- a/internal/hcs/callback.go +++ b/internal/hcs/callback.go @@ -106,6 +106,7 @@ func newSystemChannels() notificationChannels { hcsNotificationSystemStartCompleted, hcsNotificationSystemPauseCompleted, hcsNotificationSystemResumeCompleted, + hcsNotificationSystemSaveCompleted, } { channels[notif] = make(notificationChannel, 1) } diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 6120399c47..bda393a6d1 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -407,6 +407,38 @@ func (computeSystem *System) Resume(ctx context.Context) (err error) { return nil } +// Save the compute system +func (computeSystem *System) Save(ctx context.Context, options interface{}) (err error) { + operation := "hcsshim::System::Save" + + // hcsSaveComputeSystemContext is an async peration. Start the outer span + // here to measure the full save time. + ctx, span := trace.StartSpan(ctx, operation) + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes(trace.StringAttribute("cid", computeSystem.id)) + + saveOptions, err := json.Marshal(options) + if err != nil { + return err + } + + computeSystem.handleLock.RLock() + defer computeSystem.handleLock.RUnlock() + + if computeSystem.handle == 0 { + return makeSystemError(computeSystem, operation, "", ErrAlreadyClosed, nil) + } + + result, err := vmcompute.HcsSaveComputeSystem(ctx, computeSystem.handle, string(saveOptions)) + events, err := processAsyncHcsResult(ctx, err, result, computeSystem.callbackNumber, hcsNotificationSystemSaveCompleted, &timeout.SystemSave) + if err != nil { + return makeSystemError(computeSystem, operation, "", err, events) + } + + return nil +} + func (computeSystem *System) createProcess(ctx context.Context, operation string, c interface{}) (*Process, *vmcompute.HcsProcessInformation, error) { computeSystem.handleLock.RLock() defer computeSystem.handleLock.RUnlock() diff --git a/internal/timeout/timeout.go b/internal/timeout/timeout.go index ff3b6572e6..eaf39fa513 100644 --- a/internal/timeout/timeout.go +++ b/internal/timeout/timeout.go @@ -29,6 +29,9 @@ var ( // SystemResume is the timeout for resuming a compute system SystemResume time.Duration = defaultTimeout + // SystemSave is the timeout for saving a compute system + SystemSave time.Duration = defaultTimeout + // SyscallWatcher is the timeout before warning of a potential stuck platform syscall. SyscallWatcher time.Duration = defaultTimeout @@ -51,6 +54,7 @@ func init() { SystemStart = durationFromEnvironment("HCSSHIM_TIMEOUT_SYSTEMSTART", SystemStart) SystemPause = durationFromEnvironment("HCSSHIM_TIMEOUT_SYSTEMPAUSE", SystemPause) SystemResume = durationFromEnvironment("HCSSHIM_TIMEOUT_SYSTEMRESUME", SystemResume) + SystemSave = durationFromEnvironment("HCSSHIM_TIMEOUT_SYSTEMSAVE", SystemSave) SyscallWatcher = durationFromEnvironment("HCSSHIM_TIMEOUT_SYSCALLWATCHER", SyscallWatcher) Tar2VHD = durationFromEnvironment("HCSSHIM_TIMEOUT_TAR2VHD", Tar2VHD) ExternalCommandToStart = durationFromEnvironment("HCSSHIM_TIMEOUT_EXTERNALCOMMANDSTART", ExternalCommandToStart) diff --git a/internal/vmcompute/vmcompute.go b/internal/vmcompute/vmcompute.go index e42bf8cfa7..32491f2c31 100644 --- a/internal/vmcompute/vmcompute.go +++ b/internal/vmcompute/vmcompute.go @@ -29,6 +29,7 @@ import ( //sys hcsModifyServiceSettings(settings string, result **uint16) (hr error) = vmcompute.HcsModifyServiceSettings? //sys hcsRegisterComputeSystemCallback(computeSystem HcsSystem, callback uintptr, context uintptr, callbackHandle *HcsCallback) (hr error) = vmcompute.HcsRegisterComputeSystemCallback? //sys hcsUnregisterComputeSystemCallback(callbackHandle HcsCallback) (hr error) = vmcompute.HcsUnregisterComputeSystemCallback? +//sys hcsSaveComputeSystem(computeSystem HcsSystem, options string, result **uint16) (hr error) = vmcompute.HcsSaveComputeSystem? //sys hcsCreateProcess(computeSystem HcsSystem, processParameters string, processInformation *HcsProcessInformation, process *HcsProcess, result **uint16) (hr error) = vmcompute.HcsCreateProcess? //sys hcsOpenProcess(computeSystem HcsSystem, pid uint32, process *HcsProcess, result **uint16) (hr error) = vmcompute.HcsOpenProcess? @@ -585,3 +586,25 @@ func HcsUnregisterProcessCallback(ctx gcontext.Context, callbackHandle HcsCallba return hcsUnregisterProcessCallback(callbackHandle) }) } + +func HcsSaveComputeSystem(ctx gcontext.Context, computeSystem HcsSystem, options string) (result string, hr error) { + ctx, span := trace.StartSpan(ctx, "HcsSaveComputeSystem") + defer span.End() + defer func() { + if result != "" { + span.AddAttributes(trace.StringAttribute("result", result)) + } + if hr != errVmcomputeOperationPending { + oc.SetSpanStatus(span, hr) + } + }() + + return result, execute(ctx, timeout.SyscallWatcher, func() error { + var resultp *uint16 + err := hcsSaveComputeSystem(computeSystem, options, &resultp) + if resultp != nil { + result = interop.ConvertAndFreeCoTaskMemString(resultp) + } + return err + }) +} diff --git a/internal/vmcompute/zsyscall_windows.go b/internal/vmcompute/zsyscall_windows.go index 8cfded4963..cae55058de 100644 --- a/internal/vmcompute/zsyscall_windows.go +++ b/internal/vmcompute/zsyscall_windows.go @@ -53,6 +53,7 @@ var ( procHcsModifyServiceSettings = modvmcompute.NewProc("HcsModifyServiceSettings") procHcsRegisterComputeSystemCallback = modvmcompute.NewProc("HcsRegisterComputeSystemCallback") procHcsUnregisterComputeSystemCallback = modvmcompute.NewProc("HcsUnregisterComputeSystemCallback") + procHcsSaveComputeSystem = modvmcompute.NewProc("HcsSaveComputeSystem") procHcsCreateProcess = modvmcompute.NewProc("HcsCreateProcess") procHcsOpenProcess = modvmcompute.NewProc("HcsOpenProcess") procHcsCloseProcess = modvmcompute.NewProc("HcsCloseProcess") @@ -366,6 +367,29 @@ func hcsUnregisterComputeSystemCallback(callbackHandle HcsCallback) (hr error) { return } +func hcsSaveComputeSystem(computeSystem HcsSystem, options string, result **uint16) (hr error) { + var _p0 *uint16 + _p0, hr = syscall.UTF16PtrFromString(options) + if hr != nil { + return + } + return _hcsSaveComputeSystem(computeSystem, _p0, result) +} + +func _hcsSaveComputeSystem(computeSystem HcsSystem, options *uint16, result **uint16) (hr error) { + if hr = procHcsSaveComputeSystem.Find(); hr != nil { + return + } + r0, _, _ := syscall.Syscall(procHcsSaveComputeSystem.Addr(), 3, uintptr(computeSystem), uintptr(unsafe.Pointer(options)), uintptr(unsafe.Pointer(result))) + if int32(r0) < 0 { + if r0&0x1fff0000 == 0x00070000 { + r0 &= 0xffff + } + hr = syscall.Errno(r0) + } + return +} + func hcsCreateProcess(computeSystem HcsSystem, processParameters string, processInformation *HcsProcessInformation, process *HcsProcess, result **uint16) (hr error) { var _p0 *uint16 _p0, hr = syscall.UTF16PtrFromString(processParameters) From 50112c71fc64c8e1694eec121a90423673efc528 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Thu, 11 Jun 2020 00:03:50 -0700 Subject: [PATCH 2/4] Add support for creating cloned UVMs. This is one of the many small PRs that enable the support for late cloning. This commit adds the template and clone creation support in the uvm module. Signed-off-by: Amit Barve --- internal/gcs/guestconnection.go | 11 +- internal/gcs/guestconnection_test.go | 2 +- internal/uvm/clone.go | 117 ++++++++++++ internal/uvm/create.go | 23 ++- internal/uvm/create_test.go | 9 + internal/uvm/create_wcow.go | 261 +++++++++++++++++---------- internal/uvm/scsi.go | 222 ++++++++++++++++++++--- internal/uvm/start.go | 2 +- internal/uvm/types.go | 6 + internal/uvm/vsmb.go | 139 ++++++++++++-- 10 files changed, 645 insertions(+), 147 deletions(-) create mode 100644 internal/uvm/clone.go diff --git a/internal/gcs/guestconnection.go b/internal/gcs/guestconnection.go index 4745d8dd75..43f99fd62e 100644 --- a/internal/gcs/guestconnection.go +++ b/internal/gcs/guestconnection.go @@ -57,7 +57,7 @@ type GuestConnectionConfig struct { } // Connect establishes a GCS connection. `gcc.Conn` will be closed by this function. -func (gcc *GuestConnectionConfig) Connect(ctx context.Context) (_ *GuestConnection, err error) { +func (gcc *GuestConnectionConfig) Connect(ctx context.Context, isColdStart bool) (_ *GuestConnection, err error) { ctx, span := trace.StartSpan(ctx, "gcs::GuestConnectionConfig::Connect") defer span.End() defer func() { oc.SetSpanStatus(span, err) }() @@ -73,7 +73,7 @@ func (gcc *GuestConnectionConfig) Connect(ctx context.Context) (_ *GuestConnecti gc.brdg.Wait() gc.clearNotifies() }() - err = gc.connect(ctx) + err = gc.connect(ctx, isColdStart) if err != nil { gc.Close() return nil, err @@ -105,7 +105,10 @@ func (gc *GuestConnection) Protocol() uint32 { } // connect establishes a GCS connection. It must not be called more than once. -func (gc *GuestConnection) connect(ctx context.Context) (err error) { +// isColdStart should be true when the UVM is being connected to for the first time post-boot. +// It should be false for subsequent connections (e.g. when connecting to a UVM that has +// been cloned). +func (gc *GuestConnection) connect(ctx context.Context, isColdStart bool) (err error) { req := negotiateProtocolRequest{ MinimumVersion: protocolVersion, MaximumVersion: protocolVersion, @@ -123,7 +126,7 @@ func (gc *GuestConnection) connect(ctx context.Context) (err error) { if gc.os == "" { gc.os = "windows" } - if resp.Capabilities.SendHostCreateMessage { + if isColdStart && resp.Capabilities.SendHostCreateMessage { createReq := containerCreate{ requestBase: makeRequest(ctx, nullContainerID), ContainerConfig: anyInString{&uvmConfig{ diff --git a/internal/gcs/guestconnection_test.go b/internal/gcs/guestconnection_test.go index a77fcfd0d9..a878bae2ff 100644 --- a/internal/gcs/guestconnection_test.go +++ b/internal/gcs/guestconnection_test.go @@ -152,7 +152,7 @@ func connectGcs(ctx context.Context, t *testing.T) *GuestConnection { Log: logrus.NewEntry(logrus.StandardLogger()), IoListen: npipeIoListen, } - gc, err := gcc.Connect(context.Background()) + gc, err := gcc.Connect(context.Background(), true) if err != nil { c.Close() t.Fatal(err) diff --git a/internal/uvm/clone.go b/internal/uvm/clone.go new file mode 100644 index 0000000000..7d25713e0e --- /dev/null +++ b/internal/uvm/clone.go @@ -0,0 +1,117 @@ +package uvm + +import ( + "context" + + hcsschema "github.com/Microsoft/hcsshim/internal/schema2" + "github.com/pkg/errors" +) + +const ( + hcsComputeSystemSaveType = "AsTemplate" + // default namespace ID used for all template and clone VMs. + DEFAULT_CLONE_NETWORK_NAMESPACE_ID = "89EB8A86-E253-41FD-9800-E6D88EB2E18A" +) + +// Cloneable is a generic interface for cloning a specific resource. Not all resources can +// be cloned and so all resources might not implement this interface. This interface is +// mainly used during late cloning process to clone the resources associated with the UVM +// and the container. For some resources (like scratch VHDs of the UVM & container) +// cloning means actually creating a copy of that resource while for some resources it +// simply means adding that resource to the cloned VM without copying (like VSMB shares). +// The Clone function of that resource will deal with these details. +type Cloneable interface { + // A resource that supports cloning should also support serialization and + // deserialization operations. This is because during resource cloning a resource + // is usually serialized in one process and then deserialized and cloned in some + // other process. Care should be taken while serializing a resource to not include + // any state that will not be valid during the deserialization step. By default + // gob encoding is used to serialize and deserialize resources but a resource can + // implement `gob.GobEncoder` & `gob.GobDecoder` interfaces to provide its own + // serialization and deserialization functions. + + // A SerialVersionID is an identifier used to recognize a unique version of a + // resource. Every time the definition of the resource struct changes this ID is + // bumped up. This ID is used to ensure that we serialize and deserialize the + // same version of a resource. + GetSerialVersionID() uint32 + + // Clone function creates a clone of the resource on the UVM `vm` (i.e adds the + // cloned resource to the `vm`) + // `cd` parameter can be used to pass any other data that is required during the + // cloning process of that resource (for example, when cloning SCSI Mounts we + // might need scratchFolder). + // Clone function should be called on a valid struct (Mostly on the struct which + // is deserialized, and so Clone function should only depend on the fields that + // are exported in the struct). + // The implementation of the clone function should avoid reading any data from the + // `vm` struct, it can add new fields to the vm struct but since the vm struct + // isn't fully ready at this point it shouldn't be used to read any data. + Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) error +} + +// cloneData contains all the information that might be required during cloning process of +// a resource. +type cloneData struct { + // doc spec for the clone + doc *hcsschema.ComputeSystem + // scratchFolder of the clone + scratchFolder string + // UVMID of the clone + uvmID string +} + +// UVMTemplateConfig is just a wrapper struct that keeps together all the resources that +// need to be saved to create a template. +type UVMTemplateConfig struct { + // ID of the template vm + UVMID string + // Array of all resources that will be required while making a clone from this template + Resources []Cloneable +} + +// Captures all the information that is necessary to properly save this UVM as a template +// and create clones from this template later. The struct returned by this method must be +// later on made available while creating a clone from this template. +func (uvm *UtilityVM) GenerateTemplateConfig() *UVMTemplateConfig { + // Add all the SCSI Mounts and VSMB shares into the list of clones + templateConfig := &UVMTemplateConfig{ + UVMID: uvm.ID(), + } + + for _, vsmbShare := range uvm.vsmbDirShares { + templateConfig.Resources = append(templateConfig.Resources, vsmbShare) + } + + for _, vsmbShare := range uvm.vsmbFileShares { + templateConfig.Resources = append(templateConfig.Resources, vsmbShare) + } + + for _, location := range uvm.scsiLocations { + for _, scsiMount := range location { + if scsiMount != nil { + templateConfig.Resources = append(templateConfig.Resources, scsiMount) + } + } + } + + return templateConfig +} + +// Pauses the uvm and then saves it as a template. This uvm can not be restarted or used +// after it is successfully saved. +// uvm must be in the paused state before we attempt to save it. save call will throw the +// VM in incorrect state exception if it is not in the paused state at the time of saving. +func (uvm *UtilityVM) SaveAsTemplate(ctx context.Context) error { + if err := uvm.hcsSystem.Pause(ctx); err != nil { + return errors.Wrap(err, "error pausing the VM") + } + + saveOptions := hcsschema.SaveOptions{ + SaveType: hcsComputeSystemSaveType, + } + if err := uvm.hcsSystem.Save(ctx, saveOptions); err != nil { + return errors.Wrap(err, "error saving the VM") + } + return nil +} diff --git a/internal/uvm/create.go b/internal/uvm/create.go index 1290ffcd4d..5f69b015b2 100644 --- a/internal/uvm/create.go +++ b/internal/uvm/create.go @@ -123,6 +123,9 @@ func verifyOptions(ctx context.Context, options interface{}) error { if len(opts.LayerFolders) < 2 { return errors.New("at least 2 LayerFolders must be supplied") } + if opts.IsClone && opts.TemplateConfig == nil { + return errors.New("template config can not be nil when creating clone") + } } return nil } @@ -206,11 +209,9 @@ func (uvm *UtilityVM) Close() (err error) { uvm.hcsSystem.Terminate(ctx) uvm.Wait() } - if uvm.gc != nil { - uvm.gc.Close() - } - if uvm.gcListener != nil { - uvm.gcListener.Close() + + if err := uvm.CloseGCSConnection(); err != nil { + log.G(ctx).Errorf("close gcs connection failed: %f", err) } // outputListener will only be nil for a Create -> Stop without a Start. In @@ -333,3 +334,15 @@ func (uvm *UtilityVM) normalizeMemorySize(ctx context.Context, requested uint64) func (uvm *UtilityVM) DevicesPhysicallyBacked() bool { return uvm.devicesPhysicallyBacked } + +// Closes the external GCS connection if it is being used and also closes the +// listener for GCS connection. +func (uvm *UtilityVM) CloseGCSConnection() (err error) { + if uvm.gc != nil { + err = uvm.gc.Close() + } + if uvm.gcListener != nil { + err = uvm.gcListener.Close() + } + return +} diff --git a/internal/uvm/create_test.go b/internal/uvm/create_test.go index 3cdd896d59..0a6e82f322 100644 --- a/internal/uvm/create_test.go +++ b/internal/uvm/create_test.go @@ -26,3 +26,12 @@ func TestCreateWCOWBadLayerFolders(t *testing.T) { t.Fatal(err) } } + +func TestCreateClone(t *testing.T) { + opts := NewDefaultOptionsWCOW(t.Name(), "") + opts.IsClone = true + _, err := CreateWCOW(context.Background(), opts) + if err == nil { + t.Fatalf("CreateWCOW should fail when IsClone is true and TemplateConfig is not provided") + } +} diff --git a/internal/uvm/create_wcow.go b/internal/uvm/create_wcow.go index 02177f4fd5..51d609895c 100644 --- a/internal/uvm/create_wcow.go +++ b/internal/uvm/create_wcow.go @@ -28,6 +28,22 @@ type OptionsWCOW struct { *Options LayerFolders []string // Set of folders for base layers and scratch. Ordered from top most read-only through base read-only layer, followed by scratch + + // IsTemplate specifies if this UVM will be saved as a template in future. Setting + // this option will also enable some VSMB Options during UVM creation that allow + // template creation. + IsTemplate bool + + // IsClone specifies if this UVM should be created by cloning a template. If + // IsClone is true then a valid UVMTemplateConfig struct must be passed in the + // `TemplateConfig` field. + IsClone bool + + // TemplateConfig is only used during clone creation. If a uvm is + // being cloned then this TemplateConfig struct must be passed + // which holds all the information about the template from + // which this clone should be created. + TemplateConfig *UVMTemplateConfig } // NewDefaultOptionsWCOW creates the default options for a bootable version of @@ -43,83 +59,21 @@ func NewDefaultOptionsWCOW(id, owner string) *OptionsWCOW { } } -// CreateWCOW creates an HCS compute system representing a utility VM. -// -// WCOW Notes: -// - The scratch is always attached to SCSI 0:0 -// -func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error) { - ctx, span := trace.StartSpan(ctx, "uvm::CreateWCOW") - defer span.End() - defer func() { oc.SetSpanStatus(span, err) }() - - if opts.ID == "" { - g, err := guid.NewV4() - if err != nil { - return nil, err - } - opts.ID = g.String() - } +func (uvm *UtilityVM) startExternalGcsListener(ctx context.Context) error { + log.G(ctx).WithField("vmID", uvm.runtimeID).Debug("Using external GCS bridge") - span.AddAttributes(trace.StringAttribute(logfields.UVMID, opts.ID)) - log.G(ctx).WithField("options", fmt.Sprintf("%+v", opts)).Debug("uvm::CreateWCOW options") - - uvm := &UtilityVM{ - id: opts.ID, - owner: opts.Owner, - operatingSystem: "windows", - scsiControllerCount: 1, - vsmbDirShares: make(map[string]*VSMBShare), - vsmbFileShares: make(map[string]*VSMBShare), - vpciDevices: make(map[string]*VPCIDevice), - physicallyBacked: !opts.AllowOvercommit, - devicesPhysicallyBacked: opts.FullyPhysicallyBacked, - cpuGroupID: opts.CPUGroupID, - } - - defer func() { - if err != nil { - uvm.Close() - } - }() - - if err := verifyOptions(ctx, opts); err != nil { - return nil, errors.Wrap(err, errBadUVMOpts.Error()) - } - - uvmFolder, err := uvmfolder.LocateUVMFolder(ctx, opts.LayerFolders) + l, err := winio.ListenHvsock(&winio.HvsockAddr{ + VMID: uvm.runtimeID, + ServiceID: gcs.WindowsGcsHvsockServiceID, + }) if err != nil { - return nil, fmt.Errorf("failed to locate utility VM folder from layer folders: %s", err) - } - - // TODO: BUGBUG Remove this. @jhowardmsft - // It should be the responsiblity of the caller to do the creation and population. - // - Update runhcs too (vm.go). - // - Remove comment in function header - // - Update tests that rely on this current behaviour. - // Create the RW scratch in the top-most layer folder, creating the folder if it doesn't already exist. - scratchFolder := opts.LayerFolders[len(opts.LayerFolders)-1] - - // Create the directory if it doesn't exist - if _, err := os.Stat(scratchFolder); os.IsNotExist(err) { - if err := os.MkdirAll(scratchFolder, 0777); err != nil { - return nil, fmt.Errorf("failed to create utility VM scratch folder: %s", err) - } - } - - // Create sandbox.vhdx in the scratch folder based on the template, granting the correct permissions to it - scratchPath := filepath.Join(scratchFolder, "sandbox.vhdx") - if _, err := os.Stat(scratchPath); os.IsNotExist(err) { - if err := wcow.CreateUVMScratch(ctx, uvmFolder, scratchFolder, uvm.id); err != nil { - return nil, fmt.Errorf("failed to create scratch: %s", err) - } - } else { - // Sandbox.vhdx exists, just need to grant vm access to it. - if err := wclayer.GrantVmAccess(ctx, uvm.id, scratchPath); err != nil { - return nil, errors.Wrap(err, "failed to grant vm access to scratch") - } + return err } + uvm.gcListener = l + return nil +} +func prepareConfigDoc(ctx context.Context, uvm *UtilityVM, opts *OptionsWCOW, uvmFolder string) (*hcsschema.ComputeSystem, error) { processorTopology, err := processorinfo.HostProcessorInfo(ctx) if err != nil { return nil, fmt.Errorf("failed to get host processor information: %s", err) @@ -200,16 +154,6 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error }, }, Devices: &hcsschema.Devices{ - Scsi: map[string]hcsschema.Scsi{ - "0": { - Attachments: map[string]hcsschema.Attachment{ - "0": { - Path: scratchPath, - Type_: "VirtualDisk", - }, - }, - }, - }, HvSocket: &hcsschema.HvSocket2{ HvSocketConfig: &hcsschema.HvSocketSystemConfig{ // Allow administrators and SYSTEM to bind to vsock sockets @@ -234,10 +178,144 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error } } - uvm.scsiLocations[0][0] = &SCSIMount{ - vm: uvm, - HostPath: doc.VirtualMachine.Devices.Scsi["0"].Attachments["0"].Path, - refCount: 1, + return doc, nil +} + +// CreateWCOW creates an HCS compute system representing a utility VM. +// The HCS Compute system can either be created from scratch or can be cloned from a +// template. +// +// WCOW Notes: +// - The scratch is always attached to SCSI 0:0 +// +func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error) { + ctx, span := trace.StartSpan(ctx, "uvm::CreateWCOW") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + + if opts.ID == "" { + g, err := guid.NewV4() + if err != nil { + return nil, err + } + opts.ID = g.String() + } + + span.AddAttributes(trace.StringAttribute(logfields.UVMID, opts.ID)) + log.G(ctx).WithField("options", fmt.Sprintf("%+v", opts)).Debug("uvm::CreateWCOW options") + + uvm := &UtilityVM{ + id: opts.ID, + owner: opts.Owner, + operatingSystem: "windows", + scsiControllerCount: 1, + vsmbDirShares: make(map[string]*VSMBShare), + vsmbFileShares: make(map[string]*VSMBShare), + vpciDevices: make(map[string]*VPCIDevice), + physicallyBacked: !opts.AllowOvercommit, + devicesPhysicallyBacked: opts.FullyPhysicallyBacked, + cpuGroupID: opts.CPUGroupID, + } + + defer func() { + if err != nil { + uvm.Close() + } + }() + + if err := verifyOptions(ctx, opts); err != nil { + return nil, errors.Wrap(err, errBadUVMOpts.Error()) + } + + uvmFolder, err := uvmfolder.LocateUVMFolder(ctx, opts.LayerFolders) + if err != nil { + return nil, fmt.Errorf("failed to locate utility VM folder from layer folders: %s", err) + } + + // TODO: BUGBUG Remove this. @jhowardmsft + // It should be the responsiblity of the caller to do the creation and population. + // - Update runhcs too (vm.go). + // - Remove comment in function header + // - Update tests that rely on this current behaviour. + // Create the RW scratch in the top-most layer folder, creating the folder if it doesn't already exist. + scratchFolder := opts.LayerFolders[len(opts.LayerFolders)-1] + + // Create the directory if it doesn't exist + if _, err := os.Stat(scratchFolder); os.IsNotExist(err) { + if err := os.MkdirAll(scratchFolder, 0777); err != nil { + return nil, fmt.Errorf("failed to create utility VM scratch folder: %s", err) + } + } + + doc, err := prepareConfigDoc(ctx, uvm, opts, uvmFolder) + if err != nil { + return nil, fmt.Errorf("error in preparing config doc: %s", err) + } + + if !opts.IsClone { + // Create sandbox.vhdx in the scratch folder based on the template, granting the correct permissions to it + scratchPath := filepath.Join(scratchFolder, "sandbox.vhdx") + if _, err := os.Stat(scratchPath); os.IsNotExist(err) { + if err := wcow.CreateUVMScratch(ctx, uvmFolder, scratchFolder, uvm.id); err != nil { + return nil, fmt.Errorf("failed to create scratch: %s", err) + } + } else { + // Sandbox.vhdx exists, just need to grant vm access to it. + if err := wclayer.GrantVmAccess(ctx, uvm.id, scratchPath); err != nil { + return nil, errors.Wrap(err, "failed to grant vm access to scratch") + } + } + + doc.VirtualMachine.Devices.Scsi = map[string]hcsschema.Scsi{ + "0": { + Attachments: map[string]hcsschema.Attachment{ + "0": { + Path: scratchPath, + Type_: "VirtualDisk", + }, + }, + }, + } + + uvm.scsiLocations[0][0] = &SCSIMount{ + vm: uvm, + HostPath: doc.VirtualMachine.Devices.Scsi["0"].Attachments["0"].Path, + refCount: 1, + } + } else { + doc.VirtualMachine.RestoreState = &hcsschema.RestoreState{} + doc.VirtualMachine.RestoreState.TemplateSystemId = opts.TemplateConfig.UVMID + + for _, cloneableResource := range opts.TemplateConfig.Resources { + err = cloneableResource.Clone(ctx, uvm, &cloneData{ + doc: doc, + scratchFolder: scratchFolder, + uvmID: opts.ID, + }) + if err != nil { + return nil, fmt.Errorf("failed while cloning: %s", err) + } + } + + // we add default clone namespace for each clone. Include it here. + if uvm.namespaces == nil { + uvm.namespaces = make(map[string]*namespaceInfo) + } + uvm.namespaces[DEFAULT_CLONE_NETWORK_NAMESPACE_ID] = &namespaceInfo{ + nics: make(map[string]*nicInfo), + } + + uvm.IsClone = true + } + + // Add appropriate VSMB share options if this UVM needs to be saved as a template + if opts.IsTemplate { + for _, share := range doc.VirtualMachine.Devices.VirtualSmb.Shares { + share.Options.PseudoDirnotify = true + share.Options.NoLocks = true + share.Options.NoDirectmap = true + } + uvm.IsTemplate = true } fullDoc, err := mergemaps.MergeJSON(doc, ([]byte)(opts.AdditionHCSDocumentJSON)) @@ -250,16 +328,11 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error return nil, fmt.Errorf("error while creating the compute system: %s", err) } + // All clones MUST use external gcs connection if opts.ExternalGuestConnection { - log.G(ctx).WithField("vmID", uvm.runtimeID).Debug("Using external GCS bridge") - l, err := winio.ListenHvsock(&winio.HvsockAddr{ - VMID: uvm.runtimeID, - ServiceID: gcs.WindowsGcsHvsockServiceID, - }) - if err != nil { + if err = uvm.startExternalGcsListener(ctx); err != nil { return nil, err } - uvm.gcListener = l } return uvm, nil diff --git a/internal/uvm/scsi.go b/internal/uvm/scsi.go index 4444066e21..71fca379ba 100644 --- a/internal/uvm/scsi.go +++ b/internal/uvm/scsi.go @@ -1,11 +1,17 @@ package uvm import ( + "bytes" "context" + "encoding/gob" "fmt" + "io/ioutil" + "os" + "path/filepath" "strconv" "github.com/Microsoft/go-winio/pkg/security" + "github.com/Microsoft/hcsshim/internal/copyfile" "github.com/Microsoft/hcsshim/internal/guestrequest" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/requesttype" @@ -32,6 +38,8 @@ const ( VMAccessTypeIndividual ) +const scsiCurrentSerialVersionID = 1 + var ( ErrNoAvailableLocation = fmt.Errorf("no available location") ErrNotAttached = fmt.Errorf("not attached") @@ -67,6 +75,12 @@ type SCSIMount struct { // read-only layers. As RO layers are shared, we perform ref-counting. isLayer bool refCount uint32 + // specifies if this is a readonly layer + readOnly bool + // "VirtualDisk" or "PassThru" disk attachment type. + attachmentType string + // serialization ID + serialVersionID uint32 } // RefCount returns the current refcount for the SCSI mount. @@ -76,37 +90,43 @@ func (sm *SCSIMount) RefCount() uint32 { func (sm *SCSIMount) logFormat() logrus.Fields { return logrus.Fields{ - "HostPath": sm.HostPath, - "UVMPath": sm.UVMPath, - "isLayer": sm.isLayer, - "refCount": sm.refCount, - "Controller": sm.Controller, - "LUN": sm.LUN, + "HostPath": sm.HostPath, + "UVMPath": sm.UVMPath, + "isLayer": sm.isLayer, + "refCount": sm.refCount, + "Controller": sm.Controller, + "LUN": sm.LUN, + "SerialVersionID": sm.serialVersionID, + } +} + +func newSCSIMount(uvm *UtilityVM, hostPath, uvmPath, attachmentType string, refCount uint32, controller int, lun int32, readOnly bool) *SCSIMount { + return &SCSIMount{ + vm: uvm, + HostPath: hostPath, + UVMPath: uvmPath, + refCount: refCount, + Controller: controller, + LUN: int32(lun), + readOnly: readOnly, + attachmentType: attachmentType, + serialVersionID: scsiCurrentSerialVersionID, } } // allocateSCSISlot finds the next available slot on the // SCSI controllers associated with a utility VM to use. // Lock must be held when calling this function -func (uvm *UtilityVM) allocateSCSISlot(ctx context.Context, hostPath string, uvmPath string) (*SCSIMount, error) { +func (uvm *UtilityVM) allocateSCSISlot(ctx context.Context) (int, int, error) { for controller, luns := range uvm.scsiLocations { for lun, sm := range luns { // If sm is nil, we have found an open slot so we allocate a new SCSIMount if sm == nil { - uvm.scsiLocations[controller][lun] = &SCSIMount{ - vm: uvm, - HostPath: hostPath, - UVMPath: uvmPath, - refCount: 1, - Controller: controller, - LUN: int32(lun), - } - log.G(ctx).WithFields(uvm.scsiLocations[controller][lun].logFormat()).Debug("allocated SCSI mount") - return uvm.scsiLocations[controller][lun], nil + return controller, lun, nil } } } - return nil, ErrNoAvailableLocation + return -1, -1, ErrNoAvailableLocation } func (uvm *UtilityVM) deallocateSCSIMount(ctx context.Context, sm *SCSIMount) { @@ -233,7 +253,7 @@ func (uvm *UtilityVM) AddSCSIPhysicalDisk(ctx context.Context, hostPath, uvmPath // // Returns result from calling modify with the given scsi mount func (uvm *UtilityVM) addSCSIActual(ctx context.Context, hostPath, uvmPath, attachmentType string, readOnly bool, vmAccess VMAccessType) (sm *SCSIMount, err error) { - sm, existed, err := uvm.allocateSCSIMount(ctx, hostPath, uvmPath, vmAccess) + sm, existed, err := uvm.allocateSCSIMount(ctx, readOnly, hostPath, uvmPath, attachmentType, vmAccess) if err != nil { return nil, err } @@ -299,9 +319,9 @@ func (uvm *UtilityVM) addSCSIActual(ctx context.Context, hostPath, uvmPath, atta // device or allocates a new one if not already present. // Returns the resulting *SCSIMount, a bool indicating if the scsi device was already present, // and error if any. -func (uvm *UtilityVM) allocateSCSIMount(ctx context.Context, hostPath, uvmPath string, vmAccess VMAccessType) (*SCSIMount, bool, error) { +func (uvm *UtilityVM) allocateSCSIMount(ctx context.Context, readOnly bool, hostPath, uvmPath, attachmentType string, vmAccess VMAccessType) (*SCSIMount, bool, error) { // Ensure the utility VM has access - err := uvm.grantAccess(ctx, hostPath, vmAccess) + err := grantAccess(ctx, uvm.id, hostPath, vmAccess) if err != nil { return nil, false, errors.Wrapf(err, "failed to grant VM access for SCSI mount") } @@ -316,11 +336,16 @@ func (uvm *UtilityVM) allocateSCSIMount(ctx context.Context, hostPath, uvmPath s return sm, true, nil } - sm, err := uvm.allocateSCSISlot(ctx, hostPath, uvmPath) + controller, lun, err := uvm.allocateSCSISlot(ctx) if err != nil { return nil, false, err } - return sm, false, nil + + uvm.scsiLocations[controller][lun] = newSCSIMount(uvm, hostPath, uvmPath, attachmentType, 1, controller, int32(lun), readOnly) + log.G(ctx).WithFields(uvm.scsiLocations[controller][lun].logFormat()).Debug("allocated SCSI mount") + + return uvm.scsiLocations[controller][lun], false, nil + } // GetScsiUvmPath returns the guest mounted path of a SCSI drive. @@ -337,13 +362,160 @@ func (uvm *UtilityVM) GetScsiUvmPath(ctx context.Context, hostPath string) (stri } // grantAccess helper function to grant access to a file for the vm or vm group -func (uvm *UtilityVM) grantAccess(ctx context.Context, hostPath string, vmAccess VMAccessType) error { +func grantAccess(ctx context.Context, uvmID string, hostPath string, vmAccess VMAccessType) error { switch vmAccess { case VMAccessTypeGroup: log.G(ctx).WithField("path", hostPath).Debug("granting vm group access") return security.GrantVmGroupAccess(hostPath) case VMAccessTypeIndividual: - return wclayer.GrantVmAccess(ctx, uvm.id, hostPath) + return wclayer.GrantVmAccess(ctx, uvmID, hostPath) + } + return nil +} + +var _ = (Cloneable)(&SCSIMount{}) + +// GobEncode serializes the SCSIMount struct +func (sm *SCSIMount) GobEncode() ([]byte, error) { + var buf bytes.Buffer + encoder := gob.NewEncoder(&buf) + errMsgFmt := "failed to encode SCSIMount: %s" + // encode only the fields that can be safely deserialized. + if err := encoder.Encode(sm.serialVersionID); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.HostPath); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.UVMPath); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.Controller); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.LUN); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.readOnly); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(sm.attachmentType); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + return buf.Bytes(), nil +} + +// GobDecode deserializes the SCSIMount struct into the struct on which this is called +// (i.e the sm pointer) +func (sm *SCSIMount) GobDecode(data []byte) error { + buf := bytes.NewBuffer(data) + decoder := gob.NewDecoder(buf) + errMsgFmt := "failed to decode SCSIMount: %s" + // fields should be decoded in the same order in which they were encoded. + if err := decoder.Decode(&sm.serialVersionID); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if sm.serialVersionID != scsiCurrentSerialVersionID { + return fmt.Errorf("Serialized version of SCSIMount: %d doesn't match with the current version: %d", sm.serialVersionID, scsiCurrentSerialVersionID) + } + if err := decoder.Decode(&sm.HostPath); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.UVMPath); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.Controller); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.LUN); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.readOnly); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&sm.attachmentType); err != nil { + return fmt.Errorf(errMsgFmt, err) } return nil } + +// Clone function creates a clone of the SCSIMount `sm` and adds the cloned SCSIMount to +// the uvm `vm`. If `sm` is read only then it is simply added to the `vm`. But if it is a +// writeable mount(e.g a scratch layer) then a copy of it is made and that copy is added +// to the `vm`. +func (sm *SCSIMount) Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) error { + var ( + dstVhdPath string = sm.HostPath + err error + dir string + conStr string = fmt.Sprintf("%d", sm.Controller) + lunStr string = fmt.Sprintf("%d", sm.LUN) + ) + + if !sm.readOnly { + // Copy this scsi disk + // TODO(ambarve): This is a writeable SCSI mount. It can either be the + // scratch VHD of the UVM or it can be a SCSI mount that belongs to some + // container which is being automatically cloned here as a part of UVM + // cloning process. We will receive a request for creation of this + // container later on which will specify the storage path for this + // container. However, that storage location is not available now so we + // just use the storage of the uvm instead. Find a better way for handling + // this. Problem with this approach is that the scratch VHD of the container + // will not be automatically cleaned after container exits. It will stay + // there as long as the UVM keeps running. + + // For the scratch VHD of the VM (always attached at Controller:0, LUN:0) + // clone it in the scratch folder + dir = cd.scratchFolder + if sm.Controller != 0 || sm.LUN != 0 { + dir, err = ioutil.TempDir(cd.scratchFolder, fmt.Sprintf("clone-mount-%d-%d", sm.Controller, sm.LUN)) + if err != nil { + return fmt.Errorf("error while creating directory for scsi mounts of clone vm: %s", err) + } + } + + // copy the VHDX + dstVhdPath = filepath.Join(dir, filepath.Base(sm.HostPath)) + log.G(ctx).WithFields(logrus.Fields{ + "source hostPath": sm.HostPath, + "controller": sm.Controller, + "LUN": sm.LUN, + "destination hostPath": dstVhdPath, + }).Debug("Creating a clone of SCSI mount") + + if err = copyfile.CopyFile(ctx, sm.HostPath, dstVhdPath, true); err != nil { + return err + } + + if err = grantAccess(ctx, cd.uvmID, dstVhdPath, VMAccessTypeIndividual); err != nil { + os.Remove(dstVhdPath) + return err + } + } + + if cd.doc.VirtualMachine.Devices.Scsi == nil { + cd.doc.VirtualMachine.Devices.Scsi = map[string]hcsschema.Scsi{} + } + + if _, ok := cd.doc.VirtualMachine.Devices.Scsi[conStr]; !ok { + cd.doc.VirtualMachine.Devices.Scsi[conStr] = hcsschema.Scsi{ + Attachments: map[string]hcsschema.Attachment{}, + } + } + + cd.doc.VirtualMachine.Devices.Scsi[conStr].Attachments[lunStr] = hcsschema.Attachment{ + Path: dstVhdPath, + Type_: sm.attachmentType, + } + + clonedScsiMount := newSCSIMount(vm, dstVhdPath, sm.UVMPath, sm.attachmentType, 1, sm.Controller, sm.LUN, sm.readOnly) + + vm.scsiLocations[sm.Controller][sm.LUN] = clonedScsiMount + + return nil +} + +func (sm *SCSIMount) GetSerialVersionID() uint32 { + return scsiCurrentSerialVersionID +} diff --git a/internal/uvm/start.go b/internal/uvm/start.go index c970a2fca7..7d8a8459e8 100644 --- a/internal/uvm/start.go +++ b/internal/uvm/start.go @@ -240,7 +240,7 @@ func (uvm *UtilityVM) Start(ctx context.Context) (err error) { Log: log.G(ctx).WithField(logfields.UVMID, uvm.id), IoListen: gcs.HvsockIoListen(uvm.runtimeID), } - uvm.gc, err = gcc.Connect(ctx) + uvm.gc, err = gcc.Connect(ctx, !uvm.IsClone) if err != nil { return err } diff --git a/internal/uvm/types.go b/internal/uvm/types.go index 5fcb323ba6..7d25ba2f0e 100644 --- a/internal/uvm/types.go +++ b/internal/uvm/types.go @@ -116,4 +116,10 @@ type UtilityVM struct { // cpuGroupID is the ID of the cpugroup on the host that this UVM is assigned to cpuGroupID string + + // specifies if this UVM is created to be saved as a template + IsTemplate bool + + // specifies if this UVM is a cloned from a template + IsClone bool } diff --git a/internal/uvm/vsmb.go b/internal/uvm/vsmb.go index 9ecbbb1c6d..201c7bb3ed 100644 --- a/internal/uvm/vsmb.go +++ b/internal/uvm/vsmb.go @@ -1,7 +1,9 @@ package uvm import ( + "bytes" "context" + "encoding/gob" "fmt" "os" "path/filepath" @@ -17,23 +19,27 @@ import ( "golang.org/x/sys/windows" ) -const vsmbSharePrefix = `\\?\VMSMB\VSMB-{dcc079ae-60ba-4d07-847c-3493609c0870}\` +const ( + vsmbSharePrefix = `\\?\VMSMB\VSMB-{dcc079ae-60ba-4d07-847c-3493609c0870}\` + vsmbCurrentSerialVersionID = 1 +) // VSMBShare contains the host path for a Vsmb Mount type VSMBShare struct { // UVM the resource belongs to - vm *UtilityVM - HostPath string - refCount uint32 - name string - allowedFiles []string - guestPath string - readOnly bool + vm *UtilityVM + HostPath string + refCount uint32 + name string + allowedFiles []string + guestPath string + options hcsschema.VirtualSmbShareOptions + serialVersionID uint32 } // Release frees the resources of the corresponding vsmb Mount func (vsmb *VSMBShare) Release(ctx context.Context) error { - if err := vsmb.vm.RemoveVSMB(ctx, vsmb.HostPath, vsmb.readOnly); err != nil { + if err := vsmb.vm.RemoveVSMB(ctx, vsmb.HostPath, vsmb.options.ReadOnly); err != nil { return fmt.Errorf("failed to remove VSMB share: %s", err) } return nil @@ -172,10 +178,11 @@ func (uvm *UtilityVM) AddVSMB(ctx context.Context, hostPath string, options *hcs shareName := "s" + strconv.FormatUint(uvm.vsmbCounter, 16) share = &VSMBShare{ - vm: uvm, - name: shareName, - guestPath: vsmbSharePrefix + shareName, - readOnly: options.ReadOnly, + vm: uvm, + name: shareName, + guestPath: vsmbSharePrefix + shareName, + HostPath: hostPath, + serialVersionID: vsmbCurrentSerialVersionID, } } newAllowedFiles := share.allowedFiles @@ -211,6 +218,7 @@ func (uvm *UtilityVM) AddVSMB(ctx context.Context, hostPath string, options *hcs share.allowedFiles = newAllowedFiles share.refCount++ + share.options = *options m[shareKey] = share return share, nil } @@ -287,10 +295,107 @@ func (uvm *UtilityVM) GetVSMBUvmPath(ctx context.Context, hostPath string, readO return filepath.Join(share.guestPath, f), nil } -// getVSMBShareKey returns a string key which encapsulates the information that -// is used to look up an existing VSMB share. If a share is being added, but -// there is an existing share with the same key, the existing share will be used -// instead (and its ref count incremented). +var _ = (Cloneable)(&VSMBShare{}) + +// GobEncode serializes the VSMBShare struct +func (vsmb *VSMBShare) GobEncode() ([]byte, error) { + var buf bytes.Buffer + encoder := gob.NewEncoder(&buf) + errMsgFmt := "failed to encode VSMBShare: %s" + // encode only the fields that can be safely deserialized. + if err := encoder.Encode(vsmb.serialVersionID); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.HostPath); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.name); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.allowedFiles); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.guestPath); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + if err := encoder.Encode(vsmb.options); err != nil { + return nil, fmt.Errorf(errMsgFmt, err) + } + return buf.Bytes(), nil +} + +// GobDecode deserializes the VSMBShare struct into the struct on which this is called +// (i.e the vsmb pointer) +func (vsmb *VSMBShare) GobDecode(data []byte) error { + buf := bytes.NewBuffer(data) + decoder := gob.NewDecoder(buf) + errMsgFmt := "failed to decode VSMBShare: %s" + // fields should be decoded in the same order in which they were encoded. + // And verify the serialVersionID first + if err := decoder.Decode(&vsmb.serialVersionID); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if vsmb.serialVersionID != vsmbCurrentSerialVersionID { + return fmt.Errorf("Serialized version of VSMBShare %d doesn't match with the current version %d", vsmb.serialVersionID, vsmbCurrentSerialVersionID) + } + if err := decoder.Decode(&vsmb.HostPath); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&vsmb.name); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&vsmb.allowedFiles); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&vsmb.guestPath); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + if err := decoder.Decode(&vsmb.options); err != nil { + return fmt.Errorf(errMsgFmt, err) + } + return nil +} + +// Clone creates a clone of the VSMBShare `vsmb` and adds that clone to the uvm `vm`. To +// clone VSMB share we just need to add it into the config doc of that VM and increase the +// vsmb counter. +func (vsmb *VSMBShare) Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) error { + cd.doc.VirtualMachine.Devices.VirtualSmb.Shares = append(cd.doc.VirtualMachine.Devices.VirtualSmb.Shares, hcsschema.VirtualSmbShare{ + Name: vsmb.name, + Path: vsmb.HostPath, + Options: &vsmb.options, + AllowedFiles: vsmb.allowedFiles, + }) + vm.vsmbCounter++ + + clonedVSMB := &VSMBShare{ + vm: vm, + HostPath: vsmb.HostPath, + refCount: 1, + name: vsmb.name, + options: vsmb.options, + allowedFiles: vsmb.allowedFiles, + guestPath: vsmb.guestPath, + serialVersionID: vsmbCurrentSerialVersionID, + } + + if vsmb.options.RestrictFileAccess { + vm.vsmbFileShares[vsmb.HostPath] = clonedVSMB + } else { + vm.vsmbDirShares[vsmb.HostPath] = clonedVSMB + } + + return nil +} + +// getVSMBShareKey returns a string key which encapsulates the information that is used to +// look up an existing VSMB share. If a share is being added, but there is an existing +// share with the same key, the existing share will be used instead (and its ref count +// incremented). func getVSMBShareKey(hostPath string, readOnly bool) string { return fmt.Sprintf("%v-%v", hostPath, readOnly) } + +func (vsmb *VSMBShare) GetSerialVersionID() uint32 { + return vsmbCurrentSerialVersionID +} From 97ca218702b6bb447cfc2697def6b89d2289e1ed Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Thu, 11 Jun 2020 00:03:50 -0700 Subject: [PATCH 3/4] Add support for creating network namespaces inside cloned uvms. This is one of the many small PRs that enable the support for late cloning. This commit adds the set of functions required for adding network namespace and network endpoints to cloned UVMs. Signed-off-by: Amit Barve --- cmd/containerd-shim-runhcs-v1/pod.go | 17 +------- internal/hcsoci/create.go | 12 +----- internal/hcsoci/hcsdoc_wcow.go | 9 +++- internal/hcsoci/network.go | 62 ++++++++++++++++++++++++++++ internal/uvm/create_wcow.go | 1 - internal/uvm/network.go | 45 ++++++++++++++++---- 6 files changed, 109 insertions(+), 37 deletions(-) diff --git a/cmd/containerd-shim-runhcs-v1/pod.go b/cmd/containerd-shim-runhcs-v1/pod.go index 2d89b0d8ea..952671a31e 100644 --- a/cmd/containerd-shim-runhcs-v1/pod.go +++ b/cmd/containerd-shim-runhcs-v1/pod.go @@ -152,21 +152,8 @@ func createPod(ctx context.Context, events publisher, req *task.CreateTaskReques // isolated. Process isolated WCOW gets the namespace endpoints // automatically. if parent != nil { - nsid := "" - if s.Windows != nil && s.Windows.Network != nil { - nsid = s.Windows.Network.NetworkNamespace - } - - if nsid != "" { - endpoints, err := hcsoci.GetNamespaceEndpoints(ctx, nsid) - if err != nil { - return nil, err - } - err = parent.AddNetNS(ctx, nsid) - if err != nil { - return nil, err - } - err = parent.AddEndpointsToNS(ctx, nsid, endpoints) + if s.Windows != nil && s.Windows.Network != nil && s.Windows.Network.NetworkNamespace != "" { + err = hcsoci.SetupNetworkNamespace(ctx, parent, s.Windows.Network.NetworkNamespace) if err != nil { return nil, err } diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index e425316298..2283d3843b 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -149,20 +149,10 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C // container but not a workload container in a sandbox that inherits // the namespace. if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { - endpoints, err := GetNamespaceEndpoints(ctx, coi.actualNetworkNamespace) + err = SetupNetworkNamespace(ctx, coi.HostingSystem, coi.actualNetworkNamespace) if err != nil { return nil, r, err } - err = coi.HostingSystem.AddNetNS(ctx, coi.actualNetworkNamespace) - if err != nil { - return nil, r, err - } - err = coi.HostingSystem.AddEndpointsToNS(ctx, coi.actualNetworkNamespace, endpoints) - if err != nil { - // Best effort clean up the NS - coi.HostingSystem.RemoveNetNS(ctx, coi.actualNetworkNamespace) - return nil, r, err - } r.SetAddedNetNSToVM(true) } } diff --git a/internal/hcsoci/hcsdoc_wcow.go b/internal/hcsoci/hcsdoc_wcow.go index 4e3a06852f..01d8b5c361 100644 --- a/internal/hcsoci/hcsdoc_wcow.go +++ b/internal/hcsoci/hcsdoc_wcow.go @@ -182,7 +182,14 @@ func createWindowsContainerDocument(ctx context.Context, coi *createOptionsInter v2Container.Networking = &hcsschema.Networking{} v1.EndpointList = coi.Spec.Windows.Network.EndpointList - v2Container.Networking.Namespace = coi.actualNetworkNamespace + + // Use the reserved network namespace for containers created inside + // cloned or template UVMs. + if coi.HostingSystem.IsTemplate || coi.HostingSystem.IsClone { + v2Container.Networking.Namespace = uvm.DEFAULT_CLONE_NETWORK_NAMESPACE_ID + } else { + v2Container.Networking.Namespace = coi.actualNetworkNamespace + } v1.AllowUnqualifiedDNSQuery = coi.Spec.Windows.Network.AllowUnqualifiedDNSQuery v2Container.Networking.AllowUnqualifiedDnsQuery = v1.AllowUnqualifiedDNSQuery diff --git a/internal/hcsoci/network.go b/internal/hcsoci/network.go index d33bd6b838..dd0ddca374 100644 --- a/internal/hcsoci/network.go +++ b/internal/hcsoci/network.go @@ -3,6 +3,7 @@ package hcsoci import ( "context" + "github.com/Microsoft/hcsshim/hcn" "github.com/Microsoft/hcsshim/internal/hns" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" @@ -68,3 +69,64 @@ func GetNamespaceEndpoints(ctx context.Context, netNS string) ([]*hns.HNSEndpoin } return endpoints, nil } + +// Network namespace setup is a bit different for templates and clones. +// For templates and clones we use a special network namespace ID. +// Details about this can be found in the Networking section of the late-clone wiki page. +// +// In this function we take the namespace ID of the namespace that was created for this +// UVM. We hot add the namespace (with the default ID if this is a template). We get the +// endpoints associated with this namespace and then hot add those endpoints (by changing +// their namespace IDs by the deafult IDs if it is a template). +func SetupNetworkNamespace(ctx context.Context, hostingSystem *uvm.UtilityVM, nsid string) error { + nsidInsideUVM := nsid + if hostingSystem.IsTemplate || hostingSystem.IsClone { + nsidInsideUVM = uvm.DEFAULT_CLONE_NETWORK_NAMESPACE_ID + } + + // Query endpoints with actual nsid + endpoints, err := GetNamespaceEndpoints(ctx, nsid) + if err != nil { + return err + } + + // Add the network namespace inside the UVM if it is not a clone. (Clones will + // inherit the namespace from template) + if !hostingSystem.IsClone { + // Get the namespace struct from the actual nsid. + hcnNamespace, err := hcn.GetNamespaceByID(nsid) + if err != nil { + return err + } + + // All templates should have a special NSID so that it + // will be easier to debug. Override it here. + if hostingSystem.IsTemplate { + hcnNamespace.Id = nsidInsideUVM + } + + if err = hostingSystem.AddNetNS(ctx, hcnNamespace); err != nil { + return err + } + } + + // If adding a network endpoint to clones or a template override nsid associated + // with it. + if hostingSystem.IsClone || hostingSystem.IsTemplate { + // replace nsid for each endpoint + for _, ep := range endpoints { + ep.Namespace = &hns.Namespace{ + ID: nsidInsideUVM, + } + } + } + + if err = hostingSystem.AddEndpointsToNS(ctx, nsidInsideUVM, endpoints); err != nil { + // Best effort clean up the NS + if removeErr := hostingSystem.RemoveNetNS(ctx, nsidInsideUVM); removeErr != nil { + log.G(ctx).Warn(removeErr) + } + return err + } + return nil +} diff --git a/internal/uvm/create_wcow.go b/internal/uvm/create_wcow.go index 51d609895c..40a53b72fb 100644 --- a/internal/uvm/create_wcow.go +++ b/internal/uvm/create_wcow.go @@ -304,7 +304,6 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error uvm.namespaces[DEFAULT_CLONE_NETWORK_NAMESPACE_ID] = &namespaceInfo{ nics: make(map[string]*nicInfo), } - uvm.IsClone = true } diff --git a/internal/uvm/network.go b/internal/uvm/network.go index 2b0769d7af..4be39676f9 100644 --- a/internal/uvm/network.go +++ b/internal/uvm/network.go @@ -56,13 +56,16 @@ func (endpoints *NetworkEndpoints) Release(ctx context.Context) error { return nil } -// AddNetNS adds network namespace inside the guest. +// AddNetNS adds network namespace inside the guest without actually querying for the +// namespace by its ID. It uses the given namespace struct as it is in the guest request. +// This function is mostly used when we need to override the values inside the namespace +// struct returned by the GetNamespaceByID. For most uses cases AddNetNSByID is more appropriate. // -// If a namespace with `id` already exists returns `ErrNetNSAlreadyAttached`. -func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { +// If a namespace with the same id already exists returns `ErrNetNSAlreadyAttached`. +func (uvm *UtilityVM) AddNetNS(ctx context.Context, hcnNamespace *hcn.HostComputeNamespace) error { uvm.m.Lock() defer uvm.m.Unlock() - if _, ok := uvm.namespaces[id]; ok { + if _, ok := uvm.namespaces[hcnNamespace.Id]; ok { return ErrNetNSAlreadyAttached } @@ -70,10 +73,6 @@ func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { // Add a Guest Network namespace. On LCOW we add the adapters // dynamically. if uvm.operatingSystem == "windows" { - hcnNamespace, err := hcn.GetNamespaceByID(id) - if err != nil { - return err - } guestNamespace := hcsschema.ModifySettingRequest{ GuestRequest: guestrequest.GuestRequest{ ResourceType: guestrequest.ResourceTypeNetworkNamespace, @@ -90,12 +89,28 @@ func (uvm *UtilityVM) AddNetNS(ctx context.Context, id string) error { if uvm.namespaces == nil { uvm.namespaces = make(map[string]*namespaceInfo) } - uvm.namespaces[id] = &namespaceInfo{ + uvm.namespaces[hcnNamespace.Id] = &namespaceInfo{ nics: make(map[string]*nicInfo), } return nil } +// AddNetNSByID adds finds the namespace with given `id` and adds that +// network namespace inside the guest. +// +// If a namespace with `id` already exists returns `ErrNetNSAlreadyAttached`. +func (uvm *UtilityVM) AddNetNSByID(ctx context.Context, id string) error { + hcnNamespace, err := hcn.GetNamespaceByID(id) + if err != nil { + return err + } + + if err = uvm.AddNetNS(ctx, hcnNamespace); err != nil { + return err + } + return nil +} + // AddEndpointsToNS adds all unique `endpoints` to the network namespace // matching `id`. On failure does not roll back any previously successfully // added endpoints. @@ -314,3 +329,15 @@ func (uvm *UtilityVM) removeNIC(ctx context.Context, id guid.GUID, endpoint *hns } return nil } + +// Removes all NICs added to this uvm. +func (uvm *UtilityVM) RemoveAllNICs(ctx context.Context) error { + for _, ns := range uvm.namespaces { + for _, ninfo := range ns.nics { + if err := uvm.removeNIC(ctx, ninfo.ID, ninfo.Endpoint); err != nil { + return err + } + } + } + return nil +} From be4c8e19ebff1f202d44c1a90a41506473c52601 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Mon, 17 Aug 2020 21:37:02 -0700 Subject: [PATCH 4/4] Add support for creating late clones via hcsshim This is one of the many small PRs that enable the support for late cloning.This commit adds the set of functions that expose the late cloning functionality from hcsshim and adds new annotations for clients to use the late cloning feature. Signed-off-by: Amit Barve --- cmd/containerd-shim-runhcs-v1/clone.go | 50 ++++ cmd/containerd-shim-runhcs-v1/exec_clone.go | 67 +++++ cmd/containerd-shim-runhcs-v1/exec_hcs.go | 13 +- cmd/containerd-shim-runhcs-v1/pod.go | 22 +- cmd/containerd-shim-runhcs-v1/task_hcs.go | 167 ++++++++++- .../task_wcow_podsandbox.go | 5 + internal/clone/registry.go | 168 +++++++++++ internal/gcs/container.go | 17 ++ internal/hcsoci/clone.go | 50 ++++ internal/hcsoci/create.go | 269 +++++++++++++++--- internal/hcsoci/hcsdoc_wcow.go | 115 ++++---- internal/hcsoci/resources_wcow.go | 84 +++--- internal/layers/layers.go | 3 + internal/oci/uvm.go | 66 +++++ internal/uvm/clone.go | 34 ++- internal/uvm/create.go | 42 ++- internal/uvm/create_lcow.go | 1 + internal/uvm/create_wcow.go | 12 +- internal/uvm/scsi.go | 26 +- internal/uvm/types.go | 8 + internal/uvm/vsmb.go | 22 ++ 21 files changed, 1073 insertions(+), 168 deletions(-) create mode 100644 cmd/containerd-shim-runhcs-v1/clone.go create mode 100644 cmd/containerd-shim-runhcs-v1/exec_clone.go create mode 100644 internal/clone/registry.go create mode 100644 internal/hcsoci/clone.go diff --git a/cmd/containerd-shim-runhcs-v1/clone.go b/cmd/containerd-shim-runhcs-v1/clone.go new file mode 100644 index 0000000000..221aabacab --- /dev/null +++ b/cmd/containerd-shim-runhcs-v1/clone.go @@ -0,0 +1,50 @@ +package main + +import ( + "context" + + "github.com/Microsoft/hcsshim/internal/clone" + "github.com/Microsoft/hcsshim/internal/uvm" +) + +// saveAsTemplate saves the UVM and container inside it as a template and also stores the +// relevant information in the registry so that clones can be created from this template. +// Every cloned uvm gets its own NIC and we do not want to create clones of a template +// which still has a NIC attached to it. So remove the NICs attached to the template uvm +// before saving it. +// Similar to the NIC scenario we do not want to create clones from a template with an +// active GCS connection so close the GCS connection too. +func saveAsTemplate(ctx context.Context, templateTask *hcsTask) (err error) { + var utc *uvm.UVMTemplateConfig + var templateConfig *clone.TemplateConfig + + if err = templateTask.host.RemoveAllNICs(ctx); err != nil { + return err + } + + if err = templateTask.host.CloseGCSConnection(); err != nil { + return err + } + + utc, err = templateTask.host.GenerateTemplateConfig() + if err != nil { + return err + } + + templateConfig = &clone.TemplateConfig{ + TemplateUVMID: utc.UVMID, + TemplateUVMResources: utc.Resources, + TemplateUVMCreateOpts: utc.CreateOpts, + TemplateContainerID: templateTask.id, + TemplateContainerSpec: *templateTask.taskSpec, + } + + if err = clone.SaveTemplateConfig(ctx, templateConfig); err != nil { + return err + } + + if err = templateTask.host.SaveAsTemplate(ctx); err != nil { + return err + } + return nil +} diff --git a/cmd/containerd-shim-runhcs-v1/exec_clone.go b/cmd/containerd-shim-runhcs-v1/exec_clone.go new file mode 100644 index 0000000000..078149f140 --- /dev/null +++ b/cmd/containerd-shim-runhcs-v1/exec_clone.go @@ -0,0 +1,67 @@ +package main + +import ( + "context" + + "github.com/Microsoft/hcsshim/internal/cmd" + "github.com/Microsoft/hcsshim/internal/cow" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/uvm" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +func newClonedExec( + ctx context.Context, + events publisher, + tid string, + host *uvm.UtilityVM, + c cow.Container, + id, bundle string, + isWCOW bool, + spec *specs.Process, + io cmd.UpstreamIO) *clonedExec { + log.G(ctx).WithFields(logrus.Fields{ + "tid": tid, + "eid": id, // Init exec ID is always same as Task ID + "bundle": bundle, + }).Debug("newClonedExec") + + he := &hcsExec{ + events: events, + tid: tid, + host: host, + c: c, + id: id, + bundle: bundle, + isWCOW: isWCOW, + spec: spec, + io: io, + processDone: make(chan struct{}), + state: shimExecStateCreated, + exitStatus: 255, // By design for non-exited process status. + exited: make(chan struct{}), + } + + ce := &clonedExec{ + he, + } + go he.waitForContainerExit() + return ce +} + +var _ = (shimExec)(&clonedExec{}) + +// clonedExec inherits from hcsExec. The only difference between these two is that +// on starting a clonedExec it doesn't attempt to start the container even if the +// exec is the init process. This is because in case of clonedExec the container is +// already running inside the pod. +type clonedExec struct { + *hcsExec +} + +func (ce *clonedExec) Start(ctx context.Context) (err error) { + // A cloned exec should never initialize the container as it should + // already be running. + return ce.startInternal(ctx, false) +} diff --git a/cmd/containerd-shim-runhcs-v1/exec_hcs.go b/cmd/containerd-shim-runhcs-v1/exec_hcs.go index 728ef51c10..7c2115c2cc 100644 --- a/cmd/containerd-shim-runhcs-v1/exec_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/exec_hcs.go @@ -181,7 +181,7 @@ func (he *hcsExec) Status() *task.StateResponse { } } -func (he *hcsExec) Start(ctx context.Context) (err error) { +func (he *hcsExec) startInternal(ctx context.Context, initializeContainer bool) (err error) { he.sl.Lock() defer he.sl.Unlock() if he.state != shimExecStateCreated { @@ -192,8 +192,7 @@ func (he *hcsExec) Start(ctx context.Context) (err error) { he.exitFromCreatedL(ctx, 1) } }() - if he.id == he.tid { - // This is the init exec. We need to start the container itself + if initializeContainer { err = he.c.Start(ctx) if err != nil { return err @@ -257,6 +256,12 @@ func (he *hcsExec) Start(ctx context.Context) (err error) { return nil } +func (he *hcsExec) Start(ctx context.Context) (err error) { + // If he.id == he.tid then this is the init exec. + // We need to initialize the container itself before starting this exec. + return he.startInternal(ctx, he.id == he.tid) +} + func (he *hcsExec) Kill(ctx context.Context, signal uint32) error { he.sl.Lock() defer he.sl.Unlock() @@ -414,6 +419,8 @@ func (he *hcsExec) exitFromCreatedL(ctx context.Context, status int) { // // 6. Close `he.exited` channel to unblock any waiters who might have called // `Create`/`Wait`/`Start` which is a valid pattern. +// +// 7. Finally, save the UVM and this container as a template if specified. func (he *hcsExec) waitForExit() { ctx, span := trace.StartSpan(context.Background(), "hcsExec::waitForExit") defer span.End() diff --git a/cmd/containerd-shim-runhcs-v1/pod.go b/cmd/containerd-shim-runhcs-v1/pod.go index 952671a31e..6e341b83fb 100644 --- a/cmd/containerd-shim-runhcs-v1/pod.go +++ b/cmd/containerd-shim-runhcs-v1/pod.go @@ -228,6 +228,16 @@ func (p *pod) ID() string { return p.id } +func (p *pod) GetCloneAnnotations(ctx context.Context, s *specs.Spec) (bool, string, error) { + isTemplate, templateID, err := oci.ParseCloneAnnotations(ctx, s) + if err != nil { + return false, "", err + } else if (isTemplate || templateID != "") && p.host == nil { + return false, "", fmt.Errorf("save as template and creating clones is only supported for hyper-v isolated containers") + } + return isTemplate, templateID, nil +} + func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (_ shimTask, err error) { if req.ID == p.id { return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists", req.ID) @@ -270,7 +280,17 @@ func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *sp sid) } - st, err := newHcsTask(ctx, p.events, p.host, false, req, s) + _, templateID, err := p.GetCloneAnnotations(ctx, s) + if err != nil { + return nil, err + } + + var st shimTask + if templateID != "" { + st, err = newClonedHcsTask(ctx, p.events, p.host, false, req, s, templateID) + } else { + st, err = newHcsTask(ctx, p.events, p.host, false, req, s) + } if err != nil { return nil, err } diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index 2a34aa0f03..676224d47d 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -124,6 +124,7 @@ func newHcsTask( }).Debug("newHcsTask") owner := filepath.Base(os.Args[0]) + isTemplate := oci.ParseAnnotationsSaveAsTemplate(ctx, s) io, err := cmd.NewUpstreamIO(ctx, req.ID, req.Stdout, req.Stderr, req.Stdin, req.Terminal) if err != nil { @@ -163,14 +164,16 @@ func newHcsTask( } ht := &hcsTask{ - events: events, - id: req.ID, - isWCOW: oci.IsWCOW(s), - c: system, - cr: resources, - ownsHost: ownsParent, - host: parent, - closed: make(chan struct{}), + events: events, + id: req.ID, + isWCOW: oci.IsWCOW(s), + c: system, + cr: resources, + ownsHost: ownsParent, + host: parent, + closed: make(chan struct{}), + taskSpec: s, + isTemplate: isTemplate, } ht.init = newHcsExec( ctx, @@ -191,9 +194,121 @@ func newHcsTask( // handle this case. go ht.waitForHostExit() } + + // In the normal case the `Signal` call from the caller killed this task's + // init process. Or the init process ran to completion - this will mostly + // happen when we are creating a template and want to wait for init process + // to finish before we save the template. In such cases do not tear down the + // container after init exits - because we need the container in the template + go ht.waitInitExit(!isTemplate) + + // Publish the created event + ht.events.publishEvent( + ctx, + runtime.TaskCreateEventTopic, + &eventstypes.TaskCreate{ + ContainerID: req.ID, + Bundle: req.Bundle, + Rootfs: req.Rootfs, + IO: &eventstypes.TaskIO{ + Stdin: req.Stdin, + Stdout: req.Stdout, + Stderr: req.Stderr, + Terminal: req.Terminal, + }, + Checkpoint: "", + Pid: uint32(ht.init.Pid()), + }) + return ht, nil +} + +// newClonedTask creates a container within `parent`. The parent must be already cloned +// from a template and hence this container must already be present inside that parent. +// This function simply creates the go wrapper around the container that is already +// running inside the cloned parent. +// This task MAY own the UVM that it is running in but as of now the cloning feature is +// only used for WCOW hyper-V isolated containers and for WCOW, the wcowPodSandboxTask +// owns that UVM. +func newClonedHcsTask( + ctx context.Context, + events publisher, + parent *uvm.UtilityVM, + ownsParent bool, + req *task.CreateTaskRequest, + s *specs.Spec, + templateID string) (_ shimTask, err error) { + log.G(ctx).WithFields(logrus.Fields{ + "tid": req.ID, + "ownsParent": ownsParent, + "templateid": templateID, + }).Debug("newClonedHcsTask") + + owner := filepath.Base(os.Args[0]) + + if parent.OS() != "windows" { + return nil, fmt.Errorf("cloned task can only be created inside a windows host") + } + + io, err := cmd.NewNpipeIO(ctx, req.Stdin, req.Stdout, req.Stderr, req.Terminal) + if err != nil { + return nil, err + } + + var netNS string + if s.Windows != nil && + s.Windows.Network != nil { + netNS = s.Windows.Network.NetworkNamespace + } + + // This is a cloned task. Use the templateid as the ID of the container here + // because that's the ID of this container inside the UVM. + opts := hcsoci.CreateOptions{ + ID: templateID, + Owner: owner, + Spec: s, + HostingSystem: parent, + NetworkNamespace: netNS, + } + system, resources, err := hcsoci.CloneContainer(ctx, &opts) + if err != nil { + return nil, err + } + + ht := &hcsTask{ + events: events, + id: req.ID, + isWCOW: oci.IsWCOW(s), + c: system, + cr: resources, + ownsHost: ownsParent, + host: parent, + closed: make(chan struct{}), + templateID: templateID, + taskSpec: s, + isTemplate: false, + } + ht.init = newClonedExec( + ctx, + events, + req.ID, + parent, + system, + req.ID, + req.Bundle, + ht.isWCOW, + s.Process, + io) + + if parent != nil { + // We have a parent UVM. Listen for its exit and forcibly close this + // task. This is not expected but in the event of a UVM crash we need to + // handle this case. + go ht.waitForHostExit() + } + // In the normal case the `Signal` call from the caller killed this task's // init process. - go ht.waitInitExit() + go ht.waitInitExit(true) // Publish the created event ht.events.publishEvent( @@ -270,6 +385,22 @@ type hcsTask struct { // closeHostOnce is used to close `host`. This will only be used if // `ownsHost==true` and `host != nil`. closeHostOnce sync.Once + + // templateID represents the id of the template container from which this container + // is cloned. The parent UVM (inside which this container is running) identifies this + // container with it's original id (i.e the id that was assigned to this container + // at the time of template creation i.e the templateID). Hence, every request that + // is sent to the GCS must actually use templateID to reference this container. + // A non-empty templateID specifies that this task was cloned. + templateID string + + // if isTemplate is true then this container will be saved as a template as soon + // as its init process exits. Note: templateID and isTemplate are mutually exclusive. + // i.e isTemplate can not be true when templateID is not empty. + isTemplate bool + + // taskSpec represents the spec/configuration for this task. + taskSpec *specs.Spec } func (ht *hcsTask) ID() string { @@ -466,7 +597,7 @@ func (ht *hcsTask) Wait() *task.StateResponse { return ht.init.Wait() } -func (ht *hcsTask) waitInitExit() { +func (ht *hcsTask) waitInitExit(destroyContainer bool) { ctx, span := trace.StartSpan(context.Background(), "hcsTask::waitInitExit") defer span.End() span.AddAttributes(trace.StringAttribute("tid", ht.id)) @@ -474,8 +605,20 @@ func (ht *hcsTask) waitInitExit() { // Wait for it to exit on its own ht.init.Wait() - // Close the host and event the exit - ht.close(ctx) + if destroyContainer { + // Close the host and event the exit + ht.close(ctx) + } else { + // Close the container's host, but do not close or terminate the container itself + ht.closeHost(ctx) + } + + if ht.isTemplate { + // Save the host as a template + if err := saveAsTemplate(ctx, ht); err != nil { + log.G(ctx).WithError(err).Error("failed to save as template") + } + } } // waitForHostExit waits for the host virtual machine to exit. Once exited diff --git a/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go b/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go index 7085e8c03f..122c2bba6e 100644 --- a/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go +++ b/cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go @@ -7,6 +7,7 @@ import ( "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/cmd" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/shimdiag" @@ -170,6 +171,10 @@ func (wpst *wcowPodSandboxTask) close(ctx context.Context) { if err := wpst.host.Close(); err != nil { log.G(ctx).WithError(err).Error("failed host vm shutdown") } + // cleanup template state if any exists + if err := clone.RemoveSavedTemplateConfig(wpst.host.ID()); err != nil { + log.G(ctx).WithError(err).Error("failed to cleanup template config state for vm") + } } // Send the `init` exec exit notification always. exit := wpst.init.Status() diff --git a/internal/clone/registry.go b/internal/clone/registry.go new file mode 100644 index 0000000000..ed5ef3dce4 --- /dev/null +++ b/internal/clone/registry.go @@ -0,0 +1,168 @@ +package clone + +import ( + "bytes" + "context" + "encoding/gob" + "fmt" + + "github.com/Microsoft/hcsshim/internal/regstate" + "github.com/Microsoft/hcsshim/internal/uvm" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + configRoot = "LateClone" + configKey = "UVMConfig" + templateConfigCurrentSerialVersionID = 1 +) + +// TemplateConfig struct maintains all of the information about a template. This includes +// the information for both the template container and the template UVM. This struct is +// serialized and stored in the registry and hence is version controlled. +// Note: Update the `templateConfigCurrentSerialVersionID` when this structure definition +// is changed. +type TemplateConfig struct { + SerialVersionID uint32 + TemplateUVMID string + TemplateUVMResources []uvm.Cloneable + TemplateUVMCreateOpts uvm.OptionsWCOW + TemplateContainerID string + // Below we store the container spec for the template container so that when + // cloning containers we can verify that a different spec is not provided for the + // cloned container. + TemplateContainerSpec specs.Spec +} + +// When encoding interfaces gob requires us to register the struct types that we will be +// using under those interfaces. This registration needs to happen on both sides i.e the +// side which encodes the data (i.e the shim process of the template) and the side which +// decodes the data (i.e the shim process of the clone). +// Go init function: https://golang.org/doc/effective_go.html#init +func init() { + // Register the pointer to structs because that is what is being stored. + gob.Register(&uvm.VSMBShare{}) + gob.Register(&uvm.SCSIMount{}) +} + +func encodeTemplateConfig(templateConfig *TemplateConfig) ([]byte, error) { + var buf bytes.Buffer + + encoder := gob.NewEncoder(&buf) + if err := encoder.Encode(templateConfig); err != nil { + return nil, fmt.Errorf("error while encoding template config: %s", err) + } + return buf.Bytes(), nil +} + +func decodeTemplateConfig(encodedBytes []byte) (*TemplateConfig, error) { + var templateConfig TemplateConfig + + reader := bytes.NewReader(encodedBytes) + decoder := gob.NewDecoder(reader) + if err := decoder.Decode(&templateConfig); err != nil { + return nil, fmt.Errorf("error while decoding template config: %s", err) + } + return &templateConfig, nil +} + +// loadPersistedUVMConfig loads a persisted config from the registry that matches the given ID +// If not found returns `regstate.NotFoundError` +func loadPersistedUVMConfig(id string) ([]byte, error) { + sk, err := regstate.Open(configRoot, false) + if err != nil { + return nil, err + } + defer sk.Close() + + var encodedConfig []byte + if err := sk.Get(id, configKey, &encodedConfig); err != nil { + return nil, err + } + return encodedConfig, nil +} + +// storePersistedUVMConfig stores the given config to the registry. +// If the store fails returns the store error. +func storePersistedUVMConfig(id string, encodedConfig []byte) error { + sk, err := regstate.Open(configRoot, false) + if err != nil { + return err + } + defer sk.Close() + + if err := sk.Create(id, configKey, encodedConfig); err != nil { + return err + } + return nil +} + +// removePersistedUVMConfig removes any persisted state associated with this config. If the config +// is not found in the registery `Remove` returns no error. +func removePersistedUVMConfig(id string) error { + sk, err := regstate.Open(configRoot, false) + if err != nil { + if regstate.IsNotFoundError(err) { + return nil + } + return err + } + defer sk.Close() + + if err := sk.Remove(id); err != nil { + if regstate.IsNotFoundError(err) { + return nil + } + return err + } + return nil +} + +// Saves all the information required to create a clone from the template +// of this container into the registry. +func SaveTemplateConfig(ctx context.Context, templateConfig *TemplateConfig) error { + _, err := loadPersistedUVMConfig(templateConfig.TemplateUVMID) + if !regstate.IsNotFoundError(err) { + return fmt.Errorf("parent VM(ID: %s) config shouldn't exit in registry (%s)", templateConfig.TemplateUVMID, err) + } + + // set the serial version before encoding + templateConfig.SerialVersionID = templateConfigCurrentSerialVersionID + + encodedBytes, err := encodeTemplateConfig(templateConfig) + if err != nil { + return fmt.Errorf("failed to encode template config: %s", err) + } + + if err := storePersistedUVMConfig(templateConfig.TemplateUVMID, encodedBytes); err != nil { + return fmt.Errorf("failed to store encoded template config: %s", err) + } + + return nil +} + +// Removes all the state associated with the template with given ID +// If there is no state associated with this ID then the function simply returns without +// doing anything. +func RemoveSavedTemplateConfig(id string) error { + return removePersistedUVMConfig(id) +} + +// Retrieves the UVMTemplateConfig for the template with given ID from the registry. +func FetchTemplateConfig(ctx context.Context, id string) (*TemplateConfig, error) { + encodedBytes, err := loadPersistedUVMConfig(id) + if err != nil { + return nil, fmt.Errorf("failed to fetch encoded template config: %s", err) + } + + templateConfig, err := decodeTemplateConfig(encodedBytes) + if err != nil { + return nil, fmt.Errorf("failed to decode template config: %s", err) + } + + if templateConfig.SerialVersionID != templateConfigCurrentSerialVersionID { + return nil, fmt.Errorf("serialized version of TemplateConfig: %d doesn't match with the current version: %d", templateConfig.SerialVersionID, templateConfigCurrentSerialVersionID) + } + + return templateConfig, nil +} diff --git a/internal/gcs/container.go b/internal/gcs/container.go index 218d138f00..8e4ca07905 100644 --- a/internal/gcs/container.go +++ b/internal/gcs/container.go @@ -61,6 +61,23 @@ func (gc *GuestConnection) CreateContainer(ctx context.Context, cid string, conf return c, nil } +// CloneContainer just creates the wrappers and sets up notification requests for a +// container that is already running inside the UVM (after cloning). +func (gc *GuestConnection) CloneContainer(ctx context.Context, cid string) (_ *Container, err error) { + c := &Container{ + gc: gc, + id: cid, + notifyCh: make(chan struct{}), + closeCh: make(chan struct{}), + } + err = gc.requestNotify(cid, c.notifyCh) + if err != nil { + return nil, err + } + go c.waitBackground() + return c, nil +} + // OS returns the operating system of the container, "linux" or "windows". func (c *Container) OS() string { return c.gc.os diff --git a/internal/hcsoci/clone.go b/internal/hcsoci/clone.go new file mode 100644 index 0000000000..4d26f21d0b --- /dev/null +++ b/internal/hcsoci/clone.go @@ -0,0 +1,50 @@ +// +build windows + +package hcsoci + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/cow" + "github.com/Microsoft/hcsshim/internal/requesttype" + hcsschema "github.com/Microsoft/hcsshim/internal/schema2" +) + +const ( + MappedDirectoryResourcePath = "Container/MappedDirectories" + MappedPipeResourcePath = "Container/MappedPipes" +) + +// Usually mounts specified in the container config are added in the container doc +// that is passed along with the container creation reuqest. However, for cloned containers +// we don't send any create container request so we must add the mounts one by one by +// doing Modify requests to that container. +func addMountsToClone(ctx context.Context, c cow.Container, mounts *mountsConfig) error { + // TODO(ambarve) : Find out if there is a way to send request for all the mounts + // at the same time to save time + for _, md := range mounts.mdsv2 { + requestDocument := &hcsschema.ModifySettingRequest{ + RequestType: requesttype.Add, + ResourcePath: MappedDirectoryResourcePath, + Settings: md, + } + err := c.Modify(ctx, requestDocument) + if err != nil { + return fmt.Errorf("error while adding mapped directory (%s) to the container: %s", md.HostPath, err) + } + } + + for _, mp := range mounts.mpsv2 { + requestDocument := &hcsschema.ModifySettingRequest{ + RequestType: requesttype.Add, + ResourcePath: MappedPipeResourcePath, + Settings: mp, + } + err := c.Modify(ctx, requestDocument) + if err != nil { + return fmt.Errorf("error while adding mapped pipe (%s) to the container: %s", mp.HostPath, err) + } + } + return nil +} diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index 2283d3843b..f87adb1e99 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -11,6 +11,7 @@ import ( "strconv" "github.com/Microsoft/go-winio/pkg/guid" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/cow" "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/log" @@ -65,26 +66,133 @@ type createOptionsInternal struct { actualOwner string // Owner for the container actualNetworkNamespace string ccgState *hcsschema.ContainerCredentialGuardState // Container Credential Guard information to be attached to HCS container document + isTemplate bool // Are we going to save this container as a template + templateID string // Template ID of the template from which this container is being cloned } -// CreateContainer creates a container. It can cope with a wide variety of -// scenarios, including v1 HCS schema calls, as well as more complex v2 HCS schema -// calls. Note we always return the resources that have been allocated, even in the -// case of an error. This provides support for the debugging option not to -// release the resources on failure, so that the client can make the necessary -// call to release resources that have been allocated as part of calling this function. -func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { +// compares two slices of strings and returns true if they are same, returns false otherwise. +// The elements in the slices don't have to be in the same order for them to be equal. +func cmpSlices(s1, s2 []string) bool { + equal := (len(s1) == len(s2)) + for i := 0; equal && i < len(s1); i++ { + found := false + for j := 0; !found && j < len(s2); j++ { + found = (s1[i] == s2[j]) + } + equal = equal && found + } + return equal +} + +// Compares to mount structs and returns true if they are equal, returns false otherwise. +func compareMounts(m1, m2 specs.Mount) bool { + return cmpSlices(m1.Options, m2.Options) && (m1.Source == m2.Source) && (m1.Destination == m2.Destination) && (m1.Type == m2.Type) +} + +// verifyCloneContainerSpecs compares the container creation spec provided during the template container +// creation and the spec provided during cloned container creation and checks that all the fields match +// (except for the certain fields that are allowed to be different). +func verifyCloneContainerSpecs(templateSpec, cloneSpec *specs.Spec) error { + // Following fields can be different in the template and clone specs. + // 1. Process + // 2. Annotations - Only the template/cloning related annotations can be different. + // 3. Windows.LayerFolders - Only the last i.e scratch layer can be different. + + if templateSpec.Version != cloneSpec.Version { + return fmt.Errorf("OCI Runtime Spec version of template (%s) doesn't match with the Spec version of clone (%s)", templateSpec.Version, cloneSpec.Version) + } + + // for annotations check that the values of memory & cpu annotations are same + if templateSpec.Annotations[oci.AnnotationContainerMemorySizeInMB] != cloneSpec.Annotations[oci.AnnotationContainerMemorySizeInMB] { + return fmt.Errorf("memory size limit for template and clone containers can not be different") + } + if templateSpec.Annotations[oci.AnnotationContainerProcessorCount] != cloneSpec.Annotations[oci.AnnotationContainerProcessorCount] { + return fmt.Errorf("processor count for template and clone containers can not be different") + } + if templateSpec.Annotations[oci.AnnotationContainerProcessorLimit] != cloneSpec.Annotations[oci.AnnotationContainerProcessorLimit] { + return fmt.Errorf("processor limit for template and clone containers can not be different") + } + + // LayerFolders should be identical except for the last element. + if !cmpSlices(templateSpec.Windows.LayerFolders[:len(templateSpec.Windows.LayerFolders)-1], cloneSpec.Windows.LayerFolders[:len(cloneSpec.Windows.LayerFolders)-1]) { + return fmt.Errorf("layers provided for template container and clone container don't match. Check the image specified in container config") + } + + if templateSpec.Windows.HyperV != cloneSpec.Windows.HyperV { + return fmt.Errorf("HyperV spec for template and clone containers can not be different") + } + + if templateSpec.Windows.Network.AllowUnqualifiedDNSQuery != cloneSpec.Windows.Network.AllowUnqualifiedDNSQuery { + return fmt.Errorf("different values for allow unqualified DNS query can not be provided for template and clones") + } + if templateSpec.Windows.Network.NetworkSharedContainerName != cloneSpec.Windows.Network.NetworkSharedContainerName { + return fmt.Errorf("different network shared name can not be provided for template and clones") + } + if !cmpSlices(templateSpec.Windows.Network.DNSSearchList, cloneSpec.Windows.Network.DNSSearchList) { + return fmt.Errorf("different DNS search list can not be provided for template and clones") + } + return nil +} + +func validateContainerConfig(ctx context.Context, coi *createOptionsInternal) error { + + if coi.HostingSystem != nil && coi.HostingSystem.IsTemplate && !coi.isTemplate { + return fmt.Errorf("only a template container can be created inside a template pod. Any other combination is not valid") + } + + if coi.HostingSystem != nil && coi.templateID != "" && !coi.HostingSystem.IsClone { + return fmt.Errorf("A container can not be cloned inside a non cloned POD") + } + + if coi.templateID != "" { + // verify that the configurations provided for the template for + // this clone are same. + tc, err := clone.FetchTemplateConfig(ctx, coi.HostingSystem.TemplateID) + if err != nil { + return fmt.Errorf("config validation failed : %s", err) + } + if err := verifyCloneContainerSpecs(&tc.TemplateContainerSpec, coi.Spec); err != nil { + return err + } + } + + if coi.HostingSystem != nil && coi.HostingSystem.IsTemplate { + if len(coi.Spec.Windows.Devices) != 0 { + return fmt.Errorf("Mapped Devices are not supported for template containers") + } + + if _, ok := coi.Spec.Windows.CredentialSpec.(string); ok { + return fmt.Errorf("gMSA specifications are not supported for template containers") + } + + if coi.Spec.Windows.Servicing { + return fmt.Errorf("template containers can't be started in servicing mode") + } + + // check that no mounts are specified. + if len(coi.Spec.Mounts) > 0 { + return fmt.Errorf("user specified mounts are not permitted for template containers") + } + } + return nil +} + +func initializeCreateOptions(ctx context.Context, createOptions *CreateOptions) (*createOptionsInternal, error) { coi := &createOptionsInternal{ CreateOptions: createOptions, actualID: createOptions.ID, actualOwner: createOptions.Owner, } + if coi.Spec == nil { + return nil, fmt.Errorf("Spec must be supplied") + } + // Defaults if omitted by caller. if coi.actualID == "" { g, err := guid.NewV4() if err != nil { - return nil, nil, err + return nil, err } coi.actualID = g.String() } @@ -92,10 +200,6 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C coi.actualOwner = filepath.Base(os.Args[0]) } - if coi.Spec == nil { - return nil, nil, fmt.Errorf("Spec must be supplied") - } - if coi.HostingSystem != nil { // By definition, a hosting system can only be supplied for a v2 Xenon. coi.actualSchemaVersion = schemaversion.SchemaV21() @@ -103,10 +207,64 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C coi.actualSchemaVersion = schemaversion.DetermineSchemaVersion(coi.SchemaVersion) } + coi.isTemplate = oci.ParseAnnotationsSaveAsTemplate(ctx, createOptions.Spec) + coi.templateID = oci.ParseAnnotationsTemplateID(ctx, createOptions.Spec) + log.G(ctx).WithFields(logrus.Fields{ "options": fmt.Sprintf("%+v", createOptions), "schema": coi.actualSchemaVersion, - }).Debug("hcsshim::CreateContainer") + }).Debug("hcsshim::initializeCreateOptions") + + return coi, nil +} + +// configureSandboxNetwork creates a new network namespace for the pod (sandbox) +// if required and then adds that namespace to the pod. +func configureSandboxNetwork(ctx context.Context, coi *createOptionsInternal, r *resources.Resources) error { + if coi.NetworkNamespace != "" { + r.SetNetNS(coi.NetworkNamespace) + } else { + err := createNetworkNamespace(ctx, coi, r) + if err != nil { + return err + } + } + coi.actualNetworkNamespace = r.NetNS() + + if coi.HostingSystem != nil { + ct, _, err := oci.GetSandboxTypeAndID(coi.Spec.Annotations) + if err != nil { + return err + } + // Only add the network namespace to a standalone or sandbox + // container but not a workload container in a sandbox that inherits + // the namespace. + if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { + if err = SetupNetworkNamespace(ctx, coi.HostingSystem, coi.actualNetworkNamespace); err != nil { + return err + } + r.SetAddedNetNSToVM(true) + } + } + + return nil +} + +// CreateContainer creates a container. It can cope with a wide variety of +// scenarios, including v1 HCS schema calls, as well as more complex v2 HCS schema +// calls. Note we always return the resources that have been allocated, even in the +// case of an error. This provides support for the debugging option not to +// release the resources on failure, so that the client can make the necessary +// call to release resources that have been allocated as part of calling this function. +func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { + coi, err := initializeCreateOptions(ctx, createOptions) + if err != nil { + return nil, nil, err + } + + if err := validateContainerConfig(ctx, coi); err != nil { + return nil, nil, fmt.Errorf("container config validation failed: %s", err) + } r := resources.NewContainerResources(createOptions.ID) defer func() { @@ -130,31 +288,10 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C if coi.Spec.Windows != nil && coi.Spec.Windows.Network != nil && schemaversion.IsV21(coi.actualSchemaVersion) { + err = configureSandboxNetwork(ctx, coi, r) + if err != nil { + return nil, r, fmt.Errorf("failure while creating namespace for container: %s", err) - if coi.NetworkNamespace != "" { - r.SetNetNS(coi.NetworkNamespace) - } else { - err := createNetworkNamespace(ctx, coi, r) - if err != nil { - return nil, r, err - } - } - coi.actualNetworkNamespace = r.NetNS() - if coi.HostingSystem != nil { - ct, _, err := oci.GetSandboxTypeAndID(coi.Spec.Annotations) - if err != nil { - return nil, r, err - } - // Only add the network namespace to a standalone or sandbox - // container but not a workload container in a sandbox that inherits - // the namespace. - if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { - err = SetupNetworkNamespace(ctx, coi.HostingSystem, coi.actualNetworkNamespace) - if err != nil { - return nil, r, err - } - r.SetAddedNetNSToVM(true) - } } } @@ -224,6 +361,64 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C return system, r, nil } +// CloneContainer is similar to CreateContainer but it does not add layers or namespace like +// CreateContainer does. Also, instead of sending create container request it sends a modify +// request to an existing container. CloneContainer only works for WCOW. +func CloneContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.Container, _ *resources.Resources, err error) { + coi, err := initializeCreateOptions(ctx, createOptions) + if err != nil { + return nil, nil, err + } + + if err := validateContainerConfig(ctx, coi); err != nil { + return nil, nil, err + } + + if coi.Spec.Windows == nil || coi.HostingSystem == nil { + return nil, nil, fmt.Errorf("CloneContainer is only supported for Hyper-v isolated WCOW ") + } + + r := resources.NewContainerResources(createOptions.ID) + defer func() { + if err != nil { + if !coi.DoNotReleaseResourcesOnFailure { + resources.ReleaseResources(ctx, r, coi.HostingSystem, true) + } + } + }() + + if coi.HostingSystem != nil { + n := coi.HostingSystem.ContainerCounter() + if coi.Spec.Linux != nil { + r.SetContainerRootInUVM(fmt.Sprintf(lcowRootInUVM, createOptions.ID)) + } else { + r.SetContainerRootInUVM(fmt.Sprintf(wcowRootInUVM, strconv.FormatUint(n, 16))) + } + } + + if err = setupMounts(ctx, coi, r); err != nil { + return nil, r, err + } + + mounts, err := createMountsConfig(ctx, coi) + if err != nil { + return nil, r, err + } + + c, err := coi.HostingSystem.CloneContainer(ctx, coi.actualID) + if err != nil { + return nil, r, err + } + + // Everything that is usually added to the container during the createContainer + // request (via the gcsDocument) must be hot added here. + if err := addMountsToClone(ctx, c, mounts); err != nil { + return nil, r, err + } + + return c, r, nil +} + // isV2Xenon returns true if the create options are for a HCS schema V2 xenon container // with a hosting VM func (coi *createOptionsInternal) isV2Xenon() bool { diff --git a/internal/hcsoci/hcsdoc_wcow.go b/internal/hcsoci/hcsdoc_wcow.go index 01d8b5c361..a89992f256 100644 --- a/internal/hcsoci/hcsdoc_wcow.go +++ b/internal/hcsoci/hcsdoc_wcow.go @@ -23,6 +23,60 @@ import ( "github.com/sirupsen/logrus" ) +// A simple wrapper struct around the container mount configs that should be added to the +// container. +type mountsConfig struct { + mdsv1 []schema1.MappedDir + mpsv1 []schema1.MappedPipe + mdsv2 []hcsschema.MappedDirectory + mpsv2 []hcsschema.MappedPipe +} + +func createMountsConfig(ctx context.Context, coi *createOptionsInternal) (*mountsConfig, error) { + // Add the mounts as mapped directories or mapped pipes + // TODO: Mapped pipes to add in v2 schema. + var config mountsConfig + for _, mount := range coi.Spec.Mounts { + if mount.Type != "" { + return nil, fmt.Errorf("invalid container spec - Mount.Type '%s' must not be set", mount.Type) + } + if uvm.IsPipe(mount.Source) { + src, dst := uvm.GetContainerPipeMapping(coi.HostingSystem, mount) + config.mpsv1 = append(config.mpsv1, schema1.MappedPipe{HostPath: src, ContainerPipeName: dst}) + config.mpsv2 = append(config.mpsv2, hcsschema.MappedPipe{HostPath: src, ContainerPipeName: dst}) + } else { + readOnly := false + for _, o := range mount.Options { + if strings.ToLower(o) == "ro" { + readOnly = true + } + } + mdv1 := schema1.MappedDir{HostPath: mount.Source, ContainerPath: mount.Destination, ReadOnly: readOnly} + mdv2 := hcsschema.MappedDirectory{ContainerPath: mount.Destination, ReadOnly: readOnly} + if coi.HostingSystem == nil { + mdv2.HostPath = mount.Source + } else { + uvmPath, err := coi.HostingSystem.GetVSMBUvmPath(ctx, mount.Source, readOnly) + if err != nil { + if err == uvm.ErrNotAttached { + // It could also be a scsi mount. + uvmPath, err = coi.HostingSystem.GetScsiUvmPath(ctx, mount.Source) + if err != nil { + return nil, err + } + } else { + return nil, err + } + } + mdv2.HostPath = uvmPath + } + config.mdsv1 = append(config.mdsv1, mdv1) + config.mdsv2 = append(config.mdsv2, mdv2) + } + } + return &config, nil +} + // createWindowsContainerDocument creates documents for passing to HCS or GCS to create // a container, both hosted and process isolated. It creates both v1 and v2 // container objects, WCOW only. The containers storage should have been mounted already. @@ -185,7 +239,7 @@ func createWindowsContainerDocument(ctx context.Context, coi *createOptionsInter // Use the reserved network namespace for containers created inside // cloned or template UVMs. - if coi.HostingSystem.IsTemplate || coi.HostingSystem.IsClone { + if coi.HostingSystem != nil && (coi.HostingSystem.IsTemplate || coi.HostingSystem.IsClone) { v2Container.Networking.Namespace = uvm.DEFAULT_CLONE_NETWORK_NAMESPACE_ID } else { v2Container.Networking.Namespace = coi.actualNetworkNamespace @@ -275,60 +329,17 @@ func createWindowsContainerDocument(ctx context.Context, coi *createOptionsInter } } - // Add the mounts as mapped directories or mapped pipes - // TODO: Mapped pipes to add in v2 schema. - var ( - mdsv1 []schema1.MappedDir - mpsv1 []schema1.MappedPipe - mdsv2 []hcsschema.MappedDirectory - mpsv2 []hcsschema.MappedPipe - ) - for _, mount := range coi.Spec.Mounts { - if mount.Type != "" { - return nil, nil, fmt.Errorf("invalid container spec - Mount.Type '%s' must not be set", mount.Type) - } - if uvm.IsPipe(mount.Source) { - src, dst := uvm.GetContainerPipeMapping(coi.HostingSystem, mount) - mpsv1 = append(mpsv1, schema1.MappedPipe{HostPath: src, ContainerPipeName: dst}) - mpsv2 = append(mpsv2, hcsschema.MappedPipe{HostPath: src, ContainerPipeName: dst}) - } else { - readOnly := false - for _, o := range mount.Options { - if strings.ToLower(o) == "ro" { - readOnly = true - } - } - mdv1 := schema1.MappedDir{HostPath: mount.Source, ContainerPath: mount.Destination, ReadOnly: readOnly} - mdv2 := hcsschema.MappedDirectory{ContainerPath: mount.Destination, ReadOnly: readOnly} - if coi.HostingSystem == nil { - mdv2.HostPath = mount.Source - } else { - uvmPath, err := coi.HostingSystem.GetVSMBUvmPath(ctx, mount.Source, readOnly) - if err != nil { - if err == uvm.ErrNotAttached { - // It could also be a scsi mount. - uvmPath, err = coi.HostingSystem.GetScsiUvmPath(ctx, mount.Source) - if err != nil { - return nil, nil, err - } - } else { - return nil, nil, err - } - } - mdv2.HostPath = uvmPath - } - mdsv1 = append(mdsv1, mdv1) - mdsv2 = append(mdsv2, mdv2) - } + mounts, err := createMountsConfig(ctx, coi) + if err != nil { + return nil, nil, err } - - v1.MappedDirectories = mdsv1 - v2Container.MappedDirectories = mdsv2 - if len(mpsv1) > 0 && osversion.Get().Build < osversion.RS3 { + v1.MappedDirectories = mounts.mdsv1 + v2Container.MappedDirectories = mounts.mdsv2 + if len(mounts.mpsv1) > 0 && osversion.Get().Build < osversion.RS3 { return nil, nil, fmt.Errorf("named pipe mounts are not supported on this version of Windows") } - v1.MappedPipes = mpsv1 - v2Container.MappedPipes = mpsv2 + v1.MappedPipes = mounts.mpsv1 + v2Container.MappedPipes = mounts.mpsv2 // add assigned devices to the container definition if err := parseAssignedDevices(ctx, coi, v2Container); err != nil { diff --git a/internal/hcsoci/resources_wcow.go b/internal/hcsoci/resources_wcow.go index 9ace171729..0ef3ea5482 100644 --- a/internal/hcsoci/resources_wcow.go +++ b/internal/hcsoci/resources_wcow.go @@ -60,6 +60,53 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r r.SetLayers(layers) } + if err := setupMounts(ctx, coi, r); err != nil { + return err + } + + if cs, ok := coi.Spec.Windows.CredentialSpec.(string); ok { + // Only need to create a CCG instance for v2 containers + if schemaversion.IsV21(coi.actualSchemaVersion) { + hypervisorIsolated := coi.HostingSystem != nil + ccgInstance, ccgResource, err := credentials.CreateCredentialGuard(ctx, coi.actualID, cs, hypervisorIsolated) + if err != nil { + return err + } + coi.ccgState = ccgInstance.CredentialGuard + r.Add(ccgResource) + if hypervisorIsolated { + // If hypervisor isolated we need to add an hvsocket service table entry + // By default HVSocket won't allow something inside the VM to connect + // back to a process on the host. We need to update the HVSocket service table + // to allow a connection to CCG.exe on the host, so that GMSA can function. + // We need to hot add this here because at UVM creation time we don't know what containers + // will be launched in the UVM, nonetheless if they will ask for GMSA. This is a workaround + // for the previous design requirement for CCG V2 where the service entry + // must be present in the UVM'S HCS document before being sent over as hot adding + // an HvSocket service was not possible. + hvSockConfig := ccgInstance.HvSocketConfig + if err := coi.HostingSystem.UpdateHvSocketService(ctx, hvSockConfig.ServiceId, hvSockConfig.ServiceConfig); err != nil { + return fmt.Errorf("failed to update hvsocket service: %s", err) + } + } + } + } + + if coi.HostingSystem != nil && coi.hasWindowsAssignedDevices() { + windowsDevices, closers, err := handleAssignedDevicesWindows(ctx, coi.HostingSystem, coi.Spec.Annotations, coi.Spec.Windows.Devices) + if err != nil { + return err + } + r.Add(closers...) + coi.Spec.Windows.Devices = windowsDevices + } + + return nil +} + +// setupMount adds the custom mounts requested in the container configuration of this +// request. +func setupMounts(ctx context.Context, coi *createOptionsInternal, r *resources.Resources) error { // Validate each of the mounts. If this is a V2 Xenon, we have to add them as // VSMB shares to the utility VM. For V1 Xenon and Argons, there's nothing for // us to do as it's done by HCS. @@ -121,42 +168,5 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r } } - if cs, ok := coi.Spec.Windows.CredentialSpec.(string); ok { - // Only need to create a CCG instance for v2 containers - if schemaversion.IsV21(coi.actualSchemaVersion) { - hypervisorIsolated := coi.HostingSystem != nil - ccgInstance, ccgResource, err := credentials.CreateCredentialGuard(ctx, coi.actualID, cs, hypervisorIsolated) - if err != nil { - return err - } - coi.ccgState = ccgInstance.CredentialGuard - r.Add(ccgResource) - if hypervisorIsolated { - // If hypervisor isolated we need to add an hvsocket service table entry - // By default HVSocket won't allow something inside the VM to connect - // back to a process on the host. We need to update the HVSocket service table - // to allow a connection to CCG.exe on the host, so that GMSA can function. - // We need to hot add this here because at UVM creation time we don't know what containers - // will be launched in the UVM, nonetheless if they will ask for GMSA. This is a workaround - // for the previous design requirement for CCG V2 where the service entry - // must be present in the UVM'S HCS document before being sent over as hot adding - // an HvSocket service was not possible. - hvSockConfig := ccgInstance.HvSocketConfig - if err := coi.HostingSystem.UpdateHvSocketService(ctx, hvSockConfig.ServiceId, hvSockConfig.ServiceConfig); err != nil { - return fmt.Errorf("failed to update hvsocket service: %s", err) - } - } - } - } - - if coi.HostingSystem != nil && coi.hasWindowsAssignedDevices() { - windowsDevices, closers, err := handleAssignedDevicesWindows(ctx, coi.HostingSystem, coi.Spec.Annotations, coi.Spec.Windows.Devices) - if err != nil { - return err - } - r.Add(closers...) - coi.Spec.Windows.Devices = windowsDevices - } - return nil } diff --git a/internal/layers/layers.go b/internal/layers/layers.go index dcdce43617..c21234f8c0 100644 --- a/internal/layers/layers.go +++ b/internal/layers/layers.go @@ -125,6 +125,9 @@ func MountContainerLayers(ctx context.Context, layerFolders []string, guestRoot if uvm.OS() == "windows" { options := uvm.DefaultVSMBOptions(true) options.TakeBackupPrivilege = true + if uvm.IsTemplate { + uvm.SetSaveableVSMBOptions(options, options.ReadOnly) + } if _, err := uvm.AddVSMB(ctx, layerPath, options); err != nil { return "", fmt.Errorf("failed to add VSMB layer: %s", err) } diff --git a/internal/oci/uvm.go b/internal/oci/uvm.go index 9c8003cd25..3d57222804 100644 --- a/internal/oci/uvm.go +++ b/internal/oci/uvm.go @@ -3,10 +3,12 @@ package oci import ( "context" "errors" + "fmt" "strconv" "strings" runhcsopts "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" + "github.com/Microsoft/hcsshim/internal/clone" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/uvm" @@ -138,6 +140,21 @@ const ( // annotation used to specify the cpugroup ID that a UVM should be assigned to annotationCPUGroupID = "io.microsoft.virtualmachine.cpugroup.id" + + // SaveAsTemplate annotation must be used with a pod & container creation request. + // If this annotation is present in the request then it will save the UVM (pod) + // and the container(s) inside it as a template. However, this also means that this + // pod and the containers inside this pod will permananetly stay in the + // paused/templated state and can not be resumed again. + annotationSaveAsTemplate = "io.microsoft.virtualmachine.saveastemplate" + + // This annotation should be used when creating a pod or a container from a template. + // When creating a pod from a template use the ID of the templated pod as the + // TemplateID and when creating a container use the ID of the templated container as + // the TemplateID. It is the client's responsibility to make sure that the sandbox + // within which a cloned container needs to be created must also be created from the + // same template. + annotationTemplateID = "io.microsoft.virtualmachine.templateid" ) // parseAnnotationsBool searches `a` for `key` and if found verifies that the @@ -328,6 +345,32 @@ func parseAnnotationsString(a map[string]string, key string, def string) string return def } +// ParseAnnotationsSaveAsTemplate searches for the boolean value which specifies +// if this create request should be considered as a template creation request. If value +// is found the returns the actual value, returns false otherwise. +func ParseAnnotationsSaveAsTemplate(ctx context.Context, s *specs.Spec) bool { + return parseAnnotationsBool(ctx, s.Annotations, annotationSaveAsTemplate, false) +} + +// ParseAnnotationsTemplateID searches for the templateID in the create request. If the +// value is found then returns the value otherwise returns the empty string. +func ParseAnnotationsTemplateID(ctx context.Context, s *specs.Spec) string { + return parseAnnotationsString(s.Annotations, annotationTemplateID, "") +} + +func ParseCloneAnnotations(ctx context.Context, s *specs.Spec) (isTemplate bool, templateID string, err error) { + templateID = ParseAnnotationsTemplateID(ctx, s) + isTemplate = ParseAnnotationsSaveAsTemplate(ctx, s) + if templateID != "" && isTemplate { + return false, "", fmt.Errorf("templateID and save as template flags can not be passed in the same request") + } + + if (isTemplate || templateID != "") && !IsWCOW(s) { + return false, "", fmt.Errorf("save as template and creating clones is only available for WCOW") + } + return +} + // handleAnnotationKernelDirectBoot handles parsing annotationKernelDirectBoot and setting // implied annotations from the result. func handleAnnotationKernelDirectBoot(ctx context.Context, a map[string]string, lopts *uvm.OptionsLCOW) { @@ -369,6 +412,26 @@ func handleAnnotationFullyPhysicallyBacked(ctx context.Context, a map[string]str } } +// handleCloneAnnotations handles parsing annotations related to template creation and cloning +// Since late cloning is only supported for WCOW this function only deals with WCOW options. +func handleCloneAnnotations(ctx context.Context, a map[string]string, wopts *uvm.OptionsWCOW) (err error) { + wopts.IsTemplate = parseAnnotationsBool(ctx, a, annotationSaveAsTemplate, false) + templateID := parseAnnotationsString(a, annotationTemplateID, "") + if templateID != "" { + tc, err := clone.FetchTemplateConfig(ctx, templateID) + if err != nil { + return err + } + wopts.TemplateConfig = &uvm.UVMTemplateConfig{ + UVMID: tc.TemplateUVMID, + CreateOpts: tc.TemplateUVMCreateOpts, + Resources: tc.TemplateUVMResources, + } + wopts.IsClone = true + } + return nil +} + // SpecToUVMCreateOpts parses `s` and returns either `*uvm.OptionsLCOW` or // `*uvm.OptionsWCOW`. func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) (interface{}, error) { @@ -419,6 +482,9 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( wopts.DisableCompartmentNamespace = parseAnnotationsBool(ctx, s.Annotations, annotationDisableCompartmentNamespace, wopts.DisableCompartmentNamespace) wopts.CPUGroupID = parseAnnotationsString(s.Annotations, annotationCPUGroupID, wopts.CPUGroupID) handleAnnotationFullyPhysicallyBacked(ctx, s.Annotations, wopts) + if err := handleCloneAnnotations(ctx, s.Annotations, wopts); err != nil { + return nil, err + } return wopts, nil } return nil, errors.New("cannot create UVM opts spec is not LCOW or WCOW") diff --git a/internal/uvm/clone.go b/internal/uvm/clone.go index 7d25713e0e..5a7977b482 100644 --- a/internal/uvm/clone.go +++ b/internal/uvm/clone.go @@ -2,7 +2,9 @@ package uvm import ( "context" + "fmt" + "github.com/Microsoft/hcsshim/internal/cow" hcsschema "github.com/Microsoft/hcsshim/internal/schema2" "github.com/pkg/errors" ) @@ -50,7 +52,7 @@ type Cloneable interface { Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) error } -// cloneData contains all the information that might be required during cloning process of +// A struct to keep all the information that might be required during cloning process of // a resource. type cloneData struct { // doc spec for the clone @@ -68,15 +70,22 @@ type UVMTemplateConfig struct { UVMID string // Array of all resources that will be required while making a clone from this template Resources []Cloneable + // The OptionsWCOW used for template uvm creation + CreateOpts OptionsWCOW } // Captures all the information that is necessary to properly save this UVM as a template // and create clones from this template later. The struct returned by this method must be // later on made available while creating a clone from this template. -func (uvm *UtilityVM) GenerateTemplateConfig() *UVMTemplateConfig { +func (uvm *UtilityVM) GenerateTemplateConfig() (*UVMTemplateConfig, error) { + if _, ok := uvm.createOpts.(OptionsWCOW); !ok { + return nil, fmt.Errorf("template config can only be created for a WCOW uvm") + } + // Add all the SCSI Mounts and VSMB shares into the list of clones templateConfig := &UVMTemplateConfig{ - UVMID: uvm.ID(), + UVMID: uvm.ID(), + CreateOpts: uvm.createOpts.(OptionsWCOW), } for _, vsmbShare := range uvm.vsmbDirShares { @@ -95,13 +104,13 @@ func (uvm *UtilityVM) GenerateTemplateConfig() *UVMTemplateConfig { } } - return templateConfig + return templateConfig, nil } // Pauses the uvm and then saves it as a template. This uvm can not be restarted or used // after it is successfully saved. -// uvm must be in the paused state before we attempt to save it. save call will throw the -// VM in incorrect state exception if it is not in the paused state at the time of saving. +// uvm must be in the paused state before it can be saved as a template.save call will throw +// an incorrect uvm state exception if uvm is not in the paused state at the time of saving. func (uvm *UtilityVM) SaveAsTemplate(ctx context.Context) error { if err := uvm.hcsSystem.Pause(ctx); err != nil { return errors.Wrap(err, "error pausing the VM") @@ -115,3 +124,16 @@ func (uvm *UtilityVM) SaveAsTemplate(ctx context.Context) error { } return nil } + +// CloneContainer attaches back to a container that is already running inside the UVM +// because of the clone +func (uvm *UtilityVM) CloneContainer(ctx context.Context, id string) (cow.Container, error) { + if uvm.gc == nil { + return nil, fmt.Errorf("clone container cannot work without external GCS connection") + } + c, err := uvm.gc.CloneContainer(ctx, id) + if err != nil { + return nil, fmt.Errorf("failed to clone container %s: %s", id, err) + } + return c, nil +} diff --git a/internal/uvm/create.go b/internal/uvm/create.go index 5f69b015b2..47eda3cca9 100644 --- a/internal/uvm/create.go +++ b/internal/uvm/create.go @@ -87,6 +87,37 @@ type Options struct { CPUGroupID string } +// compares the create opts used during template creation with the create opts +// provided for clone creation. If they don't match (except for a few fields) +// then clone creation is failed. +func verifyCloneUvmCreateOpts(templateOpts, cloneOpts *OptionsWCOW) bool { + // Following fields can be different in the template and clone configurations. + // 1. the scratch layer path. i.e the last element of the LayerFolders path. + // 2. IsTemplate, IsClone and TemplateConfig variables. + // 3. ID + // 4. AdditionalHCSDocumentJSON + + // Save the original values of the fields that we want to ignore and replace them with + // the same values as that of the other object. So that we can simply use `==` operator. + templateIDBackup := templateOpts.ID + templateAdditionalJsonBackup := templateOpts.AdditionHCSDocumentJSON + templateOpts.ID = cloneOpts.ID + templateOpts.AdditionHCSDocumentJSON = cloneOpts.AdditionHCSDocumentJSON + + // We can't use `==` operator on structs which include slices in them. So compare the + // Layerfolders separately and then directly compare the Options struct. + result := (len(templateOpts.LayerFolders) == len(cloneOpts.LayerFolders)) + for i := 0; result && i < len(templateOpts.LayerFolders)-1; i++ { + result = result && (templateOpts.LayerFolders[i] == cloneOpts.LayerFolders[i]) + } + result = result && (*templateOpts.Options == *cloneOpts.Options) + + // set original values + templateOpts.ID = templateIDBackup + templateOpts.AdditionHCSDocumentJSON = templateAdditionalJsonBackup + return result +} + // Verifies that the final UVM options are correct and supported. func verifyOptions(ctx context.Context, options interface{}) error { switch opts := options.(type) { @@ -123,9 +154,18 @@ func verifyOptions(ctx context.Context, options interface{}) error { if len(opts.LayerFolders) < 2 { return errors.New("at least 2 LayerFolders must be supplied") } + if opts.IsClone && !verifyCloneUvmCreateOpts(&opts.TemplateConfig.CreateOpts, opts) { + return errors.New("clone configuration doesn't match with template configuration.") + } if opts.IsClone && opts.TemplateConfig == nil { return errors.New("template config can not be nil when creating clone") } + if opts.IsClone && !opts.ExternalGuestConnection { + return errors.New("External gcs connection can not be disabled for clones") + } + if opts.IsTemplate && opts.FullyPhysicallyBacked { + return errors.New("Template can not be created from a full physically backed UVM") + } } return nil } @@ -211,7 +251,7 @@ func (uvm *UtilityVM) Close() (err error) { } if err := uvm.CloseGCSConnection(); err != nil { - log.G(ctx).Errorf("close gcs connection failed: %f", err) + log.G(ctx).Errorf("close GCS connection failed: %s", err) } // outputListener will only be nil for a Create -> Stop without a Start. In diff --git a/internal/uvm/create_lcow.go b/internal/uvm/create_lcow.go index 1ce25e01f7..1b3308e0ba 100644 --- a/internal/uvm/create_lcow.go +++ b/internal/uvm/create_lcow.go @@ -168,6 +168,7 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error physicallyBacked: !opts.AllowOvercommit, devicesPhysicallyBacked: opts.FullyPhysicallyBacked, cpuGroupID: opts.CPUGroupID, + createOpts: opts, } defer func() { diff --git a/internal/uvm/create_wcow.go b/internal/uvm/create_wcow.go index 40a53b72fb..79bd93ec7e 100644 --- a/internal/uvm/create_wcow.go +++ b/internal/uvm/create_wcow.go @@ -215,6 +215,7 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error physicallyBacked: !opts.AllowOvercommit, devicesPhysicallyBacked: opts.FullyPhysicallyBacked, cpuGroupID: opts.CPUGroupID, + createOpts: *opts, } defer func() { @@ -277,11 +278,7 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error }, } - uvm.scsiLocations[0][0] = &SCSIMount{ - vm: uvm, - HostPath: doc.VirtualMachine.Devices.Scsi["0"].Attachments["0"].Path, - refCount: 1, - } + uvm.scsiLocations[0][0] = newSCSIMount(uvm, doc.VirtualMachine.Devices.Scsi["0"].Attachments["0"].Path, "", "", 1, 0, 0, false) } else { doc.VirtualMachine.RestoreState = &hcsschema.RestoreState{} doc.VirtualMachine.RestoreState.TemplateSystemId = opts.TemplateConfig.UVMID @@ -305,14 +302,13 @@ func CreateWCOW(ctx context.Context, opts *OptionsWCOW) (_ *UtilityVM, err error nics: make(map[string]*nicInfo), } uvm.IsClone = true + uvm.TemplateID = opts.TemplateConfig.UVMID } // Add appropriate VSMB share options if this UVM needs to be saved as a template if opts.IsTemplate { for _, share := range doc.VirtualMachine.Devices.VirtualSmb.Shares { - share.Options.PseudoDirnotify = true - share.Options.NoLocks = true - share.Options.NoDirectmap = true + uvm.SetSaveableVSMBOptions(share.Options, share.Options.ReadOnly) } uvm.IsTemplate = true } diff --git a/internal/uvm/scsi.go b/internal/uvm/scsi.go index 71fca379ba..a183204cb1 100644 --- a/internal/uvm/scsi.go +++ b/internal/uvm/scsi.go @@ -453,17 +453,21 @@ func (sm *SCSIMount) Clone(ctx context.Context, vm *UtilityVM, cd *cloneData) er ) if !sm.readOnly { - // Copy this scsi disk - // TODO(ambarve): This is a writeable SCSI mount. It can either be the - // scratch VHD of the UVM or it can be a SCSI mount that belongs to some - // container which is being automatically cloned here as a part of UVM - // cloning process. We will receive a request for creation of this - // container later on which will specify the storage path for this - // container. However, that storage location is not available now so we - // just use the storage of the uvm instead. Find a better way for handling - // this. Problem with this approach is that the scratch VHD of the container - // will not be automatically cleaned after container exits. It will stay - // there as long as the UVM keeps running. + // This is a writeable SCSI mount. It must be either the + // 1. scratch VHD of the UVM or + // 2. scratch VHD of the container. + // A user provided writable SCSI mount is not allowed on the template UVM + // or container and so this SCSI mount has to be the scratch VHD of the + // UVM or container. The container inside this UVM will automatically be + // cloned here when we are cloning the uvm itself. We will receive a + // request for creation of this container later and that request will + // specify the storage path for this container. However, that storage + // location is not available now so we just use the storage path of the + // uvm instead. + // TODO(ambarve): Find a better way for handling this. Problem with this + // approach is that the scratch VHD of the container will not be + // automatically cleaned after container exits. It will stay there as long + // as the UVM keeps running. // For the scratch VHD of the VM (always attached at Controller:0, LUN:0) // clone it in the scratch folder diff --git a/internal/uvm/types.go b/internal/uvm/types.go index 7d25ba2f0e..5f29166edf 100644 --- a/internal/uvm/types.go +++ b/internal/uvm/types.go @@ -122,4 +122,12 @@ type UtilityVM struct { // specifies if this UVM is a cloned from a template IsClone bool + + // ID of the template from which this clone was created. Only applies when IsClone + // is true + TemplateID string + + // The CreateOpts used to create this uvm. These can be either of type + // uvm.OptionsLCOW or uvm.OptionsWCOW + createOpts interface{} } diff --git a/internal/uvm/vsmb.go b/internal/uvm/vsmb.go index 201c7bb3ed..e15fb42554 100644 --- a/internal/uvm/vsmb.go +++ b/internal/uvm/vsmb.go @@ -60,6 +60,28 @@ func (uvm *UtilityVM) DefaultVSMBOptions(readOnly bool) *hcsschema.VirtualSmbSha return opts } +func (uvm *UtilityVM) SetSaveableVSMBOptions(opts *hcsschema.VirtualSmbShareOptions, readOnly bool) { + if readOnly { + opts.ShareRead = true + opts.CacheIo = true + opts.ReadOnly = true + opts.PseudoOplocks = true + opts.NoOplocks = false + } else { + // Using NoOpLocks can cause intermittent Access denied failures due to + // a VSMB bug that was fixed but not backported to RS5/19H1. + opts.ShareRead = false + opts.CacheIo = false + opts.ReadOnly = false + opts.PseudoOplocks = false + opts.NoOplocks = true + } + opts.NoLocks = true + opts.PseudoDirnotify = true + opts.NoDirectmap = true + return +} + // findVSMBShare finds a share by `hostPath`. If not found returns `ErrNotAttached`. func (uvm *UtilityVM) findVSMBShare(ctx context.Context, m map[string]*VSMBShare, shareKey string) (*VSMBShare, error) { share, ok := m[shareKey]