Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 45 additions & 9 deletions internal/layers/layers.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"path/filepath"

hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
"github.com/Microsoft/hcsshim/internal/hcserror"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/ospath"
"github.com/Microsoft/hcsshim/internal/uvm"
Expand Down Expand Up @@ -84,21 +85,56 @@ func MountContainerLayers(ctx context.Context, layerFolders []string, guestRoot,
}
path := layerFolders[len(layerFolders)-1]
rest := layerFolders[:len(layerFolders)-1]
if err := wclayer.ActivateLayer(ctx, path); err != nil {
return "", err
}
defer func() {
if err != nil {
_ = wclayer.DeactivateLayer(ctx, path)
// Simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manor on RS5 (ws2019) which led to some
// very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many
// parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths
// that bring about the errors we have observed aren't known.
//
// On 19h1+ this *shouldn't* be needed, but the logic is to break if everything succeeded so this is harmless and shouldn't need a version check.
var lErr error
for i := 0; i < 5; i++ {
lErr = func() (err error) {
if err := wclayer.ActivateLayer(ctx, path); err != nil {
return err
}

defer func() {
if err != nil {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't err need to be set to PrepareLayer result for the deferred DeactivateLayer to execute?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, if you have a named return value, e.g. (err error) then the return value of line 107 or the PrepareLayer call will get assigned to err after completion. So when defer runs it will have the return value of PrepareLayer to check against.

Here's a quick example: https://play.golang.org/p/cID3RHPwl88

_ = wclayer.DeactivateLayer(ctx, path)
}
}()

return wclayer.PrepareLayer(ctx, path, rest)
}()

if lErr != nil {
// Common errors seen from the RS5 behavior mentioned above is ERROR_NOT_READY and ERROR_DEVICE_NOT_CONNECTED. The former occurs when HCS
// tries to grab the volume path of the disk but it doesn't succeed, usually because the disk isn't actually mounted. DEVICE_NOT_CONNECTED
// has been observed after launching multiple containers in parallel on a machine under high load. This has also been observed to be a trigger
// for ERROR_NOT_READY as well.
if hcserr, ok := lErr.(*hcserror.HcsError); ok {
if hcserr.Err == windows.ERROR_NOT_READY || hcserr.Err == windows.ERROR_DEVICE_NOT_CONNECTED {
continue
}
}
// This was a failure case outside of the commonly known error conditions, don't retry here.
return "", lErr
}
}()

if err := wclayer.PrepareLayer(ctx, path, rest); err != nil {
return "", err
// No errors in layer setup, we can leave the loop
break
}
// If we got unlucky and ran into one of the two errors mentioned five times in a row and left the loop, we need to check
// the loop error here and fail also.
if lErr != nil {
return "", errors.Wrap(lErr, "layer retry loop failed")
}

// If any of the below fails, we want to detach the filter and unmount the disk.
defer func() {
if err != nil {
_ = wclayer.UnprepareLayer(ctx, path)
_ = wclayer.DeactivateLayer(ctx, path)
}
}()

Expand Down