From 3fcca0952ecdd5acd1cf092abf50f23c0cab3725 Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Mon, 2 Aug 2021 16:44:34 -0700 Subject: [PATCH 1/2] Add retry around wclayer operations for process isolated containers This change adds a simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manor on RS5 (ws2019) which led to some very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths that bring about the errors we have observed aren't known. On 19h1+ this retry loop shouldn't be needed, but the logic is to leave the loop if everything succeeded so this is harmless and shouldn't need a version check. Signed-off-by: Daniel Canter (cherry picked from commit 01b99119beb113ad3c5c4aa39f55b2e30c2951da) Signed-off-by: Daniel Canter --- internal/layers/layers.go | 55 ++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/internal/layers/layers.go b/internal/layers/layers.go index b5ab18a7ca..f3f358a67c 100644 --- a/internal/layers/layers.go +++ b/internal/layers/layers.go @@ -9,6 +9,7 @@ import ( "path/filepath" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/hcserror" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/ospath" "github.com/Microsoft/hcsshim/internal/uvm" @@ -16,6 +17,7 @@ import ( "github.com/Microsoft/hcsshim/internal/wclayer" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "golang.org/x/sys/windows" ) // ImageLayers contains all the layers for an image. @@ -78,21 +80,56 @@ func MountContainerLayers(ctx context.Context, layerFolders []string, guestRoot } path := layerFolders[len(layerFolders)-1] rest := layerFolders[:len(layerFolders)-1] - if err := wclayer.ActivateLayer(ctx, path); err != nil { - return "", err - } - defer func() { - if err != nil { - _ = wclayer.DeactivateLayer(ctx, path) + // Simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manor on RS5 (ws2019) which led to some + // very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many + // parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths + // that bring about the errors we have observed aren't known. + // + // On 19h1+ this *shouldn't* be needed, but the logic is to break if everything succeeded so this is harmless and shouldn't need a version check. + var lErr error + for i := 0; i < 5; i++ { + lErr = func() (err error) { + if err := wclayer.ActivateLayer(ctx, path); err != nil { + return err + } + + defer func() { + if err != nil { + _ = wclayer.DeactivateLayer(ctx, path) + } + }() + + return wclayer.PrepareLayer(ctx, path, rest) + }() + + if lErr != nil { + // Common errors seen from the RS5 behavior mentioned above is ERROR_NOT_READY and ERROR_DEVICE_NOT_CONNECTED. The former occurs when HCS + // tries to grab the volume path of the disk but it doesn't succeed, usually because the disk isn't actually mounted. DEVICE_NOT_CONNECTED + // has been observed after launching multiple containers in parallel on a machine under high load. This has also been observed to be a trigger + // for ERROR_NOT_READY as well. + if hcserr, ok := lErr.(*hcserror.HcsError); ok { + if hcserr.Err == windows.ERROR_NOT_READY || hcserr.Err == windows.ERROR_DEVICE_NOT_CONNECTED { + continue + } + } + // This was a failure case outside of the commonly known error conditions, don't retry here. + return "", lErr } - }() - if err := wclayer.PrepareLayer(ctx, path, rest); err != nil { - return "", err + // No errors in layer setup, we can leave the loop + break } + // If we got unlucky and ran into one of the two errors mentioned five times in a row and left the loop, we need to check + // the loop error here and fail also. + if lErr != nil { + return "", errors.Wrap(lErr, "layer retry loop failed") + } + + // If any of the below fails, we want to detach the filter and unmount the disk. defer func() { if err != nil { _ = wclayer.UnprepareLayer(ctx, path) + _ = wclayer.DeactivateLayer(ctx, path) } }() From 8d341a4750814aa261bf00e325050a95f193955f Mon Sep 17 00:00:00 2001 From: Daniel Canter Date: Wed, 18 Aug 2021 16:05:28 -0700 Subject: [PATCH 2/2] Add sleep before layer operation retries This change adds a small sleep before a re-attempt on layer operation failures. These failures should only happen on RS5 and the probable cause is because of a different way in which container loopback vhds were mounted on this OS version. A theory of why things might go awry on RS5 is due to some events from pnp getting reported too late/early. If the prognosis is correct, a small sleep might help to try and get things back into a "good" state before a reattempt. Signed-off-by: Daniel Canter (cherry picked from commit adc35b064429d1e80be4bf5022a0f8504a9197ae) Signed-off-by: Daniel Canter --- internal/layers/layers.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/layers/layers.go b/internal/layers/layers.go index f3f358a67c..77e34e8a0a 100644 --- a/internal/layers/layers.go +++ b/internal/layers/layers.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "path/filepath" + "time" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/hcserror" @@ -109,6 +110,9 @@ func MountContainerLayers(ctx context.Context, layerFolders []string, guestRoot // for ERROR_NOT_READY as well. if hcserr, ok := lErr.(*hcserror.HcsError); ok { if hcserr.Err == windows.ERROR_NOT_READY || hcserr.Err == windows.ERROR_DEVICE_NOT_CONNECTED { + // Sleep for a little before a re-attempt. A probable cause for these issues in the first place is events not getting + // reported in time so might be good to give some time for things to "cool down" or get back to a known state. + time.Sleep(time.Millisecond * 100) continue } }