From 9d21ad6bef1ebc35cb1f959cf509794aaccfbbf5 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Tue, 17 Aug 2021 09:30:45 -0700 Subject: [PATCH 01/14] Start time synchronization service in opengcs Changes to the opengcs to start the chronyd service after UVM boots. Signed-off-by: Amit Barve --- cmd/gcs/main.go | 8 +++ internal/guest/commonutils/utilities.go | 80 +++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 7dacf5d5b8..1b1b6e6549 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -12,6 +12,7 @@ import ( "time" "github.com/Microsoft/hcsshim/internal/guest/bridge" + "github.com/Microsoft/hcsshim/internal/guest/commonutils" "github.com/Microsoft/hcsshim/internal/guest/kmsg" "github.com/Microsoft/hcsshim/internal/guest/runtime/hcsv2" "github.com/Microsoft/hcsshim/internal/guest/runtime/runc" @@ -248,6 +249,12 @@ func main() { oomFile := os.NewFile(oom, "cefd") defer oomFile.Close() + // time synchronization service + err = commonutils.StartTimeSyncService() + if err != nil { + logrus.WithError(err).Fatal("failed to start time synchronization service") + } + go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) err = b.ListenAndServe(bridgeIn, bridgeOut) @@ -256,4 +263,5 @@ func main() { logrus.ErrorKey: err, }).Fatal("failed to serve gcs service") } + } diff --git a/internal/guest/commonutils/utilities.go b/internal/guest/commonutils/utilities.go index adcf70e6c2..c9a7a28a40 100644 --- a/internal/guest/commonutils/utilities.go +++ b/internal/guest/commonutils/utilities.go @@ -1,10 +1,19 @@ package commonutils import ( + "bufio" "encoding/json" + "fmt" "io" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" "github.com/Microsoft/hcsshim/internal/guest/gcserr" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" ) // UnmarshalJSONWithHresult unmarshals the given data into the given interface, and @@ -24,3 +33,74 @@ func DecodeJSONWithHresult(r io.Reader, v interface{}) error { } return nil } + +// StartTimeSyncService starts the `chronyd` deamon to keep the UVM time synchronized. We +// use a PTP device provided by the hypervisor as a source of correct time (instead of +// using a network server). We need to create a configuration file that configures chronyd +// to use the PTP device. The system can have multiple PTP devices so we identify the +// correct PTP device by verifying that the `clock_name` of that device is `hyperv`. +func StartTimeSyncService() error { + ptpClassDir, err := os.Open("/sys/class/ptp") + if err != nil { + return errors.Wrap(err, "failed to open PTP class directory") + } + + ptpDirList, err := ptpClassDir.Readdirnames(-1) + if err != nil { + return errors.Wrap(err, "failed to list PTP class directory") + } + + var ptpDirPath string + found := false + for _, ptpDirPath = range ptpDirList { + clockNameFilePath := filepath.Join(ptpClassDir.Name(), ptpDirPath, "clock_name") + clockNameFile, err := os.Open(clockNameFilePath) + if err != nil { + return errors.Wrapf(err, "failed to open clock name file at %s", clockNameFilePath) + } + // Expected clock name is `hyperv` so read first 6 chars and verify the + // name + expectedReadLen := len("hyperv") + buf := make([]byte, expectedReadLen) + fileReader := bufio.NewReader(clockNameFile) + nread, err := fileReader.Read(buf) + if err != nil { + return errors.Wrapf(err, "read file %s failed", clockNameFilePath) + } else if nread != expectedReadLen { + return errors.Wrapf(err, "read file %s returned %d bytes, expected %d", clockNameFilePath, nread, expectedReadLen) + } + clockName := string(buf) + if strings.EqualFold(clockName, "hyperv") { + found = true + break + } + } + + if !found { + return errors.Errorf("no PTP device found with name \"hyperv\"") + } + + // create chronyd config file + ptpDevPath := filepath.Join("/dev", filepath.Base(ptpDirPath)) + chronydConfigString := fmt.Sprintf("refclock PHC %s poll 3 dpoll -2 offset 0\n", ptpDevPath) + + chronydConfPath := "/tmp/chronyd.conf" + err = ioutil.WriteFile(chronydConfPath, []byte(chronydConfigString), 0644) + if err != nil { + return errors.Wrapf(err, "failed to create chronyd conf file %s", chronydConfPath) + } + + // start chronyd + chronydCmd := exec.Command("chronyd", "-f", chronydConfPath) + err = chronydCmd.Start() + if err != nil { + return errors.Wrap(err, "start chronyd command failed") + } + go func() { + waitErr := chronydCmd.Wait() + if waitErr != nil { + logrus.WithError(waitErr).Warn("chronyd command exited with error") + } + }() + return nil +} From 6decb3cf17ca5f6917f98fdcc8272efb4e1651ff Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Tue, 17 Aug 2021 15:49:53 -0700 Subject: [PATCH 02/14] Add annotation to enable/disable Time synchronization service. Signed-off-by: Amit Barve --- cmd/gcs/main.go | 9 ++++++--- internal/oci/uvm.go | 1 + internal/uvm/create_lcow.go | 6 ++++++ pkg/annotations/annotations.go | 4 ++++ test/cri-containerd/runpodsandbox_test.go | 15 +++++++++++++++ .../Microsoft/hcsshim/internal/oci/uvm.go | 1 + .../Microsoft/hcsshim/internal/uvm/create_lcow.go | 6 ++++++ .../hcsshim/pkg/annotations/annotations.go | 4 ++++ 8 files changed, 43 insertions(+), 3 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 1b1b6e6549..e5863a68f3 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -93,6 +93,7 @@ func main() { v4 := flag.Bool("v4", false, "enable the v4 protocol support and v2 schema") rootMemReserveBytes := flag.Uint64("root-mem-reserve-bytes", 75*1024*1024, "the amount of memory reserved for the orchestration, the rest will be assigned to containers") gcsMemLimitBytes := flag.Uint64("gcs-mem-limit-bytes", 50*1024*1024, "the maximum amount of memory the gcs can use") + disableTimeSync := flag.Bool("disableTimeSync", false, "If true do not run chronyd time synchronization service inside the UVM") flag.Usage = func() { fmt.Fprintf(os.Stderr, "\nUsage of %s:\n", os.Args[0]) @@ -250,9 +251,11 @@ func main() { defer oomFile.Close() // time synchronization service - err = commonutils.StartTimeSyncService() - if err != nil { - logrus.WithError(err).Fatal("failed to start time synchronization service") + if !(*disableTimeSync) { + err = commonutils.StartTimeSyncService() + if err != nil { + logrus.WithError(err).Fatal("failed to start time synchronization service") + } } go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) diff --git a/internal/oci/uvm.go b/internal/oci/uvm.go index a0b14d8948..289c740314 100644 --- a/internal/oci/uvm.go +++ b/internal/oci/uvm.go @@ -358,6 +358,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( lopts.SecurityPolicy = parseAnnotationsString(s.Annotations, annotations.SecurityPolicy, lopts.SecurityPolicy) lopts.KernelBootOptions = parseAnnotationsString(s.Annotations, annotations.KernelBootOptions, lopts.KernelBootOptions) lopts.ProcessDumpLocation = parseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, lopts.ProcessDumpLocation) + lopts.DisableTimeSyncService = parseAnnotationsBool(s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService) handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts) handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts) diff --git a/internal/uvm/create_lcow.go b/internal/uvm/create_lcow.go index dd8b68c3f5..98b0945bca 100644 --- a/internal/uvm/create_lcow.go +++ b/internal/uvm/create_lcow.go @@ -103,6 +103,7 @@ type OptionsLCOW struct { SecurityPolicyEnabled bool // Set when there is a security policy to apply on actual SNP hardware, use this rathen than checking the string length UseGuestStateFile bool // Use a vmgs file that contains a kernel and initrd, required for SNP GuestStateFile string // The vmgs file to load + DisableTimeSyncService bool // Disables the time synchronization service } // defaultLCOWOSBootFilesPath returns the default path used to locate the LCOW @@ -152,6 +153,7 @@ func NewDefaultOptionsLCOW(id, owner string) *OptionsLCOW { SecurityPolicyEnabled: false, SecurityPolicy: "", GuestStateFile: "", + DisableTimeSyncService: false, } if _, err := os.Stat(filepath.Join(opts.BootFilesPath, VhdFile)); err == nil { @@ -651,6 +653,10 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs initArgs += fmt.Sprintf(" -e %d", linuxLogVsockPort) } + if opts.DisableTimeSyncService { + opts.ExecCommandLine = fmt.Sprintf("%s --disableTimeSync", opts.ExecCommandLine) + } + initArgs += " " + opts.ExecCommandLine if opts.ProcessDumpLocation != "" { diff --git a/pkg/annotations/annotations.go b/pkg/annotations/annotations.go index 420243937c..4202c56aba 100644 --- a/pkg/annotations/annotations.go +++ b/pkg/annotations/annotations.go @@ -257,4 +257,8 @@ const ( // GuestStateFile specifies the path of the vmgs file to use if required. Only applies in SNP mode. GuestStateFile = "io.microsoft.virtualmachine.lcow.gueststatefile" + + // AnnotationDisableLCOWTimeSyncService is used to disable the chronyd time + // synchronization service inside the LCOW UVM. + DisableLCOWTimeSyncService = "io.microsoft.virtualmachine.lcow.timesync.disable" ) diff --git a/test/cri-containerd/runpodsandbox_test.go b/test/cri-containerd/runpodsandbox_test.go index d549f02592..cadaa308b3 100644 --- a/test/cri-containerd/runpodsandbox_test.go +++ b/test/cri-containerd/runpodsandbox_test.go @@ -1708,3 +1708,18 @@ func Test_RunPodSandbox_KernelOptions_LCOW(t *testing.T) { t.Fatalf("Expected number of hugepages to be 10. Got output instead: %d", numOfHugePages) } } + +func Test_RunPodSandbox_TimeSyncService(t *testing.T) { + requireFeatures(t, featureLCOW) + + pullRequiredLcowImages(t, []string{imageLcowK8sPause}) + + request := getRunPodSandboxRequest( + t, + lcowRuntimeHandler, + map[string]string{ + oci.AnnotationDisableLCOWTimeSyncService: "true", + }, + ) + runPodSandboxTest(t, request) +} diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go b/test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go index a0b14d8948..289c740314 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go @@ -358,6 +358,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( lopts.SecurityPolicy = parseAnnotationsString(s.Annotations, annotations.SecurityPolicy, lopts.SecurityPolicy) lopts.KernelBootOptions = parseAnnotationsString(s.Annotations, annotations.KernelBootOptions, lopts.KernelBootOptions) lopts.ProcessDumpLocation = parseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, lopts.ProcessDumpLocation) + lopts.DisableTimeSyncService = parseAnnotationsBool(s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService) handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts) handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts) diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go b/test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go index dd8b68c3f5..98b0945bca 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go @@ -103,6 +103,7 @@ type OptionsLCOW struct { SecurityPolicyEnabled bool // Set when there is a security policy to apply on actual SNP hardware, use this rathen than checking the string length UseGuestStateFile bool // Use a vmgs file that contains a kernel and initrd, required for SNP GuestStateFile string // The vmgs file to load + DisableTimeSyncService bool // Disables the time synchronization service } // defaultLCOWOSBootFilesPath returns the default path used to locate the LCOW @@ -152,6 +153,7 @@ func NewDefaultOptionsLCOW(id, owner string) *OptionsLCOW { SecurityPolicyEnabled: false, SecurityPolicy: "", GuestStateFile: "", + DisableTimeSyncService: false, } if _, err := os.Stat(filepath.Join(opts.BootFilesPath, VhdFile)); err == nil { @@ -651,6 +653,10 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs initArgs += fmt.Sprintf(" -e %d", linuxLogVsockPort) } + if opts.DisableTimeSyncService { + opts.ExecCommandLine = fmt.Sprintf("%s --disableTimeSync", opts.ExecCommandLine) + } + initArgs += " " + opts.ExecCommandLine if opts.ProcessDumpLocation != "" { diff --git a/test/vendor/github.com/Microsoft/hcsshim/pkg/annotations/annotations.go b/test/vendor/github.com/Microsoft/hcsshim/pkg/annotations/annotations.go index 420243937c..4202c56aba 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/pkg/annotations/annotations.go +++ b/test/vendor/github.com/Microsoft/hcsshim/pkg/annotations/annotations.go @@ -257,4 +257,8 @@ const ( // GuestStateFile specifies the path of the vmgs file to use if required. Only applies in SNP mode. GuestStateFile = "io.microsoft.virtualmachine.lcow.gueststatefile" + + // AnnotationDisableLCOWTimeSyncService is used to disable the chronyd time + // synchronization service inside the LCOW UVM. + DisableLCOWTimeSyncService = "io.microsoft.virtualmachine.lcow.timesync.disable" ) From 7bb933e099e30ac21b7fcbcf04d138aeadf2aed6 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 18 Aug 2021 12:38:35 -0700 Subject: [PATCH 03/14] TimeSync service inside LCOW UVM. Add test to verify both chronyd running & disabled cases. Minor fixes in chronyd startup code. Signed-off-by: Amit Barve --- internal/guest/commonutils/utilities.go | 12 ++- test/cri-containerd/runpodsandbox_test.go | 90 ++++++++++++++++++++++- 2 files changed, 94 insertions(+), 8 deletions(-) diff --git a/internal/guest/commonutils/utilities.go b/internal/guest/commonutils/utilities.go index c9a7a28a40..f115963f99 100644 --- a/internal/guest/commonutils/utilities.go +++ b/internal/guest/commonutils/utilities.go @@ -52,6 +52,7 @@ func StartTimeSyncService() error { var ptpDirPath string found := false + expectedClockName := "hyperv" for _, ptpDirPath = range ptpDirList { clockNameFilePath := filepath.Join(ptpClassDir.Name(), ptpDirPath, "clock_name") clockNameFile, err := os.Open(clockNameFilePath) @@ -60,24 +61,21 @@ func StartTimeSyncService() error { } // Expected clock name is `hyperv` so read first 6 chars and verify the // name - expectedReadLen := len("hyperv") - buf := make([]byte, expectedReadLen) + buf := make([]byte, len(expectedClockName)) fileReader := bufio.NewReader(clockNameFile) - nread, err := fileReader.Read(buf) + _, err = fileReader.Read(buf) if err != nil { return errors.Wrapf(err, "read file %s failed", clockNameFilePath) - } else if nread != expectedReadLen { - return errors.Wrapf(err, "read file %s returned %d bytes, expected %d", clockNameFilePath, nread, expectedReadLen) } clockName := string(buf) - if strings.EqualFold(clockName, "hyperv") { + if strings.EqualFold(clockName, expectedClockName) { found = true break } } if !found { - return errors.Errorf("no PTP device found with name \"hyperv\"") + return errors.Errorf("no PTP device found with name \"%s\"", expectedClockName) } // create chronyd config file diff --git a/test/cri-containerd/runpodsandbox_test.go b/test/cri-containerd/runpodsandbox_test.go index cadaa308b3..0e76bdcb2f 100644 --- a/test/cri-containerd/runpodsandbox_test.go +++ b/test/cri-containerd/runpodsandbox_test.go @@ -4,6 +4,7 @@ package cri_containerd import ( "bufio" + "bytes" "context" "fmt" "github.com/Microsoft/hcsshim/pkg/annotations" @@ -19,8 +20,10 @@ import ( "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/lcow" "github.com/Microsoft/hcsshim/internal/processorinfo" + "github.com/Microsoft/hcsshim/internal/shimdiag" "github.com/Microsoft/hcsshim/osversion" testutilities "github.com/Microsoft/hcsshim/test/functional/utilities" + "github.com/containerd/containerd/log" runtime "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" ) @@ -1050,6 +1053,14 @@ func Test_RunPodSandbox_CPUGroup(t *testing.T) { } func createExt4VHD(ctx context.Context, t *testing.T, path string) { + // UVM related functions called below produce a lot debug logs. Set the logger + // output to Discard if verbose flag is not set. This way we can still capture + // these logs in a wpr session. + if !testing.Verbose() { + origLogOut := log.L.Logger.Out + log.L.Logger.SetOutput(io.Discard) + defer log.L.Logger.SetOutput(origLogOut) + } uvm := testutilities.CreateLCOWUVM(ctx, t, t.Name()+"-createExt4VHD") defer uvm.Close() @@ -1712,6 +1723,55 @@ func Test_RunPodSandbox_KernelOptions_LCOW(t *testing.T) { func Test_RunPodSandbox_TimeSyncService(t *testing.T) { requireFeatures(t, featureLCOW) + client := newTestRuntimeClient(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + pullRequiredLcowImages(t, []string{imageLcowK8sPause}) + + request := getRunPodSandboxRequest( + t, + lcowRuntimeHandler, + map[string]string{}, + ) + + podID := runPodSandbox(t, client, ctx, request) + defer removePodSandbox(t, client, ctx, podID) + defer stopPodSandbox(t, client, ctx, podID) + + shimName := fmt.Sprintf("k8s.io-%s", podID) + + shim, err := shimdiag.GetShim(shimName) + if err != nil { + t.Fatalf("failed to find shim %s: %s", shimName, err) + } + + psCmd := []string{"ps"} + shimClient := shimdiag.NewShimDiagClient(shim) + outBuf := bytes.Buffer{} + outw := bufio.NewWriter(&outBuf) + errBuf := bytes.Buffer{} + errw := bufio.NewWriter(&errBuf) + exitCode, err := execInHost(ctx, shimClient, psCmd, nil, outw, errw) + if err != nil { + t.Fatalf("failed to exec `%s` in the uvm with %s", psCmd[0], err) + } + if exitCode != 0 { + t.Fatalf("exec `%s` in the uvm failed with exit code: %d, std error: %s", psCmd[0], exitCode, errBuf.String()) + } + if !strings.Contains(outBuf.String(), "chronyd") { + t.Logf("standard output of exec %s is: %s\n", psCmd[0], outBuf.String()) + t.Fatalf("chronyd is not running inside the uvm") + } +} + +func Test_RunPodSandbox_DisableTimeSyncService(t *testing.T) { + requireFeatures(t, featureLCOW) + + client := newTestRuntimeClient(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + pullRequiredLcowImages(t, []string{imageLcowK8sPause}) request := getRunPodSandboxRequest( @@ -1721,5 +1781,33 @@ func Test_RunPodSandbox_TimeSyncService(t *testing.T) { oci.AnnotationDisableLCOWTimeSyncService: "true", }, ) - runPodSandboxTest(t, request) + + podID := runPodSandbox(t, client, ctx, request) + defer removePodSandbox(t, client, ctx, podID) + defer stopPodSandbox(t, client, ctx, podID) + + shimName := fmt.Sprintf("k8s.io-%s", podID) + + shim, err := shimdiag.GetShim(shimName) + if err != nil { + t.Fatalf("failed to find shim %s: %s", shimName, err) + } + + psCmd := []string{"ps"} + shimClient := shimdiag.NewShimDiagClient(shim) + outBuf := bytes.Buffer{} + outw := bufio.NewWriter(&outBuf) + errBuf := bytes.Buffer{} + errw := bufio.NewWriter(&errBuf) + exitCode, err := execInHost(ctx, shimClient, psCmd, nil, outw, errw) + if err != nil { + t.Fatalf("failed to exec `%s` in the uvm with %s", psCmd[0], err) + } + if exitCode != 0 { + t.Fatalf("exec `%s` in the uvm failed with exit code: %d, std error: %s", psCmd[0], exitCode, errBuf.String()) + } + if strings.Contains(outBuf.String(), "chronyd") { + t.Logf("standard output of exec %s is: %s\n", psCmd[0], outBuf.String()) + t.Fatalf("chronyd should not be running inside the uvm") + } } From 51af60487d4b8c8a26b6dab7f169d13314bde9fe Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 25 Aug 2021 09:22:18 -0700 Subject: [PATCH 04/14] Run Chronyd with restart monitor Signed-off-by: Amit Barve --- cmd/gcs/main.go | 99 ++++++++++++++++++++++++- internal/guest/commonutils/utilities.go | 78 ------------------- 2 files changed, 97 insertions(+), 80 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index e5863a68f3..014701a0d7 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -3,16 +3,19 @@ package main import ( + "bufio" "flag" "fmt" "io" "io/ioutil" "os" + "os/exec" + "path/filepath" + "strings" "syscall" "time" "github.com/Microsoft/hcsshim/internal/guest/bridge" - "github.com/Microsoft/hcsshim/internal/guest/commonutils" "github.com/Microsoft/hcsshim/internal/guest/kmsg" "github.com/Microsoft/hcsshim/internal/guest/runtime/hcsv2" "github.com/Microsoft/hcsshim/internal/guest/runtime/runc" @@ -21,6 +24,7 @@ import ( "github.com/containerd/cgroups" cgroupstats "github.com/containerd/cgroups/stats/v1" oci "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" "github.com/sirupsen/logrus" "go.opencensus.io/trace" ) @@ -82,6 +86,97 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre } } +// runWithRestartMonitor starts a command with given args and waits for it to exit. If the +// command exit code is non-zero the command is restarted with with some back off delay. +// Any stdout or stderr of the command will be split into lines and written as a log with +// logrus standard logger. This function must be called in a separate goroutine. +func runWithRestartMonitor(arg0 string, args ...string) { + restartDelay := time.Second * 5 + backoffFactor := 2 + for { + command := exec.Command(arg0, args...) + err := command.Start() + if err != nil { + logrus.WithFields(logrus.Fields{ + "error": err, + "command": command.Args, + }).Warn("restart monitor: start returns error") + } else { + waitErr := command.Wait() + if waitErr != nil { + logrus.WithFields(logrus.Fields{ + "error": waitErr, + "command": command.Args, + }).Warn("restart monitor: wait returns error") + } + } + time.Sleep(restartDelay * time.Duration(backoffFactor)) + restartDelay *= time.Duration(backoffFactor) + } + +} + +// StartTimeSyncService starts the `chronyd` deamon to keep the UVM time synchronized. We +// use a PTP device provided by the hypervisor as a source of correct time (instead of +// using a network server). We need to create a configuration file that configures chronyd +// to use the PTP device. The system can have multiple PTP devices so we identify the +// correct PTP device by verifying that the `clock_name` of that device is `hyperv`. +func StartTimeSyncService() error { + ptpClassDir, err := os.Open("/sys/class/ptp") + if err != nil { + return errors.Wrap(err, "failed to open PTP class directory") + } + + ptpDirList, err := ptpClassDir.Readdirnames(-1) + if err != nil { + return errors.Wrap(err, "failed to list PTP class directory") + } + + var ptpDirPath string + found := false + expectedClockName := "hyperv" + for _, ptpDirPath = range ptpDirList { + clockNameFilePath := filepath.Join(ptpClassDir.Name(), ptpDirPath, "clock_name") + clockNameFile, err := os.Open(clockNameFilePath) + if err != nil { + return errors.Wrapf(err, "failed to open clock name file at %s", clockNameFilePath) + } + // Expected clock name is `hyperv` so read first 6 chars and verify the + // name + buf := make([]byte, len(expectedClockName)) + fileReader := bufio.NewReader(clockNameFile) + _, err = fileReader.Read(buf) + if err != nil { + return errors.Wrapf(err, "read file %s failed", clockNameFilePath) + } + clockName := string(buf) + if strings.EqualFold(clockName, expectedClockName) { + found = true + break + } + } + + if !found { + return errors.Errorf("no PTP device found with name \"%s\"", expectedClockName) + } + + // create chronyd config file + ptpDevPath := filepath.Join("/dev", filepath.Base(ptpDirPath)) + chronydConfigString := fmt.Sprintf("refclock PHC %s poll 3 dpoll -2 offset 0\n", ptpDevPath) + + chronydConfPath := "/tmp/chronyd.conf" + err = ioutil.WriteFile(chronydConfPath, []byte(chronydConfigString), 0644) + if err != nil { + return errors.Wrapf(err, "failed to create chronyd conf file %s", chronydConfPath) + } + + // start chronyd. Do NOT start chronyd as daemon because creating a daemon + // involves double forking the restart monitor will attempt to restart chornyd + // after the first fork child exits. + go runWithRestartMonitor("chronyd", "-d", "-f", chronydConfPath) + return nil +} + func main() { startTime := time.Now() logLevel := flag.String("loglevel", "debug", "Logging Level: debug, info, warning, error, fatal, panic.") @@ -252,7 +347,7 @@ func main() { // time synchronization service if !(*disableTimeSync) { - err = commonutils.StartTimeSyncService() + err = StartTimeSyncService() if err != nil { logrus.WithError(err).Fatal("failed to start time synchronization service") } diff --git a/internal/guest/commonutils/utilities.go b/internal/guest/commonutils/utilities.go index f115963f99..adcf70e6c2 100644 --- a/internal/guest/commonutils/utilities.go +++ b/internal/guest/commonutils/utilities.go @@ -1,19 +1,10 @@ package commonutils import ( - "bufio" "encoding/json" - "fmt" "io" - "io/ioutil" - "os" - "os/exec" - "path/filepath" - "strings" "github.com/Microsoft/hcsshim/internal/guest/gcserr" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" ) // UnmarshalJSONWithHresult unmarshals the given data into the given interface, and @@ -33,72 +24,3 @@ func DecodeJSONWithHresult(r io.Reader, v interface{}) error { } return nil } - -// StartTimeSyncService starts the `chronyd` deamon to keep the UVM time synchronized. We -// use a PTP device provided by the hypervisor as a source of correct time (instead of -// using a network server). We need to create a configuration file that configures chronyd -// to use the PTP device. The system can have multiple PTP devices so we identify the -// correct PTP device by verifying that the `clock_name` of that device is `hyperv`. -func StartTimeSyncService() error { - ptpClassDir, err := os.Open("/sys/class/ptp") - if err != nil { - return errors.Wrap(err, "failed to open PTP class directory") - } - - ptpDirList, err := ptpClassDir.Readdirnames(-1) - if err != nil { - return errors.Wrap(err, "failed to list PTP class directory") - } - - var ptpDirPath string - found := false - expectedClockName := "hyperv" - for _, ptpDirPath = range ptpDirList { - clockNameFilePath := filepath.Join(ptpClassDir.Name(), ptpDirPath, "clock_name") - clockNameFile, err := os.Open(clockNameFilePath) - if err != nil { - return errors.Wrapf(err, "failed to open clock name file at %s", clockNameFilePath) - } - // Expected clock name is `hyperv` so read first 6 chars and verify the - // name - buf := make([]byte, len(expectedClockName)) - fileReader := bufio.NewReader(clockNameFile) - _, err = fileReader.Read(buf) - if err != nil { - return errors.Wrapf(err, "read file %s failed", clockNameFilePath) - } - clockName := string(buf) - if strings.EqualFold(clockName, expectedClockName) { - found = true - break - } - } - - if !found { - return errors.Errorf("no PTP device found with name \"%s\"", expectedClockName) - } - - // create chronyd config file - ptpDevPath := filepath.Join("/dev", filepath.Base(ptpDirPath)) - chronydConfigString := fmt.Sprintf("refclock PHC %s poll 3 dpoll -2 offset 0\n", ptpDevPath) - - chronydConfPath := "/tmp/chronyd.conf" - err = ioutil.WriteFile(chronydConfPath, []byte(chronydConfigString), 0644) - if err != nil { - return errors.Wrapf(err, "failed to create chronyd conf file %s", chronydConfPath) - } - - // start chronyd - chronydCmd := exec.Command("chronyd", "-f", chronydConfPath) - err = chronydCmd.Start() - if err != nil { - return errors.Wrap(err, "start chronyd command failed") - } - go func() { - waitErr := chronydCmd.Wait() - if waitErr != nil { - logrus.WithError(waitErr).Warn("chronyd command exited with error") - } - }() - return nil -} From ce87bc3630da5e3e9c4f07b8896b8994a8fb9bd2 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Mon, 25 Oct 2021 21:45:20 -0700 Subject: [PATCH 05/14] Force chronyd to step update time if difference is big Signed-off-by: Amit Barve --- cmd/gcs/main.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 014701a0d7..30a085f4c3 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -162,8 +162,8 @@ func StartTimeSyncService() error { // create chronyd config file ptpDevPath := filepath.Join("/dev", filepath.Base(ptpDirPath)) - chronydConfigString := fmt.Sprintf("refclock PHC %s poll 3 dpoll -2 offset 0\n", ptpDevPath) - + // chronyd config file take from: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/time-sync + chronydConfigString := fmt.Sprintf("refclock PHC %s poll 3 dpoll -2 offset 0 stratum 2\nmakestep 0.1 -1\n", ptpDevPath) chronydConfPath := "/tmp/chronyd.conf" err = ioutil.WriteFile(chronydConfPath, []byte(chronydConfigString), 0644) if err != nil { @@ -173,7 +173,7 @@ func StartTimeSyncService() error { // start chronyd. Do NOT start chronyd as daemon because creating a daemon // involves double forking the restart monitor will attempt to restart chornyd // after the first fork child exits. - go runWithRestartMonitor("chronyd", "-d", "-f", chronydConfPath) + go runWithRestartMonitor("chronyd", "-n", "-f", chronydConfPath) return nil } From 08b0447fb8cb342c214909302ec7b3e01081d7a5 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Mon, 8 Nov 2021 12:37:56 -0800 Subject: [PATCH 06/14] Fixes after rebase Signed-off-by: Amit Barve --- internal/oci/uvm.go | 2 +- test/cri-containerd/runpodsandbox_test.go | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/oci/uvm.go b/internal/oci/uvm.go index 289c740314..4599c0b211 100644 --- a/internal/oci/uvm.go +++ b/internal/oci/uvm.go @@ -358,7 +358,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( lopts.SecurityPolicy = parseAnnotationsString(s.Annotations, annotations.SecurityPolicy, lopts.SecurityPolicy) lopts.KernelBootOptions = parseAnnotationsString(s.Annotations, annotations.KernelBootOptions, lopts.KernelBootOptions) lopts.ProcessDumpLocation = parseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, lopts.ProcessDumpLocation) - lopts.DisableTimeSyncService = parseAnnotationsBool(s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService) + lopts.DisableTimeSyncService = parseAnnotationsBool(ctx, s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService) handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts) handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts) diff --git a/test/cri-containerd/runpodsandbox_test.go b/test/cri-containerd/runpodsandbox_test.go index 0e76bdcb2f..4c96710545 100644 --- a/test/cri-containerd/runpodsandbox_test.go +++ b/test/cri-containerd/runpodsandbox_test.go @@ -7,7 +7,7 @@ import ( "bytes" "context" "fmt" - "github.com/Microsoft/hcsshim/pkg/annotations" + "io" "io/ioutil" "os" "path/filepath" @@ -22,6 +22,7 @@ import ( "github.com/Microsoft/hcsshim/internal/processorinfo" "github.com/Microsoft/hcsshim/internal/shimdiag" "github.com/Microsoft/hcsshim/osversion" + "github.com/Microsoft/hcsshim/pkg/annotations" testutilities "github.com/Microsoft/hcsshim/test/functional/utilities" "github.com/containerd/containerd/log" runtime "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" @@ -1727,13 +1728,11 @@ func Test_RunPodSandbox_TimeSyncService(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - pullRequiredLcowImages(t, []string{imageLcowK8sPause}) + pullRequiredLCOWImages(t, []string{imageLcowK8sPause}) request := getRunPodSandboxRequest( t, - lcowRuntimeHandler, - map[string]string{}, - ) + lcowRuntimeHandler) podID := runPodSandbox(t, client, ctx, request) defer removePodSandbox(t, client, ctx, podID) @@ -1772,14 +1771,15 @@ func Test_RunPodSandbox_DisableTimeSyncService(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - pullRequiredLcowImages(t, []string{imageLcowK8sPause}) + pullRequiredLCOWImages(t, []string{imageLcowK8sPause}) request := getRunPodSandboxRequest( t, lcowRuntimeHandler, - map[string]string{ - oci.AnnotationDisableLCOWTimeSyncService: "true", - }, + WithSandboxAnnotations( + map[string]string{ + annotations.DisableLCOWTimeSyncService: "true", + }), ) podID := runPodSandbox(t, client, ctx, request) From 48fdb0127bd1cb8958041230d1bcece30752375b Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Mon, 8 Nov 2021 12:47:02 -0800 Subject: [PATCH 07/14] go mod vendor & tidy Signed-off-by: Amit Barve --- test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go b/test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go index 289c740314..4599c0b211 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go @@ -358,7 +358,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( lopts.SecurityPolicy = parseAnnotationsString(s.Annotations, annotations.SecurityPolicy, lopts.SecurityPolicy) lopts.KernelBootOptions = parseAnnotationsString(s.Annotations, annotations.KernelBootOptions, lopts.KernelBootOptions) lopts.ProcessDumpLocation = parseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, lopts.ProcessDumpLocation) - lopts.DisableTimeSyncService = parseAnnotationsBool(s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService) + lopts.DisableTimeSyncService = parseAnnotationsBool(ctx, s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService) handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts) handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts) From 7e7eaf51c62215eeadc1fe4ce9bcfab58efc5e42 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 10 Nov 2021 10:26:47 -0800 Subject: [PATCH 08/14] Use backoff package instead of manually calculating backoffs Signed-off-by: Amit Barve --- cmd/gcs/main.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 30a085f4c3..8e01b4b89d 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -21,6 +21,7 @@ import ( "github.com/Microsoft/hcsshim/internal/guest/runtime/runc" "github.com/Microsoft/hcsshim/internal/guest/transport" "github.com/Microsoft/hcsshim/internal/oc" + "github.com/cenkalti/backoff/v4" "github.com/containerd/cgroups" cgroupstats "github.com/containerd/cgroups/stats/v1" oci "github.com/opencontainers/runtime-spec/specs-go" @@ -91,8 +92,9 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre // Any stdout or stderr of the command will be split into lines and written as a log with // logrus standard logger. This function must be called in a separate goroutine. func runWithRestartMonitor(arg0 string, args ...string) { - restartDelay := time.Second * 5 - backoffFactor := 2 + backoffSettings := backoff.NewExponentialBackOff() + // give after retrying for 10 mins + backoffSettings.MaxElapsedTime = time.Minute * 10 for { command := exec.Command(arg0, args...) err := command.Start() @@ -110,8 +112,14 @@ func runWithRestartMonitor(arg0 string, args ...string) { }).Warn("restart monitor: wait returns error") } } - time.Sleep(restartDelay * time.Duration(backoffFactor)) - restartDelay *= time.Duration(backoffFactor) + backOffTime := backoffSettings.NextBackOff() + if backOffTime == backoffSettings.Stop { + logrus.WithFields(logrus.Fields{ + "command": command.Args, + }).Warn("give up restarting after timeout") + return + } + time.Sleep(backOffTime) } } From f2ab351561db86bcd1bcddbad4a8e68a23ab747f Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 10 Nov 2021 16:09:07 -0800 Subject: [PATCH 09/14] Rename gcs cmdline params, use io.ReadFull instead of io.Read Minor other fixes. Signed-off-by: Amit Barve --- cmd/gcs/main.go | 29 +++++++++-------------------- internal/uvm/create_lcow.go | 2 +- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 8e01b4b89d..b2581d9795 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -3,7 +3,6 @@ package main import ( - "bufio" "flag" "fmt" "io" @@ -97,26 +96,18 @@ func runWithRestartMonitor(arg0 string, args ...string) { backoffSettings.MaxElapsedTime = time.Minute * 10 for { command := exec.Command(arg0, args...) - err := command.Start() + err := command.Run() if err != nil { logrus.WithFields(logrus.Fields{ "error": err, "command": command.Args, - }).Warn("restart monitor: start returns error") - } else { - waitErr := command.Wait() - if waitErr != nil { - logrus.WithFields(logrus.Fields{ - "error": waitErr, - "command": command.Args, - }).Warn("restart monitor: wait returns error") - } + }).Warn("restart monitor: run command returns error") } backOffTime := backoffSettings.NextBackOff() if backOffTime == backoffSettings.Stop { logrus.WithFields(logrus.Fields{ "command": command.Args, - }).Warn("give up restarting after timeout") + }).Error("give up restarting after timeout") return } time.Sleep(backOffTime) @@ -124,12 +115,12 @@ func runWithRestartMonitor(arg0 string, args ...string) { } -// StartTimeSyncService starts the `chronyd` deamon to keep the UVM time synchronized. We +// startTimeSyncService starts the `chronyd` deamon to keep the UVM time synchronized. We // use a PTP device provided by the hypervisor as a source of correct time (instead of // using a network server). We need to create a configuration file that configures chronyd // to use the PTP device. The system can have multiple PTP devices so we identify the // correct PTP device by verifying that the `clock_name` of that device is `hyperv`. -func StartTimeSyncService() error { +func startTimeSyncService() error { ptpClassDir, err := os.Open("/sys/class/ptp") if err != nil { return errors.Wrap(err, "failed to open PTP class directory") @@ -149,12 +140,11 @@ func StartTimeSyncService() error { if err != nil { return errors.Wrapf(err, "failed to open clock name file at %s", clockNameFilePath) } + defer clockNameFile.Close() // Expected clock name is `hyperv` so read first 6 chars and verify the // name buf := make([]byte, len(expectedClockName)) - fileReader := bufio.NewReader(clockNameFile) - _, err = fileReader.Read(buf) - if err != nil { + if _, err = io.ReadFull(clockNameFile, buf); err != nil { return errors.Wrapf(err, "read file %s failed", clockNameFilePath) } clockName := string(buf) @@ -196,7 +186,7 @@ func main() { v4 := flag.Bool("v4", false, "enable the v4 protocol support and v2 schema") rootMemReserveBytes := flag.Uint64("root-mem-reserve-bytes", 75*1024*1024, "the amount of memory reserved for the orchestration, the rest will be assigned to containers") gcsMemLimitBytes := flag.Uint64("gcs-mem-limit-bytes", 50*1024*1024, "the maximum amount of memory the gcs can use") - disableTimeSync := flag.Bool("disableTimeSync", false, "If true do not run chronyd time synchronization service inside the UVM") + disableTimeSync := flag.Bool("disable-time-sync", false, "If true do not run chronyd time synchronization service inside the UVM") flag.Usage = func() { fmt.Fprintf(os.Stderr, "\nUsage of %s:\n", os.Args[0]) @@ -355,8 +345,7 @@ func main() { // time synchronization service if !(*disableTimeSync) { - err = StartTimeSyncService() - if err != nil { + if err = startTimeSyncService(); err != nil { logrus.WithError(err).Fatal("failed to start time synchronization service") } } diff --git a/internal/uvm/create_lcow.go b/internal/uvm/create_lcow.go index 98b0945bca..a8ff8c7742 100644 --- a/internal/uvm/create_lcow.go +++ b/internal/uvm/create_lcow.go @@ -654,7 +654,7 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs } if opts.DisableTimeSyncService { - opts.ExecCommandLine = fmt.Sprintf("%s --disableTimeSync", opts.ExecCommandLine) + opts.ExecCommandLine = fmt.Sprintf("%s --disable-time-sync", opts.ExecCommandLine) } initArgs += " " + opts.ExecCommandLine From 836127de1cfbfb25633a345ad5cc5e159f1aaa58 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 10 Nov 2021 16:10:47 -0800 Subject: [PATCH 10/14] go mod vendor Signed-off-by: Amit Barve --- .../github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go b/test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go index 98b0945bca..a8ff8c7742 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go @@ -654,7 +654,7 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs } if opts.DisableTimeSyncService { - opts.ExecCommandLine = fmt.Sprintf("%s --disableTimeSync", opts.ExecCommandLine) + opts.ExecCommandLine = fmt.Sprintf("%s --disable-time-sync", opts.ExecCommandLine) } initArgs += " " + opts.ExecCommandLine From 10153526b9cc2a8d3c07c208fbd146fc27df63f0 Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 10 Nov 2021 16:23:11 -0800 Subject: [PATCH 11/14] Ignore err if file doesn't exist Signed-off-by: Amit Barve --- cmd/gcs/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index b2581d9795..a136df557a 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -137,7 +137,7 @@ func startTimeSyncService() error { for _, ptpDirPath = range ptpDirList { clockNameFilePath := filepath.Join(ptpClassDir.Name(), ptpDirPath, "clock_name") clockNameFile, err := os.Open(clockNameFilePath) - if err != nil { + if err != nil && !os.IsNotExist(err) { return errors.Wrapf(err, "failed to open clock name file at %s", clockNameFilePath) } defer clockNameFile.Close() From d3013ce4c3fbccd35fcb47bf467f0e971f1784dc Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 10 Nov 2021 21:15:55 -0800 Subject: [PATCH 12/14] Use ioutil.ReadFile to read clock_name file Signed-off-by: Amit Barve --- cmd/gcs/main.go | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index a136df557a..2595d3b99c 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -10,7 +10,6 @@ import ( "os" "os/exec" "path/filepath" - "strings" "syscall" "time" @@ -133,22 +132,16 @@ func startTimeSyncService() error { var ptpDirPath string found := false - expectedClockName := "hyperv" + // The file ends with a new line + expectedClockName := "hyperv\n" for _, ptpDirPath = range ptpDirList { clockNameFilePath := filepath.Join(ptpClassDir.Name(), ptpDirPath, "clock_name") - clockNameFile, err := os.Open(clockNameFilePath) + buf, err := ioutil.ReadFile(clockNameFilePath) if err != nil && !os.IsNotExist(err) { - return errors.Wrapf(err, "failed to open clock name file at %s", clockNameFilePath) + return errors.Wrapf(err, "failed to read clock name file at %s", clockNameFilePath) } - defer clockNameFile.Close() - // Expected clock name is `hyperv` so read first 6 chars and verify the - // name - buf := make([]byte, len(expectedClockName)) - if _, err = io.ReadFull(clockNameFile, buf); err != nil { - return errors.Wrapf(err, "read file %s failed", clockNameFilePath) - } - clockName := string(buf) - if strings.EqualFold(clockName, expectedClockName) { + + if string(buf) == expectedClockName { found = true break } From 0d29698757bd87b5c0ab4b756a3323dcf4fe846a Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Wed, 10 Nov 2021 21:17:22 -0800 Subject: [PATCH 13/14] minor fix Signed-off-by: Amit Barve --- cmd/gcs/main.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 2595d3b99c..6a198d22c4 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -95,8 +95,7 @@ func runWithRestartMonitor(arg0 string, args ...string) { backoffSettings.MaxElapsedTime = time.Minute * 10 for { command := exec.Command(arg0, args...) - err := command.Run() - if err != nil { + if err := command.Run(); err != nil { logrus.WithFields(logrus.Fields{ "error": err, "command": command.Args, From 2380361a65b5cfa68173c4d6c1add6c542f8a1bf Mon Sep 17 00:00:00 2001 From: Amit Barve Date: Fri, 12 Nov 2021 10:42:44 -0800 Subject: [PATCH 14/14] Remove incorrect usage of backoff.MaxElapsedTime Signed-off-by: Amit Barve --- cmd/gcs/main.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 6a198d22c4..eb6081ef1f 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -91,8 +91,9 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre // logrus standard logger. This function must be called in a separate goroutine. func runWithRestartMonitor(arg0 string, args ...string) { backoffSettings := backoff.NewExponentialBackOff() - // give after retrying for 10 mins - backoffSettings.MaxElapsedTime = time.Minute * 10 + // After we hit 10 min retry interval keep retrying after every 10 mins instead of + // continuing to increase retry interval. + backoffSettings.MaxInterval = time.Minute * 10 for { command := exec.Command(arg0, args...) if err := command.Run(); err != nil { @@ -102,12 +103,7 @@ func runWithRestartMonitor(arg0 string, args ...string) { }).Warn("restart monitor: run command returns error") } backOffTime := backoffSettings.NextBackOff() - if backOffTime == backoffSettings.Stop { - logrus.WithFields(logrus.Fields{ - "command": command.Args, - }).Error("give up restarting after timeout") - return - } + // since backoffSettings.MaxElapsedTime is set to 0 we will never receive backoff.Stop. time.Sleep(backOffTime) }