From d31d1fd150cb48ed7faa65cd96f24dc267eeafef Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Mon, 5 Jun 2023 16:51:31 +0200 Subject: [PATCH 1/4] restructure prerun --- pkg/admin/prerun/prerun.go | 67 +++++++++++++++++++++----------------- pkg/cmd/run.go | 13 +++++++- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/pkg/admin/prerun/prerun.go b/pkg/admin/prerun/prerun.go index 59e3da61c4..b0997c7640 100644 --- a/pkg/admin/prerun/prerun.go +++ b/pkg/admin/prerun/prerun.go @@ -8,7 +8,6 @@ import ( "strings" "github.com/openshift/microshift/pkg/admin/data" - "github.com/openshift/microshift/pkg/config" "github.com/openshift/microshift/pkg/util" "k8s.io/klog/v2" ) @@ -31,36 +30,49 @@ func (hi *HealthInfo) IsHealthy() bool { return hi.Health == "healthy" } -func Perform() error { +type PreRun struct { + dataManager data.Manager +} + +func New(dataManager data.Manager) *PreRun { + return &PreRun{ + dataManager: dataManager, + } +} + +func (pr *PreRun) Perform() error { health, err := getHealthInfo() if err != nil { if errors.Is(err, errHealthFileDoesNotExist) { klog.InfoS("Health file does not exist - skipping backup") return nil } - klog.ErrorS(err, "Failed to load health from disk") return err } - klog.InfoS("Loaded health info from the disk", "health", health) if isCurr, err := containsCurrentBootID(health.BootID); err != nil { return err } else if isCurr { - klog.InfoS("Health file contains current boot - skipping backup") + // This might happen if microshift is (re)started after greenboot finishes running. + // Green script will overwrite the health.json with + // current boot's ID, deployment ID, and health. + klog.InfoS("Health file contains current boot - skipping pre-run") return nil } - if !health.IsHealthy() { - klog.InfoS("System was not healthy - skipping backup") - return nil - } + klog.InfoS("Previous boot", "health", health.Health, "deploymentID", health.DeploymentID, "bootID", health.BootID) - dataManager, err := data.NewManager(config.BackupsDir) - if err != nil { - return err + if health.IsHealthy() { + return pr.backup(health) } - existingBackups, err := dataManager.GetBackupList() + return nil +} + +func (pr *PreRun) backup(health *HealthInfo) error { + klog.InfoS("Backing up the data for deployment", "deployment", health.DeploymentID) + + existingBackups, err := pr.dataManager.GetBackupList() if err != nil { return err } @@ -76,20 +88,28 @@ func Perform() error { return nil } - if err := dataManager.Backup(newBackupName); err != nil { + if err := pr.dataManager.Backup(newBackupName); err != nil { return err } - removeOldBackups(dataManager, backupsForDeployment) + pr.removeOldBackups(backupsForDeployment) return nil } +func (pr *PreRun) removeOldBackups(backups []data.BackupName) { + for _, b := range backups { + klog.InfoS("Removing older backup", "name", b) + if err := pr.dataManager.RemoveBackup(b); err != nil { + klog.ErrorS(err, "Failed to remove backup", "name", b) + } + } +} + func containsCurrentBootID(id string) (bool, error) { path := "/proc/sys/kernel/random/boot_id" content, err := os.ReadFile(path) if err != nil { - klog.ErrorS(err, "Failed to read file", "path", path) return false, fmt.Errorf("reading file %s failed: %w", path, err) } currentBootID := strings.ReplaceAll(strings.TrimSpace(string(content)), "-", "") @@ -107,14 +127,12 @@ func getHealthInfo() (*HealthInfo, error) { content, err := os.ReadFile(path) if err != nil { - klog.ErrorS(err, "Failed to read file", "path", path) - return nil, err + return nil, fmt.Errorf("reading file %s failed: %w", path, err) } health := &HealthInfo{} if err := json.Unmarshal(content, &health); err != nil { - klog.ErrorS(err, "Failed to unmarshal file to json", "content", string(content)) - return nil, err + return nil, fmt.Errorf("unmarshalling '%s' failed: %w", strings.TrimSpace(string(content)), err) } return health, nil } @@ -139,12 +157,3 @@ func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName } return false } - -func removeOldBackups(dataManager data.Manager, backups []data.BackupName) { - for _, b := range backups { - klog.InfoS("Removing older backup", "name", b) - if err := dataManager.RemoveBackup(b); err != nil { - klog.ErrorS(err, "Failed to remove backup", "name", b) - } - } -} diff --git a/pkg/cmd/run.go b/pkg/cmd/run.go index dab2d93d1f..e48ee378f2 100644 --- a/pkg/cmd/run.go +++ b/pkg/cmd/run.go @@ -10,6 +10,7 @@ import ( "time" "github.com/coreos/go-systemd/daemon" + "github.com/openshift/microshift/pkg/admin/data" "github.com/openshift/microshift/pkg/admin/prerun" "github.com/openshift/microshift/pkg/config" "github.com/openshift/microshift/pkg/controllers" @@ -77,13 +78,23 @@ func logConfig(cfg *config.Config) { } } +func performPrerun() error { + dataManager, err := data.NewManager(config.BackupsDir) + if err != nil { + return err + } + + return prerun.New(dataManager).Perform() +} + func RunMicroshift(cfg *config.Config) error { // fail early if we don't have enough privileges if os.Geteuid() > 0 { klog.Fatalf("MicroShift must be run privileged") } - if err := prerun.Perform(); err != nil { + if err := performPrerun(); err != nil { + klog.ErrorS(err, "Pre-run procedure failed") return err } From a6a03cff5a479c124cf6a40c0ab48e2552c28f00 Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Mon, 5 Jun 2023 18:51:10 +0200 Subject: [PATCH 2/4] restore data if previous boot was unhealthy --- pkg/admin/data/data_manager.go | 41 ++++++++++++++++++++--- pkg/admin/prerun/prerun.go | 60 +++++++++++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 6 deletions(-) diff --git a/pkg/admin/data/data_manager.go b/pkg/admin/data/data_manager.go index a2e8318f18..bfc8555f18 100644 --- a/pkg/admin/data/data_manager.go +++ b/pkg/admin/data/data_manager.go @@ -10,6 +10,7 @@ import ( "github.com/openshift/microshift/pkg/config" "github.com/openshift/microshift/pkg/util" + "k8s.io/apimachinery/pkg/util/rand" "k8s.io/klog/v2" ) @@ -91,7 +92,7 @@ func (dm *manager) Backup(name BackupName) error { dest := dm.GetBackupPath(name) - if err := copyDataDir(dest); err != nil { + if err := copy(config.DataDir, dest); err != nil { return err } @@ -99,12 +100,42 @@ func (dm *manager) Backup(name BackupName) error { return nil } -func (dm *manager) Restore(n BackupName) error { - return fmt.Errorf("Restore not implemented") +func (dm *manager) Restore(name BackupName) error { + klog.InfoS("Restoring the data", "storage", dm.storage, "name", name, "data", config.DataDir) + + if name == "" { + return &EmptyArgErr{"name"} + } + + if exists, err := dm.BackupExists(name); err != nil { + return fmt.Errorf("checking if backup %s exists failed: %w", name, err) + } else if !exists { + klog.ErrorS(nil, "Backup to restore does not exist", "name", name) + return fmt.Errorf("backup %s does not exist", name) + } + + tmp := fmt.Sprintf("%s.%s", config.DataDir, rand.String(8)) + klog.InfoS("Temporarily renaming data dir", "data", config.DataDir, "renamedTo", tmp) + if err := os.Rename(config.DataDir, tmp); err != nil { + return fmt.Errorf("renaming data dir failed: %w", err) + } + + src := dm.GetBackupPath(name) + if err := copy(src, config.DataDir); err != nil { + return err + } + + klog.InfoS("Removing temporary data dir", "path", tmp) + if err := os.RemoveAll(tmp); err != nil { + klog.ErrorS(err, "Failed to remove %s", tmp) + } + + klog.InfoS("Restore finished", "backup", src, "data", config.DataDir) + return nil } -func copyDataDir(dest string) error { - cmd := exec.Command("cp", append(cpArgs, config.DataDir, dest)...) //nolint:gosec +func copy(src, dest string) error { + cmd := exec.Command("cp", append(cpArgs, src, dest)...) //nolint:gosec klog.InfoS("Executing command", "cmd", cmd) var outb, errb bytes.Buffer diff --git a/pkg/admin/prerun/prerun.go b/pkg/admin/prerun/prerun.go index b0997c7640..8b0ed32b4b 100644 --- a/pkg/admin/prerun/prerun.go +++ b/pkg/admin/prerun/prerun.go @@ -1,10 +1,12 @@ package prerun import ( + "bytes" "encoding/json" "errors" "fmt" "os" + "os/exec" "strings" "github.com/openshift/microshift/pkg/admin/data" @@ -66,7 +68,7 @@ func (pr *PreRun) Perform() error { return pr.backup(health) } - return nil + return pr.restore() } func (pr *PreRun) backup(health *HealthInfo) error { @@ -97,6 +99,62 @@ func (pr *PreRun) backup(health *HealthInfo) error { return nil } +func (pr *PreRun) restore() error { + // TODO: Check if containers are already running (i.e. microshift.service was restarted)? + + currentDeploymentID, err := getCurrentDeploymentID() + if err != nil { + return err + } + klog.InfoS("Restoring data for current deployment", "deployID", currentDeploymentID) + + existingBackups, err := pr.dataManager.GetBackupList() + if err != nil { + return err + } + klog.InfoS("List of existing backups", "backups", existingBackups) + backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID) + + if len(backupsForDeployment) == 0 { + return fmt.Errorf("there is no backup to restore for current deployment (%s)", currentDeploymentID) + } + + if len(backupsForDeployment) > 1 { + // could happen during backing up when removing older backups failed + klog.InfoS("TODO: more than 1 backup, need to pick most recent one") + } + + return pr.dataManager.Restore(backupsForDeployment[0]) +} + +func getCurrentDeploymentID() (string, error) { + cmd := exec.Command("rpm-ostree", "status", "--jsonpath=$.deployments[0].id", "--booted") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("command %s failed: %s", strings.TrimSpace(cmd.String()), strings.TrimSpace(stderr.String())) + } + + ids := []string{} + if err := json.Unmarshal(stdout.Bytes(), &ids); err != nil { + return "", fmt.Errorf("unmarshalling '%s' to json failed: %w", strings.TrimSpace(stdout.String()), err) + } + + if len(ids) != 1 { + // this shouldn't happen if running on ostree system, but just in case + klog.ErrorS(nil, "Unexpected amount of deployments in rpm-ostree output", + "cmd", cmd.String(), + "stdout", strings.TrimSpace(stdout.String()), + "stderr", strings.TrimSpace(stderr.String()), + "unmarshalledIDs", ids) + return "", fmt.Errorf("rpm-ostree returned unexpected amount of deployment IDs: %d", len(ids)) + } + + return ids[0], nil +} + func (pr *PreRun) removeOldBackups(backups []data.BackupName) { for _, b := range backups { klog.InfoS("Removing older backup", "name", b) From 838f676ccedd02dc49999733c6ec6a5efece3ea6 Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Tue, 6 Jun 2023 09:35:33 +0200 Subject: [PATCH 3/4] handle failed restore --- pkg/admin/data/data_manager.go | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pkg/admin/data/data_manager.go b/pkg/admin/data/data_manager.go index bfc8555f18..d89ac58df8 100644 --- a/pkg/admin/data/data_manager.go +++ b/pkg/admin/data/data_manager.go @@ -10,7 +10,6 @@ import ( "github.com/openshift/microshift/pkg/config" "github.com/openshift/microshift/pkg/util" - "k8s.io/apimachinery/pkg/util/rand" "k8s.io/klog/v2" ) @@ -110,19 +109,28 @@ func (dm *manager) Restore(name BackupName) error { if exists, err := dm.BackupExists(name); err != nil { return fmt.Errorf("checking if backup %s exists failed: %w", name, err) } else if !exists { - klog.ErrorS(nil, "Backup to restore does not exist", "name", name) - return fmt.Errorf("backup %s does not exist", name) + return fmt.Errorf("failed to restore backup, %s does not exist", name) } - tmp := fmt.Sprintf("%s.%s", config.DataDir, rand.String(8)) + tmp := fmt.Sprintf("%s.saved", config.DataDir) klog.InfoS("Temporarily renaming data dir", "data", config.DataDir, "renamedTo", tmp) if err := os.Rename(config.DataDir, tmp); err != nil { - return fmt.Errorf("renaming data dir failed: %w", err) + return fmt.Errorf("renaming %s to %s failed: %w", config.DataDir, tmp, err) } src := dm.GetBackupPath(name) if err := copy(src, config.DataDir); err != nil { - return err + klog.ErrorS(err, "Copying backup failed - restoring previous data dir") + + if err := os.RemoveAll(config.DataDir); err != nil { + return fmt.Errorf("removing %s failed: %w", config.DataDir, err) + } + + if err := os.Rename(tmp, config.DataDir); err != nil { + return fmt.Errorf("renaming %s to %s failed: %w", tmp, config.DataDir, err) + } + + return fmt.Errorf("restoring backup %s failed: %w", src, err) } klog.InfoS("Removing temporary data dir", "path", tmp) From 8aa0ead5c3929c9fb5e9d806d7a84cb732b354d5 Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Tue, 6 Jun 2023 14:56:34 +0200 Subject: [PATCH 4/4] linter fixes --- pkg/admin/data/data_manager.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/admin/data/data_manager.go b/pkg/admin/data/data_manager.go index d89ac58df8..f69a67c0c5 100644 --- a/pkg/admin/data/data_manager.go +++ b/pkg/admin/data/data_manager.go @@ -91,7 +91,7 @@ func (dm *manager) Backup(name BackupName) error { dest := dm.GetBackupPath(name) - if err := copy(config.DataDir, dest); err != nil { + if err := copyPath(config.DataDir, dest); err != nil { return err } @@ -119,7 +119,7 @@ func (dm *manager) Restore(name BackupName) error { } src := dm.GetBackupPath(name) - if err := copy(src, config.DataDir); err != nil { + if err := copyPath(src, config.DataDir); err != nil { klog.ErrorS(err, "Copying backup failed - restoring previous data dir") if err := os.RemoveAll(config.DataDir); err != nil { @@ -135,14 +135,14 @@ func (dm *manager) Restore(name BackupName) error { klog.InfoS("Removing temporary data dir", "path", tmp) if err := os.RemoveAll(tmp); err != nil { - klog.ErrorS(err, "Failed to remove %s", tmp) + klog.ErrorS(err, "Failed to remove path", "path", tmp) } klog.InfoS("Restore finished", "backup", src, "data", config.DataDir) return nil } -func copy(src, dest string) error { +func copyPath(src, dest string) error { cmd := exec.Command("cp", append(cpArgs, src, dest)...) //nolint:gosec klog.InfoS("Executing command", "cmd", cmd)