-
Notifications
You must be signed in to change notification settings - Fork 228
USHIFT-1084 Restore backup if previous was unhealthy #1881
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,15 @@ | ||
| package prerun | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "encoding/json" | ||
| "errors" | ||
| "fmt" | ||
| "os" | ||
| "os/exec" | ||
| "strings" | ||
|
|
||
| "github.com/openshift/microshift/pkg/admin/data" | ||
| "github.com/openshift/microshift/pkg/config" | ||
| "github.com/openshift/microshift/pkg/util" | ||
| "k8s.io/klog/v2" | ||
| ) | ||
|
|
@@ -31,36 +32,49 @@ func (hi *HealthInfo) IsHealthy() bool { | |
| return hi.Health == "healthy" | ||
| } | ||
|
|
||
| func Perform() error { | ||
| type PreRun struct { | ||
| dataManager data.Manager | ||
| } | ||
|
|
||
| func New(dataManager data.Manager) *PreRun { | ||
| return &PreRun{ | ||
| dataManager: dataManager, | ||
| } | ||
| } | ||
|
|
||
| func (pr *PreRun) Perform() error { | ||
| health, err := getHealthInfo() | ||
| if err != nil { | ||
| if errors.Is(err, errHealthFileDoesNotExist) { | ||
| klog.InfoS("Health file does not exist - skipping backup") | ||
| return nil | ||
| } | ||
| klog.ErrorS(err, "Failed to load health from disk") | ||
| return err | ||
| } | ||
| klog.InfoS("Loaded health info from the disk", "health", health) | ||
|
|
||
| if isCurr, err := containsCurrentBootID(health.BootID); err != nil { | ||
| return err | ||
| } else if isCurr { | ||
| klog.InfoS("Health file contains current boot - skipping backup") | ||
| // This might happen if microshift is (re)started after greenboot finishes running. | ||
| // Green script will overwrite the health.json with | ||
| // current boot's ID, deployment ID, and health. | ||
| klog.InfoS("Health file contains current boot - skipping pre-run") | ||
| return nil | ||
| } | ||
|
|
||
| if !health.IsHealthy() { | ||
| klog.InfoS("System was not healthy - skipping backup") | ||
| return nil | ||
| } | ||
| klog.InfoS("Previous boot", "health", health.Health, "deploymentID", health.DeploymentID, "bootID", health.BootID) | ||
|
|
||
| dataManager, err := data.NewManager(config.BackupsDir) | ||
| if err != nil { | ||
| return err | ||
| if health.IsHealthy() { | ||
| return pr.backup(health) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As implementation grows, I expect this to not be end of the workflow. |
||
| } | ||
|
|
||
| existingBackups, err := dataManager.GetBackupList() | ||
| return pr.restore() | ||
| } | ||
|
|
||
| func (pr *PreRun) backup(health *HealthInfo) error { | ||
| klog.InfoS("Backing up the data for deployment", "deployment", health.DeploymentID) | ||
|
|
||
| existingBackups, err := pr.dataManager.GetBackupList() | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
@@ -76,20 +90,84 @@ func Perform() error { | |
| return nil | ||
| } | ||
|
|
||
| if err := dataManager.Backup(newBackupName); err != nil { | ||
| if err := pr.dataManager.Backup(newBackupName); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| removeOldBackups(dataManager, backupsForDeployment) | ||
| pr.removeOldBackups(backupsForDeployment) | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| func (pr *PreRun) restore() error { | ||
| // TODO: Check if containers are already running (i.e. microshift.service was restarted)? | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Situation when system was unhealthy, so it restored the backup, but then crashed (or exited due to IP change or smth else) and was restarted before red/green script had chance to update At this point some Pods might already run, so should we protect against restoring in such case? Or maybe it's not a problem.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As a first iteration, let's assume it's safe to restore from the same backup multiple times. |
||
|
|
||
| currentDeploymentID, err := getCurrentDeploymentID() | ||
| if err != nil { | ||
| return err | ||
| } | ||
| klog.InfoS("Restoring data for current deployment", "deployID", currentDeploymentID) | ||
|
|
||
| existingBackups, err := pr.dataManager.GetBackupList() | ||
| if err != nil { | ||
| return err | ||
| } | ||
| klog.InfoS("List of existing backups", "backups", existingBackups) | ||
| backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID) | ||
|
|
||
| if len(backupsForDeployment) == 0 { | ||
| return fmt.Errorf("there is no backup to restore for current deployment (%s)", currentDeploymentID) | ||
| } | ||
|
|
||
| if len(backupsForDeployment) > 1 { | ||
| // could happen during backing up when removing older backups failed | ||
| klog.InfoS("TODO: more than 1 backup, need to pick most recent one") | ||
| } | ||
|
|
||
| return pr.dataManager.Restore(backupsForDeployment[0]) | ||
| } | ||
|
|
||
| func getCurrentDeploymentID() (string, error) { | ||
| cmd := exec.Command("rpm-ostree", "status", "--jsonpath=$.deployments[0].id", "--booted") | ||
| var stdout, stderr bytes.Buffer | ||
| cmd.Stdout = &stdout | ||
| cmd.Stderr = &stderr | ||
|
|
||
| if err := cmd.Run(); err != nil { | ||
| return "", fmt.Errorf("command %s failed: %s", strings.TrimSpace(cmd.String()), strings.TrimSpace(stderr.String())) | ||
| } | ||
|
|
||
| ids := []string{} | ||
| if err := json.Unmarshal(stdout.Bytes(), &ids); err != nil { | ||
| return "", fmt.Errorf("unmarshalling '%s' to json failed: %w", strings.TrimSpace(stdout.String()), err) | ||
| } | ||
|
|
||
| if len(ids) != 1 { | ||
| // this shouldn't happen if running on ostree system, but just in case | ||
| klog.ErrorS(nil, "Unexpected amount of deployments in rpm-ostree output", | ||
| "cmd", cmd.String(), | ||
| "stdout", strings.TrimSpace(stdout.String()), | ||
| "stderr", strings.TrimSpace(stderr.String()), | ||
| "unmarshalledIDs", ids) | ||
| return "", fmt.Errorf("rpm-ostree returned unexpected amount of deployment IDs: %d", len(ids)) | ||
| } | ||
|
|
||
| return ids[0], nil | ||
| } | ||
|
|
||
| func (pr *PreRun) removeOldBackups(backups []data.BackupName) { | ||
| for _, b := range backups { | ||
| klog.InfoS("Removing older backup", "name", b) | ||
| if err := pr.dataManager.RemoveBackup(b); err != nil { | ||
| klog.ErrorS(err, "Failed to remove backup", "name", b) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func containsCurrentBootID(id string) (bool, error) { | ||
| path := "/proc/sys/kernel/random/boot_id" | ||
| content, err := os.ReadFile(path) | ||
| if err != nil { | ||
| klog.ErrorS(err, "Failed to read file", "path", path) | ||
| return false, fmt.Errorf("reading file %s failed: %w", path, err) | ||
| } | ||
| currentBootID := strings.ReplaceAll(strings.TrimSpace(string(content)), "-", "") | ||
|
|
@@ -107,14 +185,12 @@ func getHealthInfo() (*HealthInfo, error) { | |
|
|
||
| content, err := os.ReadFile(path) | ||
| if err != nil { | ||
| klog.ErrorS(err, "Failed to read file", "path", path) | ||
| return nil, err | ||
| return nil, fmt.Errorf("reading file %s failed: %w", path, err) | ||
| } | ||
|
|
||
| health := &HealthInfo{} | ||
| if err := json.Unmarshal(content, &health); err != nil { | ||
| klog.ErrorS(err, "Failed to unmarshal file to json", "content", string(content)) | ||
| return nil, err | ||
| return nil, fmt.Errorf("unmarshalling '%s' failed: %w", strings.TrimSpace(string(content)), err) | ||
| } | ||
| return health, nil | ||
| } | ||
|
|
@@ -139,12 +215,3 @@ func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName | |
| } | ||
| return false | ||
| } | ||
|
|
||
| func removeOldBackups(dataManager data.Manager, backups []data.BackupName) { | ||
| for _, b := range backups { | ||
| klog.InfoS("Removing older backup", "name", b) | ||
| if err := dataManager.RemoveBackup(b); err != nil { | ||
| klog.ErrorS(err, "Failed to remove backup", "name", b) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is asymmetric to
Backup()(there it's caller's responsibility to ensure the backup dir does not exist).Should we make caller's responsibility ensure
/var/lib/microshiftdoes not exist before restoring from a backup?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC,
/var/lib/microshiftshould never(?) exist before executing the Restore business logic, so it'd be an invariant of Restore(). If that's the case, IMO it should be checked by Restore(), not it's caller. Ditto for Backup().There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Restore should essentially be
If the
cpfails, then recovery isI think you want all of that logic managed from 1 function to ensure all of the error handling paths that involve recovery are covered.
Are there files that only exist in
/var/lib/microshift(and not in the backup) that we need to keep, like history metadata?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So it looks like current impl is okay (despite asymmetry with Backup()). History/health metadata is in
microshift-backups/