Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 44 additions & 5 deletions pkg/admin/data/data_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,20 +91,59 @@ func (dm *manager) Backup(name BackupName) error {

dest := dm.GetBackupPath(name)

if err := copyDataDir(dest); err != nil {
if err := copyPath(config.DataDir, dest); err != nil {
return err
}

klog.InfoS("Backup finished", "backup", dest, "data", config.DataDir)
return nil
}

func (dm *manager) Restore(n BackupName) error {
return fmt.Errorf("Restore not implemented")
func (dm *manager) Restore(name BackupName) error {
klog.InfoS("Restoring the data", "storage", dm.storage, "name", name, "data", config.DataDir)

if name == "" {
return &EmptyArgErr{"name"}
}

if exists, err := dm.BackupExists(name); err != nil {
return fmt.Errorf("checking if backup %s exists failed: %w", name, err)
} else if !exists {
return fmt.Errorf("failed to restore backup, %s does not exist", name)
}

tmp := fmt.Sprintf("%s.saved", config.DataDir)
klog.InfoS("Temporarily renaming data dir", "data", config.DataDir, "renamedTo", tmp)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is asymmetric to Backup() (there it's caller's responsibility to ensure the backup dir does not exist).
Should we make caller's responsibility ensure /var/lib/microshift does not exist before restoring from a backup?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, /var/lib/microshift should never(?) exist before executing the Restore business logic, so it'd be an invariant of Restore(). If that's the case, IMO it should be checked by Restore(), not it's caller. Ditto for Backup().

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restore should essentially be

mv /var/lib/microshift /var/lib/microshift.save
cp /var/lib/microshift-backups/whatever /var/lib/microshift (using the same flags used when doing the backup)
rm -rf /var/lib/microshift.save

If the cp fails, then recovery is

rm -rf /var/lib/microshift
mv /var/lib/microshift.save /var/lib/microshift

I think you want all of that logic managed from 1 function to ensure all of the error handling paths that involve recovery are covered.

Are there files that only exist in /var/lib/microshift (and not in the backup) that we need to keep, like history metadata?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it looks like current impl is okay (despite asymmetry with Backup()). History/health metadata is in microshift-backups/

if err := os.Rename(config.DataDir, tmp); err != nil {
return fmt.Errorf("renaming %s to %s failed: %w", config.DataDir, tmp, err)
}

src := dm.GetBackupPath(name)
if err := copyPath(src, config.DataDir); err != nil {
klog.ErrorS(err, "Copying backup failed - restoring previous data dir")

if err := os.RemoveAll(config.DataDir); err != nil {
return fmt.Errorf("removing %s failed: %w", config.DataDir, err)
}

if err := os.Rename(tmp, config.DataDir); err != nil {
return fmt.Errorf("renaming %s to %s failed: %w", tmp, config.DataDir, err)
}

return fmt.Errorf("restoring backup %s failed: %w", src, err)
}

klog.InfoS("Removing temporary data dir", "path", tmp)
if err := os.RemoveAll(tmp); err != nil {
klog.ErrorS(err, "Failed to remove path", "path", tmp)
}

klog.InfoS("Restore finished", "backup", src, "data", config.DataDir)
return nil
}

func copyDataDir(dest string) error {
cmd := exec.Command("cp", append(cpArgs, config.DataDir, dest)...) //nolint:gosec
func copyPath(src, dest string) error {
cmd := exec.Command("cp", append(cpArgs, src, dest)...) //nolint:gosec
klog.InfoS("Executing command", "cmd", cmd)

var outb, errb bytes.Buffer
Expand Down
125 changes: 96 additions & 29 deletions pkg/admin/prerun/prerun.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
package prerun

import (
"bytes"
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
"strings"

"github.com/openshift/microshift/pkg/admin/data"
"github.com/openshift/microshift/pkg/config"
"github.com/openshift/microshift/pkg/util"
"k8s.io/klog/v2"
)
Expand All @@ -31,36 +32,49 @@ func (hi *HealthInfo) IsHealthy() bool {
return hi.Health == "healthy"
}

func Perform() error {
type PreRun struct {
dataManager data.Manager
}

func New(dataManager data.Manager) *PreRun {
return &PreRun{
dataManager: dataManager,
}
}

func (pr *PreRun) Perform() error {
health, err := getHealthInfo()
if err != nil {
if errors.Is(err, errHealthFileDoesNotExist) {
klog.InfoS("Health file does not exist - skipping backup")
return nil
}
klog.ErrorS(err, "Failed to load health from disk")
return err
}
klog.InfoS("Loaded health info from the disk", "health", health)

if isCurr, err := containsCurrentBootID(health.BootID); err != nil {
return err
} else if isCurr {
klog.InfoS("Health file contains current boot - skipping backup")
// This might happen if microshift is (re)started after greenboot finishes running.
// Green script will overwrite the health.json with
// current boot's ID, deployment ID, and health.
klog.InfoS("Health file contains current boot - skipping pre-run")
return nil
}

if !health.IsHealthy() {
klog.InfoS("System was not healthy - skipping backup")
return nil
}
klog.InfoS("Previous boot", "health", health.Health, "deploymentID", health.DeploymentID, "bootID", health.BootID)

dataManager, err := data.NewManager(config.BackupsDir)
if err != nil {
return err
if health.IsHealthy() {
return pr.backup(health)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As implementation grows, I expect this to not be end of the workflow.
For example, when rolling back from healthy system, we want to take backup, but then we want to restore another (compatible with rollback's deployID). Storage migration is another example.
We'll get there

}

existingBackups, err := dataManager.GetBackupList()
return pr.restore()
}

func (pr *PreRun) backup(health *HealthInfo) error {
klog.InfoS("Backing up the data for deployment", "deployment", health.DeploymentID)

existingBackups, err := pr.dataManager.GetBackupList()
if err != nil {
return err
}
Expand All @@ -76,20 +90,84 @@ func Perform() error {
return nil
}

if err := dataManager.Backup(newBackupName); err != nil {
if err := pr.dataManager.Backup(newBackupName); err != nil {
return err
}

removeOldBackups(dataManager, backupsForDeployment)
pr.removeOldBackups(backupsForDeployment)

return nil
}

func (pr *PreRun) restore() error {
// TODO: Check if containers are already running (i.e. microshift.service was restarted)?
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Situation when system was unhealthy, so it restored the backup, but then crashed (or exited due to IP change or smth else) and was restarted before red/green script had chance to update health.json: it would restore again.

At this point some Pods might already run, so should we protect against restoring in such case? Or maybe it's not a problem.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a first iteration, let's assume it's safe to restore from the same backup multiple times.


currentDeploymentID, err := getCurrentDeploymentID()
if err != nil {
return err
}
klog.InfoS("Restoring data for current deployment", "deployID", currentDeploymentID)

existingBackups, err := pr.dataManager.GetBackupList()
if err != nil {
return err
}
klog.InfoS("List of existing backups", "backups", existingBackups)
backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID)

if len(backupsForDeployment) == 0 {
return fmt.Errorf("there is no backup to restore for current deployment (%s)", currentDeploymentID)
}

if len(backupsForDeployment) > 1 {
// could happen during backing up when removing older backups failed
klog.InfoS("TODO: more than 1 backup, need to pick most recent one")
}

return pr.dataManager.Restore(backupsForDeployment[0])
}

func getCurrentDeploymentID() (string, error) {
cmd := exec.Command("rpm-ostree", "status", "--jsonpath=$.deployments[0].id", "--booted")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr

if err := cmd.Run(); err != nil {
return "", fmt.Errorf("command %s failed: %s", strings.TrimSpace(cmd.String()), strings.TrimSpace(stderr.String()))
}

ids := []string{}
if err := json.Unmarshal(stdout.Bytes(), &ids); err != nil {
return "", fmt.Errorf("unmarshalling '%s' to json failed: %w", strings.TrimSpace(stdout.String()), err)
}

if len(ids) != 1 {
// this shouldn't happen if running on ostree system, but just in case
klog.ErrorS(nil, "Unexpected amount of deployments in rpm-ostree output",
"cmd", cmd.String(),
"stdout", strings.TrimSpace(stdout.String()),
"stderr", strings.TrimSpace(stderr.String()),
"unmarshalledIDs", ids)
return "", fmt.Errorf("rpm-ostree returned unexpected amount of deployment IDs: %d", len(ids))
}

return ids[0], nil
}

func (pr *PreRun) removeOldBackups(backups []data.BackupName) {
for _, b := range backups {
klog.InfoS("Removing older backup", "name", b)
if err := pr.dataManager.RemoveBackup(b); err != nil {
klog.ErrorS(err, "Failed to remove backup", "name", b)
}
}
}

func containsCurrentBootID(id string) (bool, error) {
path := "/proc/sys/kernel/random/boot_id"
content, err := os.ReadFile(path)
if err != nil {
klog.ErrorS(err, "Failed to read file", "path", path)
return false, fmt.Errorf("reading file %s failed: %w", path, err)
}
currentBootID := strings.ReplaceAll(strings.TrimSpace(string(content)), "-", "")
Expand All @@ -107,14 +185,12 @@ func getHealthInfo() (*HealthInfo, error) {

content, err := os.ReadFile(path)
if err != nil {
klog.ErrorS(err, "Failed to read file", "path", path)
return nil, err
return nil, fmt.Errorf("reading file %s failed: %w", path, err)
}

health := &HealthInfo{}
if err := json.Unmarshal(content, &health); err != nil {
klog.ErrorS(err, "Failed to unmarshal file to json", "content", string(content))
return nil, err
return nil, fmt.Errorf("unmarshalling '%s' failed: %w", strings.TrimSpace(string(content)), err)
}
return health, nil
}
Expand All @@ -139,12 +215,3 @@ func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName
}
return false
}

func removeOldBackups(dataManager data.Manager, backups []data.BackupName) {
for _, b := range backups {
klog.InfoS("Removing older backup", "name", b)
if err := dataManager.RemoveBackup(b); err != nil {
klog.ErrorS(err, "Failed to remove backup", "name", b)
}
}
}
13 changes: 12 additions & 1 deletion pkg/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

"github.com/coreos/go-systemd/daemon"
"github.com/openshift/microshift/pkg/admin/data"
"github.com/openshift/microshift/pkg/admin/prerun"
"github.com/openshift/microshift/pkg/config"
"github.com/openshift/microshift/pkg/controllers"
Expand Down Expand Up @@ -77,13 +78,23 @@ func logConfig(cfg *config.Config) {
}
}

func performPrerun() error {
dataManager, err := data.NewManager(config.BackupsDir)
if err != nil {
return err
}

return prerun.New(dataManager).Perform()
}

func RunMicroshift(cfg *config.Config) error {
// fail early if we don't have enough privileges
if os.Geteuid() > 0 {
klog.Fatalf("MicroShift must be run privileged")
}

if err := prerun.Perform(); err != nil {
if err := performPrerun(); err != nil {
klog.ErrorS(err, "Pre-run procedure failed")
return err
}

Expand Down