From 1d6d44feb220fb5911ecc70a356f8771d3914d1d Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Tue, 11 Jul 2023 14:05:31 +0200 Subject: [PATCH 1/6] data manager: remove microshift data func --- pkg/admin/data/data_manager.go | 12 ++++++++++++ pkg/admin/data/types.go | 2 ++ 2 files changed, 14 insertions(+) diff --git a/pkg/admin/data/data_manager.go b/pkg/admin/data/data_manager.go index 35f54ffe12..d35ccc4cfa 100644 --- a/pkg/admin/data/data_manager.go +++ b/pkg/admin/data/data_manager.go @@ -171,6 +171,18 @@ func (dm *manager) Restore(name BackupName) error { return nil } +func (dm *manager) RemoveData() error { + klog.InfoS("Starting MicroShift data removal") + + err := os.RemoveAll(config.DataDir) + if err != nil { + return fmt.Errorf("failed to remove MicroShift data: %w", err) + } + + klog.InfoS("Removed MicroShift data") + return nil +} + func copyPath(src, dest string) error { cmd := exec.Command("cp", append(cpArgs, src, dest)...) //nolint:gosec klog.InfoS("Starting copy", "cmd", cmd) diff --git a/pkg/admin/data/types.go b/pkg/admin/data/types.go index ecabc3df53..8c4165625c 100644 --- a/pkg/admin/data/types.go +++ b/pkg/admin/data/types.go @@ -23,4 +23,6 @@ type Manager interface { GetBackupPath(BackupName) string GetBackupList() ([]BackupName, error) RemoveBackup(BackupName) error + + RemoveData() error } From cd23d03d9b75a445cdb4f066bfc69fdc4a5a6c77 Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Tue, 11 Jul 2023 14:06:34 +0200 Subject: [PATCH 2/6] special backup name suffix for unhealthy --- pkg/admin/prerun/health.go | 8 +++++++- pkg/admin/prerun/health_test.go | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 pkg/admin/prerun/health_test.go diff --git a/pkg/admin/prerun/health.go b/pkg/admin/prerun/health.go index 4b83b823c8..74782688d5 100644 --- a/pkg/admin/prerun/health.go +++ b/pkg/admin/prerun/health.go @@ -23,7 +23,13 @@ type HealthInfo struct { } func (hi *HealthInfo) BackupName() data.BackupName { - return data.BackupName(fmt.Sprintf("%s_%s", hi.DeploymentID, hi.BootID)) + name := fmt.Sprintf("%s_%s", hi.DeploymentID, hi.BootID) + + if hi.IsHealthy() { + return data.BackupName(name) + } + + return data.BackupName(fmt.Sprintf("%s_unhealthy", name)) } func (hi *HealthInfo) IsHealthy() bool { diff --git a/pkg/admin/prerun/health_test.go b/pkg/admin/prerun/health_test.go new file mode 100644 index 0000000000..f015ed1e6f --- /dev/null +++ b/pkg/admin/prerun/health_test.go @@ -0,0 +1,36 @@ +package prerun + +import ( + "testing" + + "github.com/openshift/microshift/pkg/admin/data" + "github.com/stretchr/testify/assert" +) + +func Test_BackupName(t *testing.T) { + testData := []struct { + healthInfo HealthInfo + expectedBackupName string + }{ + { + healthInfo: HealthInfo{ + Health: "healthy", + DeploymentID: "did", + BootID: "bid", + }, + expectedBackupName: "did_bid", + }, + { + healthInfo: HealthInfo{ + Health: "unhealthy", + DeploymentID: "did", + BootID: "bid", + }, + expectedBackupName: "did_bid_unhealthy", + }, + } + + for _, td := range testData { + assert.Equal(t, data.BackupName(td.expectedBackupName), td.healthInfo.BackupName()) + } +} From 659dca388b2e1e9a639a3d25d69338bb09a1dd02 Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Tue, 11 Jul 2023 14:20:15 +0200 Subject: [PATCH 3/6] don't consider unhealthy backup for restore --- pkg/admin/prerun/prerun.go | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/pkg/admin/prerun/prerun.go b/pkg/admin/prerun/prerun.go index 648be42861..19b6dae29d 100644 --- a/pkg/admin/prerun/prerun.go +++ b/pkg/admin/prerun/prerun.go @@ -210,9 +210,10 @@ func (pr *PreRun) backup(health *HealthInfo) error { // get list of already existing backups for deployment ID persisted in health file // after creating backup, the list will be used to remove older backups // (so only the most recent one for specific deployment is kept) - backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, health.DeploymentID) + allBackupsForDeployment := getBackupsForTheDeployment(existingBackups, health.DeploymentID) + healthyBackupsForDeployment := getOnlyHealthyBackups(allBackupsForDeployment) - if backupAlreadyExists(backupsForDeployment, newBackupName) { + if backupAlreadyExists(healthyBackupsForDeployment, newBackupName) { klog.InfoS("Skipping backup: Backup already exists", "deploymentID", health.DeploymentID, "name", newBackupName, @@ -224,7 +225,9 @@ func (pr *PreRun) backup(health *HealthInfo) error { return fmt.Errorf("failed to create new backup %q: %w", newBackupName, err) } - pr.removeOldBackups(backupsForDeployment) + // if making a new backup, remove all old backups for the deployment + // including unhealthy ones + pr.removeOldBackups(allBackupsForDeployment) klog.InfoS("Finished backup", "deploymentID", health.DeploymentID, @@ -343,16 +346,26 @@ func getCurrentBootID() (string, error) { return strings.ReplaceAll(strings.TrimSpace(string(content)), "-", ""), nil } -func getExistingBackupsForTheDeployment(existingBackups []data.BackupName, deployID string) []data.BackupName { - existingDeploymentBackups := make([]data.BackupName, 0) - - for _, existingBackup := range existingBackups { - if strings.HasPrefix(string(existingBackup), deployID) { - existingDeploymentBackups = append(existingDeploymentBackups, existingBackup) +func filterBackups(backups []data.BackupName, predicate func(data.BackupName) bool) []data.BackupName { + out := make([]data.BackupName, 0, len(backups)) + for _, backup := range backups { + if predicate(backup) { + out = append(out, backup) } } + return out +} + +func getOnlyHealthyBackups(backups []data.BackupName) []data.BackupName { + return filterBackups(backups, func(bn data.BackupName) bool { + return !strings.HasSuffix(string(bn), "unhealthy") + }) +} - return existingDeploymentBackups +func getBackupsForTheDeployment(backups []data.BackupName, deployID string) []data.BackupName { + return filterBackups(backups, func(bn data.BackupName) bool { + return strings.HasPrefix(string(bn), deployID) + }) } func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName) bool { From 2c3e037d9f27950b74cba29a0d96ad37ee53e02f Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Tue, 11 Jul 2023 14:28:41 +0200 Subject: [PATCH 4/6] handle fdo --- pkg/admin/prerun/prerun.go | 135 ++++++++++++++++++++++++++++++------- 1 file changed, 112 insertions(+), 23 deletions(-) diff --git a/pkg/admin/prerun/prerun.go b/pkg/admin/prerun/prerun.go index 19b6dae29d..468cd83313 100644 --- a/pkg/admin/prerun/prerun.go +++ b/pkg/admin/prerun/prerun.go @@ -160,11 +160,6 @@ func (pr *PreRun) regularPrerun() error { return nil } - // TODO: We may end up needing to split this if statement into - // functions, but for now let's just tell the linter not to apply - // the rule. - // - //nolint:nestif if health.IsHealthy() { klog.Info("Previous boot was healthy") if err := pr.backup(health); err != nil { @@ -175,11 +170,12 @@ func (pr *PreRun) regularPrerun() error { klog.Info("Current and previously booted deployments are different") return pr.handleDeploymentSwitch(currentDeploymentID) } - } else { - klog.Info("Previous boot was not healthy") - if err = pr.restore(); err != nil { - return fmt.Errorf("failed to restore during pre-run: %w", err) - } + return nil + } + + klog.Info("Previous boot was not healthy") + if err = pr.handleUnhealthy(health); err != nil { + return fmt.Errorf("failed to handle unhealthy data during pre-run: %w", err) } return nil @@ -236,9 +232,9 @@ func (pr *PreRun) backup(health *HealthInfo) error { return nil } -func (pr *PreRun) restore() error { +func (pr *PreRun) handleUnhealthy(health *HealthInfo) error { // TODO: Check if containers are already running (i.e. microshift.service was restarted)? - klog.Info("Preparing to restore") + klog.Info("Handling previously unhealthy system") currentDeploymentID, err := getCurrentDeploymentID() if err != nil { @@ -253,24 +249,73 @@ func (pr *PreRun) restore() error { "currentDeploymentID", currentDeploymentID, "backups", existingBackups, ) - backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID) - - if len(backupsForDeployment) == 0 { - return fmt.Errorf("there is no backup to restore for current deployment %q", currentDeploymentID) - } + allBackupsForDeployment := getBackupsForTheDeployment(existingBackups, currentDeploymentID) + healthyBackupsForDeployment := getOnlyHealthyBackups(allBackupsForDeployment) - if len(backupsForDeployment) > 1 { + if len(healthyBackupsForDeployment) > 1 { // could happen during backing up when removing older backups failed klog.InfoS("TODO: more than 1 backup, need to pick most recent one") } - err = pr.dataManager.Restore(backupsForDeployment[0]) + if len(healthyBackupsForDeployment) > 0 { + err = pr.dataManager.Restore(healthyBackupsForDeployment[0]) + if err != nil { + return fmt.Errorf("failed to restore backup: %w", err) + } + klog.Info("Finished handling unhealthy system") + return nil + } + + klog.InfoS("There is no backup to restore for current deployment - trying to restore backup for rollback deployment") + rollbackDeployID, err := getRollbackDeploymentID() if err != nil { - return fmt.Errorf("failed to restore backup: %w", err) + return err } - klog.Info("Finished restore") - return nil + if rollbackDeployID == "" { + // No backup for current deployment and there is no rollback deployment. + // This could be a unhealthy system that was manually rebooted to + // remediate the situation - let's not interfere: no backup, no restore, just proceed. + klog.InfoS("System has no rollback but health.json suggests system was rebooted - skipping prerun") + return nil + } + + klog.InfoS("Obtained rollback deployment", + "rollback-deployment-id", rollbackDeployID, + "current-deployment-id", currentDeploymentID, + "health-deployment-id", health.DeploymentID) + + if health.DeploymentID == rollbackDeployID { + return fmt.Errorf("deployment ID stored in health.json is the same as rollback's" + + " - MicroShift should not be updated from unhealthy system") + } + + if health.DeploymentID == currentDeploymentID { + backupsForRollback := getBackupsForTheDeployment(existingBackups, rollbackDeployID) + if len(backupsForRollback) == 0 { + // This could happen if current deployment is unhealthy and rollback didn't run MicroShift + klog.InfoS("There is no backup for rollback deployment as well - removing existing data for clean start") + return pr.dataManager.RemoveData() + } + + // There is no backup for current deployment, but there is a backup for the rollback. + // Let's restore it and act like it's first boot of newly staged deployment + klog.InfoS("Restoring backup for a rollback deployment to perform migration and try starting again", + "backup-name", backupsForRollback[0]) + if err := pr.dataManager.Restore(backupsForRollback[0]); err != nil { + return fmt.Errorf("failed to restore backup: %w", err) + } + return nil + } + + // DeployID in health.json is neither booted nor rollback deployment, + // so current deployment was staged over deployment without MicroShift + // but MicroShift data exists (created by another deployment that rolled back). + klog.InfoS("Deployment in health metadata is neither currently booted nor rollback deployment - backing up, then removing existing data for clean start") + if err := pr.backup(health); err != nil { + klog.ErrorS(err, "Failed to backup data of unhealthy system - ignoring") + } + return pr.dataManager.RemoveData() } func (pr *PreRun) handleDeploymentSwitch(currentDeploymentID string) error { @@ -281,7 +326,7 @@ func (pr *PreRun) handleDeploymentSwitch(currentDeploymentID string) error { if err != nil { return fmt.Errorf("failed to determine the existing backups: %w", err) } - backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID) + backupsForDeployment := getBackupsForTheDeployment(existingBackups, currentDeploymentID) if len(backupsForDeployment) > 0 { klog.Info("Backup exists for current deployment - restoring") @@ -376,3 +421,47 @@ func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName } return false } + +func getRollbackDeploymentID() (string, error) { + cmd := exec.Command("rpm-ostree", "status", "--json") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("%q failed: %s: %w", cmd, stderr.String(), err) + } + + type deploy struct { + ID string `json:"id"` + Booted bool `json:"booted"` + } + type statusOutput struct { + Deployments []deploy `json:"deployments"` + } + + var status statusOutput + if err := json.Unmarshal(stdout.Bytes(), &status); err != nil { + return "", fmt.Errorf("failed to unmarshal %q: %w", cmd, err) + } + + if len(status.Deployments) == 0 { + return "", fmt.Errorf("unexpected amount (0) of deployments from rpm-ostree status output") + } + + if len(status.Deployments) == 1 { + return "", nil + } + + afterBooted := false + for _, d := range status.Deployments { + if afterBooted { + return d.ID, nil + } + + if d.Booted { + afterBooted = true + } + } + + return "", fmt.Errorf("could not find rollback deployment in %v", status) +} From e51829f443fd350c9496790bf5d687462b054607 Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Tue, 11 Jul 2023 14:31:23 +0200 Subject: [PATCH 5/6] fdo.robot changes to make it shorter and more robust --- test/resources/libostree.py | 10 ++++++++++ test/resources/ostree.resource | 20 +++++++++++++++++++- test/suites-ostree/fdo.robot | 1 + 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/test/resources/libostree.py b/test/resources/libostree.py index 7c95c4418b..cba07ae4a7 100644 --- a/test/resources/libostree.py +++ b/test/resources/libostree.py @@ -142,3 +142,13 @@ def cleanup_rpm_ostree() -> None: def create_agent_config(cfg: str) -> None: remote_sudo(f"echo '{cfg}' | sudo tee /var/lib/microshift-test-agent.json") + + +def write_greenboot_microshift_wait_timeout(seconds: int) -> None: + remote_sudo( + f"echo 'MICROSHIFT_WAIT_TIMEOUT_SEC={seconds}' | sudo tee /etc/greenboot/greenboot.conf" + ) + + +def remove_greenboot_microshift_wait_timeout() -> None: + remote_sudo("rm /etc/greenboot/greenboot.conf") diff --git a/test/resources/ostree.resource b/test/resources/ostree.resource index 93fee38415..4ce164d0fa 100644 --- a/test/resources/ostree.resource +++ b/test/resources/ostree.resource @@ -35,12 +35,15 @@ Deploy Commit Expecting A Rollback ${initial_deploy_id}= Get Booted Deployment Id ${deploy_id}= Rebase System ${ref} TestAgent.Write + Write Greenboot Microshift Wait Timeout 60 Reboot MicroShift Host Log To Console "Failing ref deployed - waiting for system to roll back" Wait Until Keyword Succeeds 20m 15s ... Current Deployment Should Be ${initial_deploy_id} - Log To Console "System rolled back - deploying target ref" + + Log To Console "System rolled back" + Remove Greenboot Microshift Wait Timeout Deploy Commit Not Expecting A Rollback [Documentation] Deploys given ref and configures test agent for failing greenboot. @@ -69,3 +72,18 @@ System Is Running Right Ref And Healthy Should Be Equal As Strings ${expected_deploy} ${current_deploy} Greenboot Health Check Exited System Should Be Healthy + +Mask Grub Boot Success Timer + [Documentation] Effectively disable a timer that sets boot_success + ... to 1 after two minutes from user login. + ... It impacts tests because grub's script to decrement boot_counter + ... works only if boot_success is 0 (see /etc/grub.d/08_fallback_counting) + ... resulting in more than requested amount of redboot induced reboots + ... and system needing much more time to roll back. + ${out} ${err} ${rc}= Execute Command + ... systemctl --user mask grub-boot-success.timer + ... return_stdout=True + ... return_stderr=True + ... return_rc=True + + Should Be Equal As Integers ${rc} 0 diff --git a/test/suites-ostree/fdo.robot b/test/suites-ostree/fdo.robot index 597f87425c..09e2ddff54 100644 --- a/test/suites-ostree/fdo.robot +++ b/test/suites-ostree/fdo.robot @@ -28,6 +28,7 @@ FIDO Onboarding Device ... MicroShift data, unhealthy stored in health file, ... and a deployment gap (no-microshift rollback). + Mask Grub Boot Success Timer System Should Not Feature MicroShift TestAgent.Add Action For Next Deployment every fail_greenboot Deploy Commit Expecting A Rollback ${FAILING_REF} From abf2103b3d04e1bc845e182c32109038549b663d Mon Sep 17 00:00:00 2001 From: Patryk Matuszak <305846+pmtk@users.noreply.github.com> Date: Tue, 11 Jul 2023 17:04:49 +0200 Subject: [PATCH 6/6] account for greenboot's rpm-ostree rollback It happens when boot_counter falls down to "rollback value" Then, grub just boots another boot entry. After system start, greenboot notices boot_counter is -1, and it executes `rpm-ostree rollback` to actually make currently booted deployment first one on the list (default) --- test/resources/libostree.py | 8 ++++++++ test/resources/ostree.resource | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/test/resources/libostree.py b/test/resources/libostree.py index cba07ae4a7..0d5997eaa0 100644 --- a/test/resources/libostree.py +++ b/test/resources/libostree.py @@ -152,3 +152,11 @@ def write_greenboot_microshift_wait_timeout(seconds: int) -> None: def remove_greenboot_microshift_wait_timeout() -> None: remote_sudo("rm /etc/greenboot/greenboot.conf") + + +def no_transaction_in_progress() -> None: + stdout = remote_sudo("rpm-ostree status --json") + status = DataFormats.json_parse(stdout) + key = "transaction" + transaction_in_progress = key in status and status[key] is not None + BuiltIn().should_not_be_true(transaction_in_progress) diff --git a/test/resources/ostree.resource b/test/resources/ostree.resource index 4ce164d0fa..eae6241bc7 100644 --- a/test/resources/ostree.resource +++ b/test/resources/ostree.resource @@ -87,3 +87,13 @@ Mask Grub Boot Success Timer ... return_rc=True Should Be Equal As Integers ${rc} 0 + +Wait For Transaction To End + [Documentation] Wait for any ostree transaction to end. + ... When grub boots previous deployment due to greenboot failure, + ... ostree status is updated by greenboot running `rpm-ostree rollback`, + ... so test must wait until that transaction is over before staging + ... new deployment. + + Wait Until Keyword Succeeds 2m 15s + ... No Transaction In Progress