openshift · openshift-merge-robot · Jul 13, 2023 · Jul 11, 2023 · Jul 11, 2023 · Jul 11, 2023
diff --git a/pkg/admin/data/data_manager.go b/pkg/admin/data/data_manager.go
@@ -171,6 +171,18 @@ func (dm *manager) Restore(name BackupName) error {
 	return nil
 }
 
+func (dm *manager) RemoveData() error {
+	klog.InfoS("Starting MicroShift data removal")
+
+	err := os.RemoveAll(config.DataDir)
+	if err != nil {
+		return fmt.Errorf("failed to remove MicroShift data: %w", err)
+	}
+
+	klog.InfoS("Removed MicroShift data")
+	return nil
+}
+
 func copyPath(src, dest string) error {
 	cmd := exec.Command("cp", append(cpArgs, src, dest)...) //nolint:gosec
 	klog.InfoS("Starting copy", "cmd", cmd)

diff --git a/pkg/admin/data/types.go b/pkg/admin/data/types.go
@@ -23,4 +23,6 @@ type Manager interface {
 	GetBackupPath(BackupName) string
 	GetBackupList() ([]BackupName, error)
 	RemoveBackup(BackupName) error
+
+	RemoveData() error
 }
diff --git a/pkg/admin/prerun/health.go b/pkg/admin/prerun/health.go
@@ -23,7 +23,13 @@ type HealthInfo struct {
 }
 
 func (hi *HealthInfo) BackupName() data.BackupName {
-	return data.BackupName(fmt.Sprintf("%s_%s", hi.DeploymentID, hi.BootID))
+	name := fmt.Sprintf("%s_%s", hi.DeploymentID, hi.BootID)
+
+	if hi.IsHealthy() {
+		return data.BackupName(name)
+	}
+
+	return data.BackupName(fmt.Sprintf("%s_unhealthy", name))
 }
 
 func (hi *HealthInfo) IsHealthy() bool {

diff --git a/pkg/admin/prerun/health_test.go b/pkg/admin/prerun/health_test.go
@@ -0,0 +1,36 @@
+package prerun
+
+import (
+	"testing"
+
+	"github.com/openshift/microshift/pkg/admin/data"
+	"github.com/stretchr/testify/assert"
+)
+
+func Test_BackupName(t *testing.T) {
+	testData := []struct {
+		healthInfo         HealthInfo
+		expectedBackupName string
+	}{
+		{
+			healthInfo: HealthInfo{
+				Health:       "healthy",
+				DeploymentID: "did",
+				BootID:       "bid",
+			},
+			expectedBackupName: "did_bid",
+		},
+		{
+			healthInfo: HealthInfo{
+				Health:       "unhealthy",
+				DeploymentID: "did",
+				BootID:       "bid",
+			},
+			expectedBackupName: "did_bid_unhealthy",
+		},
+	}
+
+	for _, td := range testData {
+		assert.Equal(t, data.BackupName(td.expectedBackupName), td.healthInfo.BackupName())
+	}
+}
diff --git a/pkg/admin/prerun/prerun.go b/pkg/admin/prerun/prerun.go
@@ -160,11 +160,6 @@ func (pr *PreRun) regularPrerun() error {
 		return nil
 	}
 
-	// TODO: We may end up needing to split this if statement into
-	// functions, but for now let's just tell the linter not to apply
-	// the rule.
-	//
-	//nolint:nestif
 	if health.IsHealthy() {
 		klog.Info("Previous boot was healthy")
 		if err := pr.backup(health); err != nil {
@@ -175,11 +170,12 @@ func (pr *PreRun) regularPrerun() error {
 			klog.Info("Current and previously booted deployments are different")
 			return pr.handleDeploymentSwitch(currentDeploymentID)
 		}
-	} else {
-		klog.Info("Previous boot was not healthy")
-		if err = pr.restore(); err != nil {
-			return fmt.Errorf("failed to restore during pre-run: %w", err)
-		}
+		return nil
+	}
+
+	klog.Info("Previous boot was not healthy")
+	if err = pr.handleUnhealthy(health); err != nil {
+		return fmt.Errorf("failed to handle unhealthy data during pre-run: %w", err)
 	}
 
 	return nil
@@ -210,9 +206,10 @@ func (pr *PreRun) backup(health *HealthInfo) error {
 	// get list of already existing backups for deployment ID persisted in health file
 	// after creating backup, the list will be used to remove older backups
 	// (so only the most recent one for specific deployment is kept)
-	backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, health.DeploymentID)
+	allBackupsForDeployment := getBackupsForTheDeployment(existingBackups, health.DeploymentID)
+	healthyBackupsForDeployment := getOnlyHealthyBackups(allBackupsForDeployment)
 
-	if backupAlreadyExists(backupsForDeployment, newBackupName) {
+	if backupAlreadyExists(healthyBackupsForDeployment, newBackupName) {
 		klog.InfoS("Skipping backup: Backup already exists",
 			"deploymentID", health.DeploymentID,
 			"name", newBackupName,
@@ -224,7 +221,9 @@ func (pr *PreRun) backup(health *HealthInfo) error {
 		return fmt.Errorf("failed to create new backup %q: %w", newBackupName, err)
 	}
 
-	pr.removeOldBackups(backupsForDeployment)
+	// if making a new backup, remove all old backups for the deployment
+	// including unhealthy ones
+	pr.removeOldBackups(allBackupsForDeployment)
 
 	klog.InfoS("Finished backup",
 		"deploymentID", health.DeploymentID,
@@ -233,9 +232,9 @@ func (pr *PreRun) backup(health *HealthInfo) error {
 	return nil
 }
 
-func (pr *PreRun) restore() error {
+func (pr *PreRun) handleUnhealthy(health *HealthInfo) error {
 	// TODO: Check if containers are already running (i.e. microshift.service was restarted)?
-	klog.Info("Preparing to restore")
+	klog.Info("Handling previously unhealthy system")
 
 	currentDeploymentID, err := getCurrentDeploymentID()
 	if err != nil {
@@ -250,24 +249,73 @@ func (pr *PreRun) restore() error {
 		"currentDeploymentID", currentDeploymentID,
 		"backups", existingBackups,
 	)
-	backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID)
-
-	if len(backupsForDeployment) == 0 {
-		return fmt.Errorf("there is no backup to restore for current deployment %q", currentDeploymentID)
-	}
+	allBackupsForDeployment := getBackupsForTheDeployment(existingBackups, currentDeploymentID)
+	healthyBackupsForDeployment := getOnlyHealthyBackups(allBackupsForDeployment)
 
-	if len(backupsForDeployment) > 1 {
+	if len(healthyBackupsForDeployment) > 1 {
 		// could happen during backing up when removing older backups failed
 		klog.InfoS("TODO: more than 1 backup, need to pick most recent one")
 	}
 
-	err = pr.dataManager.Restore(backupsForDeployment[0])
+	if len(healthyBackupsForDeployment) > 0 {
+		err = pr.dataManager.Restore(healthyBackupsForDeployment[0])
+		if err != nil {
+			return fmt.Errorf("failed to restore backup: %w", err)
+		}
+		klog.Info("Finished handling unhealthy system")
+		return nil
+	}
+
+	klog.InfoS("There is no backup to restore for current deployment - trying to restore backup for rollback deployment")
+	rollbackDeployID, err := getRollbackDeploymentID()
 	if err != nil {
-		return fmt.Errorf("failed to restore backup: %w", err)
+		return err
 	}
 
-	klog.Info("Finished restore")
-	return nil
+	if rollbackDeployID == "" {
+		// No backup for current deployment and there is no rollback deployment.
+		// This could be a unhealthy system that was manually rebooted to
+		// remediate the situation - let's not interfere: no backup, no restore, just proceed.
+		klog.InfoS("System has no rollback but health.json suggests system was rebooted - skipping prerun")
+		return nil
+	}
+
+	klog.InfoS("Obtained rollback deployment",
+		"rollback-deployment-id", rollbackDeployID,
+		"current-deployment-id", currentDeploymentID,
+		"health-deployment-id", health.DeploymentID)
+
+	if health.DeploymentID == rollbackDeployID {
+		return fmt.Errorf("deployment ID stored in health.json is the same as rollback's" +
+			" - MicroShift should not be updated from unhealthy system")
+	}
+
+	if health.DeploymentID == currentDeploymentID {
+		backupsForRollback := getBackupsForTheDeployment(existingBackups, rollbackDeployID)
+		if len(backupsForRollback) == 0 {
+			// This could happen if current deployment is unhealthy and rollback didn't run MicroShift
+			klog.InfoS("There is no backup for rollback deployment as well - removing existing data for clean start")
+			return pr.dataManager.RemoveData()
+		}
+
+		// There is no backup for current deployment, but there is a backup for the rollback.
+		// Let's restore it and act like it's first boot of newly staged deployment
+		klog.InfoS("Restoring backup for a rollback deployment to perform migration and try starting again",
+			"backup-name", backupsForRollback[0])
+		if err := pr.dataManager.Restore(backupsForRollback[0]); err != nil {
+			return fmt.Errorf("failed to restore backup: %w", err)
+		}
+		return nil
+	}
+
+	// DeployID in health.json is neither booted nor rollback deployment,
+	// so current deployment was staged over deployment without MicroShift
+	// but MicroShift data exists (created by another deployment that rolled back).
+	klog.InfoS("Deployment in health metadata is neither currently booted nor rollback deployment - backing up, then removing existing data for clean start")
+	if err := pr.backup(health); err != nil {
+		klog.ErrorS(err, "Failed to backup data of unhealthy system - ignoring")
+	}
+	return pr.dataManager.RemoveData()
 }
 
 func (pr *PreRun) handleDeploymentSwitch(currentDeploymentID string) error {
@@ -278,7 +326,7 @@ func (pr *PreRun) handleDeploymentSwitch(currentDeploymentID string) error {
 	if err != nil {
 		return fmt.Errorf("failed to determine the existing backups: %w", err)
 	}
-	backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID)
+	backupsForDeployment := getBackupsForTheDeployment(existingBackups, currentDeploymentID)
 
 	if len(backupsForDeployment) > 0 {
 		klog.Info("Backup exists for current deployment - restoring")
@@ -343,16 +391,26 @@ func getCurrentBootID() (string, error) {
 	return strings.ReplaceAll(strings.TrimSpace(string(content)), "-", ""), nil
 }
 
-func getExistingBackupsForTheDeployment(existingBackups []data.BackupName, deployID string) []data.BackupName {
-	existingDeploymentBackups := make([]data.BackupName, 0)
-
-	for _, existingBackup := range existingBackups {
-		if strings.HasPrefix(string(existingBackup), deployID) {
-			existingDeploymentBackups = append(existingDeploymentBackups, existingBackup)
+func filterBackups(backups []data.BackupName, predicate func(data.BackupName) bool) []data.BackupName {
+	out := make([]data.BackupName, 0, len(backups))
+	for _, backup := range backups {
+		if predicate(backup) {
+			out = append(out, backup)
 		}
 	}
+	return out
+}
 
-	return existingDeploymentBackups
+func getOnlyHealthyBackups(backups []data.BackupName) []data.BackupName {
+	return filterBackups(backups, func(bn data.BackupName) bool {
+		return !strings.HasSuffix(string(bn), "unhealthy")
+	})
+}
+
+func getBackupsForTheDeployment(backups []data.BackupName, deployID string) []data.BackupName {
+	return filterBackups(backups, func(bn data.BackupName) bool {
+		return strings.HasPrefix(string(bn), deployID)
+	})
 }
 
 func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName) bool {
@@ -363,3 +421,47 @@ func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName
 	}
 	return false
 }
+
+func getRollbackDeploymentID() (string, error) {
+	cmd := exec.Command("rpm-ostree", "status", "--json")
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+	if err := cmd.Run(); err != nil {
+		return "", fmt.Errorf("%q failed: %s: %w", cmd, stderr.String(), err)
+	}
+
+	type deploy struct {
+		ID     string `json:"id"`
+		Booted bool   `json:"booted"`
+	}
+	type statusOutput struct {
+		Deployments []deploy `json:"deployments"`
+	}
+
+	var status statusOutput
+	if err := json.Unmarshal(stdout.Bytes(), &status); err != nil {
+		return "", fmt.Errorf("failed to unmarshal %q: %w", cmd, err)
+	}
+
+	if len(status.Deployments) == 0 {
+		return "", fmt.Errorf("unexpected amount (0) of deployments from rpm-ostree status output")
+	}
+
+	if len(status.Deployments) == 1 {
+		return "", nil
+	}
+
+	afterBooted := false
+	for _, d := range status.Deployments {
+		if afterBooted {
+			return d.ID, nil
+		}
+
+		if d.Booted {
+			afterBooted = true
+		}
+	}
+
+	return "", fmt.Errorf("could not find rollback deployment in %v", status)
+}
diff --git a/test/resources/libostree.py b/test/resources/libostree.py
@@ -142,3 +142,21 @@ def cleanup_rpm_ostree() -> None:
 
 def create_agent_config(cfg: str) -> None:
     remote_sudo(f"echo '{cfg}' | sudo tee /var/lib/microshift-test-agent.json")
+
+
+def write_greenboot_microshift_wait_timeout(seconds: int) -> None:
+    remote_sudo(
+        f"echo 'MICROSHIFT_WAIT_TIMEOUT_SEC={seconds}' | sudo tee /etc/greenboot/greenboot.conf"
+    )
+
+
+def remove_greenboot_microshift_wait_timeout() -> None:
+    remote_sudo("rm /etc/greenboot/greenboot.conf")
+
+
+def no_transaction_in_progress() -> None:
+    stdout = remote_sudo("rpm-ostree status --json")
+    status = DataFormats.json_parse(stdout)
+    key = "transaction"
+    transaction_in_progress = key in status and status[key] is not None
+    BuiltIn().should_not_be_true(transaction_in_progress)
diff --git a/test/resources/ostree.resource b/test/resources/ostree.resource
@@ -35,12 +35,15 @@ Deploy Commit Expecting A Rollback
     ${initial_deploy_id}=    Get Booted Deployment Id
     ${deploy_id}=    Rebase System    ${ref}
     TestAgent.Write
+    Write Greenboot Microshift Wait Timeout    60
     Reboot MicroShift Host
 
     Log To Console    "Failing ref deployed - waiting for system to roll back"
     Wait Until Keyword Succeeds    20m    15s
     ...    Current Deployment Should Be    ${initial_deploy_id}
-    Log To Console    "System rolled back - deploying target ref"
+
+    Log To Console    "System rolled back"
+    Remove Greenboot Microshift Wait Timeout
 
 Deploy Commit Not Expecting A Rollback
     [Documentation]    Deploys given ref and configures test agent for failing greenboot.
@@ -69,3 +72,28 @@ System Is Running Right Ref And Healthy
     Should Be Equal As Strings    ${expected_deploy}    ${current_deploy}
     Greenboot Health Check Exited
     System Should Be Healthy
+
+Mask Grub Boot Success Timer
+    [Documentation]    Effectively disable a timer that sets boot_success
+    ...    to 1 after two minutes from user login.
+    ...    It impacts tests because grub's script to decrement boot_counter
+    ...    works only if boot_success is 0 (see /etc/grub.d/08_fallback_counting)
+    ...    resulting in more than requested amount of redboot induced reboots
+    ...    and system needing much more time to roll back.
+    ${out}    ${err}    ${rc}=    Execute Command
+    ...    systemctl --user mask grub-boot-success.timer
+    ...    return_stdout=True
+    ...    return_stderr=True
+    ...    return_rc=True
+
+    Should Be Equal As Integers    ${rc}    0
+
+Wait For Transaction To End
+    [Documentation]    Wait for any ostree transaction to end.
+    ...    When grub boots previous deployment due to greenboot failure,
+    ...    ostree status is updated by greenboot running `rpm-ostree rollback`,
+    ...    so test must wait until that transaction is over before staging
+    ...    new deployment.
+
+    Wait Until Keyword Succeeds    2m    15s
+    ...    No Transaction In Progress
diff --git a/test/suites-ostree/fdo.robot b/test/suites-ostree/fdo.robot
@@ -28,6 +28,7 @@ FIDO Onboarding Device
     ...    MicroShift data, unhealthy stored in health file,
     ...    and a deployment gap (no-microshift rollback).
 
+    Mask Grub Boot Success Timer
     System Should Not Feature MicroShift
     TestAgent.Add Action For Next Deployment    every    fail_greenboot
     Deploy Commit Expecting A Rollback    ${FAILING_REF}