From 1d6d44feb220fb5911ecc70a356f8771d3914d1d Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <305846+pmtk@users.noreply.github.com>
Date: Tue, 11 Jul 2023 14:05:31 +0200
Subject: [PATCH 1/6] data manager: remove microshift data func

---
 pkg/admin/data/data_manager.go | 12 ++++++++++++
 pkg/admin/data/types.go        |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/pkg/admin/data/data_manager.go b/pkg/admin/data/data_manager.go
index 35f54ffe12..d35ccc4cfa 100644
--- a/pkg/admin/data/data_manager.go
+++ b/pkg/admin/data/data_manager.go
@@ -171,6 +171,18 @@ func (dm *manager) Restore(name BackupName) error {
 	return nil
 }
 
+func (dm *manager) RemoveData() error {
+	klog.InfoS("Starting MicroShift data removal")
+
+	err := os.RemoveAll(config.DataDir)
+	if err != nil {
+		return fmt.Errorf("failed to remove MicroShift data: %w", err)
+	}
+
+	klog.InfoS("Removed MicroShift data")
+	return nil
+}
+
 func copyPath(src, dest string) error {
 	cmd := exec.Command("cp", append(cpArgs, src, dest)...) //nolint:gosec
 	klog.InfoS("Starting copy", "cmd", cmd)
diff --git a/pkg/admin/data/types.go b/pkg/admin/data/types.go
index ecabc3df53..8c4165625c 100644
--- a/pkg/admin/data/types.go
+++ b/pkg/admin/data/types.go
@@ -23,4 +23,6 @@ type Manager interface {
 	GetBackupPath(BackupName) string
 	GetBackupList() ([]BackupName, error)
 	RemoveBackup(BackupName) error
+
+	RemoveData() error
 }

From cd23d03d9b75a445cdb4f066bfc69fdc4a5a6c77 Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <305846+pmtk@users.noreply.github.com>
Date: Tue, 11 Jul 2023 14:06:34 +0200
Subject: [PATCH 2/6] special backup name suffix for unhealthy

---
 pkg/admin/prerun/health.go      |  8 +++++++-
 pkg/admin/prerun/health_test.go | 36 +++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 pkg/admin/prerun/health_test.go

diff --git a/pkg/admin/prerun/health.go b/pkg/admin/prerun/health.go
index 4b83b823c8..74782688d5 100644
--- a/pkg/admin/prerun/health.go
+++ b/pkg/admin/prerun/health.go
@@ -23,7 +23,13 @@ type HealthInfo struct {
 }
 
 func (hi *HealthInfo) BackupName() data.BackupName {
-	return data.BackupName(fmt.Sprintf("%s_%s", hi.DeploymentID, hi.BootID))
+	name := fmt.Sprintf("%s_%s", hi.DeploymentID, hi.BootID)
+
+	if hi.IsHealthy() {
+		return data.BackupName(name)
+	}
+
+	return data.BackupName(fmt.Sprintf("%s_unhealthy", name))
 }
 
 func (hi *HealthInfo) IsHealthy() bool {
diff --git a/pkg/admin/prerun/health_test.go b/pkg/admin/prerun/health_test.go
new file mode 100644
index 0000000000..f015ed1e6f
--- /dev/null
+++ b/pkg/admin/prerun/health_test.go
@@ -0,0 +1,36 @@
+package prerun
+
+import (
+	"testing"
+
+	"github.com/openshift/microshift/pkg/admin/data"
+	"github.com/stretchr/testify/assert"
+)
+
+func Test_BackupName(t *testing.T) {
+	testData := []struct {
+		healthInfo         HealthInfo
+		expectedBackupName string
+	}{
+		{
+			healthInfo: HealthInfo{
+				Health:       "healthy",
+				DeploymentID: "did",
+				BootID:       "bid",
+			},
+			expectedBackupName: "did_bid",
+		},
+		{
+			healthInfo: HealthInfo{
+				Health:       "unhealthy",
+				DeploymentID: "did",
+				BootID:       "bid",
+			},
+			expectedBackupName: "did_bid_unhealthy",
+		},
+	}
+
+	for _, td := range testData {
+		assert.Equal(t, data.BackupName(td.expectedBackupName), td.healthInfo.BackupName())
+	}
+}

From 659dca388b2e1e9a639a3d25d69338bb09a1dd02 Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <305846+pmtk@users.noreply.github.com>
Date: Tue, 11 Jul 2023 14:20:15 +0200
Subject: [PATCH 3/6] don't consider unhealthy backup for restore

---
 pkg/admin/prerun/prerun.go | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/pkg/admin/prerun/prerun.go b/pkg/admin/prerun/prerun.go
index 648be42861..19b6dae29d 100644
--- a/pkg/admin/prerun/prerun.go
+++ b/pkg/admin/prerun/prerun.go
@@ -210,9 +210,10 @@ func (pr *PreRun) backup(health *HealthInfo) error {
 	// get list of already existing backups for deployment ID persisted in health file
 	// after creating backup, the list will be used to remove older backups
 	// (so only the most recent one for specific deployment is kept)
-	backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, health.DeploymentID)
+	allBackupsForDeployment := getBackupsForTheDeployment(existingBackups, health.DeploymentID)
+	healthyBackupsForDeployment := getOnlyHealthyBackups(allBackupsForDeployment)
 
-	if backupAlreadyExists(backupsForDeployment, newBackupName) {
+	if backupAlreadyExists(healthyBackupsForDeployment, newBackupName) {
 		klog.InfoS("Skipping backup: Backup already exists",
 			"deploymentID", health.DeploymentID,
 			"name", newBackupName,
@@ -224,7 +225,9 @@ func (pr *PreRun) backup(health *HealthInfo) error {
 		return fmt.Errorf("failed to create new backup %q: %w", newBackupName, err)
 	}
 
-	pr.removeOldBackups(backupsForDeployment)
+	// if making a new backup, remove all old backups for the deployment
+	// including unhealthy ones
+	pr.removeOldBackups(allBackupsForDeployment)
 
 	klog.InfoS("Finished backup",
 		"deploymentID", health.DeploymentID,
@@ -343,16 +346,26 @@ func getCurrentBootID() (string, error) {
 	return strings.ReplaceAll(strings.TrimSpace(string(content)), "-", ""), nil
 }
 
-func getExistingBackupsForTheDeployment(existingBackups []data.BackupName, deployID string) []data.BackupName {
-	existingDeploymentBackups := make([]data.BackupName, 0)
-
-	for _, existingBackup := range existingBackups {
-		if strings.HasPrefix(string(existingBackup), deployID) {
-			existingDeploymentBackups = append(existingDeploymentBackups, existingBackup)
+func filterBackups(backups []data.BackupName, predicate func(data.BackupName) bool) []data.BackupName {
+	out := make([]data.BackupName, 0, len(backups))
+	for _, backup := range backups {
+		if predicate(backup) {
+			out = append(out, backup)
 		}
 	}
+	return out
+}
+
+func getOnlyHealthyBackups(backups []data.BackupName) []data.BackupName {
+	return filterBackups(backups, func(bn data.BackupName) bool {
+		return !strings.HasSuffix(string(bn), "unhealthy")
+	})
+}
 
-	return existingDeploymentBackups
+func getBackupsForTheDeployment(backups []data.BackupName, deployID string) []data.BackupName {
+	return filterBackups(backups, func(bn data.BackupName) bool {
+		return strings.HasPrefix(string(bn), deployID)
+	})
 }
 
 func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName) bool {

From 2c3e037d9f27950b74cba29a0d96ad37ee53e02f Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <305846+pmtk@users.noreply.github.com>
Date: Tue, 11 Jul 2023 14:28:41 +0200
Subject: [PATCH 4/6] handle fdo

---
 pkg/admin/prerun/prerun.go | 135 ++++++++++++++++++++++++++++++-------
 1 file changed, 112 insertions(+), 23 deletions(-)

diff --git a/pkg/admin/prerun/prerun.go b/pkg/admin/prerun/prerun.go
index 19b6dae29d..468cd83313 100644
--- a/pkg/admin/prerun/prerun.go
+++ b/pkg/admin/prerun/prerun.go
@@ -160,11 +160,6 @@ func (pr *PreRun) regularPrerun() error {
 		return nil
 	}
 
-	// TODO: We may end up needing to split this if statement into
-	// functions, but for now let's just tell the linter not to apply
-	// the rule.
-	//
-	//nolint:nestif
 	if health.IsHealthy() {
 		klog.Info("Previous boot was healthy")
 		if err := pr.backup(health); err != nil {
@@ -175,11 +170,12 @@ func (pr *PreRun) regularPrerun() error {
 			klog.Info("Current and previously booted deployments are different")
 			return pr.handleDeploymentSwitch(currentDeploymentID)
 		}
-	} else {
-		klog.Info("Previous boot was not healthy")
-		if err = pr.restore(); err != nil {
-			return fmt.Errorf("failed to restore during pre-run: %w", err)
-		}
+		return nil
+	}
+
+	klog.Info("Previous boot was not healthy")
+	if err = pr.handleUnhealthy(health); err != nil {
+		return fmt.Errorf("failed to handle unhealthy data during pre-run: %w", err)
 	}
 
 	return nil
@@ -236,9 +232,9 @@ func (pr *PreRun) backup(health *HealthInfo) error {
 	return nil
 }
 
-func (pr *PreRun) restore() error {
+func (pr *PreRun) handleUnhealthy(health *HealthInfo) error {
 	// TODO: Check if containers are already running (i.e. microshift.service was restarted)?
-	klog.Info("Preparing to restore")
+	klog.Info("Handling previously unhealthy system")
 
 	currentDeploymentID, err := getCurrentDeploymentID()
 	if err != nil {
@@ -253,24 +249,73 @@ func (pr *PreRun) restore() error {
 		"currentDeploymentID", currentDeploymentID,
 		"backups", existingBackups,
 	)
-	backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID)
-
-	if len(backupsForDeployment) == 0 {
-		return fmt.Errorf("there is no backup to restore for current deployment %q", currentDeploymentID)
-	}
+	allBackupsForDeployment := getBackupsForTheDeployment(existingBackups, currentDeploymentID)
+	healthyBackupsForDeployment := getOnlyHealthyBackups(allBackupsForDeployment)
 
-	if len(backupsForDeployment) > 1 {
+	if len(healthyBackupsForDeployment) > 1 {
 		// could happen during backing up when removing older backups failed
 		klog.InfoS("TODO: more than 1 backup, need to pick most recent one")
 	}
 
-	err = pr.dataManager.Restore(backupsForDeployment[0])
+	if len(healthyBackupsForDeployment) > 0 {
+		err = pr.dataManager.Restore(healthyBackupsForDeployment[0])
+		if err != nil {
+			return fmt.Errorf("failed to restore backup: %w", err)
+		}
+		klog.Info("Finished handling unhealthy system")
+		return nil
+	}
+
+	klog.InfoS("There is no backup to restore for current deployment - trying to restore backup for rollback deployment")
+	rollbackDeployID, err := getRollbackDeploymentID()
 	if err != nil {
-		return fmt.Errorf("failed to restore backup: %w", err)
+		return err
 	}
 
-	klog.Info("Finished restore")
-	return nil
+	if rollbackDeployID == "" {
+		// No backup for current deployment and there is no rollback deployment.
+		// This could be a unhealthy system that was manually rebooted to
+		// remediate the situation - let's not interfere: no backup, no restore, just proceed.
+		klog.InfoS("System has no rollback but health.json suggests system was rebooted - skipping prerun")
+		return nil
+	}
+
+	klog.InfoS("Obtained rollback deployment",
+		"rollback-deployment-id", rollbackDeployID,
+		"current-deployment-id", currentDeploymentID,
+		"health-deployment-id", health.DeploymentID)
+
+	if health.DeploymentID == rollbackDeployID {
+		return fmt.Errorf("deployment ID stored in health.json is the same as rollback's" +
+			" - MicroShift should not be updated from unhealthy system")
+	}
+
+	if health.DeploymentID == currentDeploymentID {
+		backupsForRollback := getBackupsForTheDeployment(existingBackups, rollbackDeployID)
+		if len(backupsForRollback) == 0 {
+			// This could happen if current deployment is unhealthy and rollback didn't run MicroShift
+			klog.InfoS("There is no backup for rollback deployment as well - removing existing data for clean start")
+			return pr.dataManager.RemoveData()
+		}
+
+		// There is no backup for current deployment, but there is a backup for the rollback.
+		// Let's restore it and act like it's first boot of newly staged deployment
+		klog.InfoS("Restoring backup for a rollback deployment to perform migration and try starting again",
+			"backup-name", backupsForRollback[0])
+		if err := pr.dataManager.Restore(backupsForRollback[0]); err != nil {
+			return fmt.Errorf("failed to restore backup: %w", err)
+		}
+		return nil
+	}
+
+	// DeployID in health.json is neither booted nor rollback deployment,
+	// so current deployment was staged over deployment without MicroShift
+	// but MicroShift data exists (created by another deployment that rolled back).
+	klog.InfoS("Deployment in health metadata is neither currently booted nor rollback deployment - backing up, then removing existing data for clean start")
+	if err := pr.backup(health); err != nil {
+		klog.ErrorS(err, "Failed to backup data of unhealthy system - ignoring")
+	}
+	return pr.dataManager.RemoveData()
 }
 
 func (pr *PreRun) handleDeploymentSwitch(currentDeploymentID string) error {
@@ -281,7 +326,7 @@ func (pr *PreRun) handleDeploymentSwitch(currentDeploymentID string) error {
 	if err != nil {
 		return fmt.Errorf("failed to determine the existing backups: %w", err)
 	}
-	backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID)
+	backupsForDeployment := getBackupsForTheDeployment(existingBackups, currentDeploymentID)
 
 	if len(backupsForDeployment) > 0 {
 		klog.Info("Backup exists for current deployment - restoring")
@@ -376,3 +421,47 @@ func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName
 	}
 	return false
 }
+
+func getRollbackDeploymentID() (string, error) {
+	cmd := exec.Command("rpm-ostree", "status", "--json")
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+	if err := cmd.Run(); err != nil {
+		return "", fmt.Errorf("%q failed: %s: %w", cmd, stderr.String(), err)
+	}
+
+	type deploy struct {
+		ID     string `json:"id"`
+		Booted bool   `json:"booted"`
+	}
+	type statusOutput struct {
+		Deployments []deploy `json:"deployments"`
+	}
+
+	var status statusOutput
+	if err := json.Unmarshal(stdout.Bytes(), &status); err != nil {
+		return "", fmt.Errorf("failed to unmarshal %q: %w", cmd, err)
+	}
+
+	if len(status.Deployments) == 0 {
+		return "", fmt.Errorf("unexpected amount (0) of deployments from rpm-ostree status output")
+	}
+
+	if len(status.Deployments) == 1 {
+		return "", nil
+	}
+
+	afterBooted := false
+	for _, d := range status.Deployments {
+		if afterBooted {
+			return d.ID, nil
+		}
+
+		if d.Booted {
+			afterBooted = true
+		}
+	}
+
+	return "", fmt.Errorf("could not find rollback deployment in %v", status)
+}

From e51829f443fd350c9496790bf5d687462b054607 Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <305846+pmtk@users.noreply.github.com>
Date: Tue, 11 Jul 2023 14:31:23 +0200
Subject: [PATCH 5/6] fdo.robot changes to make it shorter and more robust

---
 test/resources/libostree.py    | 10 ++++++++++
 test/resources/ostree.resource | 20 +++++++++++++++++++-
 test/suites-ostree/fdo.robot   |  1 +
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/test/resources/libostree.py b/test/resources/libostree.py
index 7c95c4418b..cba07ae4a7 100644
--- a/test/resources/libostree.py
+++ b/test/resources/libostree.py
@@ -142,3 +142,13 @@ def cleanup_rpm_ostree() -> None:
 
 def create_agent_config(cfg: str) -> None:
     remote_sudo(f"echo '{cfg}' | sudo tee /var/lib/microshift-test-agent.json")
+
+
+def write_greenboot_microshift_wait_timeout(seconds: int) -> None:
+    remote_sudo(
+        f"echo 'MICROSHIFT_WAIT_TIMEOUT_SEC={seconds}' | sudo tee /etc/greenboot/greenboot.conf"
+    )
+
+
+def remove_greenboot_microshift_wait_timeout() -> None:
+    remote_sudo("rm /etc/greenboot/greenboot.conf")
diff --git a/test/resources/ostree.resource b/test/resources/ostree.resource
index 93fee38415..4ce164d0fa 100644
--- a/test/resources/ostree.resource
+++ b/test/resources/ostree.resource
@@ -35,12 +35,15 @@ Deploy Commit Expecting A Rollback
     ${initial_deploy_id}=    Get Booted Deployment Id
     ${deploy_id}=    Rebase System    ${ref}
     TestAgent.Write
+    Write Greenboot Microshift Wait Timeout    60
     Reboot MicroShift Host
 
     Log To Console    "Failing ref deployed - waiting for system to roll back"
     Wait Until Keyword Succeeds    20m    15s
     ...    Current Deployment Should Be    ${initial_deploy_id}
-    Log To Console    "System rolled back - deploying target ref"
+
+    Log To Console    "System rolled back"
+    Remove Greenboot Microshift Wait Timeout
 
 Deploy Commit Not Expecting A Rollback
     [Documentation]    Deploys given ref and configures test agent for failing greenboot.
@@ -69,3 +72,18 @@ System Is Running Right Ref And Healthy
     Should Be Equal As Strings    ${expected_deploy}    ${current_deploy}
     Greenboot Health Check Exited
     System Should Be Healthy
+
+Mask Grub Boot Success Timer
+    [Documentation]    Effectively disable a timer that sets boot_success
+    ...    to 1 after two minutes from user login.
+    ...    It impacts tests because grub's script to decrement boot_counter
+    ...    works only if boot_success is 0 (see /etc/grub.d/08_fallback_counting)
+    ...    resulting in more than requested amount of redboot induced reboots
+    ...    and system needing much more time to roll back.
+    ${out}    ${err}    ${rc}=    Execute Command
+    ...    systemctl --user mask grub-boot-success.timer
+    ...    return_stdout=True
+    ...    return_stderr=True
+    ...    return_rc=True
+
+    Should Be Equal As Integers    ${rc}    0
diff --git a/test/suites-ostree/fdo.robot b/test/suites-ostree/fdo.robot
index 597f87425c..09e2ddff54 100644
--- a/test/suites-ostree/fdo.robot
+++ b/test/suites-ostree/fdo.robot
@@ -28,6 +28,7 @@ FIDO Onboarding Device
     ...    MicroShift data, unhealthy stored in health file,
     ...    and a deployment gap (no-microshift rollback).
 
+    Mask Grub Boot Success Timer
     System Should Not Feature MicroShift
     TestAgent.Add Action For Next Deployment    every    fail_greenboot
     Deploy Commit Expecting A Rollback    ${FAILING_REF}

From abf2103b3d04e1bc845e182c32109038549b663d Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <305846+pmtk@users.noreply.github.com>
Date: Tue, 11 Jul 2023 17:04:49 +0200
Subject: [PATCH 6/6] account for greenboot's rpm-ostree rollback

It happens when boot_counter falls down to "rollback value"
Then, grub just boots another boot entry.
After system start, greenboot notices boot_counter is -1,
and it executes `rpm-ostree rollback` to actually make
currently booted deployment first one on the list (default)
---
 test/resources/libostree.py    |  8 ++++++++
 test/resources/ostree.resource | 10 ++++++++++
 2 files changed, 18 insertions(+)

diff --git a/test/resources/libostree.py b/test/resources/libostree.py
index cba07ae4a7..0d5997eaa0 100644
--- a/test/resources/libostree.py
+++ b/test/resources/libostree.py
@@ -152,3 +152,11 @@ def write_greenboot_microshift_wait_timeout(seconds: int) -> None:
 
 def remove_greenboot_microshift_wait_timeout() -> None:
     remote_sudo("rm /etc/greenboot/greenboot.conf")
+
+
+def no_transaction_in_progress() -> None:
+    stdout = remote_sudo("rpm-ostree status --json")
+    status = DataFormats.json_parse(stdout)
+    key = "transaction"
+    transaction_in_progress = key in status and status[key] is not None
+    BuiltIn().should_not_be_true(transaction_in_progress)
diff --git a/test/resources/ostree.resource b/test/resources/ostree.resource
index 4ce164d0fa..eae6241bc7 100644
--- a/test/resources/ostree.resource
+++ b/test/resources/ostree.resource
@@ -87,3 +87,13 @@ Mask Grub Boot Success Timer
     ...    return_rc=True
 
     Should Be Equal As Integers    ${rc}    0
+
+Wait For Transaction To End
+    [Documentation]    Wait for any ostree transaction to end.
+    ...    When grub boots previous deployment due to greenboot failure,
+    ...    ostree status is updated by greenboot running `rpm-ostree rollback`,
+    ...    so test must wait until that transaction is over before staging
+    ...    new deployment.
+
+    Wait Until Keyword Succeeds    2m    15s
+    ...    No Transaction In Progress