Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions pkg/admin/data/data_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,18 @@ func (dm *manager) Restore(name BackupName) error {
return nil
}

func (dm *manager) RemoveData() error {
klog.InfoS("Starting MicroShift data removal")

err := os.RemoveAll(config.DataDir)
if err != nil {
return fmt.Errorf("failed to remove MicroShift data: %w", err)
}

klog.InfoS("Removed MicroShift data")
return nil
}

func copyPath(src, dest string) error {
cmd := exec.Command("cp", append(cpArgs, src, dest)...) //nolint:gosec
klog.InfoS("Starting copy", "cmd", cmd)
Expand Down
2 changes: 2 additions & 0 deletions pkg/admin/data/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,6 @@ type Manager interface {
GetBackupPath(BackupName) string
GetBackupList() ([]BackupName, error)
RemoveBackup(BackupName) error

RemoveData() error
}
8 changes: 7 additions & 1 deletion pkg/admin/prerun/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ type HealthInfo struct {
}

func (hi *HealthInfo) BackupName() data.BackupName {
return data.BackupName(fmt.Sprintf("%s_%s", hi.DeploymentID, hi.BootID))
name := fmt.Sprintf("%s_%s", hi.DeploymentID, hi.BootID)

if hi.IsHealthy() {
return data.BackupName(name)
}

return data.BackupName(fmt.Sprintf("%s_unhealthy", name))
}

func (hi *HealthInfo) IsHealthy() bool {
Expand Down
36 changes: 36 additions & 0 deletions pkg/admin/prerun/health_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package prerun

import (
"testing"

"github.com/openshift/microshift/pkg/admin/data"
"github.com/stretchr/testify/assert"
)

func Test_BackupName(t *testing.T) {
testData := []struct {
healthInfo HealthInfo
expectedBackupName string
}{
{
healthInfo: HealthInfo{
Health: "healthy",
DeploymentID: "did",
BootID: "bid",
},
expectedBackupName: "did_bid",
},
{
healthInfo: HealthInfo{
Health: "unhealthy",
DeploymentID: "did",
BootID: "bid",
},
expectedBackupName: "did_bid_unhealthy",
},
}

for _, td := range testData {
assert.Equal(t, data.BackupName(td.expectedBackupName), td.healthInfo.BackupName())
}
}
168 changes: 135 additions & 33 deletions pkg/admin/prerun/prerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,6 @@ func (pr *PreRun) regularPrerun() error {
return nil
}

// TODO: We may end up needing to split this if statement into
// functions, but for now let's just tell the linter not to apply
// the rule.
//
//nolint:nestif
if health.IsHealthy() {
klog.Info("Previous boot was healthy")
if err := pr.backup(health); err != nil {
Expand All @@ -175,11 +170,12 @@ func (pr *PreRun) regularPrerun() error {
klog.Info("Current and previously booted deployments are different")
return pr.handleDeploymentSwitch(currentDeploymentID)
}
} else {
klog.Info("Previous boot was not healthy")
if err = pr.restore(); err != nil {
return fmt.Errorf("failed to restore during pre-run: %w", err)
}
return nil
}

klog.Info("Previous boot was not healthy")
if err = pr.handleUnhealthy(health); err != nil {
return fmt.Errorf("failed to handle unhealthy data during pre-run: %w", err)
}

return nil
Expand Down Expand Up @@ -210,9 +206,10 @@ func (pr *PreRun) backup(health *HealthInfo) error {
// get list of already existing backups for deployment ID persisted in health file
// after creating backup, the list will be used to remove older backups
// (so only the most recent one for specific deployment is kept)
backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, health.DeploymentID)
allBackupsForDeployment := getBackupsForTheDeployment(existingBackups, health.DeploymentID)
healthyBackupsForDeployment := getOnlyHealthyBackups(allBackupsForDeployment)

if backupAlreadyExists(backupsForDeployment, newBackupName) {
if backupAlreadyExists(healthyBackupsForDeployment, newBackupName) {
klog.InfoS("Skipping backup: Backup already exists",
"deploymentID", health.DeploymentID,
"name", newBackupName,
Expand All @@ -224,7 +221,9 @@ func (pr *PreRun) backup(health *HealthInfo) error {
return fmt.Errorf("failed to create new backup %q: %w", newBackupName, err)
}

pr.removeOldBackups(backupsForDeployment)
// if making a new backup, remove all old backups for the deployment
// including unhealthy ones
pr.removeOldBackups(allBackupsForDeployment)

klog.InfoS("Finished backup",
"deploymentID", health.DeploymentID,
Expand All @@ -233,9 +232,9 @@ func (pr *PreRun) backup(health *HealthInfo) error {
return nil
}

func (pr *PreRun) restore() error {
func (pr *PreRun) handleUnhealthy(health *HealthInfo) error {
// TODO: Check if containers are already running (i.e. microshift.service was restarted)?
klog.Info("Preparing to restore")
klog.Info("Handling previously unhealthy system")

currentDeploymentID, err := getCurrentDeploymentID()
if err != nil {
Expand All @@ -250,24 +249,73 @@ func (pr *PreRun) restore() error {
"currentDeploymentID", currentDeploymentID,
"backups", existingBackups,
)
backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID)

if len(backupsForDeployment) == 0 {
return fmt.Errorf("there is no backup to restore for current deployment %q", currentDeploymentID)
}
allBackupsForDeployment := getBackupsForTheDeployment(existingBackups, currentDeploymentID)
healthyBackupsForDeployment := getOnlyHealthyBackups(allBackupsForDeployment)

if len(backupsForDeployment) > 1 {
if len(healthyBackupsForDeployment) > 1 {
// could happen during backing up when removing older backups failed
klog.InfoS("TODO: more than 1 backup, need to pick most recent one")
}

err = pr.dataManager.Restore(backupsForDeployment[0])
if len(healthyBackupsForDeployment) > 0 {
err = pr.dataManager.Restore(healthyBackupsForDeployment[0])
if err != nil {
return fmt.Errorf("failed to restore backup: %w", err)
}
klog.Info("Finished handling unhealthy system")
return nil
}

klog.InfoS("There is no backup to restore for current deployment - trying to restore backup for rollback deployment")
rollbackDeployID, err := getRollbackDeploymentID()
if err != nil {
return fmt.Errorf("failed to restore backup: %w", err)
return err
}

klog.Info("Finished restore")
return nil
if rollbackDeployID == "" {
// No backup for current deployment and there is no rollback deployment.
// This could be a unhealthy system that was manually rebooted to
// remediate the situation - let's not interfere: no backup, no restore, just proceed.
klog.InfoS("System has no rollback but health.json suggests system was rebooted - skipping prerun")
return nil
}

klog.InfoS("Obtained rollback deployment",
"rollback-deployment-id", rollbackDeployID,
"current-deployment-id", currentDeploymentID,
"health-deployment-id", health.DeploymentID)

if health.DeploymentID == rollbackDeployID {
return fmt.Errorf("deployment ID stored in health.json is the same as rollback's" +
" - MicroShift should not be updated from unhealthy system")
}

if health.DeploymentID == currentDeploymentID {
backupsForRollback := getBackupsForTheDeployment(existingBackups, rollbackDeployID)
if len(backupsForRollback) == 0 {
// This could happen if current deployment is unhealthy and rollback didn't run MicroShift
klog.InfoS("There is no backup for rollback deployment as well - removing existing data for clean start")
return pr.dataManager.RemoveData()
}

// There is no backup for current deployment, but there is a backup for the rollback.
// Let's restore it and act like it's first boot of newly staged deployment
klog.InfoS("Restoring backup for a rollback deployment to perform migration and try starting again",
Comment thread
pmtk marked this conversation as resolved.
Outdated
"backup-name", backupsForRollback[0])
if err := pr.dataManager.Restore(backupsForRollback[0]); err != nil {
return fmt.Errorf("failed to restore backup: %w", err)
}
return nil
}

// DeployID in health.json is neither booted nor rollback deployment,
// so current deployment was staged over deployment without MicroShift
// but MicroShift data exists (created by another deployment that rolled back).
klog.InfoS("Deployment in health metadata is neither currently booted nor rollback deployment - backing up, then removing existing data for clean start")
if err := pr.backup(health); err != nil {
klog.ErrorS(err, "Failed to backup data of unhealthy system - ignoring")
}
return pr.dataManager.RemoveData()
Comment thread
dhellmann marked this conversation as resolved.
Outdated
}

func (pr *PreRun) handleDeploymentSwitch(currentDeploymentID string) error {
Expand All @@ -278,7 +326,7 @@ func (pr *PreRun) handleDeploymentSwitch(currentDeploymentID string) error {
if err != nil {
return fmt.Errorf("failed to determine the existing backups: %w", err)
}
backupsForDeployment := getExistingBackupsForTheDeployment(existingBackups, currentDeploymentID)
backupsForDeployment := getBackupsForTheDeployment(existingBackups, currentDeploymentID)

if len(backupsForDeployment) > 0 {
klog.Info("Backup exists for current deployment - restoring")
Expand Down Expand Up @@ -343,16 +391,26 @@ func getCurrentBootID() (string, error) {
return strings.ReplaceAll(strings.TrimSpace(string(content)), "-", ""), nil
}

func getExistingBackupsForTheDeployment(existingBackups []data.BackupName, deployID string) []data.BackupName {
existingDeploymentBackups := make([]data.BackupName, 0)

for _, existingBackup := range existingBackups {
if strings.HasPrefix(string(existingBackup), deployID) {
existingDeploymentBackups = append(existingDeploymentBackups, existingBackup)
func filterBackups(backups []data.BackupName, predicate func(data.BackupName) bool) []data.BackupName {
out := make([]data.BackupName, 0, len(backups))
for _, backup := range backups {
if predicate(backup) {
out = append(out, backup)
}
}
return out
}

return existingDeploymentBackups
func getOnlyHealthyBackups(backups []data.BackupName) []data.BackupName {
return filterBackups(backups, func(bn data.BackupName) bool {
return !strings.HasSuffix(string(bn), "unhealthy")
})
}

func getBackupsForTheDeployment(backups []data.BackupName, deployID string) []data.BackupName {
return filterBackups(backups, func(bn data.BackupName) bool {
return strings.HasPrefix(string(bn), deployID)
})
}

func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName) bool {
Expand All @@ -363,3 +421,47 @@ func backupAlreadyExists(existingBackups []data.BackupName, name data.BackupName
}
return false
}

func getRollbackDeploymentID() (string, error) {
cmd := exec.Command("rpm-ostree", "status", "--json")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("%q failed: %s: %w", cmd, stderr.String(), err)
}

type deploy struct {
ID string `json:"id"`
Booted bool `json:"booted"`
}
type statusOutput struct {
Deployments []deploy `json:"deployments"`
}

var status statusOutput
if err := json.Unmarshal(stdout.Bytes(), &status); err != nil {
return "", fmt.Errorf("failed to unmarshal %q: %w", cmd, err)
}

if len(status.Deployments) == 0 {
return "", fmt.Errorf("unexpected amount (0) of deployments from rpm-ostree status output")
}

if len(status.Deployments) == 1 {
return "", nil
}

afterBooted := false
for _, d := range status.Deployments {
if afterBooted {
return d.ID, nil
}

if d.Booted {
afterBooted = true
}
}

return "", fmt.Errorf("could not find rollback deployment in %v", status)
}
18 changes: 18 additions & 0 deletions test/resources/libostree.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,21 @@ def cleanup_rpm_ostree() -> None:

def create_agent_config(cfg: str) -> None:
remote_sudo(f"echo '{cfg}' | sudo tee /var/lib/microshift-test-agent.json")


def write_greenboot_microshift_wait_timeout(seconds: int) -> None:
remote_sudo(
f"echo 'MICROSHIFT_WAIT_TIMEOUT_SEC={seconds}' | sudo tee /etc/greenboot/greenboot.conf"
)


def remove_greenboot_microshift_wait_timeout() -> None:
remote_sudo("rm /etc/greenboot/greenboot.conf")


def no_transaction_in_progress() -> None:
stdout = remote_sudo("rpm-ostree status --json")
status = DataFormats.json_parse(stdout)
key = "transaction"
transaction_in_progress = key in status and status[key] is not None
BuiltIn().should_not_be_true(transaction_in_progress)
30 changes: 29 additions & 1 deletion test/resources/ostree.resource
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,15 @@ Deploy Commit Expecting A Rollback
${initial_deploy_id}= Get Booted Deployment Id
${deploy_id}= Rebase System ${ref}
TestAgent.Write
Write Greenboot Microshift Wait Timeout 60
Reboot MicroShift Host

Log To Console "Failing ref deployed - waiting for system to roll back"
Wait Until Keyword Succeeds 20m 15s
... Current Deployment Should Be ${initial_deploy_id}
Log To Console "System rolled back - deploying target ref"

Log To Console "System rolled back"
Remove Greenboot Microshift Wait Timeout

Deploy Commit Not Expecting A Rollback
[Documentation] Deploys given ref and configures test agent for failing greenboot.
Expand Down Expand Up @@ -69,3 +72,28 @@ System Is Running Right Ref And Healthy
Should Be Equal As Strings ${expected_deploy} ${current_deploy}
Greenboot Health Check Exited
System Should Be Healthy

Mask Grub Boot Success Timer
[Documentation] Effectively disable a timer that sets boot_success
... to 1 after two minutes from user login.
... It impacts tests because grub's script to decrement boot_counter
... works only if boot_success is 0 (see /etc/grub.d/08_fallback_counting)
... resulting in more than requested amount of redboot induced reboots
... and system needing much more time to roll back.
${out} ${err} ${rc}= Execute Command
... systemctl --user mask grub-boot-success.timer
... return_stdout=True
... return_stderr=True
... return_rc=True

Should Be Equal As Integers ${rc} 0

Wait For Transaction To End
[Documentation] Wait for any ostree transaction to end.
... When grub boots previous deployment due to greenboot failure,
... ostree status is updated by greenboot running `rpm-ostree rollback`,
... so test must wait until that transaction is over before staging
... new deployment.

Wait Until Keyword Succeeds 2m 15s
... No Transaction In Progress
1 change: 1 addition & 0 deletions test/suites-ostree/fdo.robot
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ FIDO Onboarding Device
... MicroShift data, unhealthy stored in health file,
... and a deployment gap (no-microshift rollback).

Mask Grub Boot Success Timer
Comment thread
dhellmann marked this conversation as resolved.
Outdated
System Should Not Feature MicroShift
TestAgent.Add Action For Next Deployment every fail_greenboot
Deploy Commit Expecting A Rollback ${FAILING_REF}
Expand Down