diff --git a/.github/workflows/smoke-tests.yaml b/.github/workflows/smoke-tests.yaml index 74515196..6eb2eb33 100644 --- a/.github/workflows/smoke-tests.yaml +++ b/.github/workflows/smoke-tests.yaml @@ -85,3 +85,20 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: make smoke-upgrade + + smoke-airgapped-multi-hop: + runs-on: ubuntu-latest + timeout-minutes: 205 + if: | + github.event_name == 'push' || + contains(github.event.pull_request.labels.*.name, 'smoke-airgapped-multi-hop') + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + - name: Run airgapped multi-hop upgrade smoke test + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + run: make smoke-airgapped-multi-hop diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..780eab33 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,161 @@ +# Mirantis Launchpad — Agent Instructions + +This file follows the [AGENTS.md](https://agents.md/) open standard and is read by Claude Code, Cursor, Windsurf, Codex, Gemini CLI, and compatible agents. Instructions here take precedence over general tool defaults. + +--- + +## Project Overview + +Launchpad is a Go CLI that installs, upgrades, and resets Mirantis Kubernetes Engine (MKE) and Mirantis Container Runtime (MCR) clusters on provisioned compute nodes. It is **stateless** between runs — all cluster state is discovered by querying hosts directly at the start of each operation. + +--- + +## Non-Negotiable Rules + +- **NEVER** commit to, push to, or merge into `main`. All work goes on feature branches. +- **All commits MUST be signed**: `git commit -s`. +- Use `GOTOOLCHAIN=auto` for all `go` commands (already set in `Makefile` via `export`). +- Read `docs/guidance/project.md` before implementing features or bug fixes. +- **NEVER** auto-generate this file or any documentation file. Write it from actual project knowledge. + +--- + +## Build & Test + +```bash +# Build (current platform → dist/launchpad_GOOS_GOARCH) +make local + +# Lint & security +make lint # golangci-lint run +make security-scan # govulncheck ./... + +# Unit tests — require --tags 'testing' build tag +make unit-test +go test -v --tags 'testing' ./pkg/... +go test -v --tags 'testing' ./pkg/config/... -run TestFoo # single test + +# Functional & integration +make functional-test # test/functional/ — component level, may need network +make integration-test # test/integration/ — requires real nodes + +# Smoke tests — require AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY +make smoke-modern # RHEL9/Ubuntu24/Rocky9, MCR stable-29.2, MKE 3.9.2 (50m) +make smoke-legacy # RHEL8/Rocky8/Ubuntu22, MCR stable-25.0, MKE 3.8.8 (50m) +make smoke-windows # Ubuntu24 mgr + Win2019/2022/2025, MCR stable-25.0 (60m) +make smoke-upgrade # Install 3.8.8 → upgrade to 3.9.2, same infra (90m) +``` + +--- + +## Architecture: Phase Manager Pattern + +All operations (apply, reset, describe) are ordered sequences of **phases** run by `phase.Manager`. This is the central pattern — new features belong in a new phase or an addition to an existing one. + +``` +cmd/apply.go + └── pkg/product/mke/mke.go (Apply) + └── phase.Manager.Run() + └── [Phase1, Phase2, ..., PhaseN] sequential +``` + +Each phase implements `Run() error` and `Title() string`. Optional: `Prepare(config)`, `ShouldRun()`, `CleanUp()`, `DisableCleanup()`. + +**Key packages:** + +| Package | Role | +|---|---| +| `pkg/phase/` | Phase Manager orchestration | +| `pkg/product/mke/` | Apply / reset / describe entry points | +| `pkg/product/mke/phase/` | 30+ phase implementations | +| `pkg/product/mke/config/` | Config structs: `ClusterConfig`, `Host`, `Hosts`, `MCRConfig`, `MKEConfig`, `MSRConfig` | +| `pkg/config/` | YAML parsing, schema migrations v1–v16 | +| `pkg/configurer/` | OS-specific MCR install/upgrade (EL, Ubuntu, SLES, Windows) | +| `pkg/mcr/` | MCR runtime helpers (version detect, ensure-running) | +| `pkg/swarm/` | Docker Swarm helpers (node ID, cluster ID) | +| `pkg/kubeclient/` | Kubernetes client | +| `pkg/docker/` | Image handling and auth | +| `pkg/analytics/` | Segment telemetry | + +--- + +## Configuration Schema + +Current: `apiVersion: launchpad.mirantis.com/mke/v1.6` + +MCR is selected by **channel** (e.g. `stable-29.2`), not by specific version number. Migrations for older schemas live in `pkg/config/migration/` and run automatically on load. + +```yaml +apiVersion: launchpad.mirantis.com/mke/v1.6 +kind: mke +metadata: + name: my-cluster +spec: + hosts: + - role: manager + ssh: + address: 1.2.3.4 + user: ubuntu + keyPath: ~/.ssh/id_rsa + mcr: + channel: stable-29.2 + mke: + version: 3.9.2 + adminUsername: admin + adminPassword: secret +``` + +If you change the config schema: bump `apiVersion`, add a migration in `pkg/config/migration/`, add unit tests for it. + +--- + +## Smoke Tests + +Smoke tests (`test/smoke/`) use [Terratest](https://terratest.gruntwork.io/) to provision real AWS infrastructure via `examples/terraform/aws-simple/`, run the full Launchpad lifecycle, and destroy everything unconditionally via `defer terraform.Destroy`. All resources are tagged `launchpad-smoke-test: true`. + +| Make target | Test function | Timeout | What it tests | +|---|---|---|---| +| `smoke-modern` | `TestModernCluster` | 50m | Install on RHEL9/Ubuntu24/Rocky9 | +| `smoke-legacy` | `TestLegacyCluster` | 50m | Install on RHEL8/Rocky8/Ubuntu22 | +| `smoke-windows` | `TestWindowsCluster` | 60m | Install with Windows 2019/2022/2025 workers | +| `smoke-upgrade` | `TestUpgradeLegacyToModern` | 90m | Install 3.8.8 then upgrade to 3.9.2 in place | +| `smoke-airgapped-multi-hop` | `TestAirgappedMultiHopUpgrade` | 200m | Multi-step upgrade through internal MSR (port 4443) with imageRepo override | + +CI jobs are gated by PR labels: `smoke-test` (all jobs except `smoke-airgapped-multi-hop`), or individual labels `smoke-modern`, `smoke-legacy`, `smoke-windows`, `smoke-upgrade`, `smoke-airgapped-multi-hop`. + +**To add a new smoke test**, read `docs/development/smoke-tests.md` — it documents the full framework: available platforms, helper functions, how to write install/reset and upgrade tests, CI wiring, and a pre-submission checklist. + +--- + +## Contributing + +- Feature branches only — never `main`. +- Signed commits: `git commit -s`. +- New functionality → new phase, not inline logic. +- Run `make lint` and `make unit-test` before opening a PR. +- PR description must explain trade-offs and link any relevant Jira ticket (PRODENG-XXXX). + +--- + +## Collaborative / Multi-Engineer Workflows + +When multiple engineers or agents work on the same initiative: + +- Communicate before modifying shared files (smoke tests, Terraform examples, CI workflow). +- Prefer additive changes (new test functions, new phases) to reduce merge conflicts. +- Use a separate file per concern in `test/smoke/` (e.g. `upgrade_test.go`) rather than growing `smoke_test.go`. +- Coordinate PR labels to avoid running expensive smoke jobs unnecessarily. +- Tag all AWS resources with `launchpad-smoke-test: true` and a descriptive `launchpad-smoke-test-name` value so each engineer's resources are identifiable in the console. + +--- + +## Documentation + +| File | Purpose | +|---|---| +| `docs/guidance/project.md` | Core architectural principles | +| `docs/specifications/architecture.md` | Phase Manager, apply/reset sequences, design decisions | +| `docs/development/workflow.md` | Build, test, and contribution workflow | +| `docs/requirements/launchpad-prd.md` | Product requirements | +| `docs/usage/getting-started.md` | User-facing getting started guide | +| `docs/development/smoke-tests.md` | How to write new smoke tests (framework, platforms, CI wiring) | diff --git a/AI_AGENTS.md b/AI_AGENTS.md deleted file mode 100644 index 757ea988..00000000 --- a/AI_AGENTS.md +++ /dev/null @@ -1,57 +0,0 @@ -# AI Agent Guidelines for Mirantis Launchpad - -This document defines guidelines for AI agents working on the Launchpad project. These instructions take precedence over general tool defaults. - ---- - -## Core Principles - -1. **Generalization**: Support all AI agents, not just specific providers. -2. **Consistency**: Follow Launchpad’s architectural patterns and conventions. -3. **Clarity**: Document decisions and reasoning for human reviewers. -4. **Branching**: NEVER work on, push to, or merge to the `main` branch. All work MUST be done on feature branches. - ---- - -## Documentation Structure - -Documentation is organized to minimize context overhead for agents: -- `docs/guidance/`: Foundational principles and project vision. -- `docs/requirements/`: High-level product requirements (PRDs). -- `docs/specifications/`: Technical specifications, architecture, and design. -- `docs/development/`: Development workflows, building, and testing. -- `docs/usage/`: User-facing documentation. - ---- - -## Agent Workflow - -### 1. Research Phase -- Read `docs/guidance/project.md` to understand core architectural principles. -- For feature requests, review the relevant PRD in `docs/requirements/`. -- For bug fixes, review the relevant specification in `docs/specifications/`. -- **Environment**: Ensure `GOTOOLCHAIN=auto` is used for all `go` commands to support the required toolchain. - -### 2. Strategy Phase -- **ALWAYS** create a new feature branch from `main`. -- Propose changes that align with the **Phase Manager** architecture (`docs/specifications/architecture.md`). -- Ensure changes are backwards compatible or include migrations (`docs/specifications/architecture.md`). -- Document trade-offs and decisions in the PR description. - -### 3. Execution Phase -- **NEVER** push directly to `main`. -- **NEVER** merge into `main`. -- All work MUST be pushed to a remote feature branch for human review. -- All commits MUST be signed. -- Follow the testing strategy outlined in `docs/development/workflow.md`. -- Use `make local` for rapid iteration and validation. - ---- - -## Technical Constraints - -- **Language**: Go (Golang). -- **Core Library**: [k0sproject Rig](https://github.com/k0sproject/rig) for host management. -- **Build System**: Goreleaser (invoked via `Makefile`). -- **Telemetry**: Maintain existing telemetry patterns for installation, upgrades, and errors. -- **State**: Launchpad is stateless between runs; use phases for discovery. diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 29c7779e..00000000 --- a/CLAUDE.md +++ /dev/null @@ -1,107 +0,0 @@ -# CLAUDE.md - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## Overview - -Mirantis Launchpad is a CLI tool that installs, upgrades, and resets Mirantis Kubernetes Engine (MKE) and Mirantis Container Runtime (MCR) clusters on provisioned compute nodes. It is **stateless** between runs — all cluster state is discovered by querying hosts directly. - -## Agent Rules (from AI_AGENTS.md) - -- **NEVER** work on, push to, or merge into `main`. All work goes on feature branches. -- **All commits MUST be signed** (`git commit -s`). -- Use `GOTOOLCHAIN=auto` for all `go` commands (already set in Makefile via `export`). -- Read `docs/guidance/project.md` before implementing features or bug fixes. - -## Commands - -```bash -# Build -make local # Build for current platform → dist/launchpad_GOOS_GOARCH - -# Lint & security -make lint # golangci-lint run -make security-scan # govulncheck ./... - -# Tests -make unit-test # go test -v --tags 'testing' ./pkg/... -make functional-test # go test -v ./test/functional/... -timeout 20m -make integration-test # go test -v ./test/integration/... -timeout 20m -make smoke-small # E2E small cluster (20m timeout) -make smoke-full # E2E full matrix cluster (50m timeout) - -# Run a single test package -go test -v --tags 'testing' ./pkg/config/... - -# Run a single test -go test -v --tags 'testing' ./pkg/config/... -run TestFoo -``` - -The build tag `testing` is required for unit tests in `pkg/`. - -## Architecture: Phase Manager Pattern - -All major operations (apply, reset, describe) are implemented as ordered sequences of **phases** executed by a `phase.Manager`. This is the central architectural pattern — new features should be implemented as new phases or additions to existing ones. - -``` -cmd/apply.go - └── product/mke/mke.go (Apply method) - └── phase.Manager.Run() - └── [Phase1, Phase2, ..., PhaseN] executed sequentially -``` - -Each phase implements: -- `Run() error` — required -- `Title() string` — required -- Optional: `Prepare(config)`, `ShouldRun()`, `CleanUp()`, `DisableCleanup()` - -**Key packages:** - -| Package | Role | -|---|---| -| `pkg/phase/` | Phase Manager orchestration | -| `pkg/product/mke/` | MKE apply/reset/describe logic | -| `pkg/product/mke/phase/` | 30+ phase implementations | -| `pkg/product/mke/config/` | MKE-specific config structs | -| `pkg/config/` | YAML config parsing, schema migrations (v1–v15) | -| `pkg/configurer/` | OS/distro-specific host configuration | -| `pkg/kubeclient/` | Kubernetes client operations | -| `pkg/helm/` | Helm chart management | -| `pkg/docker/` | Docker image handling and auth | -| `pkg/analytics/` | Segment telemetry | - -## Configuration - -- Default config file: `launchpad.yaml` (override with `--config`) -- Supports `--config -` to read from stdin -- Environment variable substitution via `envsubst` -- Schema migrations live in `pkg/config/migration/` — required for any config struct changes -- Config migrations must be backward compatible; each migration version has unit tests - -## Host Management - -Hosts are managed via [k0sproject/rig](https://github.com/k0sproject/rig), which abstracts SSH (Linux) and WinRM (Windows) connections. Phases receive a configured host set and use rig for remote command execution, file upload/download, and shell quoting. - -## Testing Strategy - -| Type | Location | Notes | -|---|---|---| -| Unit | `pkg/**/*_test.go` | Requires `--tags 'testing'` build tag | -| Functional | `test/functional/` | Component-level, may need network | -| Integration | `test/integration/` | Requires real provisioned nodes | -| Smoke | `test/smoke/` | Full E2E via Terraform (terratest) | - -## Linting - -`.golangci.yml` enables 30+ linters. Notable constraints: -- `varnamelen`: max 10 chars (allowlist includes `i`, `h`, `ok`, `id`, etc.) -- Package names may conflict with stdlib (log, version, user, constant) — these are excluded from the relevant linter -- Generated files (`*.gen.go`) are excluded - -## Documentation - -Consult these before implementing non-trivial changes: -- `docs/guidance/project.md` — core architectural principles -- `docs/specifications/architecture.md` — Phase Manager and design decisions -- `docs/development/workflow.md` — contribution and testing workflow -- `docs/requirements/` — PRDs for planned features diff --git a/Makefile b/Makefile index 8eb84ba7..cf233d1e 100644 --- a/Makefile +++ b/Makefile @@ -66,6 +66,11 @@ smoke-windows: .PHONY: smoke-upgrade smoke-upgrade: go test -count=1 -v ./test/smoke/... -run TestUpgrade -timeout 90m + +.PHONY: smoke-airgapped-multi-hop +smoke-airgapped-multi-hop: + go test -count=1 -v ./test/smoke/... -run TestAirgappedMultiHopUpgrade -timeout 200m + .PHONY: clean-launchpad-chart clean-launchpad-chart: terraform -chdir=./examples/tf-aws/launchpad apply --auto-approve --destroy diff --git a/docs/development/smoke-tests.md b/docs/development/smoke-tests.md new file mode 100644 index 00000000..b838eab8 --- /dev/null +++ b/docs/development/smoke-tests.md @@ -0,0 +1,253 @@ +# Writing Smoke Tests + +Smoke tests live in `test/smoke/` and use [Terratest](https://terratest.gruntwork.io/) to provision real AWS infrastructure, run the full Launchpad lifecycle, and tear down — all within a single `go test` invocation. + +Read this document before adding a new test. The framework enforces several invariants that must be preserved. + +--- + +## How the framework works + +### Infrastructure + +All tests share a single Terraform module: `examples/terraform/aws-simple/`. Tests pass variables to it; the module provisions VPCs, subnets, security groups, EC2 instances, an NLB, IAM roles, and SSH key pairs, then outputs a ready-to-use `launchpad_yaml` string. + +The Terraform state is ephemeral — it lives in a temp directory for the duration of the test and is never committed. + +### Lifecycle + +Every test follows this sequence: + +``` +terraform init + apply → provision AWS infra + ↓ +config.ProductFromYAML → parse Terraform's launchpad_yaml output + ↓ +product.Apply(...) → install (and optionally upgrade) MKE/MCR via Launchpad + ↓ +product.Reset() → uninstall (best-effort, non-fatal — see below) + ↓ +terraform destroy → unconditional, runs via defer even on t.Fatal +``` + +`defer terraform.Destroy` is registered **before** `InitAndApply`, so infrastructure is always cleaned up even when apply or the test itself fails. + +### Resource tagging + +Every test must tag all AWS resources so they can be tracked and audited independently of Terraform state: + +```go +"extra_tags": map[string]string{ + "launchpad-smoke-test": "true", + "launchpad-smoke-test-name": cfg.Name, // e.g. "modern", "legacy", "upgrade" +}, +``` + +--- + +## Available platforms + +Platforms are defined in `test/platforms.go` as `test.Platform` values in the `test.Platforms` map. Each platform knows its Terraform platform key, instance sizing, firewall `user_data`, and how to produce a manager, worker, or MSR nodegroup map. + +| Key | Terraform platform | OS | +|---|---|---| +| `Ubuntu20` | `ubuntu_20.04` | Ubuntu 20.04 | +| `Ubuntu22` | `ubuntu_22.04` | Ubuntu 22.04 | +| `Ubuntu24` | `ubuntu_24.04` | Ubuntu 24.04 | +| `Rhel8` | `rhel_8` | RHEL 8 | +| `Rhel9` | `rhel_9` | RHEL 9 | +| `Rocky8` | `rocky_8` | Rocky Linux 8 | +| `Rocky9` | `rocky_9` | Rocky Linux 9 | +| `Sles12` | `sles_12` | SLES 12 | +| `Sles15` | `sles_15` | SLES 15 | +| `Oracle9` | `oracle_9` | Oracle Linux 9 | +| `Centos7` | `centos_7` | CentOS 7 | +| `Windows2019` | `windows_2019` | Windows Server 2019 | +| `Windows2022` | `windows_2022` | Windows Server 2022 | +| `Windows2025` | `windows_2025` | Windows Server 2025 | + +Each platform exposes three methods: + +```go +test.Platforms["Rhel9"].GetManager() // m6a.2xlarge, role=manager +test.Platforms["Rhel9"].GetWorker() // c6a.xlarge, role=worker +test.Platforms["Rhel9"].GetMSR() // m6a.2xlarge, role=msr +``` + +To add a new platform, add an entry to `test/platforms.go`. For platforms not supported by the upstream Terraform module (currently `ubuntu_24.04` and `windows_2025`), also add a local definition in `examples/terraform/aws-simple/platform.tf` under `lib_local_platform_definitions`. + +--- + +## Writing an install/reset test + +Use `runSmokeTest` (defined in `test/smoke/smoke_test.go`). Fill in a `smokeConfig` and call it from a `Test*` function. + +```go +// test/smoke/my_test.go +package smoke_test + +import ( + "testing" + "github.com/Mirantis/launchpad/test" +) + +func TestMyCluster(t *testing.T) { + runSmokeTest(t, smokeConfig{ + // Name is used in the AWS resource name and launchpad-smoke-test-name tag. + // Keep it short and lowercase (it forms part of resource names like + // "smoke-mytest-XXXXX-MngrRhel9-0"). + Name: "mytest", + + // MCR channel, e.g. "stable-29.2", "stable-25.0". + MCRChannel: "stable-29.2", + + // MKE and MSR versions. + MKEVersion: "3.9.2", + MSRVersion: "3.1.18", + + // SSH key algorithm: "ed25519" (default) or "rsa". + // Use "rsa" only when the cluster includes Windows nodes — + // RSA is required for AWS Windows password retrieval. + SSHKeyAlgorithm: "ed25519", + + // Nodegroups: map of unique nodegroup name → platform nodegroup map. + // Naming convention: prefix with role (Mngr/Wrk/Msr) + platform, + // e.g. "MngrRhel9", "WrkUbuntu24", "MsrRhel9". + // Names must be unique within the test. + Nodegroups: map[string]interface{}{ + "MngrRhel9": test.Platforms["Rhel9"].GetManager(), + "WrkUbuntu24": test.Platforms["Ubuntu24"].GetWorker(), + "WrkRocky9": test.Platforms["Rocky9"].GetWorker(), + }, + }) +} +``` + +`runSmokeTest` handles everything: infra provisioning, password generation, tagging, `defer` destroy, `Apply`, and best-effort `Reset`. + +### Windows clusters + +Include at least one Linux manager. Pass `SSHKeyAlgorithm: "rsa"` — RSA 4096-bit keys are required for AWS's encrypted Windows password mechanism. The framework auto-detects Windows nodegroups (by `platform` prefix `windows_`) and generates a compliant password via `generateWindowsPassword`. + +```go +func TestMyWindowsCluster(t *testing.T) { + runSmokeTest(t, smokeConfig{ + Name: "mywindows", + MCRChannel: "stable-25.0", + MKEVersion: "3.8.8", + MSRVersion: "2.9.28", + SSHKeyAlgorithm: "rsa", // required for Windows + Nodegroups: map[string]interface{}{ + "MngrUbuntu24": test.Platforms["Ubuntu24"].GetManager(), + "WrkWin2022": test.Platforms["Windows2022"].GetWorker(), + "WrkWin2025": test.Platforms["Windows2025"].GetWorker(), + }, + }) +} +``` + +--- + +## Writing an upgrade test + +Use `runUpgradeTest` (defined in `test/smoke/upgrade_test.go`). It provisions infra once, installs the base versions, then calls `Apply` a second time with mutated versions to trigger the upgrade. + +```go +func TestMyUpgrade(t *testing.T) { + runUpgradeTest(t, upgradeConfig{ + Base: smokeConfig{ + Name: "myupgrade", + MCRChannel: "stable-25.0", + MKEVersion: "3.8.8", + MSRVersion: "2.9.28", + SSHKeyAlgorithm: "ed25519", + Nodegroups: map[string]interface{}{ + "MngrUbuntu22": test.Platforms["Ubuntu22"].GetManager(), + "WrkRhel8": test.Platforms["Rhel8"].GetWorker(), + }, + }, + UpgradeMCRChannel: "stable-29.2", + UpgradeMKEVersion: "3.9.2", + }) +} +``` + +`bumpVersions` (in `upgrade_test.go`) handles the YAML mutation between the two `Apply` calls: it unmarshals the Terraform output, updates `spec.mcr.channel` and `spec.mke.version`, and re-marshals — preserving host addresses, SANs, LB names, and all install flags exactly as Terraform generated them. + +If you need to also change `spec.msr.version` during upgrade, extend `bumpVersions` or add a new mutator alongside it. + +--- + +## Hooking into CI + +### 1. Add a Makefile target + +```makefile +.PHONY: smoke-mytest +smoke-mytest: + go test -count=1 -v ./test/smoke/... -run TestMyCluster -timeout 50m +``` + +Timeout guidance: +- Install-only tests: **50m** +- Windows tests: **60m** (WinRM setup and Windows image pull are slower) +- Upgrade tests: **90m** (two full apply cycles) +- Multi-hop upgrade tests with image preloading: **200m+** (N apply cycles plus per-node image pulls across all upgrade versions) + +### 2. Add a CI job + +Add a job to `.github/workflows/smoke-tests.yaml` following the existing pattern: + +```yaml +smoke-mytest: + runs-on: ubuntu-latest + if: | + github.event_name == 'push' || + contains(github.event.pull_request.labels.*.name, 'smoke-test') || + contains(github.event.pull_request.labels.*.name, 'smoke-mytest') + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 + - name: Run my smoke test + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + run: make smoke-mytest +``` + +### 3. Create the PR label + +```bash +gh label create smoke-mytest --repo Mirantis/launchpad --color "0075ca" +``` + +Then add the label to your PR to trigger only that job, or add `smoke-test` to trigger all smoke jobs. + +--- + +## Reset and cleanup behaviour + +`product.Reset()` is called after `Apply` but is treated as **best-effort** and non-fatal. The MKE uninstall bootstrapper has a hardcoded node-response timeout (~2 minutes) that can fire on large or mixed-OS clusters before all uninstall agents report back. Because `defer terraform.Destroy` runs unconditionally, no AWS resources are orphaned if Reset fails. + +```go +// Pattern used in all smoke tests — do not assert on Reset. +if err = product.Reset(); err != nil { + t.Logf("WARN: product.Reset() failed (non-fatal): %v", err) +} +``` + +If `Reset` fails, Launchpad will fall back to a forced swarm dissolution (see `pkg/product/mke/phase/uninstall_mke.go`). Do not change this to a hard assertion without first confirming that `Reset` is reliable for your cluster size and platform. + +--- + +## Checklist for new smoke tests + +- [ ] Test function name starts with `Test` and is descriptive (`TestMyCluster`, not `TestIt`). +- [ ] `smokeConfig.Name` is short, lowercase, and unique across all tests. +- [ ] `SSHKeyAlgorithm` is `"rsa"` if any Windows nodegroups are present. +- [ ] `extra_tags` includes `launchpad-smoke-test: true` and `launchpad-smoke-test-name` — handled automatically by `runSmokeTest`/`runUpgradeTest`. +- [ ] `defer terraform.Destroy` is registered before `InitAndApply` — handled automatically by helpers. +- [ ] Makefile target added with an appropriate timeout. +- [ ] CI job added to `.github/workflows/smoke-tests.yaml`. +- [ ] PR label created (`gh label create smoke-`). +- [ ] New platform? Add to `test/platforms.go` and, if needed, `examples/terraform/aws-simple/platform.tf`. diff --git a/docs/development/workflow.md b/docs/development/workflow.md index 96ca1f31..beeaa524 100644 --- a/docs/development/workflow.md +++ b/docs/development/workflow.md @@ -4,37 +4,80 @@ This guide covers building, testing, and contributing to the Launchpad codebase. ## Building Locally -Launchpad uses `goreleaser` for production builds, but provides a `Makefile` for local development. +```bash +make local # Build for the current platform → dist/launchpad_GOOS_GOARCH +``` -- **`make local`**: Builds a single, platform-specific binary for rapid testing. -- **`make build-release`**: Performs a full production build, requiring a clean repository and release tag. -- **`make sign-release`**: Signs the Windows binary (requires specific environment variables). +The binary version and commit hash are injected at build time via `-ldflags`. Production release builds are handled by Goreleaser in CI. ## Testing Strategy Launchpad's system-centric nature requires a layered testing approach: -### 1. Unit and Functional Tests (`pkg/`, `test/functional/`) -- **Unit**: Small tests for individual functions or components. -- **Functional**: Tests that verify specific functional components. -- **Run**: `go test ./pkg/...` and `go test ./test/functional/...`. +### 1. Unit Tests (`pkg/`) -### 2. Integration Tests (`test/integration/`) -- **Focus**: Verifies functional elements on actual clusters provisioned by the test suite. -- **Run**: `go test ./test/integration/...`. +Tests for individual functions and components. Require the `testing` build tag. -### 3. Smoke Tests (`test/smoke/`) -- **Focus**: End-to-end command testing (`apply`, `reset`, etc.) using a real compute cluster. -- **Smoke Small**: Provision a small number of machines. -- **Smoke Large**: Provision a large and varied cluster. -- **Run**: `go test ./test/smoke/...`. +```bash +make unit-test +# or directly: +go test -v --tags 'testing' ./pkg/... + +# Single package: +go test -v --tags 'testing' ./pkg/config/... + +# Single test: +go test -v --tags 'testing' ./pkg/config/... -run TestFoo +``` + +### 2. Functional Tests (`test/functional/`) + +Component-level tests that may require network access but do not need a live cluster. + +```bash +make functional-test +``` + +### 3. Integration Tests (`test/integration/`) + +Verify behaviour against real provisioned nodes. + +```bash +make integration-test +``` + +### 4. Smoke Tests (`test/smoke/`) + +Full end-to-end tests using [Terratest](https://terratest.gruntwork.io/). Each test provisions real AWS infrastructure via `examples/terraform/aws-simple/`, runs the complete Launchpad lifecycle (install → reset), and destroys all infrastructure unconditionally via `defer terraform.Destroy`. + +Require `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. + +| Make target | Test | Timeout | Description | +|---|---|---|---| +| `smoke-modern` | `TestModernCluster` | 50m | RHEL9/Ubuntu24/Rocky9, MCR stable-29.2, MKE 3.9.2 | +| `smoke-legacy` | `TestLegacyCluster` | 50m | RHEL8/Rocky8/Ubuntu22, MCR stable-25.0, MKE 3.8.8 | +| `smoke-windows` | `TestWindowsCluster` | 60m | Ubuntu24 manager + Windows 2019/2022/2025 workers | +| `smoke-upgrade` | `TestUpgradeLegacyToModern` | 90m | Install MCR stable-25.0/MKE 3.8.8, upgrade to stable-29.2/MKE 3.9.2 | +| `smoke-airgapped-multi-hop` | `TestAirgappedMultiHopUpgrade` | 200m | Multi-step upgrade with internal MSR on port 4443 and imageRepo override | + +```bash +# Run a specific smoke test +make smoke-modern +make smoke-upgrade +make smoke-airgapped-multi-hop +``` + +All smoke-test AWS resources are tagged `launchpad-smoke-test: true` for cost tracking. CI smoke jobs are gated by PR labels (`smoke-test`, `smoke-modern`, `smoke-legacy`, `smoke-windows`, `smoke-upgrade`, `smoke-airgapped-multi-hop`). ## Contributing Principles - **Signed Commits**: All commits must be signed using `git commit -s`. +- **Feature Branches**: Never commit directly to `main`. Always work on a feature branch and open a PR. - **Feature Options**: Make new features optional via configuration or command flags. - **Phase Integration**: Implement new functionality as phases whenever possible for reusability. - **Schema Safety**: Avoid changes to the configuration syntax. If a change is necessary: - - Bump the version. + - Bump the `apiVersion` (currently `launchpad.mirantis.com/mke/v1.6`). - Include an in-memory migration in `pkg/config/migration/`. -- **Linting**: Ensure all changes pass `golangci-lint`. + - Add unit tests for the migration. +- **Linting**: Ensure all changes pass `make lint` (`golangci-lint run`). +- **Security**: Run `make security-scan` (`govulncheck ./...`) before raising a PR. diff --git a/docs/specifications/architecture.md b/docs/specifications/architecture.md index 96a287c5..650c7504 100644 --- a/docs/specifications/architecture.md +++ b/docs/specifications/architecture.md @@ -1,43 +1,73 @@ # Technical Architecture Specification: Launchpad -This document outlines the internal architecture of Launchpad, emphasizing its stateless, phase-based execution model. +This document outlines the internal architecture of Launchpad, emphasising its stateless, phase-based execution model. ## Component Model ### Configuration Management (`pkg/config/`) - **YAML-driven**: Launchpad interprets a static configuration file (`launchpad.yaml` by default). +- **Current schema**: `apiVersion: launchpad.mirantis.com/mke/v1.6` - **Structure**: - - `hosts`: A list of compute nodes and their roles. - - `mke`: A configuration block specific to the Mirantis Kubernetes Engine (MKE) product. -- **Migrations**: Found in `pkg/config/migration/`, these transform older versions of the config into the current internal representation at runtime. + - `spec.hosts`: A list of compute nodes and their roles (manager, worker, msr). + - `spec.mcr`: MCR configuration — channel-based (e.g. `stable-29.2`), repo URL, Windows installer URL. + - `spec.mke`: MKE configuration — version, image repo, admin credentials, install/upgrade flags. + - `spec.msr`: Optional MSR configuration — version, replica IDs, TLS. +- **Migrations**: Found in `pkg/config/migration/`, these transform older versions of the config into the current internal representation at runtime. Each migration is independently unit-tested. ### Host Management (`k0sproject Rig`) - **Role**: Rig manages the low-level connection to compute nodes (SSH for Linux, WinRM for Windows). - **Functionality**: Executing remote commands, uploading files, and managing node-level state. -- **Integration**: Launchpad passes host definitions directly to Rig. +- **Integration**: Launchpad passes host definitions directly to Rig; phases receive the connected host set. ### Phase Manager (`pkg/phase/`) -- **Concept**: All actions are organized into a sequence of **Phases**. -- **Execution**: The manager runs each phase in order, stopping if an error is encountered. -- **Reusability**: Phases are modular and can be reused across different commands (e.g., `apply` and `reset`). -- **Phase Logic**: Phases should ideally detect if they need to run rather than relying on external flags. +- **Concept**: All actions are organised into a sequence of **Phases**. +- **Execution**: The manager runs each phase in order, stopping if an error is encountered (unless cleanup is disabled). +- **Reusability**: Phases are modular and can be reused across different commands (e.g. `apply` and `reset`). +- **Phase logic**: Phases should detect whether they need to run via `ShouldRun()` rather than relying on external flags. +- **Phase interface**: + - `Run() error` — required + - `Title() string` — required + - Optional: `Prepare(config)`, `ShouldRun()`, `CleanUp()`, `DisableCleanup()` ### Product Support (`pkg/product/`) -- **`mke`**: The main supported product, covering the MCR (Mirantis Container Runtime), MKE, and MSR (Mirantis Secure Registry) stack. -- **Structs**: Product configurations and state structs are defined in `pkg/product/mke/api`. +- **`mke`**: The main supported product, covering MCR (Mirantis Container Runtime), MKE, and MSR (Mirantis Secure Registry). +- **Config structs**: Defined in `pkg/product/mke/config/` — `ClusterConfig`, `ClusterSpec`, `Host`, `Hosts`, `MCRConfig`, `MKEConfig`, `MSRConfig`. +- **OS configurers**: Distro-specific MCR install/upgrade logic lives in `pkg/configurer/` (EL, Ubuntu, SLES, Windows). Each configurer implements `InstallMCR`, `UpgradeMCR`, and related host-setup methods using the native package manager (yum/apt/zypper; PowerShell for Windows). ## Command Execution Flow -1. **Load Configuration**: Read and migrate the YAML file. -2. **Initialize Phases**: Instantiate the required phases for the requested command. +1. **Load Configuration**: Read and migrate the YAML file to the current schema version. +2. **Initialise Phases**: Instantiate the required phases for the requested command (apply, reset, describe). 3. **Execute Phases**: The Phase Manager runs the sequence, communicating with hosts via Rig. -4. **Finalize**: Generate logs and diagnostic reports. +4. **Finalise**: Emit logs, diagnostic output, and telemetry events. + +## Apply Phase Sequence (abridged) + +``` +UpgradeCheck → Connect → DetectOS → GatherFacts → ValidateFacts → PrepareHost +→ ConfigureMCR → InstallMCR → UpgradeMCR → InstallMCRLicense → RestartMCR +→ PullMKEImages → InitSwarm → InstallMKECerts → InstallMKE → UpgradeMKE +→ JoinManagers → JoinWorkers +→ InstallMSR → UpgradeMSR → JoinMSRReplicas +→ LabelNodes → RemoveNodes → Disconnect → Info +``` + +`InstallMCR` and `UpgradeMCR` are separate phases; `UpgradeMCR` skips hosts where MCR was just installed. `InstallMKE` and `UpgradeMKE` are likewise separate — `UpgradeMKE` is a no-op if the installed version matches the target. + +## Reset Phase Sequence + +``` +Connect → DetectOS → GatherFacts → PrepareHost +→ UninstallMSR → UninstallMKE → UninstallMCR → CleanUp → Disconnect +``` + +`UninstallMKE` runs the `mirantis/ucp uninstall-ucp` bootstrapper. If that times out (a known failure mode on large or mixed-OS clusters where agent image pulls exhaust the hardcoded deadline), it falls back to a forced swarm dissolution: removing the stuck `ucp-uninstall-agent` service, then forcing all nodes to leave the swarm sequentially. ## Persistence and State - **Statelessness**: No persistent state is kept between runs. -- **Discovery**: Phases are responsible for identifying the current state of the cluster by querying the nodes directly. +- **Discovery**: The `GatherFacts` phase queries each node to determine installed MCR version, MKE installed state, swarm membership, and node ID. Subsequent phases use this metadata to decide whether to install, upgrade, or skip. diff --git a/examples/terraform/aws-simple/.terraform.lock.hcl b/examples/terraform/aws-simple/.terraform.lock.hcl index b67784b2..9808e498 100644 --- a/examples/terraform/aws-simple/.terraform.lock.hcl +++ b/examples/terraform/aws-simple/.terraform.lock.hcl @@ -3,8 +3,9 @@ provider "registry.terraform.io/hashicorp/aws" { version = "6.43.0" - constraints = ">= 6.28.0, >= 6.29.0" + constraints = ">= 6.28.0, >= 6.33.0" hashes = [ + "h1:/A3VpeGOhvutRSlGACfUKeBMFZa3CTSLIqvT+XH0364=", "h1:yvdZqdEHHDk0WWL8oiK8dLaouJMSVZRkxGLEhDZyFCo=", "zh:0fe91026ce8c5178781de6773531dcfcf5280ee139059dc5a0c046f1532cf389", "zh:114001f94c38db8702210eda643ec627fa1929a88f774e17db30bc172df6759e", @@ -27,6 +28,7 @@ provider "registry.terraform.io/hashicorp/aws" { provider "registry.terraform.io/hashicorp/local" { version = "2.8.0" hashes = [ + "h1:3jWHVwO5QUIS9V1NsK10ZzdpkK2ABuB4G+UIWrVeGp4=", "h1:KCuj8nPbNP/ofQrAoQIuQ3CP6k+ADpULvxr7dw2PrpM=", "zh:05f18164beab4a84753e5fedf463771ee0c6eca8e90346b8766f1e1c186dec1e", "zh:563a0702e3711e25ba8930120899b681378b50cbb957fd204b37745c7c9b5f40", @@ -47,6 +49,7 @@ provider "registry.terraform.io/hashicorp/time" { version = "0.13.1" hashes = [ "h1:+W+DMrVoVnoXo3f3M4W+OpZbkCrUn6PnqDF33D2Cuf0=", + "h1:ZT5ppCNIModqk3iOkVt5my8b8yBHmDpl663JtXAIRqM=", "zh:02cb9aab1002f0f2a94a4f85acec8893297dc75915f7404c165983f720a54b74", "zh:04429b2b31a492d19e5ecf999b116d396dac0b24bba0d0fb19ecaefe193fdb8f", "zh:26f8e51bb7c275c404ba6028c1b530312066009194db721a8427a7bc5cdbc83a", @@ -67,6 +70,7 @@ provider "registry.terraform.io/hashicorp/tls" { constraints = ">= 4.0.0" hashes = [ "h1:F5d6bQY8UlBo0D71Sv7CsV+3aZOFz0yeNF+vufog7h4=", + "h1:akFNuHwvrtnYMBofieoeXhPJDhYZzJVu/Q/BgZK2fgg=", "zh:0d1e7d07ac973b97fa228f46596c800de830820506ee145626f079dd6bbf8d8a", "zh:5c7e3d4348cb4861ab812973ef493814a4b224bdd3e9d534a7c8a7c992382b86", "zh:7c6d4a86cd7a4e9c1025c6b3a3a6a45dea202af85d870cddbab455fb1bd568ad", diff --git a/examples/terraform/aws-simple/launchpad.tf b/examples/terraform/aws-simple/launchpad.tf index 6b7d4732..893c6b15 100644 --- a/examples/terraform/aws-simple/launchpad.tf +++ b/examples/terraform/aws-simple/launchpad.tf @@ -53,8 +53,8 @@ locals { routes = { "msr" = { - port_incoming = 443 - port_target = 443 + port_incoming = var.msr_port + port_target = 443 # DTR always listens on 443 internally; msr_port is the external NLB port protocol = "TCP" } } @@ -188,7 +188,7 @@ spec: "replicaIDs": "sequential" installFlags: - "--ucp-insecure-tls" - - "--dtr-external-url=${local.MSR_URL}" + - "--dtr-external-url=${local.MSR_URL}${var.msr_port != 443 ? ":${var.msr_port}" : ""}" %{endif} EOT diff --git a/examples/terraform/aws-simple/variables.tf b/examples/terraform/aws-simple/variables.tf index 333318ad..7efbc5d1 100644 --- a/examples/terraform/aws-simple/variables.tf +++ b/examples/terraform/aws-simple/variables.tf @@ -73,3 +73,9 @@ variable "ssh_pk_location" { type = string default = "" } + +variable "msr_port" { + description = "External port on which DTR/MSR is exposed via the NLB. The DTR replica always listens on 443 internally; this controls only the NLB's port_incoming and the --dtr-external-url flag. Use a non-standard port (e.g. 4443) to test airgap registry configurations." + type = number + default = 443 +} diff --git a/test/platforms.go b/test/platforms.go index 19eee0cd..2e54ad2e 100644 --- a/test/platforms.go +++ b/test/platforms.go @@ -73,7 +73,10 @@ var Platforms = map[string]Platform{ Count: 1, VolumeSize: "100", Public: true, - UserData: "sudo firewall-cmd --permanent --add-port=2377/tcp --add-port=7946/tcp --add-port=7946/udp --add-port=4789/udp --add-port=10250/tcp; sudo firewall-cmd --reload", + // Disable the container-tools module stream before MCR install. RHEL8 + // AppStream pulls in system runc as a container-selinux dependency; that + // package conflicts with Mirantis's containerd.io-runc at install time. + UserData: "sudo dnf module disable container-tools -y; sudo firewall-cmd --permanent --add-port=2377/tcp --add-port=7946/tcp --add-port=7946/udp --add-port=4789/udp --add-port=10250/tcp; sudo firewall-cmd --reload", }, "Centos7": { Name: "centos_7", @@ -108,7 +111,8 @@ var Platforms = map[string]Platform{ Count: 1, VolumeSize: "100", Public: true, - UserData: "sudo firewall-cmd --permanent --add-port=2377/tcp --add-port=7946/tcp --add-port=7946/udp --add-port=4789/udp --add-port=10250/tcp; sudo firewall-cmd --reload", + // Same AppStream container-tools conflict as RHEL8 — disable before MCR install. + UserData: "sudo dnf module disable container-tools -y; sudo firewall-cmd --permanent --add-port=2377/tcp --add-port=7946/tcp --add-port=7946/udp --add-port=4789/udp --add-port=10250/tcp; sudo firewall-cmd --reload", }, "Rocky9": { Name: "rocky_9", diff --git a/test/smoke/airgapped_multi_hop_upgrade_test.go b/test/smoke/airgapped_multi_hop_upgrade_test.go new file mode 100644 index 00000000..78b219cb --- /dev/null +++ b/test/smoke/airgapped_multi_hop_upgrade_test.go @@ -0,0 +1,646 @@ +package smoke_test + +// Airgapped multi-hop upgrade smoke test — customer scenario. +// +// TestAirgappedMultiHopUpgrade provisions a cluster with an internal MSR/DTR +// (2.9.27) on port 4443, installs the baseline software, pre-loads all MKE +// and MSR upgrade images into DTR, then drives three sequential upgrades with +// mke.imageRepo and msr.imageRepo pointing to DTR throughout. +// +// Upgrade chain: +// +// install: MCR stable-25.0 / MKE 3.8.8 / MSR 2.9.27 (images from docker.io/mirantis) +// step 1: MCR stable-25.0 / MKE 3.8.11 (images from DTR :4443) +// step 2: MCR stable-29.2 / MKE 3.8.12 (images from DTR :4443) +// step 3: MCR stable-29.2 / MKE 3.9.2 (images from DTR :4443) +// +// What this test validates: +// - Launchpad correctly uses mke.imageRepo and msr.imageRepo when set to an +// internal registry address that includes a non-standard port. +// - The full 3.8.8 → 3.8.11 → 3.8.12 → 3.9.2 upgrade chain completes when +// all MKE bootstrapper images are resolved via Launchpad's imageRepo string +// pointing to a locally available (pre-tagged) registry prefix. +// - Launchpad accepts a non-standard MSR port (4443) in the imageRepo address +// without attempting any docker login/push/pull against DTR directly. +// +// What this test does NOT validate: +// - True network-level airgap. Manager and worker nodes can still reach the +// internet; the initial install uses docker.io/mirantis to bootstrap DTR +// itself (unavoidable chicken-and-egg). For full egress blocking after +// bootstrap, restrict the manager/worker security group in the Terraform +// module — the current module does not expose per-nodegroup egress rules. +// - MCR package airgap. MCR packages (RPMs/DEBs) are still pulled from +// repos.mirantis.com during the upgrade steps. Airgapping MCR requires a +// separate apt/yum mirror and updating spec.mcr.repoURL. +// +// Terraform note: +// The msr_port variable in examples/terraform/aws-simple/variables.tf +// defaults to 443, so all other smoke tests are unaffected. This test +// passes msr_port=4443 at runtime via terraform.Options.Vars — no file +// modification or reversion is required. + +import ( + "encoding/json" + "fmt" + "net/http" + "os/exec" + "regexp" + "sort" + "strings" + "testing" + "time" + + "github.com/Mirantis/launchpad/pkg/config" + "github.com/Mirantis/launchpad/test" + "github.com/gruntwork-io/terratest/modules/terraform" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v2" +) + +// upgradeStep is a single version hop in the multi-step upgrade chain. +type upgradeStep struct { + MCRChannel string + MKEVersion string +} + +// airgapUpgradeConfig pairs a base install with an ordered list of upgrade +// steps. An MSR nodegroup (role: msr) must be present in Base.Nodegroups +// because it acts as the internal image registry for all upgrade steps. +type airgapUpgradeConfig struct { + Base smokeConfig + Steps []upgradeStep +} + +// bumpVersionsAirgap updates spec.mcr.channel, spec.mke.version, and both +// imageRepo fields so that subsequent Apply calls pull images from the +// internal DTR (registryPrefix, e.g. "dtr.example.com:4443/admin") rather +// than docker.io/mirantis. +// +// spec.mcr.repoURL is intentionally left unchanged. MCR packages are +// installed from a Linux package repository (apt/yum), not a container +// registry. Airgapping MCR package installation requires a separate package +// mirror; see the file-level comment for details. +func bumpVersionsAirgap(yamlStr, mcrChannel, mkeVersion, registryPrefix string) (string, error) { + var doc map[interface{}]interface{} + if err := yaml.Unmarshal([]byte(yamlStr), &doc); err != nil { + return "", fmt.Errorf("unmarshal cluster YAML: %w", err) + } + + spec, ok := doc["spec"].(map[interface{}]interface{}) + if !ok { + return "", fmt.Errorf("cluster YAML missing spec") + } + + if mcr, ok := spec["mcr"].(map[interface{}]interface{}); ok { + mcr["channel"] = mcrChannel + } else { + spec["mcr"] = map[interface{}]interface{}{"channel": mcrChannel} + } + + if mke, ok := spec["mke"].(map[interface{}]interface{}); ok { + mke["version"] = mkeVersion + mke["imageRepo"] = registryPrefix + } else { + return "", fmt.Errorf("cluster YAML missing spec.mke") + } + + if msr, ok := spec["msr"].(map[interface{}]interface{}); ok { + msr["imageRepo"] = registryPrefix + } + + out, err := yaml.Marshal(doc) + if err != nil { + return "", fmt.Errorf("re-marshal upgraded YAML: %w", err) + } + return string(out), nil +} + +// clusterHost holds the SSH connection details for one node in the cluster. +type clusterHost struct { + addr string + user string + keyPath string +} + +// extractAllSSHHosts returns a clusterHost for every SSH-connected host in the +// launchpad YAML. Windows (WinRM) hosts are skipped. +func extractAllSSHHosts(yamlStr string) ([]clusterHost, error) { + var doc map[interface{}]interface{} + if err := yaml.Unmarshal([]byte(yamlStr), &doc); err != nil { + return nil, fmt.Errorf("unmarshal YAML: %w", err) + } + spec, ok := doc["spec"].(map[interface{}]interface{}) + if !ok { + return nil, fmt.Errorf("cluster YAML missing spec") + } + rawHosts, ok := spec["hosts"].([]interface{}) + if !ok { + return nil, fmt.Errorf("cluster YAML missing spec.hosts") + } + var out []clusterHost + for _, h := range rawHosts { + host, ok := h.(map[interface{}]interface{}) + if !ok { + continue + } + sshConf, ok := host["ssh"].(map[interface{}]interface{}) + if !ok { + continue // WinRM host or no ssh block + } + addr, _ := sshConf["address"].(string) + user, _ := sshConf["user"].(string) + keyPath, _ := sshConf["keyPath"].(string) + if addr != "" && user != "" && keyPath != "" { + out = append(out, clusterHost{addr: addr, user: user, keyPath: keyPath}) + } + } + if len(out) == 0 { + return nil, fmt.Errorf("no SSH hosts found in launchpad YAML") + } + return out, nil +} + +// extractDTRHost parses the DTR external URL from the MSR installFlags in the +// launchpad YAML and returns a "hostname:port" string suitable for use as a +// Docker registry address (e.g. "abc123.elb.amazonaws.com:4443"). +// +// The port is always explicit because Docker uses it as the key for both +// /etc/docker/certs.d/ directory lookup and image reference resolution. +// "hostname" and "hostname:4443" are treated as distinct registries by Docker, +// so all callers — cert trust setup, docker login, docker push/pull, and the +// imageRepo field — must use the same "hostname:port" form consistently. +// If the URL contains no port, :443 is appended as the default. +func extractDTRHost(yamlStr string) (string, error) { + var doc map[interface{}]interface{} + if err := yaml.Unmarshal([]byte(yamlStr), &doc); err != nil { + return "", fmt.Errorf("unmarshal YAML: %w", err) + } + spec, ok := doc["spec"].(map[interface{}]interface{}) + if !ok { + return "", fmt.Errorf("cluster YAML missing spec") + } + msr, ok := spec["msr"].(map[interface{}]interface{}) + if !ok { + return "", fmt.Errorf("cluster YAML missing spec.msr — ensure an MSR nodegroup is present") + } + flags, ok := msr["installFlags"].([]interface{}) + if !ok { + return "", fmt.Errorf("no installFlags in spec.msr") + } + const prefix = "--dtr-external-url=" + for _, f := range flags { + flag, _ := f.(string) + if strings.HasPrefix(flag, prefix) { + addr := strings.TrimPrefix(flag, prefix) + addr = strings.TrimPrefix(addr, "https://") + addr = strings.TrimPrefix(addr, "http://") + // Ensure port is explicit. Without it, Docker keys /etc/docker/certs.d/ + // on the bare hostname, which differs from the hostname:port key used + // when the port is present — causing cert lookup mismatches. + if !strings.Contains(addr, ":") { + addr += ":443" + } + return addr, nil + } + } + return "", fmt.Errorf("--dtr-external-url not found in spec.msr.installFlags") +} + +// sshRun executes a remote shell command on addr via the system ssh binary, +// authenticating with keyPath. It avoids Go-side private key parsing entirely, +// which is necessary because Terraform's tls_private_key resource emits +// OpenSSH-format ed25519 keys (-----BEGIN OPENSSH PRIVATE KEY-----) that +// golang.org/x/crypto/ssh.ParsePrivateKey rejects with "ssh: no key found". +func sshRun(user, addr, keyPath, command string) (string, error) { + cmd := exec.Command("ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ConnectTimeout=30", + "-i", keyPath, + user+"@"+addr, + command, + ) + out, err := cmd.CombinedOutput() + return string(out), err +} + +// sshRunScript executes a shell script on addr via the system ssh binary by +// piping the script to bash via stdin. This avoids SSH command-line length +// limits and shell quoting complexity for multi-line scripts. +func sshRunScript(user, addr, keyPath, script string) (string, error) { + cmd := exec.Command("ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ConnectTimeout=30", + "-i", keyPath, + user+"@"+addr, + "bash -s", + ) + cmd.Stdin = strings.NewReader(script) + out, err := cmd.CombinedOutput() + return string(out), err +} + +// preloadImagesOnNode pulls all images for the given bootstrapper from +// docker.io/mirantis and tags each one locally with the DTR registry address +// prefix, so that Launchpad's "docker image inspect dtrHost/admin/:" +// check succeeds without any network call to DTR. +// +// Background: Launchpad's "Pull MKE images" phase runs docker image inspect +// before docker pull. If the DTR-addressed tag already exists locally the pull +// is skipped entirely. Preloading on every node (manager + workers + MSR) means +// no actual DTR push or pull is ever needed, while still exercising Launchpad's +// imageRepo feature — all upgrade Apply() calls use dtrHost+"/admin" as +// mke.imageRepo and msr.imageRepo. +func preloadImagesOnNode(t *testing.T, user, addr, keyPath, dtrHost, bootstrapper string) { + t.Helper() + + script := fmt.Sprintf(`set -euo pipefail +bootstrapper="%s" +dtr_admin="%s" +failcount=0 + +# Pull the bootstrapper so its "images" subcommand can run. +if ! docker pull "${bootstrapper}" > /dev/null 2>&1; then + echo "ERROR: failed to pull bootstrapper ${bootstrapper}" >&2 + exit 1 +fi + +# Enumerate images. UCP uses "images --list"; DTR/MSR 2.x uses just "images" +# (--list is unrecognised by DTR and causes it to print help text to stdout +# with exit 0, which we detect by filtering for valid image-reference lines). +_list_images() { + docker run --rm "$1" images $2 2>/dev/null \ + | grep -E '^[a-zA-Z0-9][a-zA-Z0-9._/:@-]*:[a-zA-Z0-9._-]+$' \ + || true +} +images=$(_list_images "${bootstrapper}" "--list") +[ -n "${images}" ] || images=$(_list_images "${bootstrapper}" "") +# Always include the bootstrapper itself; the inspect check in the loop below +# skips it if already tagged. +images="${images} +${bootstrapper}" + +for img in ${images}; do + short="${img##*/}" + dtr_img="${dtr_admin}/${short}" + + # Already tagged locally with the DTR address — nothing to do. + if docker image inspect "${dtr_img}" > /dev/null 2>&1; then + continue + fi + + # Pull from the public registry and tag with the DTR registry address. + if docker pull "${img}" > /dev/null 2>&1 && docker tag "${img}" "${dtr_img}"; then + echo "tagged: ${dtr_img}" + else + echo "FAILED: ${img} -> ${dtr_img}" >&2 + failcount=$((failcount+1)) + fi +done + +if [ "${failcount}" -gt 0 ]; then + echo "ERROR: ${failcount} image(s) failed to preload on $(hostname)" >&2 + exit 1 +fi +echo "preload OK: ${bootstrapper} on $(hostname)" +`, bootstrapper, dtrHost+"/admin") + + out, err := sshRunScript(user, addr, keyPath, script) + if err != nil { + t.Logf("preload output:\n%s", out) + } + require.NoError(t, err, "preload %s on %s", bootstrapper, addr) +} + +// semverLess reports whether X.Y.Z version string a is semantically less than b. +func semverLess(a, b string) bool { + parse := func(v string) [3]int { + var x, y, z int + fmt.Sscanf(v, "%d.%d.%d", &x, &y, &z) + return [3]int{x, y, z} + } + av, bv := parse(a), parse(b) + for i := range av { + if av[i] != bv[i] { + return av[i] < bv[i] + } + } + return false +} + +// fetchLatestMKEVersion queries the Docker Hub tags API for mirantis/ucp and +// returns the highest X.Y.Z release in the given major series (e.g. "3" → +// "3.9.3"). Tags that are not bare version strings (e.g. "latest", "3.9", +// "3.9.3-rc1") are ignored. +func fetchLatestMKEVersion(t *testing.T, major string) string { + t.Helper() + + type hubTag struct { + Name string `json:"name"` + } + type hubPage struct { + Results []hubTag `json:"results"` + Next *string `json:"next"` + } + + // Match only bare X.Y.Z tags in the requested major series. + versionRe := regexp.MustCompile(`^` + regexp.QuoteMeta(major) + `\.\d+\.\d+$`) + + client := &http.Client{Timeout: 30 * time.Second} + + var candidates []string + pageURL := "https://hub.docker.com/v2/repositories/mirantis/ucp/tags/?page_size=100" + const maxPages = 20 + for page := 0; pageURL != "" && page < maxPages; page++ { + resp, err := client.Get(pageURL) + require.NoError(t, err, "query Docker Hub tags for mirantis/ucp (page %d)", page+1) + require.Equal(t, http.StatusOK, resp.StatusCode, + "unexpected HTTP %d from Docker Hub tags API (page %d)", resp.StatusCode, page+1) + var p hubPage + require.NoError(t, json.NewDecoder(resp.Body).Decode(&p), + "decode Docker Hub tags response (page %d)", page+1) + resp.Body.Close() + + for _, tag := range p.Results { + if versionRe.MatchString(tag.Name) { + candidates = append(candidates, tag.Name) + } + } + if p.Next == nil || *p.Next == "" { + break + } + pageURL = *p.Next + } + + require.NotEmpty(t, candidates, "no MKE %s.x.y releases found on Docker Hub for mirantis/ucp", major) + sort.Slice(candidates, func(i, j int) bool { return semverLess(candidates[i], candidates[j]) }) + return candidates[len(candidates)-1] +} + +// fetchLatestMCRChannel probes the Mirantis Ubuntu apt repository to find the +// highest available stable channel for the given MCR major version +// (e.g. 29 → "stable-29.4"). It probes stable-29.1 through stable-29.N and +// returns the highest minor for which the Packages file exists. +// +// IMPORTANT: channels are NOT sequential. For example stable-29.1 (404), +// stable-29.2 (200), stable-29.3 (404), stable-29.4 (200). The loop therefore +// never breaks early on a 404 — it always scans the full range. +// +// The Ubuntu 22.04 (jammy) apt repo is used as the probe target: +// https://repos.mirantis.com/ubuntu/dists/jammy//binary-amd64/Packages +func fetchLatestMCRChannel(t *testing.T, major int) string { + t.Helper() + + const ( + probeBase = "https://repos.mirantis.com/ubuntu/dists/jammy" + probeArch = "binary-amd64/Packages" + maxMinor = 20 + ) + + client := &http.Client{Timeout: 15 * time.Second} + + last := "" + for minor := 1; minor <= maxMinor; minor++ { + channel := fmt.Sprintf("stable-%d.%d", major, minor) + url := fmt.Sprintf("%s/%s/%s", probeBase, channel, probeArch) + resp, err := client.Head(url) + if resp != nil { + resp.Body.Close() + } + // Do NOT break on 404 — channels are non-sequential; a gap does not + // mean higher minors are absent. + if err == nil && resp != nil && resp.StatusCode == http.StatusOK { + last = channel + } + } + + require.NotEmpty(t, last, + "no stable-%d.x MCR channel found at %s — check that repos.mirantis.com is reachable", + major, probeBase) + return last +} + +// runAirgappedMultiHopUpgradeTest provisions the cluster with an MSR node on +// port 4443, installs the base software using docker.io/mirantis, pre-loads +// all upgrade images into DTR, then drives each upgrade step with both +// mke.imageRepo and msr.imageRepo pointing to DTR. +func runAirgappedMultiHopUpgradeTest(t *testing.T, cfg airgapUpgradeConfig) { + t.Helper() + + uTestId := test.GenerateRandomAlphaNumericString(5) + name := fmt.Sprintf("smoke-%s-%s", cfg.Base.Name, uTestId) + + mkePassword := test.GenerateRandomAlphaNumericString(12) + + mkeConnect := map[string]interface{}{ + "username": "admin", + "password": mkePassword, + "insecure": true, + } + + launchpad := map[string]interface{}{ + "drain": false, + "mcr_channel": cfg.Base.MCRChannel, + "mke_version": cfg.Base.MKEVersion, + "msr_version": cfg.Base.MSRVersion, + "mke_connect": mkeConnect, + } + + ngKeys := make([]string, 0, len(cfg.Base.Nodegroups)) + for k := range cfg.Base.Nodegroups { + ngKeys = append(ngKeys, k) + } + + subnets := map[string]interface{}{ + "main": map[string]interface{}{ + "cidr": "172.31.0.0/17", + "private": false, + "nodegroups": ngKeys, + }, + } + + tempSSHKeyPathDir := t.TempDir() + + vars := map[string]interface{}{ + "name": name, + "aws": awsConfig, + "launchpad": launchpad, + "network": networkConfig, + "subnets": subnets, + "ssh_pk_location": tempSSHKeyPathDir, + "nodegroups": cfg.Base.Nodegroups, + "ssh_key_algorithm": cfg.Base.SSHKeyAlgorithm, + // Expose DTR on port 4443 via the NLB. DTR continues to listen on + // 443 internally; the NLB translates 4443 → 443. The launchpad.tf + // template appends :4443 to --dtr-external-url automatically when + // msr_port != 443. All other smoke tests use the default (443) and + // are unaffected by this variable. + "msr_port": 4443, + "extra_tags": map[string]string{ + "launchpad-smoke-test": "true", + "launchpad-smoke-test-name": cfg.Base.Name, + }, + } + + options := terraform.Options{ + TerraformDir: "../../examples/terraform/aws-simple", + Vars: vars, + } + + terraformOptions := terraform.WithDefaultRetryableErrors(t, &options) + defer terraform.Destroy(t, terraformOptions) + + if _, err := terraform.InitAndApplyE(t, terraformOptions); err != nil { + t.Fatal(err) + } + + baseYAML := terraform.Output(t, terraformOptions, "launchpad_yaml") + + // ── Base install (using docker.io/mirantis) ─────────────────────────────── + // DTR does not exist yet, so the initial install must use the public + // registry. Once DTR is up, all subsequent Apply() calls use DTR. + t.Logf("installing base: MCR %s / MKE %s / MSR %s", cfg.Base.MCRChannel, cfg.Base.MKEVersion, cfg.Base.MSRVersion) + + baseProduct, err := config.ProductFromYAML([]byte(baseYAML)) + require.NoError(t, err, "parse base launchpad YAML") + + err = baseProduct.Apply(true, true, 3, true) + require.NoError(t, err, "base install Apply()") + + // ── Pre-load upgrade images on all nodes ───────────────────────────────── + // DTR is now running on port 4443 but we do NOT push images to it. + // Instead, every upgrade image is pulled from docker.io/mirantis on EVERY + // node and tagged with the DTR registry address. Launchpad's "Pull MKE + // images" phase checks `docker image inspect` before attempting a pull; it + // finds the pre-tagged image locally and skips the pull entirely. This + // sidesteps DTR push authentication entirely while still exercising + // Launchpad's imageRepo feature — all upgrade Apply() calls set + // mke.imageRepo and msr.imageRepo to the DTR address so image references + // are constructed correctly. + dtrHost, err := extractDTRHost(baseYAML) + require.NoError(t, err, "extract DTR hostname from launchpad YAML") + + allHosts, err := extractAllSSHHosts(baseYAML) + require.NoError(t, err, "extract all SSH hosts from launchpad YAML") + + upgradeVersions := make([]string, 0, len(cfg.Steps)) + for _, step := range cfg.Steps { + upgradeVersions = append(upgradeVersions, step.MKEVersion) + } + + for _, h := range allHosts { + for _, version := range upgradeVersions { + bootstrapper := fmt.Sprintf("docker.io/mirantis/ucp:%s", version) + t.Logf("preloading MKE %s on %s", version, h.addr) + preloadImagesOnNode(t, h.user, h.addr, h.keyPath, dtrHost, bootstrapper) + } + // Pre-load the MSR bootstrapper so Launchpad can resolve msr.imageRepo + // during upgrade Apply() if it checks the installed MSR version. + msrBootstrapper := fmt.Sprintf("docker.io/mirantis/dtr:%s", cfg.Base.MSRVersion) + t.Logf("preloading MSR %s on %s", cfg.Base.MSRVersion, h.addr) + preloadImagesOnNode(t, h.user, h.addr, h.keyPath, dtrHost, msrBootstrapper) + } + + // ── Sequential upgrade steps (images from DTR address) ─────────────────── + // registryPrefix is the imageRepo value for all upgrade steps: + // "/admin". Launchpad constructs image references as + // "/ucp:" → "/admin/ucp:", which + // matches the local tag created by preloadImagesOnNode above. + // No REGISTRY_* env vars are set; DTR push/pull is never attempted so + // Launchpad's AuthenticateDocker phase is intentionally skipped. + registryPrefix := dtrHost + "/admin" + + currentYAML := baseYAML + lastProduct := baseProduct + for i, step := range cfg.Steps { + t.Logf("upgrade step %d/%d → MCR %s / MKE %s (imageRepo: %s)", + i+1, len(cfg.Steps), step.MCRChannel, step.MKEVersion, registryPrefix) + + upgradeYAML, err := bumpVersionsAirgap(currentYAML, step.MCRChannel, step.MKEVersion, registryPrefix) + require.NoError(t, err, "mutate YAML for upgrade step %d", i+1) + + upgradeProduct, err := config.ProductFromYAML([]byte(upgradeYAML)) + require.NoError(t, err, "parse upgrade YAML for step %d", i+1) + + err = upgradeProduct.Apply(true, true, 3, true) + assert.NoError(t, err, "upgrade Apply() for step %d", i+1) + // Update lastProduct regardless of error so Reset() uses the config from + // the attempted step (the cluster may be partially upgraded on failure). + currentYAML = upgradeYAML + lastProduct = upgradeProduct + if err != nil { + t.Logf("upgrade step %d failed; stopping upgrade chain", i+1) + break + } + } + + // ── Reset (best-effort) ─────────────────────────────────────────────────── + // See smoke_test.go for rationale on non-fatal Reset(). + if err := lastProduct.Reset(); err != nil { + t.Logf("WARN: product.Reset() failed (non-fatal): %v", err) + } +} + +// TestAirgappedMultiHopUpgrade provisions an airgapped cluster with an +// internal DTR on port 4443 and exercises the following upgrade chain, +// pulling all MKE images from DTR after the initial bootstrap: +// +// install: MCR stable-25.0 / MKE 3.8.8 / MSR 2.9.27 (docker.io/mirantis) +// step 1: MCR stable-25.0 / MKE 3.8.11 (from DTR :4443) +// step 2: MCR stable-29.2 / MKE 3.8.12 (from DTR :4443) +// step 3: MCR stable-29.2 / MKE 3.9.2 (from DTR :4443) +// step 4: MCR / MKE (discovered at runtime) +// +// Step 4 is resolved dynamically at test runtime: fetchLatestMKEVersion queries +// Docker Hub for the highest mirantis/ucp 3.x.y tag, and fetchLatestMCRChannel +// probes the Mirantis apt repository for the highest stable-29.x channel. If +// the resolved versions are identical to step 3 (no new releases), step 4 is +// omitted. +// +// Node matrix: Ubuntu22 manager + Rhel8 worker + Ubuntu22 MSR. +func TestAirgappedMultiHopUpgrade(t *testing.T) { + latestMKE := fetchLatestMKEVersion(t, "3") + latestMCR := fetchLatestMCRChannel(t, 29) + t.Logf("discovered latest: MKE %s / MCR channel %s", latestMKE, latestMCR) + + // Fixed portion of the upgrade chain — the specific versions that model + // the customer scenario and must always be exercised. + steps := []upgradeStep{ + {MCRChannel: "stable-25.0", MKEVersion: "3.8.11"}, + {MCRChannel: "stable-29.2", MKEVersion: "3.8.12"}, + {MCRChannel: "stable-29.2", MKEVersion: "3.9.2"}, + } + + // Append the dynamic "to latest" step only when it differs from the last + // fixed step — avoids a redundant no-op when the fixed chain already ends + // on the current latest release. + lastFixed := steps[len(steps)-1] + if latestMKE != lastFixed.MKEVersion || latestMCR != lastFixed.MCRChannel { + steps = append(steps, upgradeStep{MCRChannel: latestMCR, MKEVersion: latestMKE}) + t.Logf("appending upgrade-to-latest step: MCR %s / MKE %s", latestMCR, latestMKE) + } else { + t.Logf("latest versions match last fixed step; no additional upgrade step needed") + } + + runAirgappedMultiHopUpgradeTest(t, airgapUpgradeConfig{ + Base: smokeConfig{ + // NOTE: AWS LB/target-group names are limited to 32 chars. The stack + // name is "smoke-{Name}-{5-char-random}", and Terraform appends suffixes + // like "-mke-kube" (9 chars). That caps len(Name) at 11. Keep this + // short even though the CI label is "smoke-airgapped-multi-hop". + Name: "airgap-mhop", + MCRChannel: "stable-25.0", + MKEVersion: "3.8.8", + MSRVersion: "2.9.27", + SSHKeyAlgorithm: "ed25519", + Nodegroups: map[string]interface{}{ + "MngrUbuntu22": test.Platforms["Ubuntu22"].GetManager(), + "WrkRhel8": test.Platforms["Rhel8"].GetWorker(), + "MsrUbuntu22": test.Platforms["Ubuntu22"].GetMSR(), + }, + }, + Steps: steps, + }) +}