diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index c3f666199..764239a2c 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -28,6 +28,11 @@ jobs: e2e-test: runs-on: linux-amd64-cpu4 if: ${{ github.event.workflow_run.conclusion == 'success' }} && ${{ github.event.workflow_run.event == 'push' }} + strategy: + matrix: + label: [default, legacy, dra, kernel] + name: E2E Test (${{ matrix.label }}) + steps: - name: Checkout code uses: actions/checkout@v4 @@ -42,9 +47,8 @@ jobs: run: | sudo apt-get update sudo apt-get install -y make - make ginkgo - - name: Run e2e tests + - name: Run e2e test for ${{ matrix.label }} env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -55,13 +59,12 @@ jobs: echo "${{ secrets.AWS_SSH_KEY }}" > "$e2e_ssh_key" chmod 600 "$e2e_ssh_key" export E2E_SSH_KEY="$e2e_ssh_key" - - make -f tests/Makefile test + make -f tests/Makefile test GINKGO_ARGS="--label-filter='${{ matrix.label }}'" - name: Archive Ginkgo logs uses: actions/upload-artifact@v4 with: - name: ginkgo-logs + name: ginkgo-logs-${{ matrix.label }} path: ginkgo.json retention-days: 15 diff --git a/.github/workflows/golang.yaml b/.github/workflows/golang.yaml index fc9c3182c..d5cde7528 100644 --- a/.github/workflows/golang.yaml +++ b/.github/workflows/golang.yaml @@ -75,7 +75,14 @@ jobs: with: go-version: ${{ needs.variables.outputs.GOLANG_VERSION }} - - run: make coverage + - name: Run unit tests and generate coverage report + run: make coverage + + - name: Upload to Coveralls + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + file: coverage.out build: name: Build diff --git a/Makefile b/Makefile index 3096d2813..c9dc333d7 100644 --- a/Makefile +++ b/Makefile @@ -86,8 +86,7 @@ test: go test -coverprofile=$(COVERAGE_FILE) ./pkg/... coverage: test - cat $(COVERAGE_FILE) | grep -v "_mock.go" > $(COVERAGE_FILE).no-mocks - go tool cover -func=$(COVERAGE_FILE).no-mocks + go tool cover -func=$(COVERAGE_FILE) mdlint: ${CONTAINER_RUN_CMD} \ diff --git a/README.md b/README.md index eea8f4257..92c39a8f5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Holodeck -> * Tech preview, under heavy development * +[![Latest Release](https://img.shields.io/github/v/release/NVIDIA/holodeck?label=latest%20release)](https://github.com/NVIDIA/holodeck/releases/latest) A tool for creating and managing GPU-ready Cloud test environments. @@ -13,6 +13,7 @@ A tool for creating and managing GPU-ready Cloud test environments. - [Commands Reference](docs/commands/) - [Contributing Guide](docs/contributing/) - [Examples](docs/examples/) +- [Latest Release](https://github.com/NVIDIA/holodeck/releases/latest) --- @@ -89,14 +90,6 @@ holodeck status holodeck dryrun -f ./examples/v1alpha1_environment.yaml ``` ---- - -## 📦 Supported Cuda-Drivers - -See [docs/prerequisites.md](docs/prerequisites.md#supported-cuda-drivers) for the full list and usage. - ---- - ## 📂 More - [Examples](docs/examples/) @@ -104,4 +97,4 @@ See [docs/prerequisites.md](docs/prerequisites.md#supported-cuda-drivers) for th --- -For more information, see the [docs/](docs/) directory. +For more information, see the [documentation](docs/README.md) directory. diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..8fc423680 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,24 @@ +# Holodeck Documentation + +[![Latest Release](https://img.shields.io/github/v/release/NVIDIA/holodeck?label=latest%20release)](https://github.com/NVIDIA/holodeck/releases/latest) + +Welcome to the Holodeck documentation! Here you'll find everything you need to +get started, use, and contribute to Holodeck. + +## 📚 Sections + +- [Quick Start](quick-start.md): Get up and running with Holodeck in minutes. +- [Prerequisites](prerequisites.md): What you need before you begin. +- [Commands Reference](commands/README.md): Detailed documentation for every + Holodeck CLI command. +- [Contributing Guide](contributing/README.md): How to contribute to Holodeck, + including coding standards and PR process. +- [Examples](examples/README.md): Example configuration files and usage scenarios. +- [Guides](guides/README.md): In-depth guides and tutorials for advanced usage. +- [Latest Release](https://github.com/NVIDIA/holodeck/releases/latest) + +--- + +For general information, see the [main README](../README.md). + +If you have questions or want to contribute, check out the [Contributing Guide](contributing/README.md)! diff --git a/docs/examples/README.md b/docs/examples/README.md index 5bfa30d39..baa832129 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -67,4 +67,4 @@ A sample kind cluster configuration for use with the kind installer. --- For more details on configuration options, see the -[Command Reference](../commands/) and [Quick Start Guide](../quick-start.md). +[Command Reference](../commands/README.md) and [Quick Start Guide](../quick-start.md). diff --git a/docs/guides/README.md b/docs/guides/README.md new file mode 100644 index 000000000..380d2ca31 --- /dev/null +++ b/docs/guides/README.md @@ -0,0 +1,10 @@ +# Guides + +This section is for in-depth guides and tutorials related to Holodeck. + +- If you are looking for step-by-step instructions or advanced usage, guides + will be listed here as they are added. +- To contribute a guide, simply add a new Markdown file to this folder and + update this README with a link. + +*No guides are available yet. Stay tuned!* diff --git a/docs/quick-start.md b/docs/quick-start.md index f590c2801..c27d9487c 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -71,5 +71,5 @@ holodeck delete - Check out the [Prerequisites](prerequisites.md) for detailed setup requirements -- Explore the [Command Reference](commands/) for all available commands +- Explore the [Command Reference](commands/README.md) for all available commands - See [Examples](../examples/) for more complex configurations diff --git a/pkg/provisioner/templates/kubernetes.go b/pkg/provisioner/templates/kubernetes.go index 4dd6106aa..4fd18755b 100644 --- a/pkg/provisioner/templates/kubernetes.go +++ b/pkg/provisioner/templates/kubernetes.go @@ -116,13 +116,13 @@ with_retry 10 20s kubectl --kubeconfig $KUBECONFIG wait --for=condition=establis # Apply custom resources with increased retry attempts with_retry 10 20s kubectl --kubeconfig $KUBECONFIG apply -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml -# Wait for cluster to be ready -with_retry 10 20s kubectl --kubeconfig $KUBECONFIG wait --for=condition=ready --timeout=300s nodes --all - # Make single-node cluster schedulable kubectl taint nodes --all node-role.kubernetes.io/control-plane:NoSchedule- kubectl label node --all node-role.kubernetes.io/worker= kubectl label node --all nvidia.com/holodeck.managed=true + +# Wait for cluster to be ready +with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=ready --timeout=300s nodes --all ` const KindTemplate = ` diff --git a/tests/aws_test.go b/tests/aws_test.go index ed0c66035..abe77a0f3 100644 --- a/tests/aws_test.go +++ b/tests/aws_test.go @@ -33,58 +33,41 @@ import ( "github.com/NVIDIA/holodeck/tests/common" ) +// Test configuration structure +type testConfig struct { + name string + filePath string + description string +} + +// Test state structure +type testState struct { + opts struct { + cachePath string + cachefile string + cfg v1alpha1.Environment + } + provider provider.Provider + log *logger.FunLogger + ctx context.Context +} + // AWSEnvironmentTests contains end-to-end tests for AWS environment provisioning and management. // These tests verify the complete lifecycle of AWS environments, including: // - Environment creation and validation // - Kubernetes cluster setup (when enabled) // - Resource provisioning and cleanup // - Configuration validation -var _ = Describe("AWS Environment", func() { - // Test configuration structure - type testConfig struct { - name string - filePath string - description string - } - - // Test state structure - type testState struct { - opts struct { - cachePath string - cachefile string - cfg v1alpha1.Environment - } - provider provider.Provider - log *logger.FunLogger - ctx context.Context - } +var _ = DescribeTable("AWS Environment E2E", + func(config testConfig) { + GinkgoWriter.Println("=== Starting test:", config.name, "===") - // Define test configurations - testConfigs := []testConfig{ - { - name: "Default AWS Test", - filePath: filepath.Join(packagePath, "data", "test_aws.yml"), - description: "Tests basic AWS environment setup with default configuration", - }, - { - name: "Legacy Kubernetes Test", - filePath: filepath.Join(packagePath, "data", "test_aws_legacy.yml"), - description: "Tests AWS environment with legacy Kubernetes version", - }, - { - name: "DRA Enabled Test", - filePath: filepath.Join(packagePath, "data", "test_aws_dra.yml"), - description: "Tests AWS environment with Dynamic Resource Allocation enabled", - }, - { - name: "Kernel Features Test", - filePath: filepath.Join(packagePath, "data", "test_aws_kernel.yml"), - description: "Tests AWS environment with kernel features enabled", - }, - } + // Generate a unique artifact directory for this test + uniqueID := common.GenerateUID() + artifactDir := filepath.Join(LogArtifactDir, config.name+"-"+uniqueID) + Expect(os.MkdirAll(artifactDir, 0750)).To(Succeed(), "Failed to create artifact directory") - // Shared setup function - setupTest := func(config testConfig) testState { + // Setup state := testState{ ctx: context.Background(), log: logger.NewLogger(), @@ -95,145 +78,104 @@ var _ = Describe("AWS Environment", func() { Expect(err).NotTo(HaveOccurred(), "Failed to read config file: %s", config.filePath) // Set unique name for the environment - cfg.Name = cfg.Name + "-" + common.GenerateUID() + cfg.Name = cfg.Name + "-" + uniqueID - // Setup cache directory and file - state.opts.cachePath = LogArtifactDir + // Setup unique cache file + state.opts.cachePath = artifactDir state.opts.cachefile = filepath.Join(state.opts.cachePath, cfg.Name+".yaml") - // Create cache directory if it doesn't exist - Expect(os.MkdirAll(state.opts.cachePath, 0750)).To(Succeed(), "Failed to create cache directory") - // Initialize provider state.provider, err = newProvider(state.log, cfg, state.opts.cachefile) Expect(err).NotTo(HaveOccurred(), "Failed to initialize provider") - state.opts.cfg = cfg - return state - } - // Shared cleanup function - cleanupTest := func(state testState) { - if !CurrentSpecReport().Failed() { - Expect(os.Remove(state.opts.cachefile)).To(Succeed(), "Failed to remove cache file") - } - } + // Cleanup: remove cache file and artifact dir if test passes + DeferCleanup(func() { + if !CurrentSpecReport().Failed() { + err := os.RemoveAll(artifactDir) + Expect(err).NotTo(HaveOccurred(), "Failed to remove artifact directory") + } + }) - // Run each test configuration sequentially to ensure proper resource management - // and avoid potential conflicts between concurrent test runs - for _, config := range testConfigs { - config := config // Create a new variable to avoid closure issues - When("testing "+config.name, Ordered, func() { - var state testState - - BeforeAll(func() { - state = setupTest(config) - }) - - AfterAll(func() { - cleanupTest(state) - }) - - Describe("Configuration Validation", func() { - When("validating the provider configuration", func() { - It("should validate the provider configuration", func() { - Expect(state.provider.DryRun()).To(Succeed(), "Provider validation failed") - }) - - It("should validate the provisioner configuration", func() { - Expect(provisioner.Dryrun(state.log, state.opts.cfg)).To(Succeed(), "Provisioner validation failed") - }) - }) - - When("validating the environment configuration", func() { - It("should have valid instance type", func() { - Expect(state.opts.cfg.Spec.Instance.Type).NotTo(BeEmpty(), "Instance type should not be empty") - }) - - It("should have valid region", func() { - Expect(state.opts.cfg.Spec.Instance.Region).NotTo(BeEmpty(), "Region should not be empty") - }) - - It("should have valid ingress IP ranges", func() { - Expect(state.opts.cfg.Spec.Instance.IngresIpRanges).NotTo(BeEmpty(), "Ingress IP ranges should not be empty") - }) - }) - }) - - Describe("Environment Management", func() { - When("creating the environment", func() { - AfterAll(func() { - // Ensure environment cleanup even if test fails - Expect(state.provider.Delete()).To(Succeed(), "Failed to delete environment") - }) - - It("should create the environment successfully", func() { - state.opts.cfg.Spec.PrivateKey = sshKey - state.opts.cfg.Spec.Username = "ubuntu" - Expect(state.provider.Create()).To(Succeed(), "Failed to create environment") - }) - - It("should have valid environment name", func() { - Expect(state.opts.cfg.Name).NotTo(BeEmpty(), "Environment name should not be empty") - }) - It("should provision the environment successfully", func() { - By("Reading the environment file") - env, err := jyaml.UnmarshalFromFile[v1alpha1.Environment](state.opts.cachefile) - Expect(err).NotTo(HaveOccurred(), "Failed to read environment file") - - var hostUrl string - for _, p := range env.Status.Properties { - if p.Name == aws.PublicDnsName { - hostUrl = p.Value - break - } - } - Expect(hostUrl).NotTo(BeEmpty(), "Host URL should not be empty") - - By("Provisioning the environment") - p, err := provisioner.New(state.log, state.opts.cfg.Spec.PrivateKey, state.opts.cfg.Spec.Username, hostUrl) - Expect(err).NotTo(HaveOccurred(), "Failed to create provisioner") - - // Ensure client is properly closed after test - defer func() { - if p.Client != nil { - // Try to create a new session to check if connection is alive - session, err := p.Client.NewSession() - if err == nil { - session.Close() // nolint:errcheck, gosec - // Connection is alive, close it - if err := p.Client.Close(); err != nil { - Expect(err).NotTo(HaveOccurred(), "Failed to close ssh client") - } - } - // If we get here, either the connection was already closed or we couldn't create a session - p.Client = nil - } - }() - - By("Running the provisioner") - Expect(p.Run(env)).To(Succeed(), "Failed to provision environment") - }) - }) - }) - - Describe("Kubernetes Configuration", func() { - When("kubernetes is enabled", func() { - BeforeEach(func() { - if state.opts.cfg.Spec.Kubernetes.KubernetesVersion == "" { - Skip("Skipping test: Kubernetes version not specified in environment file") - } - }) - - It("should have valid kubernetes version", func() { - Expect(state.opts.cfg.Spec.Kubernetes.KubernetesVersion).NotTo(BeEmpty(), "Kubernetes version should not be empty") - }) - - It("should have valid kubernetes installer", func() { - Expect(state.opts.cfg.Spec.Kubernetes.KubernetesInstaller).NotTo(BeEmpty(), "Kubernetes installer should not be empty") - }) - }) - }) + // --- Test logic (copied from original) --- + By("Configuration Validation") + Expect(state.provider.DryRun()).To(Succeed(), "Provider validation failed") + Expect(provisioner.Dryrun(state.log, state.opts.cfg)).To(Succeed(), "Provisioner validation failed") + Expect(state.opts.cfg.Spec.Instance.Type).NotTo(BeEmpty(), "Instance type should not be empty") + Expect(state.opts.cfg.Spec.Instance.Region).NotTo(BeEmpty(), "Region should not be empty") + Expect(state.opts.cfg.Spec.Instance.IngresIpRanges).NotTo(BeEmpty(), "Ingress IP ranges should not be empty") + + By("Environment Management") + // Ensure environment cleanup even if test fails + DeferCleanup(func() { + Expect(state.provider.Delete()).To(Succeed(), "Failed to delete environment") }) - } + + state.opts.cfg.Spec.PrivateKey = sshKey + state.opts.cfg.Spec.Username = "ubuntu" + Expect(state.provider.Create()).To(Succeed(), "Failed to create environment") + Expect(state.opts.cfg.Name).NotTo(BeEmpty(), "Environment name should not be empty") + + By("Provisioning the environment") + env, err := jyaml.UnmarshalFromFile[v1alpha1.Environment](state.opts.cachefile) + Expect(err).NotTo(HaveOccurred(), "Failed to read environment file") + var hostUrl string + for _, p := range env.Status.Properties { + if p.Name == aws.PublicDnsName { + hostUrl = p.Value + break + } + } + Expect(hostUrl).NotTo(BeEmpty(), "Host URL should not be empty") + p, err := provisioner.New(state.log, state.opts.cfg.Spec.PrivateKey, state.opts.cfg.Spec.Username, hostUrl) + Expect(err).NotTo(HaveOccurred(), "Failed to create provisioner") + defer func() { + if p.Client != nil { + session, err := p.Client.NewSession() + if err == nil { + session.Close() // nolint:errcheck, gosec + if err := p.Client.Close(); err != nil { + Expect(err).NotTo(HaveOccurred(), "Failed to close ssh client") + } + } + p.Client = nil + } + }() + Expect(p.Run(env)).To(Succeed(), "Failed to provision environment") + + By("Kubernetes Configuration") + if state.opts.cfg.Spec.Kubernetes.KubernetesVersion != "" { + Expect(state.opts.cfg.Spec.Kubernetes.KubernetesVersion).NotTo(BeEmpty(), "Kubernetes version should not be empty") + Expect(state.opts.cfg.Spec.Kubernetes.KubernetesInstaller).NotTo(BeEmpty(), "Kubernetes installer should not be empty") + } else { + Skip("Skipping test: Kubernetes version not specified in environment file") + } + + GinkgoWriter.Println("=== Finished test:", config.name, "===") + }, + Entry("Default AWS Test", testConfig{ + name: "Default AWS Test", + filePath: filepath.Join(packagePath, "data", "test_aws.yml"), + description: "Tests basic AWS environment setup with default configuration", + }, Label("default")), + Entry("Legacy Kubernetes Test", testConfig{ + name: "Legacy Kubernetes Test", + filePath: filepath.Join(packagePath, "data", "test_aws_legacy.yml"), + description: "Tests AWS environment with legacy Kubernetes version", + }, Label("legacy")), + Entry("DRA Enabled Test", testConfig{ + name: "DRA Enabled Test", + filePath: filepath.Join(packagePath, "data", "test_aws_dra.yml"), + description: "Tests AWS environment with Dynamic Resource Allocation enabled", + }, Label("dra")), + Entry("Kernel Features Test", testConfig{ + name: "Kernel Features Test", + filePath: filepath.Join(packagePath, "data", "test_aws_kernel.yml"), + description: "Tests AWS environment with kernel features enabled", + }, Label("kernel")), +) + +// Mark the table as parallel +var _ = BeforeEach(func() { + GinkgoParallelNode() // This ensures the test runs in parallel; Ginkgo v2 uses Parallel() as a method, but this is a no-op if not in a container })