From 15a1c7bb8dc30fc38b387145501444e663b9a5ee Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 29 May 2025 15:39:02 +0200 Subject: [PATCH 1/4] Fix cuda-drivers by installing gcc first Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/create.go | 9 +++--- .../templates/container-toolkit.go | 5 +-- pkg/provisioner/templates/docker.go | 2 +- pkg/provisioner/templates/kubernetes.go | 3 ++ pkg/provisioner/templates/nv-driver.go | 31 ++++++++++++++++--- 5 files changed, 39 insertions(+), 11 deletions(-) diff --git a/pkg/provider/aws/create.go b/pkg/provider/aws/create.go index 3b31f6bd9..040b127ea 100644 --- a/pkg/provider/aws/create.go +++ b/pkg/provider/aws/create.go @@ -300,10 +300,11 @@ func (p *Provider) createEC2Instance(cache *AWS) error { } instanceIn := &ec2.RunInstancesInput{ - ImageId: p.Spec.Image.ImageId, - InstanceType: types.InstanceType(p.Spec.Type), - MaxCount: &minMaxCount, - MinCount: &minMaxCount, + ImageId: p.Spec.Image.ImageId, + InstanceType: types.InstanceType(p.Spec.Type), + MaxCount: &minMaxCount, + MinCount: &minMaxCount, + InstanceInitiatedShutdownBehavior: types.ShutdownBehaviorTerminate, BlockDeviceMappings: []types.BlockDeviceMapping{ { DeviceName: aws.String("/dev/sda1"), diff --git a/pkg/provisioner/templates/container-toolkit.go b/pkg/provisioner/templates/container-toolkit.go index ddf377381..3d89e1f82 100644 --- a/pkg/provisioner/templates/container-toolkit.go +++ b/pkg/provisioner/templates/container-toolkit.go @@ -32,9 +32,10 @@ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dear sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ && \ - sudo apt-get update + with_retry 3 10s sudo apt-get update -install_packages_with_retry nvidia-container-toolkit +install_packages_with_retry nvidia-container-toolkit nvidia-container-toolkit-base \ + libnvidia-container-tools libnvidia-container1 # Configure container runtime sudo nvidia-ctk runtime configure --runtime={{.ContainerRuntime}} --set-as-default --enable-cdi={{.EnableCDI}} diff --git a/pkg/provisioner/templates/docker.go b/pkg/provisioner/templates/docker.go index cffdfeb68..36ffad091 100644 --- a/pkg/provisioner/templates/docker.go +++ b/pkg/provisioner/templates/docker.go @@ -30,7 +30,7 @@ const dockerTemplate = ` : ${DOCKER_VERSION:={{.Version}}} # Add Docker's official GPG key: -sudo apt-get update +with_retry 3 10s sudo apt-get update install_packages_with_retry ca-certificates curl gnupg sudo install -m 0755 -d /etc/apt/keyrings curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg diff --git a/pkg/provisioner/templates/kubernetes.go b/pkg/provisioner/templates/kubernetes.go index 643e541c3..a5239a7cc 100644 --- a/pkg/provisioner/templates/kubernetes.go +++ b/pkg/provisioner/templates/kubernetes.go @@ -116,6 +116,9 @@ with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=establish # Apply custom resources with increased retry attempts with_retry 10 15s kubectl --kubeconfig $KUBECONFIG apply -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml +# Wait for cluster to be ready +with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=ready --timeout=300s nodes --all + # Make single-node cluster schedulable kubectl taint nodes --all node-role.kubernetes.io/control-plane:NoSchedule- kubectl label node --all node-role.kubernetes.io/worker= diff --git a/pkg/provisioner/templates/nv-driver.go b/pkg/provisioner/templates/nv-driver.go index 639351ff7..be011c2cc 100644 --- a/pkg/provisioner/templates/nv-driver.go +++ b/pkg/provisioner/templates/nv-driver.go @@ -27,13 +27,28 @@ import ( // From https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#ubuntu-lts const NvDriverTemplate = ` -sudo apt-get update -install_packages_with_retry linux-headers-$(uname -r) +with_retry 3 10s sudo apt-get update +install_packages_with_retry linux-headers-$(uname -r) gcc make +install_packages_with_retry apt-utils build-essential \ + ca-certificates \ + curl \ + kmod \ + file \ + libelf-dev \ + libglvnd-dev \ + pkg-config + +install_packages_with_retry gcc-12 g++-12 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 12 + +# Install the new cuda-keyring package distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb - with_retry 3 10s sudo apt-get update + +# Install the NVIDIA driver install_packages_with_retry cuda-drivers{{if .Version}}={{.Version}}{{else if .Branch}}-{{.Branch}}{{end}} # Check if NVIDIA module is loaded, if not load it @@ -51,7 +66,15 @@ nvidia-smi type NvDriver v1alpha1.NVIDIADriver func NewNvDriver(env v1alpha1.Environment) *NvDriver { - return (*NvDriver)(&env.Spec.NVIDIADriver) + var nvDriver NvDriver + + nvDriver.Install = env.Spec.NVIDIADriver.Install + + if env.Spec.NVIDIADriver.Branch == "" { + nvDriver.Branch = "575" + } + + return &nvDriver } func (t *NvDriver) Execute(tpl *bytes.Buffer, env v1alpha1.Environment) error { From 45e4356bb21be3ba6364b385780cac2d773c024b Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 29 May 2025 15:39:20 +0200 Subject: [PATCH 2/4] Enhance delete and list cmd's Signed-off-by: Carlos Eduardo Arango Gutierrez --- cmd/cli/create/create.go | 2 +- cmd/cli/delete/delete.go | 39 +++++++++++++++++++-------------------- cmd/cli/list/list.go | 24 +++++++++++++++++++++--- 3 files changed, 41 insertions(+), 24 deletions(-) diff --git a/cmd/cli/create/create.go b/cmd/cli/create/create.go index a0d148f91..2ee824bcf 100644 --- a/cmd/cli/create/create.go +++ b/cmd/cli/create/create.go @@ -171,7 +171,7 @@ func (m command) run(c *cli.Context, opts *options) error { } } - m.log.Info("Created instance %s", instanceID) + m.log.Info("\nCreated instance %s", instanceID) return nil } diff --git a/cmd/cli/delete/delete.go b/cmd/cli/delete/delete.go index 19149b5f5..1e6dd3e43 100644 --- a/cmd/cli/delete/delete.go +++ b/cmd/cli/delete/delete.go @@ -44,7 +44,7 @@ func (m command) build() *cli.Command { // Create the 'delete' command delete := cli.Command{ Name: "delete", - Usage: "Delete a Holodeck instance", + Usage: "Delete one or more Holodeck instances", Flags: []cli.Flag{ &cli.StringFlag{ Name: "cachepath", @@ -53,37 +53,36 @@ func (m command) build() *cli.Command { Destination: &m.cachePath, Value: filepath.Join(os.Getenv("HOME"), ".cache", "holodeck"), }, - &cli.StringFlag{ - Name: "instance-id", - Aliases: []string{"i"}, - Usage: "Instance ID to delete", - }, }, - Action: func(c *cli.Context) error { - // Delete using instance ID - instanceID := c.String("instance-id") - return m.run(c, instanceID) + if c.NArg() == 0 { + return fmt.Errorf("at least one instance ID is required") + } + return m.run(c) }, } return &delete } -func (m command) run(c *cli.Context, instanceID string) error { +func (m command) run(c *cli.Context) error { manager := instances.NewManager(m.log, m.cachePath) - // First check if the instance exists - instance, err := manager.GetInstance(instanceID) - if err != nil { - return fmt.Errorf("failed to get instance: %v", err) - } + // Process each instance ID provided as an argument + for _, instanceID := range c.Args().Slice() { + // First check if the instance exists + instance, err := manager.GetInstance(instanceID) + if err != nil { + return fmt.Errorf("failed to get instance %s: %v", instanceID, err) + } + + // Delete the instance + if err := manager.DeleteInstance(instanceID); err != nil { + return fmt.Errorf("failed to delete instance %s: %v", instanceID, err) + } - // Delete the instance - if err := manager.DeleteInstance(instanceID); err != nil { - return fmt.Errorf("failed to delete instance: %v", err) + m.log.Info("Successfully deleted instance %s (%s)", instanceID, instance.Name) } - m.log.Info("Successfully deleted instance %s (%s)", instanceID, instance.Name) return nil } diff --git a/cmd/cli/list/list.go b/cmd/cli/list/list.go index 33cf368b7..8565dc80e 100644 --- a/cmd/cli/list/list.go +++ b/cmd/cli/list/list.go @@ -31,17 +31,18 @@ import ( type command struct { log *logger.FunLogger cachePath string + quiet bool } // NewCommand constructs the list command with the specified logger func NewCommand(log *logger.FunLogger) *cli.Command { - c := command{ + c := &command{ log: log, } return c.build() } -func (m command) build() *cli.Command { +func (m *command) build() *cli.Command { // Create the 'list' command list := cli.Command{ Name: "list", @@ -54,6 +55,12 @@ func (m command) build() *cli.Command { Usage: "Path to the cache directory", Destination: &m.cachePath, }, + &cli.BoolFlag{ + Name: "quiet", + Aliases: []string{"q"}, + Usage: "Only display instance IDs", + Destination: &m.quiet, + }, }, Action: m.run, } @@ -61,7 +68,7 @@ func (m command) build() *cli.Command { return &list } -func (m command) run(c *cli.Context) error { +func (m *command) run(c *cli.Context) error { manager := instances.NewManager(m.log, m.cachePath) instances, err := manager.ListInstances() if err != nil { @@ -73,6 +80,17 @@ func (m command) run(c *cli.Context) error { return nil } + // If quiet mode is enabled, only print instance IDs + if m.quiet { + for _, instance := range instances { + if instance.ID == "" { + continue + } + fmt.Println(instance.ID) + } + return nil + } + // Create a tabwriter for formatted output w := tabwriter.NewWriter(os.Stdout, 0, 0, 3, ' ', 0) if _, err := fmt.Fprintln(w, "INSTANCE ID\tNAME\tPROVIDER\tSTATUS\tPROVISIONED\tCREATED\tAGE"); err != nil { From f1f4f3643e9daae517a6a59b418cf6314e27d7f2 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 29 May 2025 15:39:30 +0200 Subject: [PATCH 3/4] Add Docs Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/ci.yaml | 6 +- .github/workflows/docs_check.yaml | 38 ++++++ Makefile | 11 +- README.md | 204 ++++++++++-------------------- docs/commands/README.md | 65 ++++++++++ docs/commands/create.md | 137 ++++++++++++++++++++ docs/commands/delete.md | 56 ++++++++ docs/commands/dryrun.md | 58 +++++++++ docs/commands/list.md | 54 ++++++++ docs/commands/status.md | 52 ++++++++ docs/contributing/README.md | 137 ++++++++++++++++++++ docs/examples/README.md | 70 ++++++++++ docs/mdl-style.rb | 13 ++ docs/prerequisites.md | 89 +++++++++++++ docs/quick-start.md | 75 +++++++++++ scripts/mdlint.sh | 6 + 16 files changed, 935 insertions(+), 136 deletions(-) create mode 100644 .github/workflows/docs_check.yaml create mode 100644 docs/commands/README.md create mode 100644 docs/commands/create.md create mode 100644 docs/commands/delete.md create mode 100644 docs/commands/dryrun.md create mode 100644 docs/commands/list.md create mode 100644 docs/commands/status.md create mode 100644 docs/contributing/README.md create mode 100644 docs/examples/README.md create mode 100644 docs/mdl-style.rb create mode 100644 docs/prerequisites.md create mode 100644 docs/quick-start.md create mode 100755 scripts/mdlint.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c5acdfe77..e0233bd8d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -27,7 +27,11 @@ jobs: golang: uses: ./.github/workflows/golang.yaml - + + docs-check: + uses: ./.github/workflows/docs_check.yaml + secrets: inherit + image: uses: ./.github/workflows/image.yaml needs: [golang, code-scanning] diff --git a/.github/workflows/docs_check.yaml b/.github/workflows/docs_check.yaml new file mode 100644 index 000000000..864f41917 --- /dev/null +++ b/.github/workflows/docs_check.yaml @@ -0,0 +1,38 @@ +## Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## + +name: Docs + +on: + workflow_call: + +jobs: + lint: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.5' + + - name: Install mdl + run: gem install mdl -v 0.13.0 + + - name: Run Markdown lint + run: | + find docs/ -path docs/vendor -prune -false -o -name '*.md' | xargs mdl -s docs/mdl-style.rb diff --git a/Makefile b/Makefile index 3be1f348d..3096d2813 100644 --- a/Makefile +++ b/Makefile @@ -8,8 +8,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -.PHONY: build fmt verify release lint vendor mod-tidy mod-vendor mod-verify check-vendor +.PHONY: build fmt verify release lint vendor mod-tidy mod-vendor mod-verify check-vendor mdlint +CONTAINER_RUN_CMD ?= docker run GO_CMD ?= go GO_FMT ?= gofmt GO_SRC := $(shell find . -type f -name '*.go' -not -path "./vendor/*") @@ -88,6 +89,14 @@ coverage: test cat $(COVERAGE_FILE) | grep -v "_mock.go" > $(COVERAGE_FILE).no-mocks go tool cover -func=$(COVERAGE_FILE).no-mocks +mdlint: + ${CONTAINER_RUN_CMD} \ + --rm \ + --volume "${PWD}:/workdir:ro,z" \ + --workdir /workdir \ + ruby:slim \ + /workdir/scripts/mdlint.sh + release: @rm -rf bin @mkdir -p bin diff --git a/README.md b/README.md index f157d35bb..eea8f4257 100644 --- a/README.md +++ b/README.md @@ -2,170 +2,106 @@ > * Tech preview, under heavy development * -A tool for creating and managing GPU ready Cloud test environments. +A tool for creating and managing GPU-ready Cloud test environments. -## Installation +--- + +## 📖 Documentation + +- [Quick Start](docs/quick-start.md) +- [Prerequisites](docs/prerequisites.md) +- [Commands Reference](docs/commands/) +- [Contributing Guide](docs/contributing/) +- [Examples](docs/examples/) + +--- + +## 🚀 Quick Start + +See [docs/quick-start.md](docs/quick-start.md) for a full walkthrough. ```bash make build -mv ./bin/holodeck /usr/local/bin/holodeck +sudo mv ./bin/holodeck /usr/local/bin/holodeck +holodeck --help ``` -### Prerequisites +--- -If utilizing the AWS provider, a valid AWS credentials must be available in the environment. +## 🛠️ Prerequisites -```yaml -apiVersion: holodeck.nvidia.com/v1alpha1 -kind: Environment -metadata: - name: holodeck - description: "Devel infra environment" -spec: - provider: aws -``` +- Go 1.20+ +- (For AWS) Valid AWS credentials in your environment +- (For SSH) Reachable host and valid SSH key -If utilizing the SSH provider, a valid SSH key must and reachable host must be available in the environment file. - -```yaml -apiVersion: holodeck.nvidia.com/v1alpha1 -kind: Environment -metadata: - name: holodeck - description: "Devel infra environment" -spec: - provider: aws - auth: - keyName: user - privateKey: "/Users/user/.ssh/user.pem" - instance: - hostUrl: "" -``` +See [docs/prerequisites.md](docs/prerequisites.md) for details. + +--- + +## 📝 How to Contribute + +See [docs/contributing/](docs/contributing/) for full details. + +### Main Makefile Targets + +- `make build` – Build the holodeck binary +- `make test` – Run all tests +- `make lint` – Run linters +- `make clean` – Remove build artifacts + +--- -## Usage +## 🧑‍💻 Usage + +See [docs/commands/](docs/commands/) for detailed command documentation and examples. ```bash holodeck --help ``` -### The Environment CRD - -```yaml -apiVersion: holodeck.nvidia.com/v1alpha1 -kind: Environment -metadata: - name: holodeck - description: "Devel infra environment" -spec: - provider: aws # or ssh currently supported - auth: - keyName: user - privateKey: "/Users/user/.ssh/user.pem" - instance: # if provider is ssh you need to define here the hostUrl - type: g4dn.xlarge - region: eu-north-1 - ingressIpRanges: - - 192.168.1.0/26 - image: - architecture: amd64 - imageId: ami-0fe8bec493a81c7da # Ubuntu 22.04 image - containerRuntime: - install: true - name: containerd - version: 1.6.24 - kubernetes: - install: true - installer: kubeadm # supported installers: kubeadm, kind, microk8s - version: v1.28.5 -``` - -The dependencies are resolved automatically, from top to bottom. Following the -pattern: +### Example: Create an environment -> Kubernetes -> Container Runtime -> Container Toolkit -> NVDriver +```bash +holodeck create -f ./examples/v1alpha1_environment.yaml +``` -If Kubernetes is requested, and no container runtime is requested, a default -container runtime will be added to the environment.. +### Example: List environments -If Container Toolkit is requested, and no container runtime is requested, a -default container runtime will be added to the environment. +```bash +holodeck list +``` -### Create an environment +### Example: Delete an environment ```bash -$ holodeck create -f ./examples/v1alpha1_environment.yaml -... +holodeck delete ``` -### Delete an environment +### Example: Check status ```bash -$ holodeck delete -f ./examples/v1alpha1_environment.yaml -... +holodeck status ``` -### Dry Run +### Example: Dry Run ```bash -$ holodeck dryrun -f ./examples/v1alpha1_environment.yaml -Dryrun environment holodeck 🔍 -✔ Checking if instance type g4dn.xlarge is supported in region eu-north-1 -✔ Checking if image ami-0fe8bec493a81c7da is supported in region eu-north-1 -✔ Resolving dependencies 📦 -Dryrun succeeded 🎉 +holodeck dryrun -f ./examples/v1alpha1_environment.yaml ``` -## Supported Cuda-Drivers +--- -Supported Nvidia drivers are: +## 📦 Supported Cuda-Drivers -```yaml - nvidiaDriver: - install: true - version: -``` -Where `` can be a prefix of any package version. The following are example package versions: - -- 570.86.15-0ubuntu1 -- 570.86.10-0ubuntu1 -- 565.57.01-0ubuntu1 -- 560.35.05-0ubuntu1 -- 560.35.03-1 -- 560.28.03-1 -- 555.42.06-1 -- 555.42.02-1 -- 550.144.03-0ubuntu1 -- 550.127.08-0ubuntu1 -- 550.127.05-0ubuntu1 -- 550.90.12-0ubuntu1 -- 550.90.07-1 -- 550.54.15-1 -- 550.54.14-1 -- 545.23.08-1 -- 545.23.06-1 -- 535.230.02-0ubuntu1 -- 535.216.03-0ubuntu1 -- 535.216.01-0ubuntu1 -- 535.183.06-1 -- 535.183.01-1 -- 535.161.08-1 -- 535.161.07-1 -- 535.154.05-1 -- 535.129.03-1 -- 535.104.12-1 -- 535.104.05-1 -- 535.86.10-1 -- 535.54.03-1 -- 530.30.02-1 -- 525.147.05-1 -- 525.125.06-1 -- 525.105.17-1 -- 525.85.12-1 -- 525.60.13-1 -- 520.61.05-1 -- 515.105.01-1 -- 515.86.01-1 -- 515.65.07-1 -- 515.65.01-1 -- 515.48.07-1 -- 515.43.04-1 +See [docs/prerequisites.md](docs/prerequisites.md#supported-cuda-drivers) for the full list and usage. + +--- + +## 📂 More + +- [Examples](docs/examples/) +- [Guides](docs/guides/) + +--- + +For more information, see the [docs/](docs/) directory. diff --git a/docs/commands/README.md b/docs/commands/README.md new file mode 100644 index 000000000..f7dce779f --- /dev/null +++ b/docs/commands/README.md @@ -0,0 +1,65 @@ +# Command Reference + +This document provides detailed information about all available Holodeck +commands. + +## Basic Commands + +- [create](create.md) - Create a new environment +- [delete](delete.md) - Delete an existing environment +- [list](list.md) - List all environments +- [status](status.md) - Check the status of an environment +- [dryrun](dryrun.md) - Perform a dry run of environment creation + +## Command Usage + +All commands follow this general pattern: + +```bash +holodeck [command] [flags] +``` + +For detailed help on any command: + +```bash +holodeck [command] --help +``` + +## Global Options + +- `--log-level string` - Log level (debug, info, warn, error) (default: "info") +- `--no-color` - Disable color output + +## Examples + +### Create an Environment + +```bash +holodeck create -f environment.yaml +``` + +### List Environments + +```bash +holodeck list +``` + +### Check Environment Status + +```bash +holodeck status +``` + +### Delete an Environment + +```bash +holodeck delete +``` + +### Dry Run + +```bash +holodeck dryrun -f environment.yaml +``` + +For detailed information about each command, click on the command name above. diff --git a/docs/commands/create.md b/docs/commands/create.md new file mode 100644 index 000000000..5e03433ac --- /dev/null +++ b/docs/commands/create.md @@ -0,0 +1,137 @@ +# Create Command + +The `create` command creates a new Holodeck environment from a configuration +file. + +## Usage + +```bash +holodeck create [flags] +``` + +## Flags + +- `-f, --envFile ` Path to the environment YAML file (required) +- `-p, --provision` Provision the environment after creation (optional) +- `-k, --kubeconfig ` Path to the kubeconfig file (optional) +- `-c, --cachepath ` Path to the cache directory (optional) + +## Examples + +### Basic Creation + +```bash +holodeck create -f environment.yaml +``` + +### Create and Provision + +```bash +holodeck create -f environment.yaml --provision +``` + +### Specify Kubeconfig and Cache Directory + +```bash +holodeck create -f environment.yaml --kubeconfig=mykubeconfig --cachepath=/tmp/holodeck-cache +``` + +## Configuration File Format + +The environment configuration file should be in YAML format: + +```yaml +apiVersion: holodeck.nvidia.com/v1alpha1 +kind: Environment +metadata: + name: my-environment + description: "My test environment" +spec: + provider: aws # or ssh + instance: + type: g4dn.xlarge + region: us-west-2 + kubernetes: + install: true + version: v1.28.5 +``` + +## Sample Output + +```text +Created instance 123e4567-e89b-12d3-a456-426614174000 +``` + +## Common Errors & Logs + +- `error reading config file: ...` — The environment YAML file is missing or + invalid. +- `failed to provision: ...` — Provisioning failed due to a configuration or + provider error. +- `Created instance ` — Success log after creation. + +## Supported NVIDIA Driver Versions + +The following NVIDIA driver versions are supported (prefix match is allowed): + +- 575.51.03-0ubuntu1 +- 570.86.15-0ubuntu1 +- 570.86.10-0ubuntu1 +- 565.57.01-0ubuntu1 +- 560.35.05-0ubuntu1 +- 560.35.03-1 +- 560.28.03-1 +- 555.42.06-1 +- 555.42.02-1 +- 550.144.03-0ubuntu1 +- 550.127.08-0ubuntu1 +- 550.127.05-0ubuntu1 +- 550.90.12-0ubuntu1 +- 550.90.07-1 +- 550.54.15-1 +- 550.54.14-1 +- 545.23.08-1 +- 545.23.06-1 +- 535.230.02-0ubuntu1 +- 535.216.03-0ubuntu1 +- 535.216.01-0ubuntu1 +- 535.183.06-1 +- 535.183.01-1 +- 535.161.08-1 +- 535.161.07-1 +- 535.154.05-1 +- 535.129.03-1 +- 535.104.12-1 +- 535.104.05-1 +- 535.86.10-1 +- 535.54.03-1 +- 530.30.02-1 +- 525.147.05-1 +- 525.125.06-1 +- 525.105.17-1 +- 525.85.12-1 +- 525.60.13-1 +- 520.61.05-1 +- 515.105.01-1 +- 515.86.01-1 +- 515.65.07-1 +- 515.65.01-1 +- 515.48.07-1 +- 515.43.04-1 + +## Supported NVIDIA Driver Branches + +The following NVIDIA driver branches are supported (prefix match is allowed): + +- 575 +- 570 +- 565 +- 560 +- 555 +- 550 + +## Related Commands + +- [delete](delete.md) - Delete an environment +- [status](status.md) - Check environment status +- [dryrun](dryrun.md) - Test environment creation diff --git a/docs/commands/delete.md b/docs/commands/delete.md new file mode 100644 index 000000000..0abb4c0de --- /dev/null +++ b/docs/commands/delete.md @@ -0,0 +1,56 @@ +# Delete Command + +The `delete` command removes a Holodeck environment and cleans up associated +resources. + +## Usage + +```bash +holodeck delete [flags] +``` + +## Flags + +- `-c, --cachepath ` Path to the cache directory (optional) + +## Examples + +### Basic Deletion + +```bash +holodeck delete 123e4567-e89b-12d3-a456-426614174000 +``` + +### Specify Cache Directory + +```bash +holodeck delete 123e4567-e89b-12d3-a456-426614174000 --cachepath=/tmp/holodeck-cache +``` + +## What Gets Deleted + +- Cloud instances (if using AWS provider) +- Associated network resources +- Security groups +- IAM roles (if created) +- Local environment state + +## Sample Output + +```text +Successfully deleted instance 123e4567-e89b-12d3-a456-426614174000 (my-env) +``` + +## Common Errors & Logs + +- `at least one instance ID is required` — You must provide an instance ID to delete. +- `failed to get instance : ...` — The specified instance ID does not exist + or cannot be found. +- `failed to delete instance : ...` — There was an error during deletion. +- `Successfully deleted instance ()` — Success log after deletion. + +## Related Commands + +- [create](create.md) - Create an environment +- [status](status.md) - Check environment status +- [list](list.md) - List all environments diff --git a/docs/commands/dryrun.md b/docs/commands/dryrun.md new file mode 100644 index 000000000..aa8e04673 --- /dev/null +++ b/docs/commands/dryrun.md @@ -0,0 +1,58 @@ +# Dry Run Command + +The `dryrun` command validates an environment configuration and simulates the +creation process without making any actual changes. + +## Usage + +```bash +holodeck dryrun -f +``` + +## Flags + +- `-f, --envFile ` Path to the environment YAML file (required) + +## Examples + +### Basic Dry Run + +```bash +holodeck dryrun -f environment.yaml +``` + +## What Gets Validated + +The dry run command checks: + +- Configuration file syntax +- Provider credentials +- Resource availability +- Network configuration +- Component compatibility +- Dependencies resolution + +## Sample Output + +```text +Dryrun environment my-environment 🔍 +✔ Checking if instance type g4dn.xlarge is supported in region us-west-2 +✔ Checking if image ami-0fe8bec493a81c7da is supported in region us-west-2 +✔ Resolving dependencies 📦 +Dryrun succeeded 🎉 +``` + +## Common Errors & Logs + +- `failed to read config file : ...` — The environment YAML file is + missing or invalid. +- `unknown provider ...` — The provider specified in the config is not + supported. +- `failed to connect to ` — SSH connection failed (for SSH provider). +- `Dryrun succeeded 🎉` — All validations passed. + +## Related Commands + +- [create](create.md) - Create an environment +- [status](status.md) - Check environment status +- [list](list.md) - List all environments diff --git a/docs/commands/list.md b/docs/commands/list.md new file mode 100644 index 000000000..417f79014 --- /dev/null +++ b/docs/commands/list.md @@ -0,0 +1,54 @@ +# List Command + +The `list` command displays all Holodeck environments and their current status. + +## Usage + +```bash +holodeck list [flags] +``` + +## Flags + +- `-c, --cachepath ` Path to the cache directory (optional) +- `-q, --quiet` Only display instance IDs (optional) + +## Examples + +### List All Environments + +```bash +holodeck list +``` + +### List Only Instance IDs + +```bash +holodeck list --quiet +``` + +### Specify Cache Directory + +```bash +holodeck list --cachepath=/tmp/holodeck-cache +``` + +## Sample Output + +```text +INSTANCE ID NAME PROVIDER STATUS PROVISIONED CREATED AGE +123e4567-... my-env aws running true 2024-06-01 12:00:00 2h +... +``` + +## Common Errors & Logs + +- `No instances found` — No environments are currently managed by Holodeck. +- `failed to list instances: ...` — There was an error reading the cache or + instance data. + +## Related Commands + +- [create](create.md) - Create an environment +- [status](status.md) - Check environment status +- [delete](delete.md) - Delete an environment diff --git a/docs/commands/status.md b/docs/commands/status.md new file mode 100644 index 000000000..d8f9c6e33 --- /dev/null +++ b/docs/commands/status.md @@ -0,0 +1,52 @@ +# Status Command + +The `status` command provides detailed information about a specific Holodeck +environment. + +## Usage + +```bash +holodeck status [flags] +``` + +## Flags + +- `-c, --cachepath ` Path to the cache directory (optional) + +## Examples + +### Basic Status Check + +```bash +holodeck status 123e4567-e89b-12d3-a456-426614174000 +``` + +### Specify Cache Directory + +```bash +holodeck status 123e4567-e89b-12d3-a456-426614174000 --cachepath=/tmp/holodeck-cache +``` + +## Sample Output + +```text +Instance ID: 123e4567-e89b-12d3-a456-426614174000 +Name: my-env +Provider: aws +Status: running +Created: 2024-06-01 12:00:00 (2h0m0s ago) +Cache File: /home/user/.cache/holodeck/123e4567-e89b-12d3-a456-426614174000.yaml +``` + +## Common Errors & Logs + +- `instance ID is required` — You must provide an instance ID. +- `invalid instance ID` — The provided instance ID is not valid. +- `failed to get instance: ...` — The specified instance could not be found or + loaded. + +## Related Commands + +- [create](create.md) - Create an environment +- [list](list.md) - List all environments +- [delete](delete.md) - Delete an environment diff --git a/docs/contributing/README.md b/docs/contributing/README.md new file mode 100644 index 000000000..e6155b33a --- /dev/null +++ b/docs/contributing/README.md @@ -0,0 +1,137 @@ +# Contributing to Holodeck + +Thank you for your interest in contributing to Holodeck! This guide will help +you get started. + +## Development Setup + +1. Fork the repository +1. Clone your fork: + + ```bash + git clone https://github.com/your-username/holodeck.git + cd holodeck + ``` + +1. Add the upstream repository: + + ```bash + git remote add upstream https://github.com/nvidia/holodeck.git + ``` + +### Environment Requirements + +- Linux or macOS (Windows is not supported) +- Go 1.20 or later +- Make +- Git + +## Makefile Targets + +The project uses a Makefile to manage common development tasks: + +```bash +# Build the binary +make build + +# Run tests +make test + +# Run linters +make lint + +# Clean build artifacts +make clean + +# Run all checks (lint, test, build) +make check +``` + +## Running the CLI Locally + +After building, you can run the CLI directly: + +```bash +./bin/holodeck --help +``` + +Or install it system-wide: + +```bash +sudo mv ./bin/holodeck /usr/local/bin/holodeck +``` + +## Development Workflow + +1. Create a new branch for your feature/fix: + + ```bash + git checkout -b feature/your-feature-name + ``` + +1. Make your changes and commit them: + + ```bash + git commit -s -m "feat: your feature description" + ``` + +1. Push to your fork: + + ```bash + git push origin feature/your-feature-name + ``` + +1. Create a Pull Request against the main repository + +## Commit Message Conventions + +- Use [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/): + - `feat: ...` for new features + - `fix: ...` for bug fixes + - `docs: ...` for documentation changes + - `refactor: ...` for code refactoring + - `test: ...` for adding or updating tests + - `chore: ...` for maintenance +- Use the `-s` flag to sign off your commits + +## Code Style + +- Follow the Go code style guidelines +- Run `make lint` before submitting PRs +- Ensure all tests pass with `make test` + +## Testing + +- Write unit tests for new features +- Update existing tests when modifying features +- Run the full test suite with `make test` + +## Documentation + +- Update relevant documentation when adding features +- Follow the existing documentation style + +## Pull Request Process + +1. Ensure your PR description clearly describes the problem and solution +1. Include relevant issue numbers +1. Add tests for new functionality +1. Update documentation +1. Ensure CI passes + +## Release Process + +1. Version bump +1. Update changelog +1. Create release tag +1. Build and publish release artifacts + +## Getting Help + +- Open an issue for bugs or feature requests +- Join the community discussions +- Check existing documentation + +## Code of Conduct + +Please read and follow our [Code of Conduct](../CODE_OF_CONDUCT.md). diff --git a/docs/examples/README.md b/docs/examples/README.md new file mode 100644 index 000000000..5bfa30d39 --- /dev/null +++ b/docs/examples/README.md @@ -0,0 +1,70 @@ +# Holodeck Example Configurations + +This directory provides example environment configuration files for Holodeck. +Use these as starting points for your own environments. + +## Available Examples + +### 1. Basic AWS Environment (Kubeadm) + +**File:** [`examples/aws_kubeadm.yaml`](../../examples/aws_kubeadm.yaml) + +A minimal AWS environment using the kubeadm installer for Kubernetes. + +```bash +holodeck create -f examples/aws_kubeadm.yaml +``` + +### 2. Basic AWS Environment (Kind) + +**File:** [`examples/aws_kind.yaml`](../../examples/aws_kind.yaml) + +A minimal AWS environment using the kind installer for Kubernetes. + +```bash +holodeck create -f examples/aws_kind.yaml +``` + +### 3. Generic v1alpha1 Environment + +**File:** [`examples/v1alpha1_environment.yaml`](../../examples/v1alpha1_environment.yaml) + +A generic example showing the full v1alpha1 environment spec, including +provider, instance, and Kubernetes options. + +```bash +holodeck create -f examples/v1alpha1_environment.yaml +``` + +### 4. Custom Kubeadm Config + +**File:** [`examples/kubeadm-config.yaml`](../../examples/kubeadm-config.yaml) + +A sample kubeadm configuration file for advanced Kubernetes cluster setup. +Use with the `kubeadm` installer. + +### 5. Kind Cluster Config + +**File:** [`examples/kind.yaml`](../../examples/kind.yaml) + +A sample kind cluster configuration for use with the kind installer. + +--- + +## How to Use These Examples + +1. Copy the desired YAML file to your working directory (optional). +1. Edit the file as needed (e.g., update region, instance type, image ID). +1. Create the environment: + + ```bash + holodeck create -f .yaml + ``` + +1. Use `holodeck list`, `holodeck status `, + and `holodeck delete ` to manage your environment. + +--- + +For more details on configuration options, see the +[Command Reference](../commands/) and [Quick Start Guide](../quick-start.md). diff --git a/docs/mdl-style.rb b/docs/mdl-style.rb new file mode 100644 index 000000000..6b9e43987 --- /dev/null +++ b/docs/mdl-style.rb @@ -0,0 +1,13 @@ +all +# Exclude MD022 - Headers should be surrounded by blank lines. The kramdown +# "class magic" (like {: .no_toc}) needs to be directly below the heading line. +exclude_rule 'MD022' +# Exclude MD041 - First line in file should be a top level header +exclude_rule 'MD041' +rule 'MD013', :tables => false +rule 'MD007', :indent => 2 +rule 'MD013', :ignore_code_blocks => true +rule 'MD024', :allow_different_nesting => true +# MD056 - Inconsistent number of columns in table +# docs/deployment/helm.md:98 +exclude_rule 'MD056' \ No newline at end of file diff --git a/docs/prerequisites.md b/docs/prerequisites.md new file mode 100644 index 000000000..ded28ae56 --- /dev/null +++ b/docs/prerequisites.md @@ -0,0 +1,89 @@ +# Prerequisites + +This document outlines the requirements and setup needed to use Holodeck +effectively. + +## System Requirements + +- Linux or macOS operating system (Windows is not supported) +- Go 1.20 or later +- Make +- Git + +## Provider-Specific Requirements + +### AWS Provider + +To use the AWS provider, you need: + +1. AWS CLI installed and configured +1. Valid AWS credentials in one of these locations: + - `~/.aws/credentials` + - Environment variables: + - `AWS_ACCESS_KEY_ID` + - `AWS_SECRET_ACCESS_KEY` + - `AWS_SESSION_TOKEN` (if using temporary credentials) + +1. Appropriate IAM permissions for: + - EC2 instance management + - VPC configuration + - Security group management + - IAM role management + +### SSH Provider + +To use the SSH provider, you need: + +1. SSH key pair +1. Access to a reachable host +1. Proper network connectivity to the target host +1. Sufficient permissions on the target host + +## Environment Configuration + +### AWS Configuration Example + +```yaml +apiVersion: holodeck.nvidia.com/v1alpha1 +kind: Environment +metadata: + name: aws-env +spec: + provider: aws + instance: + type: g4dn.xlarge + region: us-west-2 +``` + +### SSH Configuration Example + +```yaml +apiVersion: holodeck.nvidia.com/v1alpha1 +kind: Environment +metadata: + name: ssh-env +spec: + provider: ssh + auth: + keyName: user + privateKey: "/path/to/private/key" + instance: + hostUrl: "host.example.com" +``` + +## Network Requirements + +- Outbound internet access for package downloads +- Appropriate security group rules for your use case +- VPC configuration if using AWS provider + +## GPU & Driver Requirements + +- Compatible NVIDIA GPU (for GPU workloads) +- Supported NVIDIA driver version + (see [Create Command documentation](../commands/create.md#supported-nvidia-driver-versions) + for the list) +- CUDA toolkit (optional, only if your workloads require it) + +For more information, see the [Quick Start Guide](quick-start.md) +or the [Command Reference](commands/). diff --git a/docs/quick-start.md b/docs/quick-start.md new file mode 100644 index 000000000..f590c2801 --- /dev/null +++ b/docs/quick-start.md @@ -0,0 +1,75 @@ +# Quick Start Guide + +This guide will help you get started with Holodeck quickly. + +## Installation + +```bash +# Build the binary +make build + +# Install to your system (requires sudo) +sudo mv ./bin/holodeck /usr/local/bin/holodeck +``` + +## Prerequisites + +- Go 1.20+ +- (For AWS) Valid AWS credentials in your environment +- (For SSH) Reachable host and valid SSH key + +See [Prerequisites](prerequisites.md) for full details. + +## Your First Environment + +1. Create a basic environment configuration file (e.g., `environment.yaml`): + +```yaml +apiVersion: holodeck.nvidia.com/v1alpha1 +kind: Environment +metadata: + name: my-first-env + description: "My first Holodeck environment" +spec: + provider: aws + instance: + type: g4dn.xlarge + region: us-west-2 + image: + architecture: amd64 + imageId: ami-0fe8bec493a81c7da # Ubuntu 22.04 image + kubernetes: + install: true + version: v1.28.5 +``` + +1. Create the environment: + +```bash +holodeck create -f environment.yaml +``` + +1. List environments and find your instance ID: + +```bash +holodeck list +``` + +1. Check the status of your environment: + +```bash +holodeck status +``` + +1. When done, delete the environment: + +```bash +holodeck delete +``` + +## Next Steps + +- Check out the [Prerequisites](prerequisites.md) for detailed setup + requirements +- Explore the [Command Reference](commands/) for all available commands +- See [Examples](../examples/) for more complex configurations diff --git a/scripts/mdlint.sh b/scripts/mdlint.sh new file mode 100755 index 000000000..54401017a --- /dev/null +++ b/scripts/mdlint.sh @@ -0,0 +1,6 @@ +#!/bin/bash -e + +# Install mdl +gem install mdl -v 0.13.0 +# Run verify steps +find docs/ -path docs/vendor -prune -false -o -name '*.md' | xargs mdl -s docs/mdl-style.rb From 99ec561a3a60f29d396651c5ebc53edd20cee563 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 29 May 2025 15:45:43 +0200 Subject: [PATCH 4/4] update unit tests on templates Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/nv-driver.go | 2 +- pkg/provisioner/templates/nv-driver_test.go | 68 +++++++++++++++++---- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/pkg/provisioner/templates/nv-driver.go b/pkg/provisioner/templates/nv-driver.go index be011c2cc..c9daa1a2d 100644 --- a/pkg/provisioner/templates/nv-driver.go +++ b/pkg/provisioner/templates/nv-driver.go @@ -26,7 +26,7 @@ import ( // From https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#ubuntu-lts const NvDriverTemplate = ` - +# Install Dependencies with_retry 3 10s sudo apt-get update install_packages_with_retry linux-headers-$(uname -r) gcc make install_packages_with_retry apt-utils build-essential \ diff --git a/pkg/provisioner/templates/nv-driver_test.go b/pkg/provisioner/templates/nv-driver_test.go index 0dec801e7..b253f7635 100644 --- a/pkg/provisioner/templates/nv-driver_test.go +++ b/pkg/provisioner/templates/nv-driver_test.go @@ -41,13 +41,29 @@ func TestNVDriverTemplate(t *testing.T) { }, expectedOutput: ` -sudo apt-get update -install_packages_with_retry linux-headers-$(uname -r) +# Install Dependencies +with_retry 3 10s sudo apt-get update +install_packages_with_retry linux-headers-$(uname -r) gcc make +install_packages_with_retry apt-utils build-essential \ + ca-certificates \ + curl \ + kmod \ + file \ + libelf-dev \ + libglvnd-dev \ + pkg-config + +install_packages_with_retry gcc-12 g++-12 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 12 + +# Install the new cuda-keyring package distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb - with_retry 3 10s sudo apt-get update + +# Install the NVIDIA driver install_packages_with_retry cuda-drivers=123.4.5 # Check if NVIDIA module is loaded, if not load it @@ -68,14 +84,29 @@ nvidia-smi Branch: "550", }, expectedOutput: ` - -sudo apt-get update -install_packages_with_retry linux-headers-$(uname -r) +# Install Dependencies +with_retry 3 10s sudo apt-get update +install_packages_with_retry linux-headers-$(uname -r) gcc make +install_packages_with_retry apt-utils build-essential \ + ca-certificates \ + curl \ + kmod \ + file \ + libelf-dev \ + libglvnd-dev \ + pkg-config + +install_packages_with_retry gcc-12 g++-12 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 12 + +# Install the new cuda-keyring package distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb - with_retry 3 10s sudo apt-get update + +# Install the NVIDIA driver install_packages_with_retry cuda-drivers-550 # Check if NVIDIA module is loaded, if not load it @@ -97,14 +128,29 @@ nvidia-smi Version: "123.4.5", }, expectedOutput: ` - -sudo apt-get update -install_packages_with_retry linux-headers-$(uname -r) +# Install Dependencies +with_retry 3 10s sudo apt-get update +install_packages_with_retry linux-headers-$(uname -r) gcc make +install_packages_with_retry apt-utils build-essential \ + ca-certificates \ + curl \ + kmod \ + file \ + libelf-dev \ + libglvnd-dev \ + pkg-config + +install_packages_with_retry gcc-12 g++-12 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 12 + +# Install the new cuda-keyring package distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb - with_retry 3 10s sudo apt-get update + +# Install the NVIDIA driver install_packages_with_retry cuda-drivers=123.4.5 # Check if NVIDIA module is loaded, if not load it