Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7ca2841
Expand retry time out on Kubernetes templates
ArangoGutierrez May 30, 2025
c1efcbf
Merge pull request #379 from ArangoGutierrez/devel/v0212
ArangoGutierrez May 30, 2025
3267f73
Add --ignore-preflight-errors= to kubernetes template during kubeadm …
ArangoGutierrez May 30, 2025
395f0c4
Merge pull request #380 from ArangoGutierrez/devel/v0212
ArangoGutierrez May 30, 2025
23169c9
Bump containerd minimum defaults
ArangoGutierrez May 30, 2025
61fddf2
Merge pull request #381 from ArangoGutierrez/template_deps
ArangoGutierrez May 30, 2025
cd0167e
Bump github.com/aws/aws-sdk-go-v2/service/ec2 from 1.222.0 to 1.224.0
dependabot[bot] May 30, 2025
2b63184
Merge pull request #374 from NVIDIA/dependabot/go_modules/main/github…
ArangoGutierrez May 30, 2025
1d7ff3e
Enahnce documentation
ArangoGutierrez May 31, 2025
7de06bb
Add Coveralls
ArangoGutierrez May 31, 2025
a6c03dc
Use makefile from e2e matrix
ArangoGutierrez May 31, 2025
7b26557
move node readyness check to last in kubernetes template
ArangoGutierrez May 31, 2025
b286058
Merge pull request #382 from ArangoGutierrez/docs
ArangoGutierrez May 31, 2025
aa57fc1
Only generate ginkgo logs on gpu test
ArangoGutierrez May 31, 2025
8bbf21e
Merge pull request #383 from ArangoGutierrez/v0212
ArangoGutierrez May 31, 2025
f9103a7
Normalize retry/timeouts for kubernetes installation
ArangoGutierrez Jun 2, 2025
4190ef6
fix kubernetes version setting on kubernetes template
ArangoGutierrez Jun 2, 2025
13fa2be
Move microK8s version normalization to it's template
ArangoGutierrez Jun 2, 2025
c1c5e8c
Merge pull request #385 from ArangoGutierrez/v0212
ArangoGutierrez Jun 2, 2025
cf2d3dc
use version instead of kubernetesVersion in legacy test
ArangoGutierrez Jun 2, 2025
4f2ecb0
use version instead of kubernetesVersion in dra test
ArangoGutierrez Jun 2, 2025
a2a1d65
Merge pull request #386 from ArangoGutierrez/fix_test_data_legacy
ArangoGutierrez Jun 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ jobs:
e2e-test:
runs-on: linux-amd64-cpu4
if: ${{ github.event.workflow_run.conclusion == 'success' }} && ${{ github.event.workflow_run.event == 'push' }}
strategy:
matrix:
label: [default, legacy, dra, kernel]
name: E2E Test (${{ matrix.label }})

steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -42,9 +47,8 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install -y make
make ginkgo

- name: Run e2e tests
- name: Run e2e test for ${{ matrix.label }}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand All @@ -55,13 +59,16 @@ jobs:
echo "${{ secrets.AWS_SSH_KEY }}" > "$e2e_ssh_key"
chmod 600 "$e2e_ssh_key"
export E2E_SSH_KEY="$e2e_ssh_key"

make -f tests/Makefile test
if [ "${{ matrix.label }}" = "default" ]; then \
make -f tests/Makefile test GINKGO_ARGS="--label-filter='${{ matrix.label }}' --json-report ginkgo.json"; \
else \
make -f tests/Makefile test GINKGO_ARGS="--label-filter='${{ matrix.label }}'"; \
fi

- name: Archive Ginkgo logs
uses: actions/upload-artifact@v4
with:
name: ginkgo-logs
name: ginkgo-logs-${{ matrix.label }}
path: ginkgo.json
retention-days: 15

Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/golang.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,14 @@ jobs:
with:
go-version: ${{ needs.variables.outputs.GOLANG_VERSION }}

- run: make coverage
- name: Run unit tests and generate coverage report
run: make coverage

- name: Upload to Coveralls
uses: coverallsapp/github-action@v2
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
file: coverage.out

build:
name: Build
Expand Down
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ test:
go test -coverprofile=$(COVERAGE_FILE) ./pkg/...

coverage: test
cat $(COVERAGE_FILE) | grep -v "_mock.go" > $(COVERAGE_FILE).no-mocks
go tool cover -func=$(COVERAGE_FILE).no-mocks
go tool cover -func=$(COVERAGE_FILE)

mdlint:
${CONTAINER_RUN_CMD} \
Expand Down
13 changes: 3 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Holodeck

> * Tech preview, under heavy development *
[![Latest Release](https://img.shields.io/github/v/release/NVIDIA/holodeck?label=latest%20release)](https://github.com/NVIDIA/holodeck/releases/latest)

A tool for creating and managing GPU-ready Cloud test environments.

Expand All @@ -13,6 +13,7 @@ A tool for creating and managing GPU-ready Cloud test environments.
- [Commands Reference](docs/commands/)
- [Contributing Guide](docs/contributing/)
- [Examples](docs/examples/)
- [Latest Release](https://github.com/NVIDIA/holodeck/releases/latest)

---

Expand Down Expand Up @@ -89,19 +90,11 @@ holodeck status <instance-id>
holodeck dryrun -f ./examples/v1alpha1_environment.yaml
```

---

## 📦 Supported Cuda-Drivers

See [docs/prerequisites.md](docs/prerequisites.md#supported-cuda-drivers) for the full list and usage.

---

## 📂 More

- [Examples](docs/examples/)
- [Guides](docs/guides/)

---

For more information, see the [docs/](docs/) directory.
For more information, see the [documentation](docs/README.md) directory.
24 changes: 24 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Holodeck Documentation

[![Latest Release](https://img.shields.io/github/v/release/NVIDIA/holodeck?label=latest%20release)](https://github.com/NVIDIA/holodeck/releases/latest)

Welcome to the Holodeck documentation! Here you'll find everything you need to
get started, use, and contribute to Holodeck.

## 📚 Sections

- [Quick Start](quick-start.md): Get up and running with Holodeck in minutes.
- [Prerequisites](prerequisites.md): What you need before you begin.
- [Commands Reference](commands/README.md): Detailed documentation for every
Holodeck CLI command.
- [Contributing Guide](contributing/README.md): How to contribute to Holodeck,
including coding standards and PR process.
- [Examples](examples/README.md): Example configuration files and usage scenarios.
- [Guides](guides/README.md): In-depth guides and tutorials for advanced usage.
- [Latest Release](https://github.com/NVIDIA/holodeck/releases/latest)

---

For general information, see the [main README](../README.md).

If you have questions or want to contribute, check out the [Contributing Guide](contributing/README.md)!
2 changes: 1 addition & 1 deletion docs/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,4 @@ A sample kind cluster configuration for use with the kind installer.
---

For more details on configuration options, see the
[Command Reference](../commands/) and [Quick Start Guide](../quick-start.md).
[Command Reference](../commands/README.md) and [Quick Start Guide](../quick-start.md).
10 changes: 10 additions & 0 deletions docs/guides/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Guides

This section is for in-depth guides and tutorials related to Holodeck.

- If you are looking for step-by-step instructions or advanced usage, guides
will be listed here as they are added.
- To contribute a guide, simply add a new Markdown file to this folder and
update this README with a link.

*No guides are available yet. Stay tuned!*
2 changes: 1 addition & 1 deletion docs/quick-start.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,5 +71,5 @@ holodeck delete <instance-id>

- Check out the [Prerequisites](prerequisites.md) for detailed setup
requirements
- Explore the [Command Reference](commands/) for all available commands
- Explore the [Command Reference](commands/README.md) for all available commands
- See [Examples](../examples/) for more complex configurations
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/aws/aws-sdk-go v1.55.7
github.com/aws/aws-sdk-go-v2 v1.36.3
github.com/aws/aws-sdk-go-v2/config v1.29.14
github.com/aws/aws-sdk-go-v2/service/ec2 v1.222.0
github.com/aws/aws-sdk-go-v2/service/ec2 v1.224.0
github.com/aws/aws-sdk-go-v2/service/ssm v1.59.0
github.com/mattn/go-isatty v0.0.20
github.com/onsi/ginkgo/v2 v2.23.4
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 h1:SZwFm17ZUNNg5Np0io
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34/go.mod h1:dFZsC0BLo346mvKQLWmoJxT+Sjp+qcVR1tRVHQGOH9Q=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.222.0 h1:qPVuEWzRvc/Z8UA0CKG4QczxORbgYTbWwlviUAmVmgs=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.222.0/go.mod h1:ouvGEfHbLaIlWwpDpOVWPWR+YwO0HDv3vm5tYLq8ImY=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.224.0 h1:i7FB/N5pSvEzNOGHm7n6KQiBx2/X8UkrE/Ppb5Bh3QQ=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.224.0/go.mod h1:ouvGEfHbLaIlWwpDpOVWPWR+YwO0HDv3vm5tYLq8ImY=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 h1:dM9/92u2F1JbDaGooxTq18wmmFzbJRfXfVfy96/1CXM=
Expand Down
4 changes: 2 additions & 2 deletions pkg/provisioner/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ func (p *Provisioner) waitForNodeReboot() error {
}

// Wait for the node to come back up
maxRetries := 30
retryInterval := 10 * time.Second
maxRetries := 10
retryInterval := 30 * time.Second

for i := 0; i < maxRetries; i++ {
p.log.Info("Waiting for node to come back online...")
Expand Down
6 changes: 3 additions & 3 deletions pkg/provisioner/templates/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ RUNC_VERSION=$(curl -fsSL https://api.github.com/repos/opencontainers/runc/relea

if [ -z "$RUNC_VERSION" ]; then
echo "Failed to fetch latest RUNC version. Using default version."
RUNC_VERSION="1.2.6"
RUNC_VERSION="1.3.0"
fi

RUNC_URL="https://github.com/opencontainers/runc/releases/download/v${RUNC_VERSION}/runc.${ARCH}"
Expand All @@ -184,7 +184,7 @@ sudo install -m 755 runc.${ARCH} /usr/local/sbin/runc
echo "Runc ${RUNC_VERSION} installed successfully."

# Install CNI plugins
CNI_VERSION="1.1.1"
CNI_VERSION="1.3.0"
CNI_TAR="cni-plugins-linux-${ARCH}-v${CNI_VERSION}.tgz"
CNI_URL="https://github.com/containernetworking/plugins/releases/download/v${CNI_VERSION}/${CNI_TAR}"

Expand Down Expand Up @@ -322,7 +322,7 @@ func NewContainerd(env v1alpha1.Environment) *Containerd {
var version string

if env.Spec.ContainerRuntime.Version == "" {
version = "1.6.27"
version = "1.7.26"
} else {
// remove the 'v' prefix from the version if it exists
version = strings.TrimPrefix(env.Spec.ContainerRuntime.Version, "v")
Expand Down
12 changes: 6 additions & 6 deletions pkg/provisioner/templates/containerd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ import (
func TestNewContainerd_Defaults(t *testing.T) {
env := v1alpha1.Environment{}
c := NewContainerd(env)
if c.Version != "1.6.27" {
t.Errorf("expected default Version to be '1.6.27', got '%s'", c.Version)
if c.Version != "1.7.26" {
t.Errorf("expected default Version to be '1.7.26', got '%s'", c.Version)
}
}

Expand All @@ -39,8 +39,8 @@ func TestNewContainerd_EmptyVersion(t *testing.T) {
},
}
c := NewContainerd(env)
if c.Version != "1.6.27" {
t.Errorf("expected default Version to be '1.6.27' when empty, got '%s'", c.Version)
if c.Version != "1.7.26" {
t.Errorf("expected default Version to be '1.7.26' when empty, got '%s'", c.Version)
}
}

Expand All @@ -62,7 +62,7 @@ func TestContainerd_Execute_Version1(t *testing.T) {
env := v1alpha1.Environment{
Spec: v1alpha1.EnvironmentSpec{
ContainerRuntime: v1alpha1.ContainerRuntime{
Version: "1.6.27",
Version: "1.7.26",
},
},
}
Expand Down Expand Up @@ -138,7 +138,7 @@ func TestContainerd_Execute_SystemChecks(t *testing.T) {
env := v1alpha1.Environment{
Spec: v1alpha1.EnvironmentSpec{
ContainerRuntime: v1alpha1.ContainerRuntime{
Version: "1.6.27",
Version: "1.7.26",
},
},
}
Expand Down
54 changes: 31 additions & 23 deletions pkg/provisioner/templates/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,36 +93,39 @@ with_retry 3 10s sudo kubeadm init \
--ignore-preflight-errors=all
{{- else }}
# Using kubeadm config file for newer Kubernetes versions
with_retry 3 10s sudo kubeadm init --config /etc/kubernetes/kubeadm-config.yaml
with_retry 3 10s sudo kubeadm init --config /etc/kubernetes/kubeadm-config.yaml --ignore-preflight-errors=all
{{- end }}

mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
export KUBECONFIG="${HOME}/.kube/config"

# Wait explicitly for kube-apiserver availability
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG version

# Install Calico
# based on https://docs.tigera.io/calico/latest/getting-started/kubernetes/quickstart
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/tigera-operator.yaml
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/tigera-operator.yaml

# Wait for Tigera operator to be ready
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=available --timeout=300s deployment/tigera-operator -n tigera-operator
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=available --timeout=300s deployment/tigera-operator -n tigera-operator

# Wait for all necessary CRDs to be established
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/installations.operator.tigera.io
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/apiservers.operator.tigera.io
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/tigerastatuses.operator.tigera.io
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/installations.operator.tigera.io
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/apiservers.operator.tigera.io
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/tigerastatuses.operator.tigera.io

# Apply custom resources with increased retry attempts
with_retry 10 15s kubectl --kubeconfig $KUBECONFIG apply -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml

# Wait for cluster to be ready
with_retry 10 20s kubectl --kubeconfig $KUBECONFIG wait --for=condition=ready --timeout=300s nodes --all
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG apply -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml

# Make single-node cluster schedulable
kubectl taint nodes --all node-role.kubernetes.io/control-plane:NoSchedule-
kubectl label node --all node-role.kubernetes.io/worker=
kubectl label node --all nvidia.com/holodeck.managed=true
with_retry 10 30s kubectl taint nodes --all node-role.kubernetes.io/control-plane:NoSchedule-
with_retry 10 30s kubectl label node --all node-role.kubernetes.io/worker=
with_retry 10 30s kubectl label node --all nvidia.com/holodeck.managed=true

# Wait for cluster to be ready
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=ready --timeout=300s nodes --all
`

const KindTemplate = `
Expand Down Expand Up @@ -168,11 +171,14 @@ echo "ssh -i <your-private-key> ubuntu@${INSTANCE_ENDPOINT_HOST}"

const microk8sTemplate = `
: ${INSTANCE_ENDPOINT_HOST:={{.K8sEndpointHost}}}
: ${K8S_VERSION:={{.Version}}}

# Remove leading 'v' from version if present for microk8s snap channel
MICROK8S_VERSION="${K8S_VERSION#v}"

# Install microk8s
sudo apt-get update

sudo snap install microk8s --classic --channel={{.Version}}
sudo snap install microk8s --classic --channel=${MICROK8S_VERSION}
sudo microk8s enable gpu dashboard dns registry
sudo usermod -a -G microk8s ubuntu
mkdir -p ~/.kube
Expand All @@ -181,7 +187,7 @@ sudo microk8s config > ~/.kube/config
sudo chown -f -R ubuntu ~/.kube
sudo snap alias microk8s.kubectl kubectl

echo "Microk8s {{.Version}} installed successfully"
echo "Microk8s ${MICROK8S_VERSION} installed successfully"
echo "you can now access the cluster with:"
echo "ssh -i <your-private-key> ubuntu@${INSTANCE_ENDPOINT_HOST}"
`
Expand Down Expand Up @@ -269,14 +275,16 @@ type KubeadmConfig struct {
}

func NewKubernetes(env v1alpha1.Environment) (*Kubernetes, error) {
kubernetes := &Kubernetes{
Version: env.Spec.Kubernetes.KubernetesVersion,
}
// check if env.Spec.Kubernetes.KubernetesVersion is in the format of vX.Y.Z
// if not, set the default version
if !strings.HasPrefix(env.Spec.Kubernetes.KubernetesVersion, "v") && env.Spec.Kubernetes.KubernetesInstaller != "microk8s" {
fmt.Printf("Kubernetes version %s is not in the format of vX.Y.Z, setting default version v1.32.1\n", env.Spec.Kubernetes.KubernetesVersion)
kubernetes := &Kubernetes{}

// Normalize Kubernetes version: always ensure it starts with 'v'
switch {
case env.Spec.Kubernetes.KubernetesVersion == "":
kubernetes.Version = defaultKubernetesVersion
case !strings.HasPrefix(env.Spec.Kubernetes.KubernetesVersion, "v"):
kubernetes.Version = "v" + env.Spec.Kubernetes.KubernetesVersion
default:
kubernetes.Version = env.Spec.Kubernetes.KubernetesVersion
}
if env.Spec.Kubernetes.KubeletReleaseVersion != "" {
kubernetes.KubeletReleaseVersion = env.Spec.Kubernetes.KubeletReleaseVersion
Expand Down
2 changes: 1 addition & 1 deletion tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ test: $(GINKGO_BIN)
CI=$(CI) \
ENV_FILE=$(ENV_FILE) \
GINKGO_FOCUS=$(GINKGO_FOCUS) \
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/...
$(GINKGO_BIN) $(GINKGO_ARGS) -v ./tests/...

$(GINKGO_BIN):
mkdir -p $(CURDIR)/bin
Expand Down
Loading
Loading