Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
LOG_ARTIFACT_DIR: e2e_logs
run: make -f tests/Makefile test
run: |
e2e_ssh_key=$(mktemp)
echo "${{ secrets.AWS_SSH_KEY }}" > "$e2e_ssh_key"
chmod 600 "$e2e_ssh_key"
export E2E_SSH_KEY="$e2e_ssh_key"

make -f tests/Makefile test

- name: Archive Ginkgo logs
uses: actions/upload-artifact@v4
Expand Down
18 changes: 18 additions & 0 deletions cmd/cli/create/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/NVIDIA/holodeck/pkg/utils"

cli "github.com/urfave/cli/v2"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type options struct {
Expand Down Expand Up @@ -200,6 +201,23 @@ func runProvision(log *logger.FunLogger, opts *options) error {
opts.cfg.Status = opts.cache.Status

if err = p.Run(opts.cfg); err != nil {
// Set degraded condition when provisioning fails
opts.cfg.Status.Conditions = []metav1.Condition{
{
Type: v1alpha1.ConditionDegraded,
Status: metav1.ConditionTrue,
LastTransitionTime: metav1.Now(),
Reason: "ProvisioningFailed",
Message: fmt.Sprintf("Failed to provision environment: %v", err),
},
}
data, err := jyaml.MarshalYAML(opts.cfg)
if err != nil {
return fmt.Errorf("failed to marshal environment: %v", err)
}
if err := os.WriteFile(opts.cachePath, data, 0600); err != nil {
return fmt.Errorf("failed to update cache file with provisioning status: %v", err)
}
return fmt.Errorf("failed to run provisioner: %v", err)
}

Expand Down
40 changes: 0 additions & 40 deletions cmd/cli/delete/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,15 @@ import (
"os"
"path/filepath"

"github.com/NVIDIA/holodeck/api/holodeck/v1alpha1"
"github.com/NVIDIA/holodeck/internal/instances"
"github.com/NVIDIA/holodeck/internal/logger"
"github.com/NVIDIA/holodeck/pkg/jyaml"

cli "github.com/urfave/cli/v2"
)

type command struct {
log *logger.FunLogger
cachePath string
envFile string
cfg v1alpha1.Environment
}

// NewCommand constructs the delete command with the specified logger
Expand All @@ -57,50 +53,14 @@ func (m command) build() *cli.Command {
Destination: &m.cachePath,
Value: filepath.Join(os.Getenv("HOME"), ".cache", "holodeck"),
},
&cli.StringFlag{
Name: "envFile",
Aliases: []string{"f"},
Usage: "Path to the Environment file",
Destination: &m.envFile,
},
&cli.StringFlag{
Name: "instance-id",
Aliases: []string{"i"},
Usage: "Instance ID to delete",
},
},
Before: func(c *cli.Context) error {
// Check that either envFile or instance-id is provided, but not both
hasEnvFile := c.IsSet("envFile")
hasInstanceID := c.IsSet("instance-id")

if hasEnvFile && hasInstanceID {
return fmt.Errorf("cannot specify both --envFile and --instance-id")
}
if !hasEnvFile && !hasInstanceID {
return fmt.Errorf("must specify either --envFile or --instance-id")
}

// Read the config file if provided
if hasEnvFile {
var err error
m.cfg, err = jyaml.UnmarshalFromFile[v1alpha1.Environment](m.envFile)
if err != nil {
return fmt.Errorf("error reading config file: %s", err)
}
}
return nil
},
Action: func(c *cli.Context) error {
if c.IsSet("envFile") {
// Delete using environment file
instanceID := m.cfg.Labels[instances.InstanceLabelKey]
if instanceID == "" {
return fmt.Errorf("environment file does not contain an instance ID")
}
return m.run(c, instanceID)
}

// Delete using instance ID
instanceID := c.String("instance-id")
return m.run(c, instanceID)
Expand Down
1 change: 0 additions & 1 deletion cmd/cli/list/list.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ func (m command) run(c *cli.Context) error {
for _, instance := range instances {
// Skip instances without an ID (old cache files)
if instance.ID == "" {
m.log.Warning("Found old cache file without instance ID, skipping: %s", instance.CacheFile)
continue
}

Expand Down
7 changes: 5 additions & 2 deletions internal/instances/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,11 @@ func (m *Manager) getProviderStatus(env v1alpha1.Environment, cacheFile string)
}
case v1alpha1.ConditionDegraded:
if condition.Status == metav1.ConditionTrue {
status = "degraded"
if condition.Reason != "" {
status = fmt.Sprintf("degraded (%s)", condition.Reason)
} else {
status = "degraded"
}
statusFound = true
}
case v1alpha1.ConditionProgressing:
Expand Down Expand Up @@ -173,7 +177,6 @@ func (m *Manager) ListInstances() ([]Instance, error) {
env.Labels[InstanceLabelKey] = instanceID
} else {
// Skip files that don't have an instance ID and aren't UUIDs
m.log.Warning("Found old cache file without instance ID, skipping: %s", cacheFile)
continue
}
}
Expand Down
6 changes: 0 additions & 6 deletions pkg/provisioner/templates/kernel.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ echo "Current kernel version: $CURRENT_KERNEL"
KERNEL_VERSION="{{ .Spec.Kernel.Version }}"

if [ "${CURRENT_KERNEL}" != "${KERNEL_VERSION}" ]; then
echo "--------------Upgrading kernel to ${KERNEL_VERSION}--------------"

# Update package lists
sudo apt-get update -y || true

Expand All @@ -61,10 +59,6 @@ if [ "${CURRENT_KERNEL}" != "${KERNEL_VERSION}" ]; then
echo "Rebooting..."
# Run the reboot command with nohup to avoid abrupt SSH closure issues
nohup sudo reboot &

echo "--------------Kernel upgrade completed--------------"
else
echo "--------------Kernel upgrade not required, current kernel version ${KERNEL_VERSION}--------------"
fi
{{- end }}
`
Expand Down
12 changes: 5 additions & 7 deletions pkg/provisioner/templates/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,8 @@ export KUBECONFIG="${HOME}/.kube/config"

# Install Calico
# based on https://docs.tigera.io/calico/latest/getting-started/kubernetes/quickstart
with_retry 3 10s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/tigera-operator.yaml
# Calico CRDs created. Now we sleep for 10s to ensure they are fully registered in the K8s etcd
sleep 10s
with_retry 3 10s kubectl --kubeconfig $KUBECONFIG apply -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/tigera-operator.yaml
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml
# Make single-node cluster schedulable
kubectl taint nodes --all node-role.kubernetes.io/control-plane:NoSchedule-
kubectl label node --all node-role.kubernetes.io/worker=
Expand Down Expand Up @@ -445,7 +443,7 @@ func GetCRISocket(runtime string) (string, error) {
}
}

// isLegacyKubernetesVersion checks if the Kubernetes version is older than v1.30.0
// isLegacyKubernetesVersion checks if the Kubernetes version is older than v1.32.0
// which requires using legacy kubeadm init flags instead of config file
func isLegacyKubernetesVersion(version string) bool {
// Remove 'v' prefix if present
Expand All @@ -461,6 +459,6 @@ func isLegacyKubernetesVersion(version string) bool {
major, _ := strconv.Atoi(parts[0])
minor, _ := strconv.Atoi(parts[1])

// Return true if version is older than v1.30.0
return major < 1 || (major == 1 && minor < 30)
// Return true if version is older than v1.32.0
return major < 1 || (major == 1 && minor < 32)
}
32 changes: 16 additions & 16 deletions pkg/provisioner/templates/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,23 @@ func TestIsLegacyKubernetesVersion(t *testing.T) {
want bool
}{
{
name: "legacy version v1.29.0",
version: "v1.29.0",
name: "legacy version v1.31.0",
version: "v1.31.0",
want: true,
},
{
name: "legacy version v1.28.0",
version: "v1.28.0",
name: "legacy version v1.30.0",
version: "v1.30.0",
want: true,
},
{
name: "supported version v1.30.0",
version: "v1.30.0",
name: "supported version v1.32.0",
version: "v1.32.0",
want: false,
},
{
name: "supported version v1.31.0",
version: "v1.31.0",
name: "supported version v1.32.1",
version: "v1.32.1",
want: false,
},
{
Expand Down Expand Up @@ -81,7 +81,7 @@ func TestNewKubernetes(t *testing.T) {
CniPluginsVersion: defaultCNIPluginsVersion,
CalicoVersion: defaultCalicoVersion,
CrictlVersion: defaultCRIVersion,
UseLegacyInit: false,
UseLegacyInit: true,
CriSocket: "unix:///run/containerd/containerd.sock",
},
wantErr: false,
Expand All @@ -91,15 +91,15 @@ func TestNewKubernetes(t *testing.T) {
env: v1alpha1.Environment{
Spec: v1alpha1.EnvironmentSpec{
Kubernetes: v1alpha1.Kubernetes{
KubernetesVersion: "v1.29.0",
KubernetesVersion: "v1.31.0",
},
ContainerRuntime: v1alpha1.ContainerRuntime{
Name: "containerd",
},
},
},
want: &Kubernetes{
Version: "v1.29.0",
Version: "v1.31.0",
KubeletReleaseVersion: defaultKubeletReleaseVersion,
Arch: defaultArch,
CniPluginsVersion: defaultCNIPluginsVersion,
Expand Down Expand Up @@ -136,7 +136,7 @@ func TestNewKubernetes(t *testing.T) {
CalicoVersion: "v3.30.0",
CrictlVersion: "v1.32.0",
K8sFeatureGates: "Feature1=true,Feature2=false",
UseLegacyInit: false,
UseLegacyInit: true,
CriSocket: "unix:///run/cri-dockerd.sock",
},
wantErr: false,
Expand Down Expand Up @@ -199,7 +199,7 @@ func TestKubernetes_Execute(t *testing.T) {
env: v1alpha1.Environment{
Spec: v1alpha1.EnvironmentSpec{
Kubernetes: v1alpha1.Kubernetes{
KubernetesVersion: "v1.29.0",
KubernetesVersion: "v1.31.0",
KubernetesInstaller: "kubeadm",
K8sEndpointHost: "test-host",
},
Expand All @@ -217,7 +217,7 @@ func TestKubernetes_Execute(t *testing.T) {
env: v1alpha1.Environment{
Spec: v1alpha1.EnvironmentSpec{
Kubernetes: v1alpha1.Kubernetes{
KubernetesVersion: "v1.30.0",
KubernetesVersion: "v1.32.1",
KubernetesInstaller: "kind",
},
ContainerRuntime: v1alpha1.ContainerRuntime{
Expand All @@ -232,7 +232,7 @@ func TestKubernetes_Execute(t *testing.T) {
env: v1alpha1.Environment{
Spec: v1alpha1.EnvironmentSpec{
Kubernetes: v1alpha1.Kubernetes{
KubernetesVersion: "v1.30.0",
KubernetesVersion: "v1.32.1",
KubernetesInstaller: "microk8s",
},
ContainerRuntime: v1alpha1.ContainerRuntime{
Expand All @@ -247,7 +247,7 @@ func TestKubernetes_Execute(t *testing.T) {
env: v1alpha1.Environment{
Spec: v1alpha1.EnvironmentSpec{
Kubernetes: v1alpha1.Kubernetes{
KubernetesVersion: "v1.30.0",
KubernetesVersion: "v1.32.1",
KubernetesInstaller: "invalid",
},
ContainerRuntime: v1alpha1.ContainerRuntime{
Expand Down
11 changes: 10 additions & 1 deletion pkg/provisioner/templates/nv-driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,17 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_
sudo dpkg -i cuda-keyring_1.1-1_all.deb

with_retry 3 10s sudo apt-get update
install_packages_with_retry cuda-drivers{{if .Version}}={{.Version}}{{else if .Branch}}-{{.Branch}}{{end}}
install_packages_with_retry nvidia-driver{{if .Version}}={{.Version}}{{else if .Branch}}-{{.Branch}}{{end}}

# Check if NVIDIA module is loaded, if not load it
if ! lsmod | grep -q "^nvidia "; then
sudo modprobe nvidia
fi

# Start nvidia-persistenced
sudo nvidia-persistenced --persistence-mode

# Quick check to see if the driver is installed
nvidia-smi
`

Expand Down
33 changes: 30 additions & 3 deletions pkg/provisioner/templates/nv-driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,17 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_
sudo dpkg -i cuda-keyring_1.1-1_all.deb

with_retry 3 10s sudo apt-get update
install_packages_with_retry cuda-drivers=123.4.5
install_packages_with_retry nvidia-driver=123.4.5

# Check if NVIDIA module is loaded, if not load it
if ! lsmod | grep -q "^nvidia "; then
sudo modprobe nvidia
fi

# Start nvidia-persistenced
sudo nvidia-persistenced --persistence-mode

# Quick check to see if the driver is installed
nvidia-smi
`,
},
Expand All @@ -66,8 +75,17 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_
sudo dpkg -i cuda-keyring_1.1-1_all.deb

with_retry 3 10s sudo apt-get update
install_packages_with_retry cuda-drivers-550
install_packages_with_retry nvidia-driver-550

# Check if NVIDIA module is loaded, if not load it
if ! lsmod | grep -q "^nvidia "; then
sudo modprobe nvidia
fi

# Start nvidia-persistenced
sudo nvidia-persistenced --persistence-mode

# Quick check to see if the driver is installed
nvidia-smi
`,
},
Expand All @@ -86,8 +104,17 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_
sudo dpkg -i cuda-keyring_1.1-1_all.deb

with_retry 3 10s sudo apt-get update
install_packages_with_retry cuda-drivers=123.4.5
install_packages_with_retry nvidia-driver=123.4.5

# Check if NVIDIA module is loaded, if not load it
if ! lsmod | grep -q "^nvidia "; then
sudo modprobe nvidia
fi

# Start nvidia-persistenced
sudo nvidia-persistenced --persistence-mode

# Quick check to see if the driver is installed
nvidia-smi
`,
},
Expand Down
Loading