From c2b4652a21c5c9322585a65d929a7f425d39fa59 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Tue, 27 May 2025 08:07:17 +0000 Subject: [PATCH 1/8] update quickstart Change-Id: Id98538c545fafb566ae38a892ae597ee235b1845 --- site/content/docs/quick-start.md | 251 ++++++++----------------------- 1 file changed, 62 insertions(+), 189 deletions(-) diff --git a/site/content/docs/quick-start.md b/site/content/docs/quick-start.md index 3cb81d79..fbe68908 100644 --- a/site/content/docs/quick-start.md +++ b/site/content/docs/quick-start.md @@ -4,7 +4,7 @@ date: 2024-12-17T14:47:05Z weight: 1 --- -DRANET depends on the Kubernetes feature [Dynamic Resource Allocation (DRA)](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/), that is beta (disabled by default in v1.32). +`DraNet` depends on the Kubernetes feature [Dynamic Resource Allocation (DRA)](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/), that is beta (disabled by default in Kubernetes ∂v1.32). In order to enable DRA you need to enable both the [feature gates and the API groups](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#enabling-dynamic-resource-allocation). @@ -19,21 +19,17 @@ Create a cluster using the following configuration. ```yaml kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 -containerdConfigPatches: - # Enable NRI plugins -- |- - [plugins."io.containerd.nri.v1.nri"] - disable = false nodes: - role: control-plane - image: kindest/node:v1.32.0 + image: kindest/node:v1.33.1 - role: worker - image: kindest/node:v1.32.0 + image: kindest/node:v1.33.1 - role: worker - image: kindest/node:v1.32.0 + image: kindest/node:v1.33.1 featureGates: # Enable the corresponding DRA feature gates DynamicResourceAllocation: true + DRAResourceClaimDeviceStatus: true runtimeConfig: api/beta : true ``` @@ -44,42 +40,8 @@ kind create cluster --config kind.yaml --name dra ### Google Cloud -You can [enable the DRA beta APIs in GKE](https://cloud.google.com/kubernetes-engine/docs/how-to/use-beta-apis) and it automatically turns on the feature gates. - -You need to check that a v1.32 version exist in your zone: - -```sh -$ gcloud container get-server-config | grep 1.32 -Fetching server config for us-central1-c - - 1.32.0-gke.1358000 - minorVersion: '1.32' -- 1.32.0-gke.1358000 -- 1.32.0-gke.1358000 -``` - -And using the version obtained you can create a cluster - -```sh -export PROJECT=dra-proj -export REGION=us-central1 -export ZONE=us-central1-c -export CLUSTER=dra-cluster -export VERSION=1.32.0-gke.1358000 - -gcloud beta container clusters create ${CLUSTER} \ - --cluster-version=${VERSION} \ - --enable-multi-networking \ - --enable-dataplane-v2 \ - --enable-kubernetes-unstable-apis=resource.k8s.io/v1beta1/deviceclasses,resource.k8s.io/v1beta1/resourceclaims,resource.k8s.io/v1beta1/resourceclaimtemplates,resource.k8s.io/v1beta1/resourceslices \ - --no-enable-autorepair \ - --no-enable-autoupgrade \ - --zone=${ZONE} - -To inspect the contents of your cluster, go to: https://console.cloud.google.com/kubernetes/workload_/gcloud/us-central1-c/aojea-dra?project=aojea-gke-dev -kubeconfig entry generated for aojea-dra. -NAME LOCATION MASTER_VERSION MASTER_IP MACHINE_TYPE NODE_VERSION NUM_NODES STATUS -aojea-dra us-central1-c 1.32.0-gke.1358000 X.X.X.X e2-medium 1.32.0-gke.1358000 3 RUNNING -``` +For instructions on setting up DRA on GKE, refer to the official documentation: +[Set up Dynamic Resource Allocation](https://cloud.google.com/kubernetes-engine/docs/how-to/set-up-dra) A quick and easy way to find if DRA is enabled is by checking the metrics in the kube-apiserver @@ -89,9 +51,9 @@ kubectl get --raw /metrics | grep kubernetes_feature_enabled | grep DynamicResou kubernetes_feature_enabled{name="DynamicResourceAllocation",stage="BETA"} 1 ``` -### Installation +## Installation -You can install the latest stable version using the provided manifest: +You can install the latest stable version of `DraNet` using the provided manifest: ``` kubectl apply -f https://raw.githubusercontent.com/google/dranet/refs/heads/main/install.yaml @@ -99,7 +61,7 @@ kubectl apply -f https://raw.githubusercontent.com/google/dranet/refs/heads/main ### How to use it -Once the Kubernetes Network Driver is running you can see the list of Network Interfaces and its attributes published by the drivers: +Once the Kubernetes Network Driver is running you can see the list of Network Interfaces and its attributes published by the drivers using `kubectl get resourceslices -o yaml`: ``` apiVersion: resource.k8s.io/v1beta1 @@ -119,133 +81,49 @@ metadata: uid: 535724d7-a573-49e1-8f3b-4e644405375a spec: devices: - - basic: - attributes: - alias: - string: "" - cloud_network: - string: projects/961828715260/networks/aojea-dra-net-1 - encapsulation: - string: ether - ip: - string: 192.168.1.2 - kind: - string: network - mac: - string: 42:01:c0:a8:01:02 - mtu: - int: 8244 - name: - string: eth1 - numa_node: - int: -1 - pci_address_bus: - string: "00" - pci_address_device: - string: "05" - pci_address_domain: - string: "0000" - pci_address_function: - string: "0" - pci_vendor: - string: Google, Inc. - rdma: - bool: false - sriov: - bool: false - state: - string: up - type: - string: device - virtual: - bool: false - name: eth1 - - basic: - attributes: - alias: - string: "" - cloud_network: - string: projects/961828715260/networks/aojea-dra-net-2 - encapsulation: - string: ether - ip: - string: 192.168.2.2 - kind: - string: network - mac: - string: 42:01:c0:a8:02:02 - mtu: - int: 8244 - name: - string: eth2 - numa_node: - int: -1 - pci_address_bus: - string: "00" - pci_address_device: - string: "06" - pci_address_domain: - string: "0000" - pci_address_function: - string: "0" - pci_vendor: - string: Google, Inc. - rdma: - bool: false - sriov: - bool: false - state: - string: up - type: - string: device - virtual: - bool: false - name: eth2 - - basic: - attributes: - alias: - string: "" - cloud_network: - string: projects/961828715260/networks/aojea-dra-net-3 - encapsulation: - string: ether - ip: - string: 192.168.3.2 - kind: - string: network - mac: - string: 42:01:c0:a8:03:02 - mtu: - int: 8244 - name: - string: eth3 - numa_node: - int: -1 - pci_address_bus: - string: "00" - pci_address_device: - string: "07" - pci_address_domain: - string: "0000" - pci_address_function: - string: "0" - pci_vendor: - string: Google, Inc. - rdma: - bool: false - sriov: - bool: false - state: - string: up - type: - string: device - virtual: - bool: false - name: eth3 + - basic: + attributes: + dra.net/alias: + string: "" + dra.net/cloudNetwork: + string: dra-1-vpc + dra.net/encapsulation: + string: ether + dra.net/ifName: + string: gpu7rdma0 + dra.net/ipv4: + string: 10.0.8.8 + dra.net/mac: + string: 9a:41:2e:4f:86:16 + dra.net/mtu: + int: 8896 + dra.net/numaNode: + int: 1 + dra.net/pciAddressBus: + string: c8 + dra.net/pciAddressDevice: + string: "00" + dra.net/pciAddressDomain: + string: "0000" + dra.net/pciAddressFunction: + string: "0" + dra.net/pciVendor: + string: Mellanox Technologies + dra.net/rdma: + bool: true + dra.net/sriov: + bool: false + dra.net/state: + string: up + dra.net/type: + string: device + dra.net/virtual: + bool: false + name: gpu7rdma0 ... ``` -Once the resources are available, users can create DeviceClasses, ResourceClaims and/or ResourceClaimTemplates to schedule pods, see some [examples](https://github.com/google/dranet/tree/main/examples). +Once the resources are available, users can create `DeviceClasses`, `ResourceClaims` and/or `ResourceClaimTemplates` to schedule pods. Define a `DeviceClass` that selects all the network interfaces that are connected to a `GCP Network` @@ -259,43 +137,38 @@ spec: - cel: expression: device.driver == "dra.net" - cel: - expression: has(device.attributes["dra.net"].cloud_network) - config: - - opaque: - driver: dra.net - parameters: - nccl: "true" + expression: has(device.attributes["dra.net"].cloudNetwork) ``` -Now you can create a `ResourceClaim` that connects to a specific network, in this case `projects/961828715260/networks/aojea-dra-net-3` and reference that claim in a `Pod`: +Now you can create a `ResourceClaim` that connects to a specific network, in this case `dra-1-vpc` and reference that claim in a `Pod`: ```yaml apiVersion: resource.k8s.io/v1beta1 kind: ResourceClaim metadata: - name: cloud-network-dra-net-3 + name: cloud-network-dra-net-1 spec: devices: requests: - - name: req-cloud-net-3 + - name: req-cloud-net-1 deviceClassName: dranet-cloud selectors: - cel: - expression: device.attributes["dra.net"].cloud_network == "projects/961828715260/networks/aojea-dra-net-3" + expression: device.attributes["dra.net"].cloudNetwork == "dra-1-vpc" --- apiVersion: v1 kind: Pod metadata: - name: pod-dra-net3 + name: pod-dra-net1 labels: - app: pod-dra-net3 + app: pod-dra-net1 spec: containers: - name: ctr1 image: registry.k8s.io/e2e-test-images/agnhost:2.39 resourceClaims: - - name: net-3 - resourceClaimName: cloud-network-dra-net-3 + - name: net-1 + resourceClaimName: cloud-network-dra-net-1 ``` Kubernetes schedules the `Pod` to the corresponding `Node` and attach the network interface to the `Pod`: @@ -303,12 +176,12 @@ Kubernetes schedules the `Pod` to the corresponding `Node` and attach the networ ```sh kubectl get pods -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -pod-dra-net3 1/1 Running 0 5s 10.52.3.108 gke-dra-multi-nic-985b8c20-jg5l +pod-dra-net1 1/1 Running 0 5s 10.52.3.108 gke-dra-multi-nic-985b8c20-jg5l ``` If we execute inside the `Pod` we can see the network interface now is attached: ```sh -kubectl exec -it pod-dra-net3 ip a +kubectl exec -it pod-dra-net1 ip a kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead. 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 @@ -318,7 +191,7 @@ kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future versi link/ether 86:dc:58:24:55:1a brd ff:ff:ff:ff:ff:ff link-netnsid 0 inet 10.52.3.108/24 brd 10.52.3.255 scope global eth0 valid_lft forever preferred_lft forever -5: eth3: mtu 8244 qdisc fq state UP group default qlen 1000 +5: gpu7rdma0: mtu 8244 qdisc fq state UP group default qlen 1000 link/ether 42:01:c0:a8:03:02 brd ff:ff:ff:ff:ff:ff ``` From bcbca1f174841c31f6d9264d56f2b43c3bd9806a Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Tue, 27 May 2025 11:09:34 +0000 Subject: [PATCH 2/8] remove tcpdirect example Change-Id: I48587b6872b4e4be7dd9bab9f4998b26a6374ccd --- examples/demo_gke_tcpdirect/README.md | 28 --------------------------- 1 file changed, 28 deletions(-) delete mode 100644 examples/demo_gke_tcpdirect/README.md diff --git a/examples/demo_gke_tcpdirect/README.md b/examples/demo_gke_tcpdirect/README.md deleted file mode 100644 index 1eb1d0b7..00000000 --- a/examples/demo_gke_tcpdirect/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# GKE TCP Direct - - -1. Create a cluster - -DRA is beta in 1.32, so it requires to explicitly enable the feature. - -```sh -PROJECT="test-project" -CLUSTER="test-cluster" -ZONE="us-central1-c" -VERSION="1.32" - -gcloud container clusters create "${CLUSTER}" \ - --cluster-version="${VERSION}" \ - --enable-multi-networking \ - --enable-dataplane-v2 \ - --enable-kubernetes-unstable-apis=resource.k8s.io/v1beta1/deviceclasses,resource.k8s.io/v1beta1/resourceclaims,resource.k8s.io/v1beta1/resourceclaimtemplates,resource.k8s.io/v1beta1/resourceslices \ - --no-enable-autorepair \ - --no-enable-autoupgrade \ - --zone="${ZONE}" \ - --project="${PROJECT}" # Explicitly set the project -``` - -2. Once the cluster has been created we need to create a Node Pool with A3 machines, `dranetctl` is an opinionanted tool that will set the necessary -values for an optimal performance. - -```sh \ No newline at end of file From 5b5a806a1e6a6efbd1918fb5f5b530f31cb0278d Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Tue, 27 May 2025 14:28:44 +0000 Subject: [PATCH 3/8] add nccl tests to the testing image Change-Id: Ib4896dcee5513177716ba27f1a591106e52fadfa --- .github/workflows/test-images.yaml | 2 +- Dockerfile.perftest | 38 +++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test-images.yaml b/.github/workflows/test-images.yaml index 2d85d9d2..cbfb3374 100644 --- a/.github/workflows/test-images.yaml +++ b/.github/workflows/test-images.yaml @@ -57,7 +57,7 @@ jobs: with: context: . file: Dockerfile.perftest - platforms: linux/amd64,linux/arm64 + platforms: linux/amd64 push: true tags: | ${{ steps.meta.outputs.tags }} diff --git a/Dockerfile.perftest b/Dockerfile.perftest index abeb0991..429ae362 100644 --- a/Dockerfile.perftest +++ b/Dockerfile.perftest @@ -8,6 +8,7 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ build-essential \ + wget \ autoconf \ automake \ libtool \ @@ -35,6 +36,20 @@ RUN ./autogen.sh && \ make -j$(nproc) && \ make install +# --- Build openmpi --- +WORKDIR /usr/src +RUN wget -O- https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz | tar xzf - +WORKDIR /usr/src/openmpi-5.0.7 +RUN ./configure --with-cuda=/usr/local/cuda && \ + make -j$(nproc) && \ + make install + +# --- Build nvidia/ncc-tests --- +WORKDIR /usr/src +RUN git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git +WORKDIR /usr/src/nccl-tests +RUN make -j$(nproc) MPI=1 MPI_HOME=/usr/src/openmpi-5.0.7 + # Stage 2: Runtime FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04 AS runtime @@ -51,18 +66,29 @@ RUN apt-get update && \ libnl-3-200 \ libnl-route-3-200 \ libpci3 \ + libmnl0 \ + libelf1 \ + pciutils \ + openssh-server \ libnuma1 && \ rm -rf /var/lib/apt/lists/* -# Set environment variables for CUDA libraries in the runtime stage. -ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH:-}" -ENV LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:-}" +# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need +# to disable UserKnownHostsFile to avoid write permissions. +# Disabling StrictModes avoids directory and files read permission checks. +RUN echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config &&\ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config +RUN mkdir /run/sshd -COPY --from=builder /usr/local/bin/ib_* /usr/local/bin/ -COPY --from=builder /usr/local/bin/raw_ethernet_* /usr/local/bin/ +COPY --from=builder /usr/local/bin/ /usr/local/bin/ +COPY --from=builder /usr/local/lib/ /usr/local/lib/ +COPY --from=builder /usr/src/nccl-tests/build/*_perf /usr/local/bin/ +# Set environment variables for CUDA libraries in the runtime stage. +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/nvidia/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}" +ENV LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:-}" # Add the installation directory to the PATH for easy execution. -ENV PATH="/usr/local/bin:${PATH}" +ENV PATH="/usr/local/bin:/usr/local/nvidia/bin:${PATH}" # Set the default command to run when the container starts. CMD ["/bin/bash"] \ No newline at end of file From 928f7cf1fbc6ca6f1ac188ae3bb0166f715d9196 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Tue, 27 May 2025 19:34:48 +0000 Subject: [PATCH 4/8] MPI operator WIP Change-Id: I7b4f43247a11d7283382b26ab4f4455557ebd6b1 --- Dockerfile.perftest | 10 +- examples/mpi_operator/dra_resources.yaml | 31 +++++ examples/mpi_operator/nccl-test-job.yaml | 48 ++++++++ site/content/docs/user/mpi-operator.md | 146 +++++++++++++++++++++++ 4 files changed, 230 insertions(+), 5 deletions(-) create mode 100644 examples/mpi_operator/dra_resources.yaml create mode 100644 examples/mpi_operator/nccl-test-job.yaml create mode 100644 site/content/docs/user/mpi-operator.md diff --git a/Dockerfile.perftest b/Dockerfile.perftest index 429ae362..5c4f1ff8 100644 --- a/Dockerfile.perftest +++ b/Dockerfile.perftest @@ -40,7 +40,7 @@ RUN ./autogen.sh && \ WORKDIR /usr/src RUN wget -O- https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz | tar xzf - WORKDIR /usr/src/openmpi-5.0.7 -RUN ./configure --with-cuda=/usr/local/cuda && \ +RUN ./configure --prefix=/opt/openmpi --with-cuda=/usr/local/cuda && \ make -j$(nproc) && \ make install @@ -48,7 +48,7 @@ RUN ./configure --with-cuda=/usr/local/cuda && \ WORKDIR /usr/src RUN git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git WORKDIR /usr/src/nccl-tests -RUN make -j$(nproc) MPI=1 MPI_HOME=/usr/src/openmpi-5.0.7 +RUN make -j$(nproc) MPI=1 MPI_HOME=/opt/openmpi # Stage 2: Runtime FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04 AS runtime @@ -81,14 +81,14 @@ RUN echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && echo " RUN mkdir /run/sshd COPY --from=builder /usr/local/bin/ /usr/local/bin/ -COPY --from=builder /usr/local/lib/ /usr/local/lib/ +COPY --from=builder /opt/openmpi/ /opt/openmpi/ COPY --from=builder /usr/src/nccl-tests/build/*_perf /usr/local/bin/ # Set environment variables for CUDA libraries in the runtime stage. -ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/nvidia/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}" +ENV LD_LIBRARY_PATH="/opt/openmpi/lib/:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}" ENV LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:-}" # Add the installation directory to the PATH for easy execution. -ENV PATH="/usr/local/bin:/usr/local/nvidia/bin:${PATH}" +ENV PATH="/usr/local/bin:/usr/local/nvidia/bin:/opt/openmpi/bin:${PATH}" # Set the default command to run when the container starts. CMD ["/bin/bash"] \ No newline at end of file diff --git a/examples/mpi_operator/dra_resources.yaml b/examples/mpi_operator/dra_resources.yaml new file mode 100644 index 00000000..80fa3e6f --- /dev/null +++ b/examples/mpi_operator/dra_resources.yaml @@ -0,0 +1,31 @@ +apiVersion: resource.k8s.io/v1beta1 +kind: DeviceClass +metadata: + name: dranet-rdma-for-mpi +spec: + selectors: + - cel: + expression: device.driver == "dra.net" + - cel: + expression: device.attributes["dra.net"].rdma == true +--- +apiVersion: resource.k8s.io/v1beta1 +kind: ResourceClaimTemplate +metadata: + name: mpi-worker-rdma-nic-template +spec: + spec: + devices: + requests: + - name: rdma-nic-for-mpi + deviceClassName: dranet-rdma-for-mpi + selectors: + - cel: + expression: device.attributes["dra.net"].ifName == "gpu2rdma0" + config: + - opaque: + driver: dra.net + parameters: + interface: + name: "dranet0" # NCCL will use this interface + diff --git a/examples/mpi_operator/nccl-test-job.yaml b/examples/mpi_operator/nccl-test-job.yaml new file mode 100644 index 00000000..90ad6247 --- /dev/null +++ b/examples/mpi_operator/nccl-test-job.yaml @@ -0,0 +1,48 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: nccl-test-dranet-1gpu-1nic +spec: + slotsPerWorker: 1 # 1 MPI rank per worker Pod + mpiImplementation: OpenMPI # Or your preferred MPI + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: docker.io/aojea/dranet-perftest:latest@sha256:bfaa20bba4178f20b3c50c2ddd57d3e0488bc94a4d704ea091043c389571fd6e + name: test-launcher + env: + - name: NCCL_DEBUG + value: "INFO" + - name: LD_LIBRARY_PATH + value: "/opt/openmpi/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64" + command: ["/bin/bash", "-c"] + args: + - | + echo "--- Environment for mpirun ---" + echo "Effective LD_LIBRARY_PATH: $LD_LIBRARY_PATH" + PATH=/usr/local/bin:/usr/local/nvidia/bin:/opt/openmpi/bin:$PATH + + mpirun --allow-run-as-root -x LD_LIBRARY_PATH -x PATH \ + bash -c 'hostname' + securityContext: + capabilities: + add: ["IPC_LOCK"] + Worker: + replicas: 2 + template: + spec: + resourceClaims: + - name: worker-rdma-nic + resourceClaimTemplateName: mpi-worker-rdma-nic-template + containers: + - image: docker.io/aojea/dranet-perftest:latest@sha256:bfaa20bba4178f20b3c50c2ddd57d3e0488bc94a4d704ea091043c389571fd6e + name: mpi-worker + securityContext: + capabilities: + add: ["IPC_LOCK"] + resources: + limits: + nvidia.com/gpu: 1 # Each worker gets 1 GPU diff --git a/site/content/docs/user/mpi-operator.md b/site/content/docs/user/mpi-operator.md new file mode 100644 index 00000000..6bf8d95d --- /dev/null +++ b/site/content/docs/user/mpi-operator.md @@ -0,0 +1,146 @@ +--- +title: "MPI Operator" +date: 2025-05-27T11:30:40Z +--- + +Running distributed applications, such as those using the Message Passing Interface (MPI) or NVIDIA's Collective Communications Library (NCCL) for GPU communication, often requires each participating process (or Pod, in Kubernetes terms) to have access to high-speed, low-latency interconnects. Simply sharing a generic network interface among many high-performance jobs can lead to contention, unpredictable performance, and underutilization of expensive hardware. + +The goal is resource compartmentalization: ensuring that each part of your distributed job gets dedicated access to the specific resources it needs – for instance, one GPU and one dedicated RDMA-capable NIC per worker. + +## DraNet + MPI Operator: A Powerful Combination + +- DraNet: Provides the mechanism to discover RDMA-capable NICs on your Kubernetes nodes and make them available for Pods to claim. Through DRA, Pods can request a specific NIC, and DraNet, via NRI hooks, will configure it within the Pod's namespace, [even naming it predictably (e.g., dranet0)](google/dranet/dranet-dcd98f563b1a24f4800cf3d2d502ec5b2f488ddc/site/content/docs/user/interface-configuration.md) + +- [Kubeflow MPI Operator](https://github.com/kubeflow/mpi-operator): Simplifies the deployment and management of MPI-based applications on Kubernetes. It handles the setup of MPI ranks, hostfiles, and the execution of mpirun. + +By using them together, we can create MPIJob definitions where each worker Pod explicitly claims a dedicated RDMA NIC managed by DraNet, alongside its GPU + +### Example: Running NCCL Tests for Distributed Workload Validation + +A common and reliable way to validate that that our distributed setup is performing optimally is by running an [NVIDIA's Collective Communications Library (NCCL) All-Reduce test](https://github.com/NVIDIA/nccl-tests). This benchmark is designed to exercise the high-speed interconnects between nodes, helping you confirm that the RDMA fabric (like InfiniBand or RoCE) is operating correctly and ready to support your distributed workloads with expected efficiency. + +Let's see how we can run this with DraNet and the MPI Operator, focusing on a 1 GPU and 1 NIC per worker configuration. + +1. Defining Resources for DraNet: + +First, we tell DraNet what kind of NICs we're interested in and how Pods can claim them. + +**DeviceClass (dranet-rdma-for-mpi):** This selects RDMA-capable NICs managed by DraNet. + +```yaml +apiVersion: resource.k8s.io/v1beta1 +kind: DeviceClass +metadata: + name: dranet-rdma-for-mpi +spec: + selectors: + - cel: + expression: device.driver == "dra.net" + - cel: + expression: device.attributes["dra.net"].rdma == true +``` + +**ResourceClaimTemplate (mpi-worker-rdma-nic-template):** MPI worker Pods will use this to request one RDMA NIC. DraNet will be instructed to name this interface dranet0 inside the Pod. + +```yaml +apiVersion: resource.k8s.io/v1beta1 +kind: ResourceClaimTemplate +metadata: + name: mpi-worker-rdma-nic-template +spec: + spec: + devices: + requests: + - name: rdma-nic-for-mpi + deviceClassName: dranet-rdma-for-mpi + selectors: + - cel: + expression: device.attributes["dra.net"].ifName == "gpu2rdma0" + config: + - opaque: + driver: dra.net + parameters: + interface: + name: "dranet0" # NCCL will use this interface +``` + +1. Crafting the MPIJob: + +The MPIJob specification is where we tie everything together. We'll define a job with two workers, each getting one GPU and one DraNet-managed RDMA NIC. + +```yaml +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: nccl-test-dranet-1gpu-1nic +spec: + slotsPerWorker: 1 # 1 MPI rank per worker Pod + mpiImplementation: OpenMPI # Or your preferred MPI + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5 + env: + - name: NCCL_DEBUG + value: "INFO" + - name: OMPI_MCA_pml + value: "ucx" + command: + - mpirun + - /third_party/nccl-tests/build/all_reduce_perf + - -b 8K -e 128M -g 1 # Benchmark params: 1 GPU per process + securityContext: + capabilities: + add: ["IPC_LOCK"] + Worker: + replicas: 2 + template: + spec: + resourceClaims: + - name: worker-rdma-nic + resourceClaimTemplateName: mpi-worker-rdma-nic-template + containers: + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5 + name: mpi-worker + securityContext: + capabilities: + add: ["IPC_LOCK"] + resources: + limits: + nvidia.com/gpu: 1 # Each worker gets 1 GPU +``` + +### Key Aspects of this Configuration: + +- **slotsPerWorker: 1:** Each worker Pod hosts a single MPI rank. + +- **Worker.replicas: 2:** We run a 2-rank MPI job. + +- **Worker.template.spec.resourceClaims:** Each worker Pod claims its own RDMA NIC via the template, which DraNet will configure as dranet0. + +- **Worker.template.spec.containers[0].resources.limits["nvidia.com/gpu"]: 1:** Each worker gets one GPU. + +- **Launcher.template.spec.containers[0].env.NCCL_SOCKET_IFNAME: "dranet0":** This environment variable explicitly tells NCCL to use the dranet0 interface for its network operations. + +- **MPI MCA Parameters (e.g., UCX_NET_DEVICES="dranet0"):** These guide the MPI library itself to use the specified RDMA interface. + +3. Running and Observing: + +Once deployed, the MPI Operator will launch the job. The launcher Pod will execute mpirun, which starts the all_reduce_perf test across the two worker Pods. Each worker Pod will use its dedicated GPU and its dedicated dranet0 (RDMA NIC) for NCCL communications. + +You can monitor the launcher's logs to see the NCCL benchmark results, including the achieved bus bandwidth. The NCCL_DEBUG=INFO logs will also confirm that NCCL is indeed using the dranet0 interface. + +## The Power of Compartmentalization with DraNet + +This setup beautifully illustrates the benefits of resource compartmentalization: + +- Dedicated Performance: Each MPI worker in this job has exclusive use of one GPU and one high-speed RDMA NIC. This ensures that its communication performance is not impacted by other workloads on the same node. + +- Efficient Resource Utilization: If your nodes are powerful (e.g., 8 GPUs and 8 RDMA NICs), running this 2-worker job (consuming 1 GPU/1 NIC on two separate nodes) leaves the remaining resources on those nodes (and other nodes) fully available. + +- Concurrent High-Performance Jobs: You can run multiple independent MPI jobs or other DraNet-aware distributed workloads simultaneously. Each job can claim its own subset of GPUs and RDMA NICs, and DraNet ensures that their network traffic is isolated at the NIC level, preventing contention and guaranteeing predictable performance. + +By leveraging DraNet with tools like the MPI Operator, teams can confidently deploy network-intensive distributed applications on Kubernetes, achieving performance comparable to bare-metal HPC clusters while benefiting from Kubernetes' orchestration capabilities. \ No newline at end of file From 030b9997cf0835594f799ed30f461a7208818e56 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Wed, 28 May 2025 07:45:23 +0000 Subject: [PATCH 5/8] worker mpi images Change-Id: I6c51015d25c333b79109fc58ae5782d9c55dc4b5 --- Dockerfile.perftest | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/Dockerfile.perftest b/Dockerfile.perftest index 5c4f1ff8..4de80631 100644 --- a/Dockerfile.perftest +++ b/Dockerfile.perftest @@ -38,8 +38,8 @@ RUN ./autogen.sh && \ # --- Build openmpi --- WORKDIR /usr/src -RUN wget -O- https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz | tar xzf - -WORKDIR /usr/src/openmpi-5.0.7 +RUN wget -O- https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.8.tar.gz | tar xzf - +WORKDIR /usr/src/openmpi-4.1.8 RUN ./configure --prefix=/opt/openmpi --with-cuda=/usr/local/cuda && \ make -j$(nproc) && \ make install @@ -70,15 +70,42 @@ RUN apt-get update && \ libelf1 \ pciutils \ openssh-server \ + openssh-client \ + libcap2-bin \ libnuma1 && \ rm -rf /var/lib/apt/lists/* + +# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need +# https://github.com/kubeflow/mpi-operator/issues/580 +ARG port=2222 +# Add priviledge separation directoy to run sshd as root. +RUN mkdir -p /var/run/sshd +# Add capability to run sshd as non-root. +RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd + +# Allow OpenSSH to talk to containers without asking for confirmation +# by disabling StrictHostKeyChecking. # mpi-operator mounts the .ssh folder from a Secret. For that to work, we need # to disable UserKnownHostsFile to avoid write permissions. # Disabling StrictModes avoids directory and files read permission checks. -RUN echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config &&\ - sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config -RUN mkdir /run/sshd +RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \ + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ + && sed -i "s/[ #]\(.*Port \).*/ \1$port/g" /etc/ssh/ssh_config \ + && sed -i "s/#\(StrictModes \).*/\1no/g" /etc/ssh/sshd_config \ + && sed -i "s/#\(Port \).*/\1$port/g" /etc/ssh/sshd_config + +RUN useradd -m mpiuser +WORKDIR /home/mpiuser +# Configurations for running sshd as non-root. +RUN mkdir -p /home/mpiuser/.ssh && \ + cat < /home/mpiuser/sshd_config_custom +PidFile /home/mpiuser/sshd.pid +HostKey /home/mpiuser/.ssh/id_rsa +StrictModes no +EOF + +RUN echo "Port $port" >> /home/mpiuser/.sshd_config COPY --from=builder /usr/local/bin/ /usr/local/bin/ COPY --from=builder /opt/openmpi/ /opt/openmpi/ From 01bbc155e16a3f3cbddbf265f264572281132cb4 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Wed, 28 May 2025 11:16:59 +0000 Subject: [PATCH 6/8] update demo gke to reflect custom drivers Change-Id: I45e2e4242253d12ccaf77a7f0cf9d2106addd349 --- examples/demo_gke_rdma/README.md | 107 +++++++++++----------- examples/demo_gke_rdma/nccl-gib-test.yaml | 15 +-- examples/demo_gke_rdma/rdma-perftest.yaml | 17 +++- 3 files changed, 72 insertions(+), 67 deletions(-) diff --git a/examples/demo_gke_rdma/README.md b/examples/demo_gke_rdma/README.md index 924a14a0..0180aab9 100644 --- a/examples/demo_gke_rdma/README.md +++ b/examples/demo_gke_rdma/README.md @@ -60,6 +60,22 @@ You can validate this by using `kubectl get resourceslices -o yaml` and checking ``` +## GKE RDMA and NCCL + +Based on https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute-custom but using only 1 NIC and 1 GPU per Pod to demonstrate how to split workloads to allocate individual resources. + + +### Install the RDMA binary and configure NCCL + +This Daemonset does the following: + +* Installs RDMA binaries and the NCCL library on the node. +* Stores the library and the binary in the /home/kubernetes/bin/nvidia/lib64 and the /home/kubernetes/bin/gib directory on the VM. + +```sh +kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/refs/heads/master/gpudirect-rdma/nccl-rdma-installer.yaml +``` + ## Deploy perf-tests RDMA Pods Use the following manifest to install two Pods in the same RDMA network, @@ -130,7 +146,7 @@ Run `rping -s` in one of the Pods and connect from the other to validate the con ``` kubectl exec -it rdma-perftest-1 -- bash -root@rdma-perftest-1:/# LD_LIBRARY_PATH="" rping -c -a 10.0.4.7 -C 3 -v -V +root@rdma-perftest-1:/# rping -c -a 10.0.4.7 -C 3 -v -V ping data: rdma-ping-0: ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqr ping data: rdma-ping-1: BCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrs ping data: rdma-ping-2: CDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrst @@ -295,23 +311,8 @@ deallocating GPU buffer 000078e9f8800000 destroying current CUDA Ctx ``` -## GKE NCCL - -Based on https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute-custom but using only 1 NIC and 1 GPU per Pod to demonstrate how to split workloads to allocate individual resources. - -### Install the RDMA binary and configure NCCL - -This Daemonset does the following: - -* Installs RDMA binaries and the NCCL library on the node. -* Stores the library and the binary in the /home/kubernetes/bin/nvidia/lib64 and the /home/kubernetes/bin/gib directory on the VM. - -```sh -kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/refs/heads/master/gpudirect-rdma/nccl-rdma-installer.yaml -``` - -### Deploy the test workload +### Deploy the test NCCL workload The manifest deploys two test pods, each of which runs in a A3 Ultra node. @@ -332,63 +333,61 @@ nccl-gib-test-0 1/1 Running 0 3s nccl-gib-test-1 1/1 Running 0 1s ``` - - ### Run the tests It is important to pass the right parameters, in this specific example we need to indicate to only use one GPU per node `[-g ]`. ```sh - kubectl exec nccl-gib-test-0 -it -- /usr/local/gib/scripts/run_nccl_tests.sh -t all_gather -b 1K -g 1 -e 8G nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test +kubectl exec nccl-gib-test-0 -it -- /usr/local/gib/scripts/run_nccl_tests.sh -t all_gather -b 1K -g 1 -e 8G nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test ``` It should return something like: ```sh - +kubectl exec nccl-gib-test-0 -it -- /usr/local/gib/scripts/run_nccl_tests.sh -t all_gather -b 1K -g 1 -e 8G nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test Initializing SSH... +Warning: Permanently added '[nccl-gib-test-0.nccl-gib-test]:222' (ED25519) to the list of known hosts. Hello from nccl-gib-test-0.nccl-gib-test +Warning: Permanently added '[nccl-gib-test-1.nccl-gib-test]:222' (ED25519) to the list of known hosts. Hello from nccl-gib-test-1.nccl-gib-test -+ /usr/local/gib/scripts/gen_hostfiles.sh -p 222 nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test -Generating hostfiles for 2 hosts: +Generating hostfiles for 2 hosts: nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test -+ mpirun --allow-run-as-root --mca btl tcp,self --mca btl_tcp_if_include eth0 --bind-to none -np 2 --hostfile /tmp/hostfiles/hostfile1 -x PATH -x LD_LIBRARY_PATH=/usr/local/gib/lib64:/usr/local/nvidia/lib64 -x NCCL_DEBUG=WARN -x NCCL_DEBUG_SUBSYS=INIT,NET -x NCCL_TESTS_SPLIT_MASK=0x0 bash -c 'source /usr/local/gib/scripts/set_nccl_env.sh; /third_party/nccl-tests/build/all_gather_perf -b 1K -e 8G -f 2 -w 50 -n 100;' # nThread 1 nGpus 1 minBytes 1024 maxBytes 8589934592 step: 2(factor) warmup iters: 50 iters: 100 agg iters: 1 validation: 1 graph: 0 # # Using devices -# Rank 0 Group 0 Pid 235 on nccl-gib-test-0 device 0 [0000:90:00] NVIDIA H200 -# Rank 1 Group 0 Pid 161 on nccl-gib-test-1 device 0 [0000:90:00] NVIDIA H200 +# Rank 0 Group 0 Pid 85 on nccl-gib-test-0 device 0 [0000:cc:00] NVIDIA H200 +# Rank 1 Group 0 Pid 54 on nccl-gib-test-1 device 0 [0000:c4:00] NVIDIA H200 NCCL version 2.25.1+cuda12.8 # -# out-of-place in-place +# out-of-place in-place # size count type redop root time algbw busbw #wrong time algbw busbw #wrong -# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) - 1024 128 float none -1 20.89 0.05 0.02 0 20.40 0.05 0.03 0 - 2048 256 float none -1 20.55 0.10 0.05 0 20.48 0.10 0.05 0 - 4096 512 float none -1 20.71 0.20 0.10 0 20.75 0.20 0.10 0 - 8192 1024 float none -1 21.49 0.38 0.19 0 21.62 0.38 0.19 0 - 16384 2048 float none -1 24.56 0.67 0.33 0 24.55 0.67 0.33 0 - 32768 4096 float none -1 25.04 1.31 0.65 0 24.59 1.33 0.67 0 - 65536 8192 float none -1 28.59 2.29 1.15 0 28.04 2.34 1.17 0 - 131072 16384 float none -1 33.46 3.92 1.96 0 36.83 3.56 1.78 0 - 262144 32768 float none -1 47.72 5.49 2.75 0 45.11 5.81 2.91 0 - 524288 65536 float none -1 79.13 6.63 3.31 0 76.17 6.88 3.44 0 - 1048576 131072 float none -1 71.48 14.67 7.33 0 70.06 14.97 7.48 0 - 2097152 262144 float none -1 76.40 27.45 13.72 0 76.41 27.44 13.72 0 - 4194304 524288 float none -1 117.9 35.58 17.79 0 117.3 35.77 17.88 0 - 8388608 1048576 float none -1 203.4 41.24 20.62 0 204.7 40.98 20.49 0 - 16777216 2097152 float none -1 375.1 44.73 22.37 0 371.6 45.14 22.57 0 - 33554432 4194304 float none -1 729.7 45.98 22.99 0 728.9 46.04 23.02 0 - 67108864 8388608 float none -1 1447.6 46.36 23.18 0 1443.5 46.49 23.25 0 - 134217728 16777216 float none -1 2871.7 46.74 23.37 0 2854.1 47.03 23.51 0 - 268435456 33554432 float none -1 5699.0 47.10 23.55 0 5666.1 47.38 23.69 0 - 536870912 67108864 float none -1 11382 47.17 23.58 0 11026 48.69 24.35 0 - 1073741824 134217728 float none -1 22474 47.78 23.89 0 21049 51.01 25.51 0 - 2147483648 268435456 float none -1 44241 48.54 24.27 0 39256 54.70 27.35 0 - 4294967296 536870912 float none -1 86470 49.67 24.83 0 75081 57.20 28.60 0 - 8589934592 1073741824 float none -1 166030 51.74 25.87 0 141444 60.73 30.37 0 +# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 1024 128 float none -1 21.75 0.05 0.02 0 20.94 0.05 0.02 0 + 2048 256 float none -1 21.48 0.10 0.05 0 21.40 0.10 0.05 0 + 4096 512 float none -1 21.81 0.19 0.09 0 21.91 0.19 0.09 0 + 8192 1024 float none -1 22.45 0.36 0.18 0 22.65 0.36 0.18 0 + 16384 2048 float none -1 26.28 0.62 0.31 0 25.44 0.64 0.32 0 + 32768 4096 float none -1 26.05 1.26 0.63 0 25.64 1.28 0.64 0 + 65536 8192 float none -1 29.97 2.19 1.09 0 29.61 2.21 1.11 0 + 131072 16384 float none -1 33.05 3.97 1.98 0 32.99 3.97 1.99 0 + 262144 32768 float none -1 40.97 6.40 3.20 0 37.59 6.97 3.49 0 + 524288 65536 float none -1 50.18 10.45 5.22 0 46.03 11.39 5.70 0 + 1048576 131072 float none -1 61.30 17.11 8.55 0 57.32 18.29 9.15 0 + 2097152 262144 float none -1 77.15 27.18 13.59 0 77.63 27.01 13.51 0 + 4194304 524288 float none -1 119.5 35.09 17.55 0 121.7 34.48 17.24 0 + 8388608 1048576 float none -1 206.9 40.55 20.28 0 207.7 40.39 20.20 0 + 16777216 2097152 float none -1 371.4 45.17 22.58 0 372.5 45.04 22.52 0 + 33554432 4194304 float none -1 695.5 48.25 24.12 0 698.2 48.06 24.03 0 + 67108864 8388608 float none -1 1282.5 52.33 26.16 0 1280.4 52.41 26.21 0 + 134217728 16777216 float none -1 2395.4 56.03 28.02 0 2548.9 52.66 26.33 0 + 268435456 33554432 float none -1 4526.0 59.31 29.65 0 4506.3 59.57 29.78 0 + 536870912 67108864 float none -1 8827.7 60.82 30.41 0 8873.5 60.50 30.25 0 + 1073741824 134217728 float none -1 17261 62.21 31.10 0 17056 62.95 31.48 0 + 2147483648 268435456 float none -1 33952 63.25 31.62 0 33156 64.77 32.38 0 + 4294967296 536870912 float none -1 67018 64.09 32.04 0 65577 65.50 32.75 0 + 8589934592 1073741824 float none -1 133370 64.41 32.20 0 128890 66.65 33.32 0 # Out of bounds values : 0 OK -# Avg bus bandwidth : 13.132 - +# Avg bus bandwidth : 15.0709 +# ``` \ No newline at end of file diff --git a/examples/demo_gke_rdma/nccl-gib-test.yaml b/examples/demo_gke_rdma/nccl-gib-test.yaml index 5cdecefe..806cd887 100644 --- a/examples/demo_gke_rdma/nccl-gib-test.yaml +++ b/examples/demo_gke_rdma/nccl-gib-test.yaml @@ -22,7 +22,7 @@ spec: deviceClassName: rdma selectors: - cel: - expression: device.attributes["dra.net"].ifName == "gpu2rdma0" + expression: device.attributes["dra.net"].ifName == "gpu1rdma0" --- apiVersion: v1 kind: Service @@ -54,21 +54,16 @@ spec: - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5 name: test resources: - requests: - cpu: 150m + limits: + nvidia.com/gpu: 1 volumeMounts: - name: library-dir-host mountPath: /usr/local/nvidia - name: gib mountPath: /usr/local/gib - - name: shared-memory - mountPath: /dev/shm env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 - resources: - limits: - nvidia.com/gpu: 1 command: ["/bin/bash", "-c"] args: - | @@ -85,10 +80,6 @@ spec: - name: gib hostPath: path: /home/kubernetes/bin/gib - - name: shared-memory - emptyDir: - medium: "Memory" - sizeLimit: 250Gi resourceClaims: - name: rdma-net-interface resourceClaimTemplateName: rdma-net-template-gib diff --git a/examples/demo_gke_rdma/rdma-perftest.yaml b/examples/demo_gke_rdma/rdma-perftest.yaml index 7e8a0fc5..083372a1 100644 --- a/examples/demo_gke_rdma/rdma-perftest.yaml +++ b/examples/demo_gke_rdma/rdma-perftest.yaml @@ -22,7 +22,7 @@ spec: deviceClassName: rdma selectors: - cel: - expression: device.attributes["dra.net"].ifName == "gpu3rdma0" + expression: device.attributes["dra.net"].ifName == "gpu4rdma0" --- apiVersion: v1 kind: Service @@ -60,6 +60,21 @@ spec: limits: nvidia.com/gpu: 1 command: ["sleep", "infinity"] + volumeMounts: + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + volumes: + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib resourceClaims: - name: rdma-net-interface resourceClaimTemplateName: rdma-net-template From 97408a2a4a6fe3c8401d0a54ece70106321c05af Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Wed, 28 May 2025 11:17:26 +0000 Subject: [PATCH 7/8] add rdma core to the perf test image Change-Id: I90741d69f7955579afa5bb5513afdec4170eb9c2 --- Dockerfile.perftest | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.perftest b/Dockerfile.perftest index 4de80631..429fec87 100644 --- a/Dockerfile.perftest +++ b/Dockerfile.perftest @@ -58,6 +58,7 @@ ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ apt-get install -y --no-install-recommends \ rdmacm-utils \ + rdma-core \ iproute2 \ inetutils-ping \ ibverbs-utils \ From 0e17d598cc9609a52cd475b33efd378964b2faf4 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Wed, 28 May 2025 12:03:35 +0000 Subject: [PATCH 8/8] mpi operator Change-Id: Id8c72deafd8f2b3cd465ca7604b2719636617195 --- .../resourceclaimtemplate.yaml | 57 +++++++ examples/mpi_operator/nccl-test-job.yaml | 44 +++-- site/content/docs/user/mpi-operator.md | 158 ++++++++++++------ 3 files changed, 189 insertions(+), 70 deletions(-) create mode 100644 examples/demo_gke_multinetwork/resourceclaimtemplate.yaml diff --git a/examples/demo_gke_multinetwork/resourceclaimtemplate.yaml b/examples/demo_gke_multinetwork/resourceclaimtemplate.yaml new file mode 100644 index 00000000..9f6e4c73 --- /dev/null +++ b/examples/demo_gke_multinetwork/resourceclaimtemplate.yaml @@ -0,0 +1,57 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: resource.k8s.io/v1beta1 +kind: ResourceClaimTemplate +metadata: + name: phy-interfaces-template +spec: + spec: + devices: + requests: + - name: phy-interfaces-template + deviceClassName: multinic + selectors: + - cel: + expression: device.attributes["dra.net"].name == "eth1" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: server-deployment + labels: + app: MyApp +spec: + replicas: 2 + selector: + matchLabels: + app: MyApp + template: + metadata: + labels: + app: MyApp + spec: + resourceClaims: + - name: phy-interfaces + resourceClaimTemplateName: phy-interfaces-template + containers: + - name: agnhost + image: registry.k8s.io/e2e-test-images/agnhost:2.39 + args: + - netexec + - --http-port=80 + ports: + - containerPort: 80 + \ No newline at end of file diff --git a/examples/mpi_operator/nccl-test-job.yaml b/examples/mpi_operator/nccl-test-job.yaml index 90ad6247..a97ae060 100644 --- a/examples/mpi_operator/nccl-test-job.yaml +++ b/examples/mpi_operator/nccl-test-job.yaml @@ -4,29 +4,31 @@ metadata: name: nccl-test-dranet-1gpu-1nic spec: slotsPerWorker: 1 # 1 MPI rank per worker Pod - mpiImplementation: OpenMPI # Or your preferred MPI mpiReplicaSpecs: Launcher: replicas: 1 template: spec: containers: - - image: docker.io/aojea/dranet-perftest:latest@sha256:bfaa20bba4178f20b3c50c2ddd57d3e0488bc94a4d704ea091043c389571fd6e - name: test-launcher - env: - - name: NCCL_DEBUG - value: "INFO" - - name: LD_LIBRARY_PATH - value: "/opt/openmpi/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64" + - image: mpioperator/openmpi:v0.6.0 + name: mpi-launcher command: ["/bin/bash", "-c"] args: - | - echo "--- Environment for mpirun ---" - echo "Effective LD_LIBRARY_PATH: $LD_LIBRARY_PATH" - PATH=/usr/local/bin:/usr/local/nvidia/bin:/opt/openmpi/bin:$PATH - - mpirun --allow-run-as-root -x LD_LIBRARY_PATH -x PATH \ - bash -c 'hostname' + set -ex + mpirun \ + --allow-run-as-root \ + --prefix /opt/openmpi \ + -np 2 \ + -bind-to none \ + -map-by slot \ + -mca routed direct \ + -x LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \ + bash -c \ + "source /usr/local/gib/scripts/set_nccl_env.sh; \ + /usr/local/bin/all_reduce_perf \ + -g 1 -b 1K -e 8G -f 2 \ + -w 5 -n 20;" securityContext: capabilities: add: ["IPC_LOCK"] @@ -38,7 +40,7 @@ spec: - name: worker-rdma-nic resourceClaimTemplateName: mpi-worker-rdma-nic-template containers: - - image: docker.io/aojea/dranet-perftest:latest@sha256:bfaa20bba4178f20b3c50c2ddd57d3e0488bc94a4d704ea091043c389571fd6e + - image: docker.io/aojea/dranet-perftest:latest@sha256:f9729ddd665ea6c5cb1d99754666465d57a84034d808d23a2126a41a7683938c name: mpi-worker securityContext: capabilities: @@ -46,3 +48,15 @@ spec: resources: limits: nvidia.com/gpu: 1 # Each worker gets 1 GPU + volumeMounts: + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + volumes: + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib diff --git a/site/content/docs/user/mpi-operator.md b/site/content/docs/user/mpi-operator.md index 6bf8d95d..3332d215 100644 --- a/site/content/docs/user/mpi-operator.md +++ b/site/content/docs/user/mpi-operator.md @@ -1,5 +1,5 @@ --- -title: "MPI Operator" +title: "MPI Operator on GKE" date: 2025-05-27T11:30:40Z --- @@ -64,74 +64,122 @@ spec: name: "dranet0" # NCCL will use this interface ``` -1. Crafting the MPIJob: +1. Install the GKE optimized RDMA dependencies -The MPIJob specification is where we tie everything together. We'll define a job with two workers, each getting one GPU and one DraNet-managed RDMA NIC. +GKE automatically install on the VM some optimized RDMA and NCCL libraries for Google Cloud infrastructure, that can be installed following the instructions on: + +https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute-custom#install-rdma-configure-nccl + +In order to use them you need to mount the following volumes ```yaml -apiVersion: kubeflow.org/v2beta1 -kind: MPIJob -metadata: - name: nccl-test-dranet-1gpu-1nic spec: - slotsPerWorker: 1 # 1 MPI rank per worker Pod - mpiImplementation: OpenMPI # Or your preferred MPI - mpiReplicaSpecs: - Launcher: - replicas: 1 - template: - spec: - containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5 - env: - - name: NCCL_DEBUG - value: "INFO" - - name: OMPI_MCA_pml - value: "ucx" - command: - - mpirun - - /third_party/nccl-tests/build/all_reduce_perf - - -b 8K -e 128M -g 1 # Benchmark params: 1 GPU per process - securityContext: - capabilities: - add: ["IPC_LOCK"] - Worker: - replicas: 2 - template: - spec: - resourceClaims: - - name: worker-rdma-nic - resourceClaimTemplateName: mpi-worker-rdma-nic-template - containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5 - name: mpi-worker - securityContext: - capabilities: - add: ["IPC_LOCK"] - resources: - limits: - nvidia.com/gpu: 1 # Each worker gets 1 GPU + volumes: + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib ``` -### Key Aspects of this Configuration: - -- **slotsPerWorker: 1:** Each worker Pod hosts a single MPI rank. +in your workloads: -- **Worker.replicas: 2:** We run a 2-rank MPI job. - -- **Worker.template.spec.resourceClaims:** Each worker Pod claims its own RDMA NIC via the template, which DraNet will configure as dranet0. +```yaml +containers: + - name: my-container + volumeMounts: + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 +``` -- **Worker.template.spec.containers[0].resources.limits["nvidia.com/gpu"]: 1:** Each worker gets one GPU. +1. Crafting the MPIJob: -- **Launcher.template.spec.containers[0].env.NCCL_SOCKET_IFNAME: "dranet0":** This environment variable explicitly tells NCCL to use the dranet0 interface for its network operations. +The MPIJob specification is where we tie everything together. We'll define a job with two workers, each getting one GPU and one DraNet-managed RDMA NIC. -- **MPI MCA Parameters (e.g., UCX_NET_DEVICES="dranet0"):** These guide the MPI library itself to use the specified RDMA interface. +{{}} +{{ readFile "examples/mpi_operator/nccl-test-job.yaml" | safeHTML }} +{{}} -3. Running and Observing: +1. Running and Observing: Once deployed, the MPI Operator will launch the job. The launcher Pod will execute mpirun, which starts the all_reduce_perf test across the two worker Pods. Each worker Pod will use its dedicated GPU and its dedicated dranet0 (RDMA NIC) for NCCL communications. -You can monitor the launcher's logs to see the NCCL benchmark results, including the achieved bus bandwidth. The NCCL_DEBUG=INFO logs will also confirm that NCCL is indeed using the dranet0 interface. +You can monitor the launcher's logs to see the NCCL benchmark results, including the achieved bus bandwidth. + +```sh +kubectl logs $(kubectl get pods | grep launcher | awk '{ print $1}') -f ++ mpirun --allow-run-as-root --prefix /opt/openmpi -np 2 -bind-to none -map-by slot -mca routed direct -x LD_LIBRARY_PATH=/usr/local/nvidia/lib64 bash -c 'source /usr/local/gib/scripts/set_nccl_env.sh; /usr/local/bin/all_reduce_perf -g 1 -b 1K -e 8G -f 2 -w 5 -n 20;' +Warning: Permanently added '[nccl-test-dranet-1gpu-1nic-worker-1.nccl-test-dranet-1gpu-1nic.default.svc]:2222' (ED25519) to the list of known hosts. +Warning: Permanently added '[nccl-test-dranet-1gpu-1nic-worker-0.nccl-test-dranet-1gpu-1nic.default.svc]:2222' (ED25519) to the list of known hosts. +-------------------------------------------------------------------------- +WARNING: No preset parameters were found for the device that Open MPI +detected: + + Local host: nccl-test-dranet-1gpu-1nic-worker-0 + Device name: mlx5_2 + Device vendor ID: 0x02c9 + Device vendor part ID: 4126 + +Default device parameters will be used, which may result in lower +performance. You can edit any of the files specified by the +btl_openib_device_param_files MCA parameter to set values for your +device. + +NOTE: You can turn off this warning by setting the MCA parameter + btl_openib_warn_no_device_params_found to 0. +-------------------------------------------------------------------------- +-------------------------------------------------------------------------- +No OpenFabrics connection schemes reported that they were able to be +used on a specific port. As such, the openib BTL (OpenFabrics +support) will be disabled for this port. + + Local host: nccl-test-dranet-1gpu-1nic-worker-0 + Local device: mlx5_2 + Local port: 1 + CPCs attempted: rdmacm, udcm +-------------------------------------------------------------------------- +# nThread 1 nGpus 1 minBytes 1024 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0 +# +# Using devices +# Rank 0 Group 0 Pid 23 on nccl-test-dranet-1gpu-1nic-worker-0 device 0 [0000:cc:00] NVIDIA H200 +# Rank 1 Group 0 Pid 21 on nccl-test-dranet-1gpu-1nic-worker-1 device 0 [0000:97:00] NVIDIA H200 +# +# out-of-place in-place +# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 1024 256 float sum -1 35.59 0.03 0.03 0 30.61 0.03 0.03 0 + 2048 512 float sum -1 31.80 0.06 0.06 0 31.90 0.06 0.06 0 + 4096 1024 float sum -1 33.56 0.12 0.12 0 33.33 0.12 0.12 0 + 8192 2048 float sum -1 39.33 0.21 0.21 0 39.24 0.21 0.21 0 + 16384 4096 float sum -1 41.89 0.39 0.39 0 40.31 0.41 0.41 0 + 32768 8192 float sum -1 45.47 0.72 0.72 0 42.92 0.76 0.76 0 + 65536 16384 float sum -1 54.03 1.21 1.21 0 51.81 1.26 1.26 0 + 131072 32768 float sum -1 51.86 2.53 2.53 0 52.60 2.49 2.49 0 + 262144 65536 float sum -1 79.10 3.31 3.31 0 68.36 3.83 3.83 0 + 524288 131072 float sum -1 76.88 6.82 6.82 0 76.38 6.86 6.86 0 + 1048576 262144 float sum -1 98.57 10.64 10.64 0 93.72 11.19 11.19 0 + 2097152 524288 float sum -1 131.9 15.90 15.90 0 131.8 15.91 15.91 0 + 4194304 1048576 float sum -1 227.5 18.44 18.44 0 227.4 18.45 18.45 0 + 8388608 2097152 float sum -1 415.7 20.18 20.18 0 416.7 20.13 20.13 0 + 16777216 4194304 float sum -1 811.3 20.68 20.68 0 808.5 20.75 20.75 0 + 33554432 8388608 float sum -1 1609.7 20.84 20.84 0 1607.6 20.87 20.87 0 + 67108864 16777216 float sum -1 2250.8 29.82 29.82 0 2253.3 29.78 29.78 0 + 134217728 33554432 float sum -1 4440.0 30.23 30.23 0 4444.3 30.20 30.20 0 + 268435456 67108864 float sum -1 8635.4 31.09 31.09 0 8653.9 31.02 31.02 0 + 536870912 134217728 float sum -1 17077 31.44 31.44 0 17081 31.43 31.43 0 + 1073741824 268435456 float sum -1 33860 31.71 31.71 0 33896 31.68 31.68 0 + 2147483648 536870912 float sum -1 67521 31.80 31.80 0 67503 31.81 31.81 0 + 4294967296 1073741824 float sum -1 134734 31.88 31.88 0 135069 31.80 31.80 0 + 8589934592 2147483648 float sum -1 269368 31.89 31.89 0 269407 31.88 31.88 0 +# Out of bounds values : 0 OK +# Avg bus bandwidth : 15.5188 +``` ## The Power of Compartmentalization with DraNet