From c2b4652a21c5c9322585a65d929a7f425d39fa59 Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Tue, 27 May 2025 08:07:17 +0000
Subject: [PATCH 1/8] update quickstart

Change-Id: Id98538c545fafb566ae38a892ae597ee235b1845
---
 site/content/docs/quick-start.md | 251 ++++++++-----------------------
 1 file changed, 62 insertions(+), 189 deletions(-)

diff --git a/site/content/docs/quick-start.md b/site/content/docs/quick-start.md
index 3cb81d79..fbe68908 100644
--- a/site/content/docs/quick-start.md
+++ b/site/content/docs/quick-start.md
@@ -4,7 +4,7 @@ date: 2024-12-17T14:47:05Z
 weight: 1
 ---
 
-DRANET depends on the Kubernetes feature [Dynamic Resource Allocation (DRA)](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/), that is beta (disabled by default in v1.32).
+`DraNet` depends on the Kubernetes feature [Dynamic Resource Allocation (DRA)](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/), that is beta (disabled by default in Kubernetes ∂v1.32).
 
 In order to enable DRA you need to enable both the [feature gates and the API groups](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#enabling-dynamic-resource-allocation).
 
@@ -19,21 +19,17 @@ Create a cluster using the following configuration.
 ```yaml
 kind: Cluster
 apiVersion: kind.x-k8s.io/v1alpha4
-containerdConfigPatches:
-  # Enable NRI plugins
-- |-
-  [plugins."io.containerd.nri.v1.nri"]
-    disable = false
 nodes:
 - role: control-plane
-  image: kindest/node:v1.32.0
+  image: kindest/node:v1.33.1
 - role: worker
-  image: kindest/node:v1.32.0
+  image: kindest/node:v1.33.1
 - role: worker
-  image: kindest/node:v1.32.0
+  image: kindest/node:v1.33.1
 featureGates:
   # Enable the corresponding DRA feature gates
   DynamicResourceAllocation: true
+  DRAResourceClaimDeviceStatus: true
 runtimeConfig:
   api/beta : true
 ```
@@ -44,42 +40,8 @@ kind create cluster --config kind.yaml --name dra
 
 ### Google Cloud
 
-You can [enable the DRA beta APIs in GKE](https://cloud.google.com/kubernetes-engine/docs/how-to/use-beta-apis) and it automatically turns on the feature gates.
-
-You need to check that a v1.32 version exist in your zone:
-
-```sh
-$ gcloud container get-server-config  | grep 1.32
-Fetching server config for us-central1-c
-  - 1.32.0-gke.1358000
-  minorVersion: '1.32'
-- 1.32.0-gke.1358000
-- 1.32.0-gke.1358000
-```
-
-And using the version obtained you can create a cluster
-
-```sh
-export PROJECT=dra-proj
-export REGION=us-central1
-export ZONE=us-central1-c
-export CLUSTER=dra-cluster
-export VERSION=1.32.0-gke.1358000
-
-gcloud beta container clusters create ${CLUSTER} \
-    --cluster-version=${VERSION} \
-    --enable-multi-networking \
-    --enable-dataplane-v2 \
-    --enable-kubernetes-unstable-apis=resource.k8s.io/v1beta1/deviceclasses,resource.k8s.io/v1beta1/resourceclaims,resource.k8s.io/v1beta1/resourceclaimtemplates,resource.k8s.io/v1beta1/resourceslices \
-    --no-enable-autorepair \
-    --no-enable-autoupgrade \
-    --zone=${ZONE}
-
-To inspect the contents of your cluster, go to: https://console.cloud.google.com/kubernetes/workload_/gcloud/us-central1-c/aojea-dra?project=aojea-gke-dev
-kubeconfig entry generated for aojea-dra.
-NAME       LOCATION       MASTER_VERSION      MASTER_IP     MACHINE_TYPE  NODE_VERSION        NUM_NODES  STATUS
-aojea-dra  us-central1-c  1.32.0-gke.1358000  X.X.X.X  e2-medium     1.32.0-gke.1358000  3          RUNNING
-```
+For instructions on setting up DRA on GKE, refer to the official documentation:
+[Set up Dynamic Resource Allocation](https://cloud.google.com/kubernetes-engine/docs/how-to/set-up-dra)
 
 A quick and easy way to find if DRA is enabled is by checking the metrics in the kube-apiserver
 
@@ -89,9 +51,9 @@ kubectl get --raw /metrics | grep kubernetes_feature_enabled | grep DynamicResou
 kubernetes_feature_enabled{name="DynamicResourceAllocation",stage="BETA"} 1
 ```
 
-### Installation
+## Installation
 
-You can install the latest stable version using the provided manifest:
+You can install the latest stable version of `DraNet` using the provided manifest:
 
 ```
 kubectl apply -f https://raw.githubusercontent.com/google/dranet/refs/heads/main/install.yaml
@@ -99,7 +61,7 @@ kubectl apply -f https://raw.githubusercontent.com/google/dranet/refs/heads/main
 
 ### How to use it
 
-Once the Kubernetes Network Driver is running you can see the list of Network Interfaces and its attributes published by the drivers:
+Once the Kubernetes Network Driver is running you can see the list of Network Interfaces and its attributes published by the drivers using `kubectl get resourceslices -o yaml`:
 
 ```
 apiVersion: resource.k8s.io/v1beta1
@@ -119,133 +81,49 @@ metadata:
   uid: 535724d7-a573-49e1-8f3b-4e644405375a
 spec:
   devices:
-  - basic:
-      attributes:
-        alias:
-          string: ""
-        cloud_network:
-          string: projects/961828715260/networks/aojea-dra-net-1
-        encapsulation:
-          string: ether
-        ip:
-          string: 192.168.1.2
-        kind:
-          string: network
-        mac:
-          string: 42:01:c0:a8:01:02
-        mtu:
-          int: 8244
-        name:
-          string: eth1
-        numa_node:
-          int: -1
-        pci_address_bus:
-          string: "00"
-        pci_address_device:
-          string: "05"
-        pci_address_domain:
-          string: "0000"
-        pci_address_function:
-          string: "0"
-        pci_vendor:
-          string: Google, Inc.
-        rdma:
-          bool: false
-        sriov:
-          bool: false
-        state:
-          string: up
-        type:
-          string: device
-        virtual:
-          bool: false
-    name: eth1
-  - basic:
-      attributes:
-        alias:
-          string: ""
-        cloud_network:
-          string: projects/961828715260/networks/aojea-dra-net-2
-        encapsulation:
-          string: ether
-        ip:
-          string: 192.168.2.2
-        kind:
-          string: network
-        mac:
-          string: 42:01:c0:a8:02:02
-        mtu:
-          int: 8244
-        name:
-          string: eth2
-        numa_node:
-          int: -1
-        pci_address_bus:
-          string: "00"
-        pci_address_device:
-          string: "06"
-        pci_address_domain:
-          string: "0000"
-        pci_address_function:
-          string: "0"
-        pci_vendor:
-          string: Google, Inc.
-        rdma:
-          bool: false
-        sriov:
-          bool: false
-        state:
-          string: up
-        type:
-          string: device
-        virtual:
-          bool: false
-    name: eth2
-  - basic:
-      attributes:
-        alias:
-          string: ""
-        cloud_network:
-          string: projects/961828715260/networks/aojea-dra-net-3
-        encapsulation:
-          string: ether
-        ip:
-          string: 192.168.3.2
-        kind:
-          string: network
-        mac:
-          string: 42:01:c0:a8:03:02
-        mtu:
-          int: 8244
-        name:
-          string: eth3
-        numa_node:
-          int: -1
-        pci_address_bus:
-          string: "00"
-        pci_address_device:
-          string: "07"
-        pci_address_domain:
-          string: "0000"
-        pci_address_function:
-          string: "0"
-        pci_vendor:
-          string: Google, Inc.
-        rdma:
-          bool: false
-        sriov:
-          bool: false
-        state:
-          string: up
-        type:
-          string: device
-        virtual:
-          bool: false
-    name: eth3
+    - basic:
+        attributes:
+          dra.net/alias:
+            string: ""
+          dra.net/cloudNetwork:
+            string: dra-1-vpc
+          dra.net/encapsulation:
+            string: ether
+          dra.net/ifName:
+            string: gpu7rdma0
+          dra.net/ipv4:
+            string: 10.0.8.8
+          dra.net/mac:
+            string: 9a:41:2e:4f:86:16
+          dra.net/mtu:
+            int: 8896
+          dra.net/numaNode:
+            int: 1
+          dra.net/pciAddressBus:
+            string: c8
+          dra.net/pciAddressDevice:
+            string: "00"
+          dra.net/pciAddressDomain:
+            string: "0000"
+          dra.net/pciAddressFunction:
+            string: "0"
+          dra.net/pciVendor:
+            string: Mellanox Technologies
+          dra.net/rdma:
+            bool: true
+          dra.net/sriov:
+            bool: false
+          dra.net/state:
+            string: up
+          dra.net/type:
+            string: device
+          dra.net/virtual:
+            bool: false
+      name: gpu7rdma0
 ...
 ```
 
-Once the resources are available, users can create DeviceClasses, ResourceClaims and/or ResourceClaimTemplates to schedule pods, see some [examples](https://github.com/google/dranet/tree/main/examples).
+Once the resources are available, users can create `DeviceClasses`, `ResourceClaims` and/or `ResourceClaimTemplates` to schedule pods.
 
 Define a `DeviceClass` that selects all the network interfaces that are connected to a `GCP Network`
 
@@ -259,43 +137,38 @@ spec:
     - cel:
         expression: device.driver == "dra.net"
     - cel:
-        expression: has(device.attributes["dra.net"].cloud_network) 
-  config:
-  - opaque:
-      driver: dra.net
-      parameters:
-        nccl: "true"
+        expression: has(device.attributes["dra.net"].cloudNetwork) 
 ```
 
-Now you can create a `ResourceClaim` that connects to a specific network, in this case `projects/961828715260/networks/aojea-dra-net-3` and reference that claim in a `Pod`:
+Now you can create a `ResourceClaim` that connects to a specific network, in this case `dra-1-vpc` and reference that claim in a `Pod`:
 
 ```yaml
 apiVersion: resource.k8s.io/v1beta1
 kind:  ResourceClaim
 metadata:
-  name: cloud-network-dra-net-3
+  name: cloud-network-dra-net-1
 spec:
   devices:
     requests:
-    - name: req-cloud-net-3
+    - name: req-cloud-net-1
       deviceClassName: dranet-cloud
       selectors:
         - cel:
-            expression: device.attributes["dra.net"].cloud_network == "projects/961828715260/networks/aojea-dra-net-3"
+            expression: device.attributes["dra.net"].cloudNetwork == "dra-1-vpc"
 ---
 apiVersion: v1
 kind: Pod
 metadata:
-  name: pod-dra-net3
+  name: pod-dra-net1
   labels:
-    app: pod-dra-net3
+    app: pod-dra-net1
 spec:
   containers:
   - name: ctr1
     image: registry.k8s.io/e2e-test-images/agnhost:2.39
   resourceClaims:
-  - name: net-3
-    resourceClaimName: cloud-network-dra-net-3
+  - name: net-1
+    resourceClaimName: cloud-network-dra-net-1
 ```
 
 Kubernetes schedules the `Pod` to the corresponding `Node` and attach the network interface to the `Pod`:
@@ -303,12 +176,12 @@ Kubernetes schedules the `Pod` to the corresponding `Node` and attach the networ
 ```sh
 kubectl get pods -o wide
 NAME           READY   STATUS    RESTARTS   AGE   IP            NODE                                    NOMINATED NODE   READINESS GATES
-pod-dra-net3   1/1     Running   0          5s    10.52.3.108   gke-dra-multi-nic-985b8c20-jg5l   <none>           <none>
+pod-dra-net1  1/1     Running   0          5s    10.52.3.108   gke-dra-multi-nic-985b8c20-jg5l   <none>           <none>
 ```
 
 If we execute inside the `Pod` we can see the network interface now is attached:
 ```sh
-kubectl exec -it pod-dra-net3 ip a
+kubectl exec -it pod-dra-net1 ip a
 kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
     link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
@@ -318,7 +191,7 @@ kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future versi
     link/ether 86:dc:58:24:55:1a brd ff:ff:ff:ff:ff:ff link-netnsid 0
     inet 10.52.3.108/24 brd 10.52.3.255 scope global eth0
        valid_lft forever preferred_lft forever
-5: eth3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 8244 qdisc fq state UP group default qlen 1000
+5: gpu7rdma0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 8244 qdisc fq state UP group default qlen 1000
     link/ether 42:01:c0:a8:03:02 brd ff:ff:ff:ff:ff:ff
 ```
 

From bcbca1f174841c31f6d9264d56f2b43c3bd9806a Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Tue, 27 May 2025 11:09:34 +0000
Subject: [PATCH 2/8] remove tcpdirect example

Change-Id: I48587b6872b4e4be7dd9bab9f4998b26a6374ccd
---
 examples/demo_gke_tcpdirect/README.md | 28 ---------------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 examples/demo_gke_tcpdirect/README.md

diff --git a/examples/demo_gke_tcpdirect/README.md b/examples/demo_gke_tcpdirect/README.md
deleted file mode 100644
index 1eb1d0b7..00000000
--- a/examples/demo_gke_tcpdirect/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# GKE TCP Direct
-
-
-1. Create a cluster
-
-DRA is beta in 1.32, so it requires to explicitly enable the feature.
-
-```sh
-PROJECT="test-project"
-CLUSTER="test-cluster"
-ZONE="us-central1-c"
-VERSION="1.32"
-
-gcloud container clusters create "${CLUSTER}" \
-    --cluster-version="${VERSION}" \
-    --enable-multi-networking \
-    --enable-dataplane-v2 \
-    --enable-kubernetes-unstable-apis=resource.k8s.io/v1beta1/deviceclasses,resource.k8s.io/v1beta1/resourceclaims,resource.k8s.io/v1beta1/resourceclaimtemplates,resource.k8s.io/v1beta1/resourceslices \
-    --no-enable-autorepair \
-    --no-enable-autoupgrade \
-    --zone="${ZONE}" \
-    --project="${PROJECT}" # Explicitly set the project
-```
-
-2. Once the cluster has been created we need to create a Node Pool with A3 machines, `dranetctl` is an opinionanted tool that will set the necessary
-values for an optimal performance.
-
-```sh
\ No newline at end of file

From 5b5a806a1e6a6efbd1918fb5f5b530f31cb0278d Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Tue, 27 May 2025 14:28:44 +0000
Subject: [PATCH 3/8] add nccl tests to the testing image

Change-Id: Ib4896dcee5513177716ba27f1a591106e52fadfa
---
 .github/workflows/test-images.yaml |  2 +-
 Dockerfile.perftest                | 38 +++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/test-images.yaml b/.github/workflows/test-images.yaml
index 2d85d9d2..cbfb3374 100644
--- a/.github/workflows/test-images.yaml
+++ b/.github/workflows/test-images.yaml
@@ -57,7 +57,7 @@ jobs:
       with:
         context: .
         file: Dockerfile.perftest
-        platforms: linux/amd64,linux/arm64
+        platforms: linux/amd64
         push: true
         tags: |
           ${{ steps.meta.outputs.tags }}
diff --git a/Dockerfile.perftest b/Dockerfile.perftest
index abeb0991..429ae362 100644
--- a/Dockerfile.perftest
+++ b/Dockerfile.perftest
@@ -8,6 +8,7 @@ RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     git \
     build-essential \
+    wget \
     autoconf \
     automake \
     libtool \
@@ -35,6 +36,20 @@ RUN ./autogen.sh && \
     make -j$(nproc) && \
     make install
 
+# --- Build openmpi ---
+WORKDIR /usr/src
+RUN wget -O- https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz | tar xzf -
+WORKDIR /usr/src/openmpi-5.0.7
+RUN ./configure --with-cuda=/usr/local/cuda && \
+    make -j$(nproc) && \
+    make install
+
+# --- Build nvidia/ncc-tests ---
+WORKDIR /usr/src
+RUN git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git
+WORKDIR /usr/src/nccl-tests
+RUN make -j$(nproc) MPI=1 MPI_HOME=/usr/src/openmpi-5.0.7
+
 # Stage 2: Runtime
 FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04 AS runtime
 
@@ -51,18 +66,29 @@ RUN apt-get update && \
     libnl-3-200 \
     libnl-route-3-200 \
     libpci3 \
+    libmnl0 \
+    libelf1 \
+    pciutils \
+    openssh-server \
     libnuma1 && \
     rm -rf /var/lib/apt/lists/*
 
-# Set environment variables for CUDA libraries in the runtime stage.
-ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH:-}"
-ENV LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:-}"
+# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
+# to disable UserKnownHostsFile to avoid write permissions.
+# Disabling StrictModes avoids directory and files read permission checks.
+RUN echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config &&\
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+RUN mkdir /run/sshd
 
-COPY --from=builder /usr/local/bin/ib_* /usr/local/bin/
-COPY --from=builder /usr/local/bin/raw_ethernet_* /usr/local/bin/
+COPY --from=builder /usr/local/bin/ /usr/local/bin/
+COPY --from=builder /usr/local/lib/ /usr/local/lib/
+COPY --from=builder /usr/src/nccl-tests/build/*_perf /usr/local/bin/
 
+# Set environment variables for CUDA libraries in the runtime stage.
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/nvidia/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}"
+ENV LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:-}"
 # Add the installation directory to the PATH for easy execution.
-ENV PATH="/usr/local/bin:${PATH}"
+ENV PATH="/usr/local/bin:/usr/local/nvidia/bin:${PATH}"
 
 # Set the default command to run when the container starts.
 CMD ["/bin/bash"]
\ No newline at end of file

From 928f7cf1fbc6ca6f1ac188ae3bb0166f715d9196 Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Tue, 27 May 2025 19:34:48 +0000
Subject: [PATCH 4/8] MPI operator WIP

Change-Id: I7b4f43247a11d7283382b26ab4f4455557ebd6b1
---
 Dockerfile.perftest                      |  10 +-
 examples/mpi_operator/dra_resources.yaml |  31 +++++
 examples/mpi_operator/nccl-test-job.yaml |  48 ++++++++
 site/content/docs/user/mpi-operator.md   | 146 +++++++++++++++++++++++
 4 files changed, 230 insertions(+), 5 deletions(-)
 create mode 100644 examples/mpi_operator/dra_resources.yaml
 create mode 100644 examples/mpi_operator/nccl-test-job.yaml
 create mode 100644 site/content/docs/user/mpi-operator.md

diff --git a/Dockerfile.perftest b/Dockerfile.perftest
index 429ae362..5c4f1ff8 100644
--- a/Dockerfile.perftest
+++ b/Dockerfile.perftest
@@ -40,7 +40,7 @@ RUN ./autogen.sh && \
 WORKDIR /usr/src
 RUN wget -O- https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz | tar xzf -
 WORKDIR /usr/src/openmpi-5.0.7
-RUN ./configure --with-cuda=/usr/local/cuda && \
+RUN ./configure --prefix=/opt/openmpi --with-cuda=/usr/local/cuda && \
     make -j$(nproc) && \
     make install
 
@@ -48,7 +48,7 @@ RUN ./configure --with-cuda=/usr/local/cuda && \
 WORKDIR /usr/src
 RUN git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git
 WORKDIR /usr/src/nccl-tests
-RUN make -j$(nproc) MPI=1 MPI_HOME=/usr/src/openmpi-5.0.7
+RUN make -j$(nproc) MPI=1 MPI_HOME=/opt/openmpi
 
 # Stage 2: Runtime
 FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04 AS runtime
@@ -81,14 +81,14 @@ RUN echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && echo "
 RUN mkdir /run/sshd
 
 COPY --from=builder /usr/local/bin/ /usr/local/bin/
-COPY --from=builder /usr/local/lib/ /usr/local/lib/
+COPY --from=builder /opt/openmpi/ /opt/openmpi/
 COPY --from=builder /usr/src/nccl-tests/build/*_perf /usr/local/bin/
 
 # Set environment variables for CUDA libraries in the runtime stage.
-ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/nvidia/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}"
+ENV LD_LIBRARY_PATH="/opt/openmpi/lib/:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}"
 ENV LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:-}"
 # Add the installation directory to the PATH for easy execution.
-ENV PATH="/usr/local/bin:/usr/local/nvidia/bin:${PATH}"
+ENV PATH="/usr/local/bin:/usr/local/nvidia/bin:/opt/openmpi/bin:${PATH}"
 
 # Set the default command to run when the container starts.
 CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/examples/mpi_operator/dra_resources.yaml b/examples/mpi_operator/dra_resources.yaml
new file mode 100644
index 00000000..80fa3e6f
--- /dev/null
+++ b/examples/mpi_operator/dra_resources.yaml
@@ -0,0 +1,31 @@
+apiVersion: resource.k8s.io/v1beta1
+kind: DeviceClass
+metadata:
+  name: dranet-rdma-for-mpi
+spec:
+  selectors:
+    - cel:
+        expression: device.driver == "dra.net"
+    - cel:
+        expression: device.attributes["dra.net"].rdma == true
+---
+apiVersion: resource.k8s.io/v1beta1
+kind: ResourceClaimTemplate
+metadata:
+  name: mpi-worker-rdma-nic-template
+spec:
+  spec:
+    devices:
+      requests:
+        - name: rdma-nic-for-mpi
+          deviceClassName: dranet-rdma-for-mpi
+          selectors:
+          - cel:
+              expression: device.attributes["dra.net"].ifName == "gpu2rdma0"
+      config:
+      - opaque:
+          driver: dra.net
+          parameters:
+            interface:
+              name: "dranet0" # NCCL will use this interface
+
diff --git a/examples/mpi_operator/nccl-test-job.yaml b/examples/mpi_operator/nccl-test-job.yaml
new file mode 100644
index 00000000..90ad6247
--- /dev/null
+++ b/examples/mpi_operator/nccl-test-job.yaml
@@ -0,0 +1,48 @@
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: nccl-test-dranet-1gpu-1nic
+spec:
+  slotsPerWorker: 1 # 1 MPI rank per worker Pod
+  mpiImplementation: OpenMPI # Or your preferred MPI
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - image: docker.io/aojea/dranet-perftest:latest@sha256:bfaa20bba4178f20b3c50c2ddd57d3e0488bc94a4d704ea091043c389571fd6e
+            name: test-launcher
+            env:
+              - name: NCCL_DEBUG
+                value: "INFO"
+              - name: LD_LIBRARY_PATH
+                value: "/opt/openmpi/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64"
+            command: ["/bin/bash", "-c"]
+            args:
+            - |
+              echo "--- Environment for mpirun ---"
+              echo "Effective LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+              PATH=/usr/local/bin:/usr/local/nvidia/bin:/opt/openmpi/bin:$PATH
+
+              mpirun --allow-run-as-root -x LD_LIBRARY_PATH -x PATH \
+                bash -c 'hostname'
+            securityContext:
+              capabilities:
+                add: ["IPC_LOCK"]
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          resourceClaims:
+          - name: worker-rdma-nic
+            resourceClaimTemplateName: mpi-worker-rdma-nic-template
+          containers:
+          - image: docker.io/aojea/dranet-perftest:latest@sha256:bfaa20bba4178f20b3c50c2ddd57d3e0488bc94a4d704ea091043c389571fd6e
+            name: mpi-worker
+            securityContext:
+              capabilities:
+                add: ["IPC_LOCK"]
+            resources:
+              limits:
+                nvidia.com/gpu: 1 # Each worker gets 1 GPU
diff --git a/site/content/docs/user/mpi-operator.md b/site/content/docs/user/mpi-operator.md
new file mode 100644
index 00000000..6bf8d95d
--- /dev/null
+++ b/site/content/docs/user/mpi-operator.md
@@ -0,0 +1,146 @@
+---
+title: "MPI Operator"
+date: 2025-05-27T11:30:40Z
+---
+
+Running distributed applications, such as those using the Message Passing Interface (MPI) or NVIDIA's Collective Communications Library (NCCL) for GPU communication, often requires each participating process (or Pod, in Kubernetes terms) to have access to high-speed, low-latency interconnects. Simply sharing a generic network interface among many high-performance jobs can lead to contention, unpredictable performance, and underutilization of expensive hardware.
+
+The goal is resource compartmentalization: ensuring that each part of your distributed job gets dedicated access to the specific resources it needs – for instance, one GPU and one dedicated RDMA-capable NIC per worker.
+
+## DraNet + MPI Operator: A Powerful Combination
+
+- DraNet: Provides the mechanism to discover RDMA-capable NICs on your Kubernetes nodes and make them available for Pods to claim. Through DRA, Pods can request a specific NIC, and DraNet, via NRI hooks, will configure it within the Pod's namespace, [even naming it predictably (e.g., dranet0)](google/dranet/dranet-dcd98f563b1a24f4800cf3d2d502ec5b2f488ddc/site/content/docs/user/interface-configuration.md)
+
+- [Kubeflow MPI Operator](https://github.com/kubeflow/mpi-operator): Simplifies the deployment and management of MPI-based applications on Kubernetes. It handles the setup of MPI ranks, hostfiles, and the execution of mpirun.
+
+By using them together, we can create MPIJob definitions where each worker Pod explicitly claims a dedicated RDMA NIC managed by DraNet, alongside its GPU
+
+### Example: Running NCCL Tests for Distributed Workload Validation
+
+A common and reliable way to validate that that our distributed setup is performing optimally is by running an [NVIDIA's Collective Communications Library (NCCL) All-Reduce test](https://github.com/NVIDIA/nccl-tests). This benchmark is designed to exercise the high-speed interconnects between nodes, helping you confirm that the RDMA fabric (like InfiniBand or RoCE) is operating correctly and ready to support your distributed workloads with expected efficiency.
+
+Let's see how we can run this with DraNet and the MPI Operator, focusing on a 1 GPU and 1 NIC per worker configuration.
+
+1. Defining Resources for DraNet:
+
+First, we tell DraNet what kind of NICs we're interested in and how Pods can claim them.
+
+**DeviceClass (dranet-rdma-for-mpi):** This selects RDMA-capable NICs managed by DraNet.
+
+```yaml
+apiVersion: resource.k8s.io/v1beta1
+kind: DeviceClass
+metadata:
+  name: dranet-rdma-for-mpi
+spec:
+  selectors:
+    - cel:
+        expression: device.driver == "dra.net"
+    - cel:
+        expression: device.attributes["dra.net"].rdma == true
+```
+
+**ResourceClaimTemplate (mpi-worker-rdma-nic-template):** MPI worker Pods will use this to request one RDMA NIC. DraNet will be instructed to name this interface dranet0 inside the Pod.
+
+```yaml
+apiVersion: resource.k8s.io/v1beta1
+kind: ResourceClaimTemplate
+metadata:
+  name: mpi-worker-rdma-nic-template
+spec:
+  spec:
+    devices:
+      requests:
+        - name: rdma-nic-for-mpi
+          deviceClassName: dranet-rdma-for-mpi
+          selectors:
+          - cel:
+              expression: device.attributes["dra.net"].ifName == "gpu2rdma0"
+    config:
+    - opaque:
+        driver: dra.net
+        parameters:
+          interface:
+            name: "dranet0" # NCCL will use this interface
+```
+
+1. Crafting the MPIJob:
+
+The MPIJob specification is where we tie everything together. We'll define a job with two workers, each getting one GPU and one DraNet-managed RDMA NIC.
+
+```yaml
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: nccl-test-dranet-1gpu-1nic
+spec:
+  slotsPerWorker: 1 # 1 MPI rank per worker Pod
+  mpiImplementation: OpenMPI # Or your preferred MPI
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5
+            env:
+              - name: NCCL_DEBUG
+                value: "INFO"
+              - name: OMPI_MCA_pml
+                value: "ucx"
+            command:
+            - mpirun
+            - /third_party/nccl-tests/build/all_reduce_perf
+            - -b 8K -e 128M -g 1 # Benchmark params: 1 GPU per process
+            securityContext:
+              capabilities:
+                add: ["IPC_LOCK"]
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          resourceClaims:
+          - name: worker-rdma-nic
+            resourceClaimTemplateName: mpi-worker-rdma-nic-template
+          containers:
+          - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5
+            name: mpi-worker
+            securityContext:
+              capabilities:
+                add: ["IPC_LOCK"]
+            resources:
+              limits:
+                nvidia.com/gpu: 1 # Each worker gets 1 GPU
+```
+
+### Key Aspects of this Configuration:
+
+- **slotsPerWorker: 1:** Each worker Pod hosts a single MPI rank.
+
+- **Worker.replicas: 2:** We run a 2-rank MPI job.
+
+- **Worker.template.spec.resourceClaims:** Each worker Pod claims its own RDMA NIC via the template, which DraNet will configure as dranet0.
+
+- **Worker.template.spec.containers[0].resources.limits["nvidia.com/gpu"]: 1:** Each worker gets one GPU.
+
+- **Launcher.template.spec.containers[0].env.NCCL_SOCKET_IFNAME: "dranet0":** This environment variable explicitly tells NCCL to use the dranet0 interface for its network operations.
+
+- **MPI MCA Parameters (e.g., UCX_NET_DEVICES="dranet0"):** These guide the MPI library itself to use the specified RDMA interface.
+
+3. Running and Observing:
+
+Once deployed, the MPI Operator will launch the job. The launcher Pod will execute mpirun, which starts the all_reduce_perf test across the two worker Pods. Each worker Pod will use its dedicated GPU and its dedicated dranet0 (RDMA NIC) for NCCL communications.
+
+You can monitor the launcher's logs to see the NCCL benchmark results, including the achieved bus bandwidth. The NCCL_DEBUG=INFO logs will also confirm that NCCL is indeed using the dranet0 interface.
+
+## The Power of Compartmentalization with DraNet
+
+This setup beautifully illustrates the benefits of resource compartmentalization:
+
+- Dedicated Performance: Each MPI worker in this job has exclusive use of one GPU and one high-speed RDMA NIC. This ensures that its communication performance is not impacted by other workloads on the same node.
+
+- Efficient Resource Utilization: If your nodes are powerful (e.g., 8 GPUs and 8 RDMA NICs), running this 2-worker job (consuming 1 GPU/1 NIC on two separate nodes) leaves the remaining resources on those nodes (and other nodes) fully available.
+
+- Concurrent High-Performance Jobs: You can run multiple independent MPI jobs or other DraNet-aware distributed workloads simultaneously. Each job can claim its own subset of GPUs and RDMA NICs, and DraNet ensures that their network traffic is isolated at the NIC level, preventing contention and guaranteeing predictable performance.
+
+By leveraging DraNet with tools like the MPI Operator, teams can confidently deploy network-intensive distributed applications on Kubernetes, achieving performance comparable to bare-metal HPC clusters while benefiting from Kubernetes' orchestration capabilities.
\ No newline at end of file

From 030b9997cf0835594f799ed30f461a7208818e56 Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Wed, 28 May 2025 07:45:23 +0000
Subject: [PATCH 5/8] worker mpi images

Change-Id: I6c51015d25c333b79109fc58ae5782d9c55dc4b5
---
 Dockerfile.perftest | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.perftest b/Dockerfile.perftest
index 5c4f1ff8..4de80631 100644
--- a/Dockerfile.perftest
+++ b/Dockerfile.perftest
@@ -38,8 +38,8 @@ RUN ./autogen.sh && \
 
 # --- Build openmpi ---
 WORKDIR /usr/src
-RUN wget -O- https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz | tar xzf -
-WORKDIR /usr/src/openmpi-5.0.7
+RUN wget -O- https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.8.tar.gz | tar xzf -
+WORKDIR /usr/src/openmpi-4.1.8
 RUN ./configure --prefix=/opt/openmpi --with-cuda=/usr/local/cuda && \
     make -j$(nproc) && \
     make install
@@ -70,15 +70,42 @@ RUN apt-get update && \
     libelf1 \
     pciutils \
     openssh-server \
+    openssh-client \
+    libcap2-bin \
     libnuma1 && \
     rm -rf /var/lib/apt/lists/*
 
+
+# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
+# https://github.com/kubeflow/mpi-operator/issues/580
+ARG port=2222
+# Add priviledge separation directoy to run sshd as root.
+RUN mkdir -p /var/run/sshd
+# Add capability to run sshd as non-root.
+RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+# by disabling StrictHostKeyChecking.
 # mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
 # to disable UserKnownHostsFile to avoid write permissions.
 # Disabling StrictModes avoids directory and files read permission checks.
-RUN echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config &&\
-    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
-RUN mkdir /run/sshd
+RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
+    && echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
+    && sed -i "s/[ #]\(.*Port \).*/ \1$port/g" /etc/ssh/ssh_config \
+    && sed -i "s/#\(StrictModes \).*/\1no/g" /etc/ssh/sshd_config \
+    && sed -i "s/#\(Port \).*/\1$port/g" /etc/ssh/sshd_config
+
+RUN useradd -m mpiuser
+WORKDIR /home/mpiuser
+# Configurations for running sshd as non-root.
+RUN mkdir -p /home/mpiuser/.ssh && \
+    cat <<EOF > /home/mpiuser/sshd_config_custom
+PidFile /home/mpiuser/sshd.pid
+HostKey /home/mpiuser/.ssh/id_rsa
+StrictModes no
+EOF
+
+RUN echo "Port $port" >> /home/mpiuser/.sshd_config
 
 COPY --from=builder /usr/local/bin/ /usr/local/bin/
 COPY --from=builder /opt/openmpi/ /opt/openmpi/

From 01bbc155e16a3f3cbddbf265f264572281132cb4 Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Wed, 28 May 2025 11:16:59 +0000
Subject: [PATCH 6/8] update demo gke to reflect custom drivers

Change-Id: I45e2e4242253d12ccaf77a7f0cf9d2106addd349
---
 examples/demo_gke_rdma/README.md          | 107 +++++++++++-----------
 examples/demo_gke_rdma/nccl-gib-test.yaml |  15 +--
 examples/demo_gke_rdma/rdma-perftest.yaml |  17 +++-
 3 files changed, 72 insertions(+), 67 deletions(-)

diff --git a/examples/demo_gke_rdma/README.md b/examples/demo_gke_rdma/README.md
index 924a14a0..0180aab9 100644
--- a/examples/demo_gke_rdma/README.md
+++ b/examples/demo_gke_rdma/README.md
@@ -60,6 +60,22 @@ You can validate this by using `kubectl get resourceslices -o yaml` and checking
 ```
 
 
+## GKE RDMA and NCCL
+
+Based on https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute-custom but using only 1 NIC and 1 GPU per Pod to demonstrate how to split workloads to allocate individual resources.
+
+
+### Install the RDMA binary and configure NCCL
+
+This Daemonset does the following:
+
+* Installs RDMA binaries and the NCCL library on the node.
+* Stores the library and the binary in the /home/kubernetes/bin/nvidia/lib64 and the  /home/kubernetes/bin/gib directory on the VM.
+
+```sh
+kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/refs/heads/master/gpudirect-rdma/nccl-rdma-installer.yaml
+```
+
 ## Deploy perf-tests RDMA Pods
 
 Use the following manifest to install two Pods in the same RDMA network,
@@ -130,7 +146,7 @@ Run `rping -s` in one of the Pods and connect from the other to validate the con
 
 ```
  kubectl exec -it rdma-perftest-1 -- bash
-root@rdma-perftest-1:/# LD_LIBRARY_PATH="" rping -c -a 10.0.4.7 -C 3 -v -V
+root@rdma-perftest-1:/# rping -c -a 10.0.4.7 -C 3 -v -V
 ping data: rdma-ping-0: ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqr
 ping data: rdma-ping-1: BCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrs
 ping data: rdma-ping-2: CDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrst
@@ -295,23 +311,8 @@ deallocating GPU buffer 000078e9f8800000
 destroying current CUDA Ctx
 ```
 
-## GKE NCCL
-
-Based on https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute-custom but using only 1 NIC and 1 GPU per Pod to demonstrate how to split workloads to allocate individual resources.
-
 
-### Install the RDMA binary and configure NCCL
-
-This Daemonset does the following:
-
-* Installs RDMA binaries and the NCCL library on the node.
-* Stores the library and the binary in the /home/kubernetes/bin/nvidia/lib64 and the  /home/kubernetes/bin/gib directory on the VM.
-
-```sh
-kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/refs/heads/master/gpudirect-rdma/nccl-rdma-installer.yaml
-```
-
-### Deploy the test workload
+### Deploy the test NCCL workload
 
 The manifest deploys two test pods, each of which runs in a A3 Ultra node.
 
@@ -332,63 +333,61 @@ nccl-gib-test-0        1/1     Running   0          3s
 nccl-gib-test-1        1/1     Running   0          1s
 ```
 
-
-
 ### Run the tests
 
 It is important to pass the right parameters, in this specific example we need to indicate to only use one GPU per node `[-g <gpus_per_node>]`.
 
 ```sh
- kubectl exec nccl-gib-test-0 -it -- /usr/local/gib/scripts/run_nccl_tests.sh -t all_gather -b 1K -g 1 -e 8G nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test
+kubectl exec nccl-gib-test-0 -it -- /usr/local/gib/scripts/run_nccl_tests.sh -t all_gather -b 1K -g 1 -e 8G nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test
 ```
 
 It should return something like:
 
 ```sh
-
+kubectl exec nccl-gib-test-0 -it -- /usr/local/gib/scripts/run_nccl_tests.sh -t all_gather -b 1K -g 1 -e 8G nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test
 Initializing SSH...
+Warning: Permanently added '[nccl-gib-test-0.nccl-gib-test]:222' (ED25519) to the list of known hosts.
 Hello from nccl-gib-test-0.nccl-gib-test
+Warning: Permanently added '[nccl-gib-test-1.nccl-gib-test]:222' (ED25519) to the list of known hosts.
 Hello from nccl-gib-test-1.nccl-gib-test
-+ /usr/local/gib/scripts/gen_hostfiles.sh -p 222 nccl-gib-test-0.nccl-gib-test nccl-gib-test-1.nccl-gib-test
-Generating hostfiles for 2 hosts:
+Generating hostfiles for 2 hosts: 
 nccl-gib-test-0.nccl-gib-test
 nccl-gib-test-1.nccl-gib-test
-+ mpirun --allow-run-as-root --mca btl tcp,self --mca btl_tcp_if_include eth0 --bind-to none -np 2 --hostfile /tmp/hostfiles/hostfile1 -x PATH -x LD_LIBRARY_PATH=/usr/local/gib/lib64:/usr/local/nvidia/lib64 -x NCCL_DEBUG=WARN -x NCCL_DEBUG_SUBSYS=INIT,NET -x NCCL_TESTS_SPLIT_MASK=0x0 bash -c 'source /usr/local/gib/scripts/set_nccl_env.sh;      /third_party/nccl-tests/build/all_gather_perf        -b 1K -e 8G -f 2        -w 50 -n 100;'
 # nThread 1 nGpus 1 minBytes 1024 maxBytes 8589934592 step: 2(factor) warmup iters: 50 iters: 100 agg iters: 1 validation: 1 graph: 0
 #
 # Using devices
-#  Rank  0 Group  0 Pid    235 on nccl-gib-test-0 device  0 [0000:90:00] NVIDIA H200
-#  Rank  1 Group  0 Pid    161 on nccl-gib-test-1 device  0 [0000:90:00] NVIDIA H200
+#  Rank  0 Group  0 Pid     85 on nccl-gib-test-0 device  0 [0000:cc:00] NVIDIA H200
+#  Rank  1 Group  0 Pid     54 on nccl-gib-test-1 device  0 [0000:c4:00] NVIDIA H200
 NCCL version 2.25.1+cuda12.8
 #
-#                                                              out-of-place                       in-place
+#                                                              out-of-place                       in-place          
 #       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
-#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
-        1024           128     float    none      -1    20.89    0.05    0.02      0    20.40    0.05    0.03      0
-        2048           256     float    none      -1    20.55    0.10    0.05      0    20.48    0.10    0.05      0
-        4096           512     float    none      -1    20.71    0.20    0.10      0    20.75    0.20    0.10      0
-        8192          1024     float    none      -1    21.49    0.38    0.19      0    21.62    0.38    0.19      0
-       16384          2048     float    none      -1    24.56    0.67    0.33      0    24.55    0.67    0.33      0
-       32768          4096     float    none      -1    25.04    1.31    0.65      0    24.59    1.33    0.67      0
-       65536          8192     float    none      -1    28.59    2.29    1.15      0    28.04    2.34    1.17      0
-      131072         16384     float    none      -1    33.46    3.92    1.96      0    36.83    3.56    1.78      0
-      262144         32768     float    none      -1    47.72    5.49    2.75      0    45.11    5.81    2.91      0
-      524288         65536     float    none      -1    79.13    6.63    3.31      0    76.17    6.88    3.44      0
-     1048576        131072     float    none      -1    71.48   14.67    7.33      0    70.06   14.97    7.48      0
-     2097152        262144     float    none      -1    76.40   27.45   13.72      0    76.41   27.44   13.72      0
-     4194304        524288     float    none      -1    117.9   35.58   17.79      0    117.3   35.77   17.88      0
-     8388608       1048576     float    none      -1    203.4   41.24   20.62      0    204.7   40.98   20.49      0
-    16777216       2097152     float    none      -1    375.1   44.73   22.37      0    371.6   45.14   22.57      0
-    33554432       4194304     float    none      -1    729.7   45.98   22.99      0    728.9   46.04   23.02      0
-    67108864       8388608     float    none      -1   1447.6   46.36   23.18      0   1443.5   46.49   23.25      0
-   134217728      16777216     float    none      -1   2871.7   46.74   23.37      0   2854.1   47.03   23.51      0
-   268435456      33554432     float    none      -1   5699.0   47.10   23.55      0   5666.1   47.38   23.69      0
-   536870912      67108864     float    none      -1    11382   47.17   23.58      0    11026   48.69   24.35      0
-  1073741824     134217728     float    none      -1    22474   47.78   23.89      0    21049   51.01   25.51      0
-  2147483648     268435456     float    none      -1    44241   48.54   24.27      0    39256   54.70   27.35      0
-  4294967296     536870912     float    none      -1    86470   49.67   24.83      0    75081   57.20   28.60      0
-  8589934592    1073741824     float    none      -1   166030   51.74   25.87      0   141444   60.73   30.37      0
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+        1024           128     float    none      -1    21.75    0.05    0.02      0    20.94    0.05    0.02      0
+        2048           256     float    none      -1    21.48    0.10    0.05      0    21.40    0.10    0.05      0
+        4096           512     float    none      -1    21.81    0.19    0.09      0    21.91    0.19    0.09      0
+        8192          1024     float    none      -1    22.45    0.36    0.18      0    22.65    0.36    0.18      0
+       16384          2048     float    none      -1    26.28    0.62    0.31      0    25.44    0.64    0.32      0
+       32768          4096     float    none      -1    26.05    1.26    0.63      0    25.64    1.28    0.64      0
+       65536          8192     float    none      -1    29.97    2.19    1.09      0    29.61    2.21    1.11      0
+      131072         16384     float    none      -1    33.05    3.97    1.98      0    32.99    3.97    1.99      0
+      262144         32768     float    none      -1    40.97    6.40    3.20      0    37.59    6.97    3.49      0
+      524288         65536     float    none      -1    50.18   10.45    5.22      0    46.03   11.39    5.70      0
+     1048576        131072     float    none      -1    61.30   17.11    8.55      0    57.32   18.29    9.15      0
+     2097152        262144     float    none      -1    77.15   27.18   13.59      0    77.63   27.01   13.51      0
+     4194304        524288     float    none      -1    119.5   35.09   17.55      0    121.7   34.48   17.24      0
+     8388608       1048576     float    none      -1    206.9   40.55   20.28      0    207.7   40.39   20.20      0
+    16777216       2097152     float    none      -1    371.4   45.17   22.58      0    372.5   45.04   22.52      0
+    33554432       4194304     float    none      -1    695.5   48.25   24.12      0    698.2   48.06   24.03      0
+    67108864       8388608     float    none      -1   1282.5   52.33   26.16      0   1280.4   52.41   26.21      0
+   134217728      16777216     float    none      -1   2395.4   56.03   28.02      0   2548.9   52.66   26.33      0
+   268435456      33554432     float    none      -1   4526.0   59.31   29.65      0   4506.3   59.57   29.78      0
+   536870912      67108864     float    none      -1   8827.7   60.82   30.41      0   8873.5   60.50   30.25      0
+  1073741824     134217728     float    none      -1    17261   62.21   31.10      0    17056   62.95   31.48      0
+  2147483648     268435456     float    none      -1    33952   63.25   31.62      0    33156   64.77   32.38      0
+  4294967296     536870912     float    none      -1    67018   64.09   32.04      0    65577   65.50   32.75      0
+  8589934592    1073741824     float    none      -1   133370   64.41   32.20      0   128890   66.65   33.32      0
 # Out of bounds values : 0 OK
-# Avg bus bandwidth    : 13.132
-
+# Avg bus bandwidth    : 15.0709 
+#
 ```
\ No newline at end of file
diff --git a/examples/demo_gke_rdma/nccl-gib-test.yaml b/examples/demo_gke_rdma/nccl-gib-test.yaml
index 5cdecefe..806cd887 100644
--- a/examples/demo_gke_rdma/nccl-gib-test.yaml
+++ b/examples/demo_gke_rdma/nccl-gib-test.yaml
@@ -22,7 +22,7 @@ spec:
         deviceClassName: rdma
         selectors:
         - cel:
-            expression: device.attributes["dra.net"].ifName == "gpu2rdma0"
+            expression: device.attributes["dra.net"].ifName == "gpu1rdma0"
 ---
 apiVersion: v1
 kind: Service
@@ -54,21 +54,16 @@ spec:
       - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5
         name: test
         resources:
-          requests:
-            cpu: 150m
+          limits:
+            nvidia.com/gpu: 1
         volumeMounts:
           - name: library-dir-host
             mountPath: /usr/local/nvidia
           - name: gib
             mountPath: /usr/local/gib
-          - name: shared-memory
-            mountPath: /dev/shm
         env:
           - name: LD_LIBRARY_PATH
             value: /usr/local/nvidia/lib64
-        resources:
-          limits:
-            nvidia.com/gpu: 1
         command: ["/bin/bash", "-c"]
         args:
           - |
@@ -85,10 +80,6 @@ spec:
         - name: gib
           hostPath:
             path: /home/kubernetes/bin/gib
-        - name: shared-memory
-          emptyDir:
-            medium: "Memory"
-            sizeLimit: 250Gi
       resourceClaims:
       - name: rdma-net-interface
         resourceClaimTemplateName: rdma-net-template-gib
diff --git a/examples/demo_gke_rdma/rdma-perftest.yaml b/examples/demo_gke_rdma/rdma-perftest.yaml
index 7e8a0fc5..083372a1 100644
--- a/examples/demo_gke_rdma/rdma-perftest.yaml
+++ b/examples/demo_gke_rdma/rdma-perftest.yaml
@@ -22,7 +22,7 @@ spec:
         deviceClassName: rdma
         selectors:
         - cel:
-            expression: device.attributes["dra.net"].ifName == "gpu3rdma0"
+            expression: device.attributes["dra.net"].ifName == "gpu4rdma0"
 ---
 apiVersion: v1
 kind: Service
@@ -60,6 +60,21 @@ spec:
           limits:
             nvidia.com/gpu: 1
         command: ["sleep", "infinity"]
+        volumeMounts:
+          - name: library-dir-host
+            mountPath: /usr/local/nvidia
+          - name: gib
+            mountPath: /usr/local/gib
+        env:
+          - name: LD_LIBRARY_PATH
+            value: /usr/local/nvidia/lib64
+      volumes:
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
       resourceClaims:
       - name: rdma-net-interface
         resourceClaimTemplateName: rdma-net-template

From 97408a2a4a6fe3c8401d0a54ece70106321c05af Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Wed, 28 May 2025 11:17:26 +0000
Subject: [PATCH 7/8] add rdma core to the perf test image

Change-Id: I90741d69f7955579afa5bb5513afdec4170eb9c2
---
 Dockerfile.perftest | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.perftest b/Dockerfile.perftest
index 4de80631..429fec87 100644
--- a/Dockerfile.perftest
+++ b/Dockerfile.perftest
@@ -58,6 +58,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     rdmacm-utils \
+    rdma-core \
     iproute2 \
     inetutils-ping \
     ibverbs-utils \

From 0e17d598cc9609a52cd475b33efd378964b2faf4 Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Wed, 28 May 2025 12:03:35 +0000
Subject: [PATCH 8/8] mpi operator

Change-Id: Id8c72deafd8f2b3cd465ca7604b2719636617195
---
 .../resourceclaimtemplate.yaml                |  57 +++++++
 examples/mpi_operator/nccl-test-job.yaml      |  44 +++--
 site/content/docs/user/mpi-operator.md        | 158 ++++++++++++------
 3 files changed, 189 insertions(+), 70 deletions(-)
 create mode 100644 examples/demo_gke_multinetwork/resourceclaimtemplate.yaml

diff --git a/examples/demo_gke_multinetwork/resourceclaimtemplate.yaml b/examples/demo_gke_multinetwork/resourceclaimtemplate.yaml
new file mode 100644
index 00000000..9f6e4c73
--- /dev/null
+++ b/examples/demo_gke_multinetwork/resourceclaimtemplate.yaml
@@ -0,0 +1,57 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+apiVersion: resource.k8s.io/v1beta1
+kind: ResourceClaimTemplate
+metadata:
+  name: phy-interfaces-template
+spec:
+  spec:
+    devices:
+      requests:
+      - name: phy-interfaces-template
+        deviceClassName: multinic
+        selectors:
+        - cel:
+            expression: device.attributes["dra.net"].name == "eth1"
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: server-deployment
+  labels:
+    app: MyApp
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: MyApp
+  template:
+    metadata:
+      labels:
+        app: MyApp
+    spec:
+      resourceClaims:
+      - name: phy-interfaces
+        resourceClaimTemplateName: phy-interfaces-template
+      containers:
+      - name: agnhost
+        image: registry.k8s.io/e2e-test-images/agnhost:2.39
+        args:
+          - netexec
+          - --http-port=80
+        ports:
+        - containerPort: 80
+  
\ No newline at end of file
diff --git a/examples/mpi_operator/nccl-test-job.yaml b/examples/mpi_operator/nccl-test-job.yaml
index 90ad6247..a97ae060 100644
--- a/examples/mpi_operator/nccl-test-job.yaml
+++ b/examples/mpi_operator/nccl-test-job.yaml
@@ -4,29 +4,31 @@ metadata:
   name: nccl-test-dranet-1gpu-1nic
 spec:
   slotsPerWorker: 1 # 1 MPI rank per worker Pod
-  mpiImplementation: OpenMPI # Or your preferred MPI
   mpiReplicaSpecs:
     Launcher:
       replicas: 1
       template:
         spec:
           containers:
-          - image: docker.io/aojea/dranet-perftest:latest@sha256:bfaa20bba4178f20b3c50c2ddd57d3e0488bc94a4d704ea091043c389571fd6e
-            name: test-launcher
-            env:
-              - name: NCCL_DEBUG
-                value: "INFO"
-              - name: LD_LIBRARY_PATH
-                value: "/opt/openmpi/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64"
+          - image: mpioperator/openmpi:v0.6.0
+            name: mpi-launcher
             command: ["/bin/bash", "-c"]
             args:
             - |
-              echo "--- Environment for mpirun ---"
-              echo "Effective LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
-              PATH=/usr/local/bin:/usr/local/nvidia/bin:/opt/openmpi/bin:$PATH
-
-              mpirun --allow-run-as-root -x LD_LIBRARY_PATH -x PATH \
-                bash -c 'hostname'
+              set -ex
+              mpirun \
+                --allow-run-as-root \
+                --prefix /opt/openmpi \
+                -np 2 \
+                -bind-to none \
+                -map-by slot \
+                -mca routed direct \
+                -x LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
+                bash -c \
+                  "source /usr/local/gib/scripts/set_nccl_env.sh; \
+                  /usr/local/bin/all_reduce_perf \
+                    -g 1 -b 1K -e 8G -f 2 \
+                    -w 5 -n 20;"
             securityContext:
               capabilities:
                 add: ["IPC_LOCK"]
@@ -38,7 +40,7 @@ spec:
           - name: worker-rdma-nic
             resourceClaimTemplateName: mpi-worker-rdma-nic-template
           containers:
-          - image: docker.io/aojea/dranet-perftest:latest@sha256:bfaa20bba4178f20b3c50c2ddd57d3e0488bc94a4d704ea091043c389571fd6e
+          - image: docker.io/aojea/dranet-perftest:latest@sha256:f9729ddd665ea6c5cb1d99754666465d57a84034d808d23a2126a41a7683938c
             name: mpi-worker
             securityContext:
               capabilities:
@@ -46,3 +48,15 @@ spec:
             resources:
               limits:
                 nvidia.com/gpu: 1 # Each worker gets 1 GPU
+            volumeMounts:
+              - name: library-dir-host
+                mountPath: /usr/local/nvidia
+              - name: gib
+                mountPath: /usr/local/gib
+          volumes:
+            - name: library-dir-host
+              hostPath:
+                path: /home/kubernetes/bin/nvidia
+            - name: gib
+              hostPath:
+                path: /home/kubernetes/bin/gib
diff --git a/site/content/docs/user/mpi-operator.md b/site/content/docs/user/mpi-operator.md
index 6bf8d95d..3332d215 100644
--- a/site/content/docs/user/mpi-operator.md
+++ b/site/content/docs/user/mpi-operator.md
@@ -1,5 +1,5 @@
 ---
-title: "MPI Operator"
+title: "MPI Operator on GKE"
 date: 2025-05-27T11:30:40Z
 ---
 
@@ -64,74 +64,122 @@ spec:
             name: "dranet0" # NCCL will use this interface
 ```
 
-1. Crafting the MPIJob:
+1. Install the GKE optimized RDMA dependencies
 
-The MPIJob specification is where we tie everything together. We'll define a job with two workers, each getting one GPU and one DraNet-managed RDMA NIC.
+GKE automatically install on the VM some optimized RDMA and NCCL libraries for Google Cloud infrastructure, that can be installed following the instructions on:
+
+https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute-custom#install-rdma-configure-nccl
+
+In order to use them you need to mount the following volumes
 
 ```yaml
-apiVersion: kubeflow.org/v2beta1
-kind: MPIJob
-metadata:
-  name: nccl-test-dranet-1gpu-1nic
 spec:
-  slotsPerWorker: 1 # 1 MPI rank per worker Pod
-  mpiImplementation: OpenMPI # Or your preferred MPI
-  mpiReplicaSpecs:
-    Launcher:
-      replicas: 1
-      template:
-        spec:
-          containers:
-          - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5
-            env:
-              - name: NCCL_DEBUG
-                value: "INFO"
-              - name: OMPI_MCA_pml
-                value: "ucx"
-            command:
-            - mpirun
-            - /third_party/nccl-tests/build/all_reduce_perf
-            - -b 8K -e 128M -g 1 # Benchmark params: 1 GPU per process
-            securityContext:
-              capabilities:
-                add: ["IPC_LOCK"]
-    Worker:
-      replicas: 2
-      template:
-        spec:
-          resourceClaims:
-          - name: worker-rdma-nic
-            resourceClaimTemplateName: mpi-worker-rdma-nic-template
-          containers:
-          - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.5
-            name: mpi-worker
-            securityContext:
-              capabilities:
-                add: ["IPC_LOCK"]
-            resources:
-              limits:
-                nvidia.com/gpu: 1 # Each worker gets 1 GPU
+  volumes:
+    - name: library-dir-host
+      hostPath:
+        path: /home/kubernetes/bin/nvidia
+    - name: gib
+      hostPath:
+        path: /home/kubernetes/bin/gib
 ```
 
-### Key Aspects of this Configuration:
-
-- **slotsPerWorker: 1:** Each worker Pod hosts a single MPI rank.
+in your workloads:
 
-- **Worker.replicas: 2:** We run a 2-rank MPI job.
-
-- **Worker.template.spec.resourceClaims:** Each worker Pod claims its own RDMA NIC via the template, which DraNet will configure as dranet0.
+```yaml
+containers:
+  - name: my-container
+    volumeMounts:
+      - name: library-dir-host
+        mountPath: /usr/local/nvidia
+      - name: gib
+        mountPath: /usr/local/gib
+    env:
+      - name: LD_LIBRARY_PATH
+        value: /usr/local/nvidia/lib64
+```
 
-- **Worker.template.spec.containers[0].resources.limits["nvidia.com/gpu"]: 1:** Each worker gets one GPU.
+1. Crafting the MPIJob:
 
-- **Launcher.template.spec.containers[0].env.NCCL_SOCKET_IFNAME: "dranet0":** This environment variable explicitly tells NCCL to use the dranet0 interface for its network operations.
+The MPIJob specification is where we tie everything together. We'll define a job with two workers, each getting one GPU and one DraNet-managed RDMA NIC.
 
-- **MPI MCA Parameters (e.g., UCX_NET_DEVICES="dranet0"):** These guide the MPI library itself to use the specified RDMA interface.
+{{</* highlight yaml */>}}
+{{ readFile "examples/mpi_operator/nccl-test-job.yaml" | safeHTML }}
+{{</* /highlight */>}}
 
-3. Running and Observing:
+1. Running and Observing:
 
 Once deployed, the MPI Operator will launch the job. The launcher Pod will execute mpirun, which starts the all_reduce_perf test across the two worker Pods. Each worker Pod will use its dedicated GPU and its dedicated dranet0 (RDMA NIC) for NCCL communications.
 
-You can monitor the launcher's logs to see the NCCL benchmark results, including the achieved bus bandwidth. The NCCL_DEBUG=INFO logs will also confirm that NCCL is indeed using the dranet0 interface.
+You can monitor the launcher's logs to see the NCCL benchmark results, including the achieved bus bandwidth.
+
+```sh
+kubectl logs $(kubectl get pods | grep launcher | awk '{ print $1}') -f
++ mpirun --allow-run-as-root --prefix /opt/openmpi -np 2 -bind-to none -map-by slot -mca routed direct -x LD_LIBRARY_PATH=/usr/local/nvidia/lib64 bash -c 'source /usr/local/gib/scripts/set_nccl_env.sh;     /usr/local/bin/all_reduce_perf       -g 1 -b 1K -e 8G -f 2       -w 5 -n 20;'
+Warning: Permanently added '[nccl-test-dranet-1gpu-1nic-worker-1.nccl-test-dranet-1gpu-1nic.default.svc]:2222' (ED25519) to the list of known hosts.
+Warning: Permanently added '[nccl-test-dranet-1gpu-1nic-worker-0.nccl-test-dranet-1gpu-1nic.default.svc]:2222' (ED25519) to the list of known hosts.
+--------------------------------------------------------------------------
+WARNING: No preset parameters were found for the device that Open MPI
+detected:
+
+  Local host:            nccl-test-dranet-1gpu-1nic-worker-0
+  Device name:           mlx5_2
+  Device vendor ID:      0x02c9
+  Device vendor part ID: 4126
+
+Default device parameters will be used, which may result in lower
+performance.  You can edit any of the files specified by the
+btl_openib_device_param_files MCA parameter to set values for your
+device.
+
+NOTE: You can turn off this warning by setting the MCA parameter
+      btl_openib_warn_no_device_params_found to 0.
+--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+No OpenFabrics connection schemes reported that they were able to be
+used on a specific port.  As such, the openib BTL (OpenFabrics
+support) will be disabled for this port.
+
+  Local host:           nccl-test-dranet-1gpu-1nic-worker-0
+  Local device:         mlx5_2
+  Local port:           1
+  CPCs attempted:       rdmacm, udcm
+--------------------------------------------------------------------------
+# nThread 1 nGpus 1 minBytes 1024 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
+#
+# Using devices
+#  Rank  0 Group  0 Pid     23 on nccl-test-dranet-1gpu-1nic-worker-0 device  0 [0000:cc:00] NVIDIA H200
+#  Rank  1 Group  0 Pid     21 on nccl-test-dranet-1gpu-1nic-worker-1 device  0 [0000:97:00] NVIDIA H200
+#
+#                                                              out-of-place                       in-place
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
+        1024           256     float     sum      -1    35.59    0.03    0.03      0    30.61    0.03    0.03      0
+        2048           512     float     sum      -1    31.80    0.06    0.06      0    31.90    0.06    0.06      0
+        4096          1024     float     sum      -1    33.56    0.12    0.12      0    33.33    0.12    0.12      0
+        8192          2048     float     sum      -1    39.33    0.21    0.21      0    39.24    0.21    0.21      0
+       16384          4096     float     sum      -1    41.89    0.39    0.39      0    40.31    0.41    0.41      0
+       32768          8192     float     sum      -1    45.47    0.72    0.72      0    42.92    0.76    0.76      0
+       65536         16384     float     sum      -1    54.03    1.21    1.21      0    51.81    1.26    1.26      0
+      131072         32768     float     sum      -1    51.86    2.53    2.53      0    52.60    2.49    2.49      0
+      262144         65536     float     sum      -1    79.10    3.31    3.31      0    68.36    3.83    3.83      0
+      524288        131072     float     sum      -1    76.88    6.82    6.82      0    76.38    6.86    6.86      0
+     1048576        262144     float     sum      -1    98.57   10.64   10.64      0    93.72   11.19   11.19      0
+     2097152        524288     float     sum      -1    131.9   15.90   15.90      0    131.8   15.91   15.91      0
+     4194304       1048576     float     sum      -1    227.5   18.44   18.44      0    227.4   18.45   18.45      0
+     8388608       2097152     float     sum      -1    415.7   20.18   20.18      0    416.7   20.13   20.13      0
+    16777216       4194304     float     sum      -1    811.3   20.68   20.68      0    808.5   20.75   20.75      0
+    33554432       8388608     float     sum      -1   1609.7   20.84   20.84      0   1607.6   20.87   20.87      0
+    67108864      16777216     float     sum      -1   2250.8   29.82   29.82      0   2253.3   29.78   29.78      0
+   134217728      33554432     float     sum      -1   4440.0   30.23   30.23      0   4444.3   30.20   30.20      0
+   268435456      67108864     float     sum      -1   8635.4   31.09   31.09      0   8653.9   31.02   31.02      0
+   536870912     134217728     float     sum      -1    17077   31.44   31.44      0    17081   31.43   31.43      0
+  1073741824     268435456     float     sum      -1    33860   31.71   31.71      0    33896   31.68   31.68      0
+  2147483648     536870912     float     sum      -1    67521   31.80   31.80      0    67503   31.81   31.81      0
+  4294967296    1073741824     float     sum      -1   134734   31.88   31.88      0   135069   31.80   31.80      0
+  8589934592    2147483648     float     sum      -1   269368   31.89   31.89      0   269407   31.88   31.88      0
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 15.5188
+```
 
 ## The Power of Compartmentalization with DraNet