This repository was archived by the owner on May 6, 2026. It is now read-only.
Description It seems only one device gets updated
kubectl get resourceclaims -o yaml
apiVersion: v1
items:
- apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaim
metadata:
annotations:
resource.kubernetes.io/pod-claim-name: rdma-net-interface
creationTimestamp: "2025-06-17T22:59:48Z"
finalizers:
- resource.kubernetes.io/delete-protection
generateName: nccl-gib-test-0-rdma-net-interface-
name: nccl-gib-test-0-rdma-net-interface-8dj8b
namespace: default
ownerReferences:
- apiVersion: v1
blockOwnerDeletion: true
controller: true
kind: Pod
name: nccl-gib-test-0
uid: 0c344e45-9866-4c3f-bc3f-44e8b664bbbd
resourceVersion: "1750201254435023024"
uid: 18317a3d-300d-41e3-ba99-4fe78f425d21
spec:
devices:
requests:
- allocationMode: ExactCount
count: 8
deviceClassName: dranet
name: rdma-net-interface
selectors:
- cel:
expression: device.attributes["dra.net"].rdma == true
status:
allocation:
devices:
results:
- adminAccess: null
device: gpu0rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
request: rdma-net-interface
- adminAccess: null
device: gpu1rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
request: rdma-net-interface
- adminAccess: null
device: gpu2rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
request: rdma-net-interface
- adminAccess: null
device: gpu3rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
request: rdma-net-interface
- adminAccess: null
device: gpu4rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
request: rdma-net-interface
- adminAccess: null
device: gpu5rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
request: rdma-net-interface
- adminAccess: null
device: gpu6rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
request: rdma-net-interface
- adminAccess: null
device: gpu7rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
request: rdma-net-interface
nodeSelector:
nodeSelectorTerms:
- matchFields:
- key: metadata.name
operator: In
values:
- gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
devices:
- conditions:
- lastTransitionTime: "2025-06-17T23:00:54Z"
message: ""
reason: NetworkDeviceReady
status: "True"
type: Ready
- lastTransitionTime: "2025-06-17T23:00:54Z"
message: ""
reason: NetworkReady
status: "True"
type: NetworkReady
device: gpu7rdma0
driver: dra.net
networkData:
hardwareAddress: 62:55:7a:a9:36:16
interfaceName: gpu7rdma0
ips:
- 192.168.8.4/32
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-lp53
reservedFor:
- name: nccl-gib-test-0
resource: pods
uid: 0c344e45-9866-4c3f-bc3f-44e8b664bbbd
- apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaim
metadata:
annotations:
resource.kubernetes.io/pod-claim-name: rdma-net-interface
creationTimestamp: "2025-06-17T23:00:55Z"
finalizers:
- resource.kubernetes.io/delete-protection
generateName: nccl-gib-test-1-rdma-net-interface-
name: nccl-gib-test-1-rdma-net-interface-j5gzp
namespace: default
ownerReferences:
- apiVersion: v1
blockOwnerDeletion: true
controller: true
kind: Pod
name: nccl-gib-test-1
uid: 09bdeedc-9981-4b04-9dc6-0fb687b07a2f
resourceVersion: "1750201286238063000"
uid: 5eb63421-14fe-4ec8-8903-c74fb2c00143
spec:
devices:
requests:
- allocationMode: ExactCount
count: 8
deviceClassName: dranet
name: rdma-net-interface
selectors:
- cel:
expression: device.attributes["dra.net"].rdma == true
status:
allocation:
devices:
results:
- adminAccess: null
device: gpu0rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
request: rdma-net-interface
- adminAccess: null
device: gpu1rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
request: rdma-net-interface
- adminAccess: null
device: gpu2rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
request: rdma-net-interface
- adminAccess: null
device: gpu3rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
request: rdma-net-interface
- adminAccess: null
device: gpu4rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
request: rdma-net-interface
- adminAccess: null
device: gpu5rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
request: rdma-net-interface
- adminAccess: null
device: gpu6rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
request: rdma-net-interface
- adminAccess: null
device: gpu7rdma0
driver: dra.net
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
request: rdma-net-interface
nodeSelector:
nodeSelectorTerms:
- matchFields:
- key: metadata.name
operator: In
values:
- gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
devices:
- conditions:
- lastTransitionTime: "2025-06-17T23:01:26Z"
message: ""
reason: NetworkDeviceReady
status: "True"
type: Ready
- lastTransitionTime: "2025-06-17T23:01:26Z"
message: ""
reason: NetworkReady
status: "True"
type: NetworkReady
device: gpu6rdma0
driver: dra.net
networkData:
hardwareAddress: 92:77:04:06:a2:13
interfaceName: gpu6rdma0
ips:
- 192.168.7.5/32
pool: gke-dranet-maspinwal-dranet-maspinwal-d3003787-zwtv
reservedFor:
- name: nccl-gib-test-1
resource: pods
uid: 09bdeedc-9981-4b04-9dc6-0fb687b07a2f
- apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaim
metadata:
annotations:
resource.kubernetes.io/pod-claim-name: rdma-net-interface
creationTimestamp: "2025-06-17T23:15:16Z"
generateName: nccl-gib-test-4w-0-rdma-net-interface-
name: nccl-gib-test-4w-0-rdma-net-interface-7f9gq
namespace: default
ownerReferences:
- apiVersion: v1
blockOwnerDeletion: true
controller: true
kind: Pod
name: nccl-gib-test-4w-0
uid: e93f4e36-ae83-4701-85f6-dfeb25bdfe38
resourceVersion: "1750202116244175024"
uid: c7b82de3-bb74-484d-b769-eb3d1a3edcb2
spec:
devices:
requests:
- allocationMode: ExactCount
count: 4
deviceClassName: dranet
name: rdma-net-interface
selectors:
- cel:
expression: |
device.attributes["dra.net"].rdma == true &&
(
(device.attributes["dra.net"].ifName.startsWith("gpu") &&
device.attributes["dra.net"].ifName.endsWith("rdma0") &&
int(device.attributes["dra.net"].ifName.substring(3, 4)) < 4)
||
(device.attributes["dra.net"].ifName.startsWith("gpu") &&
device.attributes["dra.net"].ifName.endsWith("rdma0") &&
int(device.attributes["dra.net"].ifName.substring(3, 4)) >= 4)
)
status: {}
kind: List
metadata:
Reactions are currently unavailable
It seems only one device gets updated