From e31e494cae7efa0b421c94184abcf0437473bd00 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 13:49:23 -0700
Subject: [PATCH 01/13] clean up deps

---
 go.mod | 4 ----
 go.sum | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/go.mod b/go.mod
index 62595c07..7fe6a39a 100644
--- a/go.mod
+++ b/go.mod
@@ -63,7 +63,6 @@ require (
 	github.com/NYTimes/gziphandler v1.1.1 // indirect
 	github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
 	github.com/antonmedv/expr v1.15.3 // indirect
-	github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
 	github.com/blendle/zapdriver v1.3.1 // indirect
@@ -101,7 +100,6 @@ require (
 	github.com/google/cel-go v0.23.2 // indirect
 	github.com/google/gnostic-models v0.6.9 // indirect
 	github.com/google/go-containerregistry v0.16.1 // indirect
-	github.com/google/gofuzz v1.2.0 // indirect
 	github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect
 	github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
 	github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
@@ -109,12 +107,10 @@ require (
 	github.com/hashicorp/errwrap v1.1.0 // indirect
 	github.com/hashicorp/golang-lru v1.0.2 // indirect
 	github.com/hashicorp/hcl v1.0.0 // indirect
-	github.com/imdario/mergo v0.3.16 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/kelseyhightower/envconfig v1.4.0 // indirect
-	github.com/klauspost/compress v1.18.0 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
 	github.com/kylelemons/godebug v1.1.0 // indirect
 	github.com/leodido/go-urn v1.4.0 // indirect
diff --git a/go.sum b/go.sum
index 6906429c..cab24b93 100644
--- a/go.sum
+++ b/go.sum
@@ -58,8 +58,6 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8
 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
 github.com/antonmedv/expr v1.15.3 h1:q3hOJZNvLvhqE8OHBs1cFRdbXFNKuA+bHmRaI+AmRmI=
 github.com/antonmedv/expr v1.15.3/go.mod h1:0E/6TxnOlRNp81GMzX9QfDPAmHo2Phg00y4JUv1ihsE=
-github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
-github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
 github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -287,8 +285,6 @@ github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
 github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
-github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
-github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/jarcoal/httpmock v1.2.0 h1:gSvTxxFR/MEMfsGrvRbdfpRUMBStovlSRLw0Ep1bwwc=

From f1d13d51992aa0d3ed2da9d17447676ec39db5f1 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 14:17:57 -0700
Subject: [PATCH 02/13] update router image

---
 config/runtimes/srt/deepseek-rdma-rt.yaml                       | 2 +-
 config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml         | 2 +-
 config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml              | 2 +-
 config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml       | 2 +-
 config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml               | 2 +-
 config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml               | 2 +-
 config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml   | 2 +-
 config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml              | 2 +-
 .../runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml | 2 +-
 config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml      | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml
index 99b21eb3..33e4eb3f 100644
--- a/config/runtimes/srt/deepseek-rdma-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-rt.yaml
@@ -35,7 +35,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
index 14d6b5a3..d661af78 100644
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
@@ -108,7 +108,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
index e3d7616e..42c6a0cb 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
@@ -110,7 +110,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
index a484dc2e..448e1980 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
@@ -108,7 +108,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
index ce55124c..71a957b7 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
@@ -110,7 +110,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
index 6d1d6842..3bc1b8b8 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
@@ -110,7 +110,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
index 4a1d0e7b..8686bac5 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
@@ -108,7 +108,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
index d7096f4f..ebcad544 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
@@ -110,7 +110,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index 70e1d0f2..bc66768a 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -109,7 +109,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
index 07be4bad..d9a86c63 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
@@ -110,7 +110,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"

From 3ea784c75bf29a9f67031cc295f9fa7018d897da Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 14:33:05 -0700
Subject: [PATCH 03/13] add router config to sglang runtimes

---
 .../runtimes/srt/mistral-7b-instruct-rt.yaml  | 33 ++++++++++++++++++-
 .../srt/mixtral-8x7b-instruct-rt.yaml         | 33 ++++++++++++++++++-
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
index 72262542..53ff611b 100644
--- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
@@ -106,4 +106,35 @@ spec:
         successThreshold: 1
         periodSeconds: 6
         initialDelaySeconds: 60
-        timeoutSeconds: 30
\ No newline at end of file
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
index 0a18d556..441de166 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
@@ -106,4 +106,35 @@ spec:
         successThreshold: 1
         periodSeconds: 6
         initialDelaySeconds: 60
-        timeoutSeconds: 30
\ No newline at end of file
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file

From d8e123a932712493f6c9f702031a2ad6f9ee76de Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 15:05:10 -0700
Subject: [PATCH 04/13] add pd runtimes

---
 .../llama-3-1-405b-instruct-fp8-pd-rt.yaml    | 141 +++++++++++++++++
 .../srt/llama-3-1-70b-instruct-pd-rt.yaml     | 143 ++++++++++++++++++
 .../llama-3-2-11b-vision-instruct-pd-rt.yaml  | 141 +++++++++++++++++
 .../srt/llama-3-2-1b-instruct-pd-rt.yaml      | 143 ++++++++++++++++++
 .../srt/llama-3-2-3b-instruct-pd-rt.yaml      | 143 ++++++++++++++++++
 ...ama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 141 +++++++++++++++++
 .../srt/llama-3-3-70b-instruct-pd-rt.yaml     | 143 ++++++++++++++++++
 ...-maverick-17b-128e-instruct-fp8-pd-rt.yaml |   2 +-
 .../llama-4-scout-17b-16e-instruct-pd-rt.yaml | 143 ++++++++++++++++++
 .../srt/mistral-7b-instruct-pd-rt.yaml        | 143 ++++++++++++++++++
 .../srt/mixtral-8x7b-instruct-pd-rt.yaml      | 143 ++++++++++++++++++
 11 files changed, 1425 insertions(+), 1 deletion(-)
 create mode 100644 config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
 create mode 100644 config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
 create mode 100644 config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
 create mode 100644 config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
 create mode 100644 config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
 create mode 100644 config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
 create mode 100644 config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
 create mode 100644 config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
 create mode 100644 config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
 create mode 100644 config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml

diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
new file mode 100644
index 00000000..6517ba8f
--- /dev/null
+++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
@@ -0,0 +1,141 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-1-405b-instruct-fp8
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.43.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 400B
+    max: 410B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 8 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 128
+          memory: 216Gi
+          nvidia.com/gpu: 8
+        limits:
+          cpu: 128
+          memory: 216Gi
+          nvidia.com/gpu: 8
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..dbf58ea5
--- /dev/null
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
@@ -0,0 +1,143 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-1-70b-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.42.3"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 60B
+    max: 75B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=4 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
new file mode 100644
index 00000000..14a8e52e
--- /dev/null
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
@@ -0,0 +1,141 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-2-11b-vision-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.45.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: MllamaForConditionalGeneration
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 10B
+    max: 12B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 1 \
+          --mem-frac=0.9 \
+          --chat-template llama_3_vision
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..05432454
--- /dev/null
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -0,0 +1,143 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-2-1b-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.45.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 500M
+    max: 2B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 1 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..e7903afd
--- /dev/null
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -0,0 +1,143 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-2-3b-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.45.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 2B
+    max: 4B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 1 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
new file mode 100644
index 00000000..fde24e3e
--- /dev/null
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
@@ -0,0 +1,141 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-2-90b-vision-instruct-fp8
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.46.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: MllamaForConditionalGeneration
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 85B
+    max: 95B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 4 \
+          --mem-frac=0.9 \
+          --chat-template llama_3_vision
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..ce194b7a
--- /dev/null
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
@@ -0,0 +1,143 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-3-70b-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.47.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 60B
+    max: 75B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=4 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
index b62ae96c..e57b1323 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
@@ -210,7 +210,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev13
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
new file mode 100644
index 00000000..757336a2
--- /dev/null
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -0,0 +1,143 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-4-scout-17b-16e-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.51.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: Llama4ForConditionalGeneration
+      autoSelect: true
+      priority: 2
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 100B
+    max: 109B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --model-path="$MODEL_PATH" \
+          --tp 4 \
+          --mem-frac=0.95 \
+          --context-length=128000 \
+          --chat-template llama-4 \
+          --attention-backend fa3 \
+          --log-requests
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..69cab117
--- /dev/null
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -0,0 +1,143 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-mistral-7b-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.36.2"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: MistralForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 5B
+    max: 9B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model="$MODEL_PATH"\
+          --tp 2 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..6dcec754
--- /dev/null
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -0,0 +1,143 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-mmixtral-8x7b-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.36.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: MixtralForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 45B
+    max: 50B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model="$MODEL_PATH"\
+          --tp 2 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file

From fe28d78a6bd4797ae28a3de3b14cd53b9f359fd8 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 15:52:22 -0700
Subject: [PATCH 05/13] add decoderconfig to pd runtimes

---
 .../llama-3-1-405b-instruct-fp8-pd-rt.yaml    | 102 ++++++++++++++++-
 .../srt/llama-3-1-70b-instruct-pd-rt.yaml     | 104 +++++++++++++++++-
 .../llama-3-2-11b-vision-instruct-pd-rt.yaml  | 102 ++++++++++++++++-
 .../srt/llama-3-2-1b-instruct-pd-rt.yaml      | 104 +++++++++++++++++-
 .../srt/llama-3-2-3b-instruct-pd-rt.yaml      | 104 +++++++++++++++++-
 ...ama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 102 ++++++++++++++++-
 .../srt/llama-3-3-70b-instruct-pd-rt.yaml     | 104 +++++++++++++++++-
 ...-maverick-17b-128e-instruct-fp8-pd-rt.yaml |   6 -
 ...a-4-maverick-17b-128e-instruct-fp8-rt.yaml |   3 -
 .../llama-4-scout-17b-16e-instruct-pd-rt.yaml | 104 +++++++++++++++++-
 .../srt/mistral-7b-instruct-pd-rt.yaml        | 104 +++++++++++++++++-
 .../srt/mixtral-8x7b-instruct-pd-rt.yaml      | 104 +++++++++++++++++-
 12 files changed, 993 insertions(+), 50 deletions(-)

diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
index 6517ba8f..f131b54c 100644
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 410B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -43,6 +46,8 @@ spec:
                   operator: In
                   values:
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -63,7 +68,98 @@ spec:
           --log-requests \
           --model-path="$MODEL_PATH" \
           --tp 8 \
-          --mem-frac=0.9
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 128
+          memory: 216Gi
+          nvidia.com/gpu: 8
+        limits:
+          cpu: 128
+          memory: 216Gi
+          nvidia.com/gpu: 8
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 8 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -76,7 +172,6 @@ spec:
           cpu: 128
           memory: 216Gi
           nvidia.com/gpu: 8
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -85,7 +180,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -94,7 +188,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
@@ -104,7 +197,6 @@ spec:
         periodSeconds: 6
         initialDelaySeconds: 60
         timeoutSeconds: 30
-
   routerConfig:
     runner:
       name: router
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
index dbf58ea5..2906b2f2 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 75B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -46,6 +49,8 @@ spec:
                     - BM.GPU4.8
                     - BM.GPU.A100-v2.8
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -66,7 +71,101 @@ spec:
           --log-requests \
           --model-path="$MODEL_PATH" \
           --tp-size=4 \
-          --mem-frac=0.9
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=4 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -79,7 +178,6 @@ spec:
           cpu: 10
           memory: 160Gi
           nvidia.com/gpu: 4
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -88,7 +186,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -97,7 +194,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
index 14a8e52e..5c7ac9da 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 12B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -43,6 +46,8 @@ spec:
                   operator: In
                   values:
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -64,7 +69,99 @@ spec:
           --model-path="$MODEL_PATH" \
           --tp 1 \
           --mem-frac=0.9 \
-          --chat-template llama_3_vision
+          --chat-template llama_3_vision \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 1 \
+          --mem-frac=0.9 \
+          --chat-template llama_3_vision \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -77,7 +174,6 @@ spec:
           cpu: 10
           memory: 30Gi
           nvidia.com/gpu: 1
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -86,7 +182,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -95,7 +190,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
index 05432454..055d7a2a 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 2B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -46,6 +49,8 @@ spec:
                     - BM.GPU4.8
                     - BM.GPU.A100-v2.8
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -66,7 +71,101 @@ spec:
           --log-requests \
           --model-path="$MODEL_PATH" \
           --tp 1 \
-          --mem-frac=0.9
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 1 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -79,7 +178,6 @@ spec:
           cpu: 10
           memory: 30Gi
           nvidia.com/gpu: 1
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -88,7 +186,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -97,7 +194,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
index e7903afd..b019ec2e 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 4B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -46,6 +49,8 @@ spec:
                     - BM.GPU4.8
                     - BM.GPU.A100-v2.8
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -66,7 +71,101 @@ spec:
           --log-requests \
           --model-path="$MODEL_PATH" \
           --tp 1 \
-          --mem-frac=0.9
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 1 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -79,7 +178,6 @@ spec:
           cpu: 10
           memory: 30Gi
           nvidia.com/gpu: 1
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -88,7 +186,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -97,7 +194,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
index fde24e3e..49b54544 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 95B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -43,6 +46,8 @@ spec:
                   operator: In
                   values:
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -64,7 +69,99 @@ spec:
           --model-path="$MODEL_PATH" \
           --tp 4 \
           --mem-frac=0.9 \
-          --chat-template llama_3_vision
+          --chat-template llama_3_vision \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp 4 \
+          --mem-frac=0.9 \
+          --chat-template llama_3_vision \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -77,7 +174,6 @@ spec:
           cpu: 30
           memory: 100Gi
           nvidia.com/gpu: 4
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -86,7 +182,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -95,7 +190,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
index ce194b7a..191181d2 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 75B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -46,6 +49,8 @@ spec:
                     - BM.GPU4.8
                     - BM.GPU.A100-v2.8
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -66,7 +71,101 @@ spec:
           --log-requests \
           --model-path="$MODEL_PATH" \
           --tp-size=4 \
-          --mem-frac=0.9
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=4 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -79,7 +178,6 @@ spec:
           cpu: 10
           memory: 160Gi
           nvidia.com/gpu: 4
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -88,7 +186,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -97,7 +194,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
index e57b1323..976ec907 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
@@ -85,7 +85,6 @@ spec:
           cpu: 128
           memory: 512Gi
           nvidia.com/gpu: 8
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -94,7 +93,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -103,7 +101,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
@@ -179,7 +176,6 @@ spec:
           cpu: 128
           memory: 512Gi
           nvidia.com/gpu: 8
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -188,7 +184,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -197,7 +192,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index bc66768a..8c750599 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -78,7 +78,6 @@ spec:
           cpu: 128
           memory: 512Gi
           nvidia.com/gpu: 8
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -87,7 +86,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -96,7 +94,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
index 757336a2..19510eea 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 109B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -43,6 +46,8 @@ spec:
                   operator: In
                   values:
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -66,7 +71,101 @@ spec:
           --context-length=128000 \
           --chat-template llama-4 \
           --attention-backend fa3 \
-          --log-requests
+          --log-requests \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --model-path="$MODEL_PATH" \
+          --tp 4 \
+          --mem-frac=0.95 \
+          --context-length=128000 \
+          --chat-template llama-4 \
+          --attention-backend fa3 \
+          --log-requests \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -79,7 +178,6 @@ spec:
           cpu: 64
           memory: 256Gi
           nvidia.com/gpu: 4
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -88,7 +186,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -97,7 +194,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
index 69cab117..c594751b 100644
--- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 9B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -46,6 +49,8 @@ spec:
                     - BM.GPU4.8
                     - BM.GPU.A100-v2.8
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -66,7 +71,101 @@ spec:
           --log-requests \
           --model="$MODEL_PATH"\
           --tp 2 \
-          --mem-frac=0.9
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model="$MODEL_PATH"\
+          --tp 2 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -79,7 +178,6 @@ spec:
           cpu: 10
           memory: 30Gi
           nvidia.com/gpu: 2
-
       readinessProbe:
         httpGet:
           path: /health
@@ -88,7 +186,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -97,7 +194,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
index 6dcec754..9b8da644 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -21,6 +21,9 @@ spec:
     max: 50B
   engineConfig:
     annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
       prometheus.io/scrape: "true"
       prometheus.io/port: "8080"
       prometheus.io/path: "/metrics"
@@ -46,6 +49,8 @@ spec:
                     - BM.GPU4.8
                     - BM.GPU.A100-v2.8
                     - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
     runner:
       name: ome-container
       image: ghcr.io/moirai-internal/sgl:dev2
@@ -66,7 +71,101 @@ spec:
           --log-requests \
           --model="$MODEL_PATH"\
           --tp 2 \
-          --mem-frac=0.9
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: ghcr.io/moirai-internal/sgl:dev2
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model="$MODEL_PATH"\
+          --tp 2 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
@@ -79,7 +178,6 @@ spec:
           cpu: 10
           memory: 30Gi
           nvidia.com/gpu: 2
-
       readinessProbe:
         httpGet:
           path: /health
@@ -88,7 +186,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -97,7 +194,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health

From 407aada8cae9ba5571b5123c1341de2e6756d49d Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 16:16:19 -0700
Subject: [PATCH 06/13] add -pd as suffix to runtime names

---
 config/runtimes/srt/deepseek-rdma-pd-rt.yaml                    | 2 +-
 config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml      | 2 +-
 config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml           | 2 +-
 config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml    | 2 +-
 config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml            | 2 +-
 config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml            | 2 +-
 .../runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml   | 2 +-
 config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml           | 2 +-
 config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml   | 2 +-
 config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml              | 2 +-
 config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml            | 2 +-
 config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml               | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
index 2d05d410..5a54d7c8 100644
--- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-deepseek-pd-rdma
+  name: srt-deepseek-rdma-pd
 spec:
   disabled: false
   modelSizeRange:
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
index f131b54c..52b5022d 100644
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-1-405b-instruct-fp8
+  name: srt-llama-3-1-405b-instruct-fp8-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
index 2906b2f2..2753bdad 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-1-70b-instruct
+  name: srt-llama-3-1-70b-instruct-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
index 5c7ac9da..d7e8766e 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-2-11b-vision-instruct
+  name: srt-llama-3-2-11b-vision-instruct-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
index 055d7a2a..61e82e48 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-2-1b-instruct
+  name: srt-llama-3-2-1b-instruct-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
index b019ec2e..72b38ebc 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-2-3b-instruct
+  name: srt-llama-3-2-3b-instruct-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
index 49b54544..fc34160f 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-2-90b-vision-instruct-fp8
+  name: srt-llama-3-2-90b-vision-instruct-fp8-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
index 191181d2..1bcb0328 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-3-70b-instruct
+  name: srt-llama-3-3-70b-instruct-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
index 19510eea..6b8dc2b2 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-4-scout-17b-16e-instruct
+  name: srt-llama-4-scout-17b-16e-instruct-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
index c594751b..b2bc39e7 100644
--- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-mistral-7b-instruct
+  name: srt-mistral-7b-instruct-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
index 9b8da644..c4aea6ef 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-mmixtral-8x7b-instruct
+  name: srt-mixtral-8x7b-instruct-pd
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
index 441de166..62251b91 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-mmixtral-8x7b-instruct
+  name: srt-mixtral-8x7b-instruct
 spec:
   disabled: false
   supportedModelFormats:

From e479760d9d381e42e6c2e78932e4de796b531c83 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 16:42:21 -0700
Subject: [PATCH 07/13] use --model-path for consistancy

---
 config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml   | 4 ++--
 config/runtimes/srt/mistral-7b-instruct-rt.yaml      | 2 +-
 config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml | 4 ++--
 config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml    | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
index b2bc39e7..89ebc20a 100644
--- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -69,7 +69,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --log-requests \
-          --model="$MODEL_PATH"\
+          --model-path="$MODEL_PATH"\
           --tp 2 \
           --mem-frac=0.9 \
           --disaggregation-mode prefill \
@@ -161,7 +161,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --log-requests \
-          --model="$MODEL_PATH"\
+          --model-path="$MODEL_PATH"\
           --tp 2 \
           --mem-frac=0.9 \
           --disaggregation-mode decode \
diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
index 53ff611b..231c1842 100644
--- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
@@ -64,7 +64,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --log-requests \
-          --model="$MODEL_PATH"\
+          --model-path="$MODEL_PATH"\
           --tp 2 \
           --mem-frac=0.9
       volumeMounts:
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
index c4aea6ef..ad2a4908 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -69,7 +69,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --log-requests \
-          --model="$MODEL_PATH"\
+          --model-path="$MODEL_PATH"\
           --tp 2 \
           --mem-frac=0.9 \
           --disaggregation-mode prefill \
@@ -161,7 +161,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --log-requests \
-          --model="$MODEL_PATH"\
+          --model-path="$MODEL_PATH"\
           --tp 2 \
           --mem-frac=0.9 \
           --disaggregation-mode decode \
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
index 62251b91..c3e26913 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
@@ -64,7 +64,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --log-requests \
-          --model="$MODEL_PATH"\
+          --model-path="$MODEL_PATH"\
           --tp 2 \
           --mem-frac=0.9
       volumeMounts:

From db94f2bfb6f34db24dfa3f6ab56770d554c87e97 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 16:49:53 -0700
Subject: [PATCH 08/13] replace --tp with --tp-size for consistency

---
 config/runtimes/srt/deepseek-rdma-rt.yaml                     | 4 ++--
 config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml            | 2 +-
 config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml    | 4 ++--
 config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml       | 2 +-
 config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml  | 4 ++--
 config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml     | 2 +-
 config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml          | 4 ++--
 config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml             | 2 +-
 config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml          | 4 ++--
 config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml             | 2 +-
 .../runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 4 ++--
 config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml | 2 +-
 .../srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml     | 4 ++--
 .../srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml        | 2 +-
 config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml | 4 ++--
 config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml    | 2 +-
 config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml            | 4 ++--
 config/runtimes/srt/mistral-7b-instruct-rt.yaml               | 2 +-
 config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml          | 4 ++--
 config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml             | 2 +-
 20 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml
index 33e4eb3f..2c3bd513 100644
--- a/config/runtimes/srt/deepseek-rdma-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-rt.yaml
@@ -98,7 +98,7 @@ spec:
             python3 -m sglang.launch_server 
             --host 0.0.0.0 --port 8080 
             --model-path ${MODEL_PATH} 
-            --tp 16 
+            --tp-size 16 
             --nccl-init $(LWS_LEADER_ADDRESS):5000 
             --nnodes ${LWS_GROUP_SIZE} 
             --node-rank ${LWS_WORKER_INDEX} 
@@ -169,7 +169,7 @@ spec:
             --host 0.0.0.0 
             --port 8080 
             --model-path ${MODEL_PATH} 
-            --tp 16 
+            --tp-size 16 
             --nccl-init $(LWS_LEADER_ADDRESS):5000 
             --nnodes ${LWS_GROUP_SIZE} 
             --node-rank ${LWS_WORKER_INDEX} 
diff --git a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
index 2f9104ff..d139ddd9 100644
--- a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
@@ -38,7 +38,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --is-embedding
       volumeMounts:
         - mountPath: /dev/shm
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
index 52b5022d..1d005591 100644
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
@@ -67,7 +67,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 8 \
+          --tp-size 8 \
           --mem-frac=0.9 \
           --disaggregation-mode prefill \
           --disaggregation-ib-device mlx5_0
@@ -156,7 +156,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 8 \
+          --tp-size 8 \
           --mem-frac=0.9 \
           --disaggregation-mode decode \
           --disaggregation-ib-device mlx5_0
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
index d661af78..c152183b 100644
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
@@ -62,7 +62,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 8 \
+          --tp-size 8 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
index d7e8766e..b12b0c96 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
@@ -67,7 +67,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9 \
           --chat-template llama_3_vision \
           --disaggregation-mode prefill \
@@ -157,7 +157,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9 \
           --chat-template llama_3_vision \
           --disaggregation-mode decode \
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
index 448e1980..d141e791 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
@@ -62,7 +62,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9 \
           --chat-template llama_3_vision
       volumeMounts:
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
index 61e82e48..58c9b505 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -70,7 +70,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9 \
           --disaggregation-mode prefill \
           --disaggregation-ib-device mlx5_0
@@ -162,7 +162,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9 \
           --disaggregation-mode decode \
           --disaggregation-ib-device mlx5_0
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
index 71a957b7..38fe8459 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
@@ -65,7 +65,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
index 72b38ebc..c45678e5 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -70,7 +70,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9 \
           --disaggregation-mode prefill \
           --disaggregation-ib-device mlx5_0
@@ -162,7 +162,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9 \
           --disaggregation-mode decode \
           --disaggregation-ib-device mlx5_0
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
index 3bc1b8b8..3c78795e 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
@@ -65,7 +65,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
index fc34160f..0e3b2ca1 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
@@ -67,7 +67,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 4 \
+          --tp-size 4 \
           --mem-frac=0.9 \
           --chat-template llama_3_vision \
           --disaggregation-mode prefill \
@@ -157,7 +157,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 4 \
+          --tp-size 4 \
           --mem-frac=0.9 \
           --chat-template llama_3_vision \
           --disaggregation-mode decode \
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
index 8686bac5..3b38f67a 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
@@ -62,7 +62,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 4 \
+          --tp-size 4 \
           --mem-frac=0.9 \
           --chat-template llama_3_vision
       volumeMounts:
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
index 976ec907..d8f2c123 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
@@ -68,7 +68,7 @@ spec:
           --model-path="$MODEL_PATH" \
           --disaggregation-mode prefill \
           --disaggregation-ib-device mlx5_0 \
-          --tp 8 \
+          --tp-size 8 \
           --context-length=430000 \
           --chat-template llama-4 \
           --attention-backend fa3 \
@@ -159,7 +159,7 @@ spec:
           --model-path="$MODEL_PATH" \
           --disaggregation-mode decode \
           --disaggregation-ib-device mlx5_0 \
-          --tp 8 \
+          --tp-size 8 \
           --context-length=430000 \
           --chat-template llama-4 \
           --attention-backend fa3 \
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index 8c750599..12fd0ae9 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -61,7 +61,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --model-path="$MODEL_PATH" \
-          --tp 8 \
+          --tp-size 8 \
           --context-length=430000 \
           --chat-template llama-4 \
           --attention-backend fa3 \
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
index 6b8dc2b2..de4ca76b 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -66,7 +66,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --model-path="$MODEL_PATH" \
-          --tp 4 \
+          --tp-size 4 \
           --mem-frac=0.95 \
           --context-length=128000 \
           --chat-template llama-4 \
@@ -158,7 +158,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --model-path="$MODEL_PATH" \
-          --tp 4 \
+          --tp-size 4 \
           --mem-frac=0.95 \
           --context-length=128000 \
           --chat-template llama-4 \
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
index d9a86c63..06f028e1 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
@@ -61,7 +61,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --model-path="$MODEL_PATH" \
-          --tp 4 \
+          --tp-size 4 \
           --mem-frac=0.95 \
           --context-length=128000 \
           --chat-template llama-4 \
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
index 89ebc20a..f8d9b501 100644
--- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -70,7 +70,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH"\
-          --tp 2 \
+          --tp-size 2 \
           --mem-frac=0.9 \
           --disaggregation-mode prefill \
           --disaggregation-ib-device mlx5_0
@@ -162,7 +162,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH"\
-          --tp 2 \
+          --tp-size 2 \
           --mem-frac=0.9 \
           --disaggregation-mode decode \
           --disaggregation-ib-device mlx5_0
diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
index 231c1842..12812e7e 100644
--- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
@@ -65,7 +65,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH"\
-          --tp 2 \
+          --tp-size 2 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
index ad2a4908..9d15dcce 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -70,7 +70,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH"\
-          --tp 2 \
+          --tp-size 2 \
           --mem-frac=0.9 \
           --disaggregation-mode prefill \
           --disaggregation-ib-device mlx5_0
@@ -162,7 +162,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH"\
-          --tp 2 \
+          --tp-size 2 \
           --mem-frac=0.9 \
           --disaggregation-mode decode \
           --disaggregation-ib-device mlx5_0
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
index c3e26913..973045c4 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
@@ -65,7 +65,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH"\
-          --tp 2 \
+          --tp-size 2 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm

From 2393a8bff413f0f9f874777a5338e863db0b5d50 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 17:19:26 -0700
Subject: [PATCH 09/13] use latest sgl image in runtimes

---
 config/runtimes/srt/deepseek-rdma-pd-rt.yaml              | 8 ++++----
 config/runtimes/srt/deepseek-rdma-rt.yaml                 | 4 ++--
 config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml        | 2 +-
 .../runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml   | 4 ++--
 config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml   | 2 +-
 config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml     | 4 ++--
 config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml        | 2 +-
 .../runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml | 4 ++--
 config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml | 2 +-
 config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml      | 4 ++--
 config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml         | 2 +-
 config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml      | 4 ++--
 config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml         | 2 +-
 .../srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml      | 4 ++--
 .../srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml         | 2 +-
 config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml     | 4 ++--
 config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml        | 2 +-
 .../srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml | 4 ++--
 .../srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml    | 2 +-
 .../srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml         | 4 ++--
 .../runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml   | 2 +-
 config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml        | 4 ++--
 config/runtimes/srt/mistral-7b-instruct-rt.yaml           | 2 +-
 config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml      | 4 ++--
 config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml         | 2 +-
 25 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
index 5a54d7c8..4226b0d9 100644
--- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
@@ -59,7 +59,7 @@ spec:
                       - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: lmsysorg/sglang:v0.4.8.post1-cu126
         env:
           - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
             value: "1"
@@ -158,7 +158,7 @@ spec:
                       - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -240,7 +240,7 @@ spec:
                       - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: lmsysorg/sglang:v0.4.8.post1-cu126
         env:
           - name: SGLANG_MOONCAKE_TRANS_THREAD
             value: "16"
@@ -330,7 +330,7 @@ spec:
                       - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml
index 2c3bd513..e65af7fa 100644
--- a/config/runtimes/srt/deepseek-rdma-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-rt.yaml
@@ -90,7 +90,7 @@ spec:
                       - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -160,7 +160,7 @@ spec:
                       - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
diff --git a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
index d139ddd9..b74a82e4 100644
--- a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
@@ -22,7 +22,7 @@ spec:
   engineConfig:
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
index 1d005591..c72f3a62 100644
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
@@ -50,7 +50,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -139,7 +139,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
index c152183b..3855879a 100644
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
@@ -45,7 +45,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
index 2753bdad..e94c62bc 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
@@ -53,7 +53,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -145,7 +145,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
index 42c6a0cb..b93995ae 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
@@ -48,7 +48,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
index b12b0c96..6742148d 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
@@ -50,7 +50,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -140,7 +140,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
index d141e791..009e84f9 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
@@ -45,7 +45,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
index 58c9b505..9728c48e 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -53,7 +53,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -145,7 +145,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
index 38fe8459..5dab9f3b 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
@@ -48,7 +48,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
index c45678e5..8720109d 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -53,7 +53,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -145,7 +145,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
index 3c78795e..1e73f347 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
@@ -48,7 +48,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
index 0e3b2ca1..e5882b60 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
@@ -50,7 +50,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -140,7 +140,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
index 3b38f67a..a3d334f6 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
@@ -45,7 +45,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
index 1bcb0328..43a18ca9 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
@@ -53,7 +53,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -145,7 +145,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
index ebcad544..977e5c88 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
@@ -48,7 +48,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
index d8f2c123..416a82f0 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
@@ -50,7 +50,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -141,7 +141,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index 12fd0ae9..a8cd4997 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -45,7 +45,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
index de4ca76b..9d1f61e1 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -50,7 +50,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -142,7 +142,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
index 06f028e1..8d50f476 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
@@ -45,7 +45,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
index f8d9b501..961f0a4c 100644
--- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -53,7 +53,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -145,7 +145,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
index 12812e7e..d8a25498 100644
--- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
@@ -48,7 +48,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
index 9d15dcce..961278d9 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -53,7 +53,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -145,7 +145,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
index 973045c4..1897089d 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
@@ -48,7 +48,7 @@ spec:
                     - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1

From d63c7c2df335e5845fc9a89936e842c0e4dd968b Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 17:36:20 -0700
Subject: [PATCH 10/13] fix image repo

---
 Makefile                         | 2 +-
 charts/ome-resources/values.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index eea9a0ed..fb0d4f39 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 CHARTS_DIR := ./charts
 
 # Define the registry and image tagging
-REGISTRY     ?= ghcr.io/moirai-internal/ome
+REGISTRY     ?= ghcr.io/moirai-internal
 TAG          ?= $(GIT_TAG)
 ARCH         ?= linux/amd64
 MANAGER_IMG  ?= $(REGISTRY)/manager:$(TAG)
diff --git a/charts/ome-resources/values.yaml b/charts/ome-resources/values.yaml
index 193b1b91..aa5c8729 100644
--- a/charts/ome-resources/values.yaml
+++ b/charts/ome-resources/values.yaml
@@ -56,7 +56,7 @@ ome:
         cpu: 2
         memory: 4Gi
   omeAgent:
-    image: ghcr.io/moirai-internal/genai-ome-agent
+    image: ghcr.io/moirai-internal/ome-agent
     tag: *defaultVersion
     authType: InstancePrincipal
     compartmentId: ocid1.compartment.oc1..dummy-compartment

From b0c7d9eabf44f9f112be65e0eae57db026b243aa Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 17:47:13 -0700
Subject: [PATCH 11/13] fix ome manager img name

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index fb0d4f39..10124723 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ CHARTS_DIR := ./charts
 REGISTRY     ?= ghcr.io/moirai-internal
 TAG          ?= $(GIT_TAG)
 ARCH         ?= linux/amd64
-MANAGER_IMG  ?= $(REGISTRY)/manager:$(TAG)
+MANAGER_IMG  ?= $(REGISTRY)/ome-manager:$(TAG)
 
 # Git version and commit information for build
 version_pkg = github.com/sgl-project/ome/pkg/version

From 838f12edef98749fbae54f5aa0b47cf9ab20bed1 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 17:48:17 -0700
Subject: [PATCH 12/13] cleanup dep

---
 hack/internal/tools/go.sum | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/hack/internal/tools/go.sum b/hack/internal/tools/go.sum
index 8f3b5ba9..4410b7e0 100644
--- a/hack/internal/tools/go.sum
+++ b/hack/internal/tools/go.sum
@@ -737,8 +737,7 @@ github.com/frankban/quicktest v1.14.4/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7z
 github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
 github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
 github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
-github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
-github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
+github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
 github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
 github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
 github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
@@ -797,8 +796,7 @@ github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5x
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
-github.com/gohugoio/hugo v0.142.0 h1:gOVP52kHxr5dByyKgo/74s35tLIcHiHVwojQ4fmd3A4=
-github.com/gohugoio/hugo v0.142.0/go.mod h1:G0uwM5aRUXN4cbnqrDQx9Dlgmf/ukUpPADajL8FbL9M=
+github.com/gohugoio/hugo v0.147.7 h1:7qQKI8wsPgF1ipYBcXgM8wFmqTyFpkmzqLEf3hpzpT8=
 github.com/gohugoio/hugo v0.147.7/go.mod h1:gBn9Oi4LomFk1XS9raAPHdxaPrhPoF8ZfRrEcZZFGpo=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
@@ -1093,8 +1091,7 @@ github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU
 github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
 github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
-github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M=
-github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc=
+github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
 github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
 github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI=
 github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
@@ -1164,13 +1161,11 @@ github.com/spf13/afero v1.3.3/go.mod h1:5KUK8ByomD5Ti5Artl0RtHeI5pTF7MIDuXL3yY52
 github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I=
 github.com/spf13/afero v1.9.2/go.mod h1:iUV7ddyEEZPO5gA3zD4fJt6iStLlL+Lg4m2cihcDf8Y=
 github.com/spf13/afero v1.9.5/go.mod h1:UBogFpq8E9Hx+xc5CNTTEpTnuHVmXDwZcZcE1eb/UhQ=
-github.com/spf13/afero v1.12.0 h1:UcOPyRBYczmFn6yvphxkn9ZEOY65cpwGKb5mL36mrqs=
-github.com/spf13/afero v1.12.0/go.mod h1:ZTlWwG4/ahT8W7T0WQ5uYmjI9duaLQGy3Q2OAl4sk/4=
+github.com/spf13/afero v1.14.0 h1:9tH6MapGnn/j0eb0yIXiLjERO8RB6xIVZRDCX7PtqWA=
 github.com/spf13/afero v1.14.0/go.mod h1:acJQ8t0ohCGuMN3O+Pv0V0hgMxNYDlvdk+VTfyZmbYo=
 github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
 github.com/spf13/cast v1.5.1/go.mod h1:b9PdjNptOpzXr7Rq1q9gJML/2cdGQAo69NKzQ10KN48=
-github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y=
-github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
+github.com/spf13/cast v1.8.0 h1:gEN9K4b8Xws4EX0+a0reLmhq8moKn7ntRlQYgjPeCDk=
 github.com/spf13/cast v1.8.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
 github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
 github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
@@ -1447,8 +1442,7 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20220819030929-7fc1605a5dde/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
-golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
 golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
 golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -1578,8 +1572,7 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
-golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
-golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
+golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4=
 golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@@ -1587,8 +1580,7 @@ golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxb
 golang.org/x/time v0.0.0-20220922220347-f3bd1da661af/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.1.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
-golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
+golang.org/x/time v0.10.0 h1:3usCWA8tQn0L8+hFJQNgzpWbd89begxN66o1Ojdn5L4=
 golang.org/x/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

From 6fdd662d96a0a88f2fe6d840c29f53da33f9e2d7 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Thu, 26 Jun 2025 17:51:49 -0700
Subject: [PATCH 13/13] add pd to runtime kustomize

---
 config/runtimes/kustomization.yaml            |   3 +
 config/runtimes/srt/deepseek-rdma-pd-rt.yaml  |  66 +----
 config/runtimes/srt/deepseek-rdma-rt.yaml     |  30 +--
 .../srt/e5-mistral-7b-instruct-rt.yaml        |  16 +-
 .../llama-3-1-405b-instruct-fp8-pd-rt.yaml    | 233 ------------------
 .../srt/llama-3-1-405b-instruct-fp8-rt.yaml   | 138 -----------
 .../srt/llama-3-1-70b-instruct-pd-rt.yaml     |  28 +--
 .../srt/llama-3-1-70b-instruct-rt.yaml        |  14 +-
 .../llama-3-2-11b-vision-instruct-pd-rt.yaml  |  17 +-
 .../srt/llama-3-2-11b-vision-instruct-rt.yaml |  11 +-
 .../srt/llama-3-2-1b-instruct-pd-rt.yaml      |  28 +--
 .../srt/llama-3-2-1b-instruct-rt.yaml         |  14 +-
 .../srt/llama-3-2-3b-instruct-pd-rt.yaml      |  20 +-
 .../srt/llama-3-2-3b-instruct-rt.yaml         |  14 +-
 ...ama-3-2-90b-vision-instruct-fp8-pd-rt.yaml |  22 +-
 .../llama-3-2-90b-vision-instruct-fp8-rt.yaml |  11 +-
 .../srt/llama-3-3-70b-instruct-pd-rt.yaml     |  28 +--
 .../srt/llama-3-3-70b-instruct-rt.yaml        |  14 +-
 ...-maverick-17b-128e-instruct-fp8-pd-rt.yaml |  22 +-
 ...a-4-maverick-17b-128e-instruct-fp8-rt.yaml |  11 +-
 .../llama-4-scout-17b-16e-instruct-pd-rt.yaml |  22 +-
 .../llama-4-scout-17b-16e-instruct-rt.yaml    |  11 +-
 .../srt/mistral-7b-instruct-pd-rt.yaml        |  28 +--
 .../runtimes/srt/mistral-7b-instruct-rt.yaml  |  14 +-
 .../srt/mixtral-8x7b-instruct-pd-rt.yaml      |  28 +--
 .../srt/mixtral-8x7b-instruct-rt.yaml         |  14 +-
 config/runtimes/vllm/deepseek-v3-rdma-rt.yaml |  13 -
 .../vllm/e5-mistral-7b-instruct-rt.yaml       |  12 -
 .../vllm/llama-3-1-405b-instruct-fp8-rt.yaml  |  12 -
 .../vllm/llama-3-1-70b-instruct-rt.yaml       |  12 -
 .../llama-3-1-nemotron-nano-8b-v1-rt.yaml     |  12 -
 .../llama-3-1-nemotron-ultra-253b-v1-rt.yaml  |   9 -
 .../llama-3-2-11b-vision-instruct-rt.yaml     |   9 -
 .../vllm/llama-3-2-1b-instruct-rt.yaml        |  12 -
 .../vllm/llama-3-2-3b-instruct-rt.yaml        |  12 -
 .../llama-3-2-90b-vision-instruct-fp8-rt.yaml |   9 -
 ...llama-3-3-70b-instruct-fp8-dynamic-rt.yaml |  10 -
 .../vllm/llama-3-3-70b-instruct-rt.yaml       |  12 -
 .../llama-3-3-nemotron-super-49b-v1-rt.yaml   |  12 -
 .../vllm/llama-3-70b-instruct-rt.yaml         |  12 -
 ...a-4-maverick-17b-128e-instruct-fp8-rt.yaml |   9 -
 .../llama-4-scout-17b-16e-instruct-rt.yaml    |   9 -
 .../runtimes/vllm/mistral-7b-instruct-rt.yaml |  12 -
 .../vllm/mixtral-8x7b-instruct-rt.yaml        |  12 -
 .../vllm/phi-3-vision-128k-instruct-rt.yaml   |  12 -
 45 files changed, 48 insertions(+), 1021 deletions(-)
 delete mode 100644 config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
 delete mode 100644 config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml

diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
index 9da0e862..111d1161 100644
--- a/config/runtimes/kustomization.yaml
+++ b/config/runtimes/kustomization.yaml
@@ -5,6 +5,9 @@ resources:
 - srt/deepseek-rdma-pd-rt.yaml
 - srt/deepseek-rdma-rt.yaml
 - srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+- srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
 - srt/llama-4-scout-17b-16e-instruct-rt.yaml
+- srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
 - srt/e5-mistral-7b-instruct-rt.yaml
 - srt/llama-3-3-70b-instruct-rt.yaml
+- srt/llama-3-3-70b-instruct-pd-rt.yaml
diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
index 4226b0d9..a5ec1663 100644
--- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
@@ -44,22 +44,9 @@ spec:
           effect: "NoSchedule"
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: lmsysorg/sglang:v0.4.8.post1-cu126
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         env:
           - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
             value: "1"
@@ -137,28 +124,15 @@ spec:
           timeoutSeconds: 30
     worker:
       size: 1
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists
           effect: NoSchedule
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: lmsysorg/sglang:v0.4.8.post1-cu126
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -205,7 +179,7 @@ spec:
             value: "0"
           - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
             value: "1"
-          - name: SGLANG_MOONCAKE_TRANS_THREAD
+          - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE
             value: "8"
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
@@ -225,24 +199,11 @@ spec:
           effect: "NoSchedule"
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: lmsysorg/sglang:v0.4.8.post1-cu126
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         env:
-          - name: SGLANG_MOONCAKE_TRANS_THREAD
+          - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE
             value: "16"
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
@@ -309,28 +270,15 @@ spec:
           timeoutSeconds: 30
     worker:
       size: 1
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists
           effect: NoSchedule
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: lmsysorg/sglang:v0.4.8.post1-cu126
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -372,7 +320,7 @@ spec:
             value: "0"
           - name: NVSHMEM_IB_TRAFFIC_CLASS
             value: "16"
-          - name: SGLANG_MOONCAKE_TRANS_THREAD
+          - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE
             value: "16"
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml
index e65af7fa..a791a7b1 100644
--- a/config/runtimes/srt/deepseek-rdma-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-rt.yaml
@@ -75,22 +75,9 @@ spec:
           effect: "NoSchedule"
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: lmsysorg/sglang:v0.4.8.post1-cu126
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -139,28 +126,15 @@ spec:
           timeoutSeconds: 30
     worker:
       size: 1
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists
           effect: NoSchedule
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: lmsysorg/sglang:v0.4.8.post1-cu126
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
diff --git a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
index b74a82e4..0b22c744 100644
--- a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
@@ -22,7 +22,7 @@ spec:
   engineConfig:
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -93,16 +93,4 @@ spec:
     volumes:
       - name: dshm
         emptyDir:
-          medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
\ No newline at end of file
+          medium: Memory
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
deleted file mode 100644
index c72f3a62..00000000
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml
+++ /dev/null
@@ -1,233 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterServingRuntime
-metadata:
-  name: srt-llama-3-1-405b-instruct-fp8-pd
-spec:
-  disabled: false
-  supportedModelFormats:
-    - modelFramework:
-        name: transformers
-        version: "4.43.0.dev0"
-      modelFormat:
-        name: safetensors
-        version: "1"
-      modelArchitecture: LlamaForCausalLM
-      autoSelect: false
-      priority: 1
-  protocolVersions:
-    - openAI
-  modelSizeRange:
-    min: 400B
-    max: 410B
-  engineConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 8 \
-          --mem-frac=0.9 \
-          --disaggregation-mode prefill \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 128
-          memory: 216Gi
-          nvidia.com/gpu: 8
-        limits:
-          cpu: 128
-          memory: 216Gi
-          nvidia.com/gpu: 8
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  decoderConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 8 \
-          --mem-frac=0.9 \
-          --disaggregation-mode decode \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 128
-          memory: 216Gi
-          nvidia.com/gpu: 8
-        limits:
-          cpu: 128
-          memory: 216Gi
-          nvidia.com/gpu: 8
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  routerConfig:
-    runner:
-      name: router
-      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
-      resources:
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      ports:
-        - containerPort: 8080
-          name: http
-      command:
-        - sh
-        - -c
-        - >
-          python3 -m sglang_router.launch_router
-          --host "0.0.0.0"
-          --port "8080"
-          --pd-disaggregation
-          --policy power_of_two
-          --service-discovery
-          --service-discovery-namespace "${NAMESPACE}"
-          --service-discovery-port 8080
-          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-      env:
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        - name: INFERENCESERVICE_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.labels['ome.io/inferenceservice']
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
deleted file mode 100644
index 3855879a..00000000
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
+++ /dev/null
@@ -1,138 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterServingRuntime
-metadata:
-  name: srt-llama-3-1-405b-instruct-fp8
-spec:
-  disabled: false
-  supportedModelFormats:
-    - modelFramework:
-        name: transformers
-        version: "4.43.0.dev0"
-      modelFormat:
-        name: safetensors
-        version: "1.0.0"
-      modelArchitecture: LlamaForCausalLM
-      autoSelect: false
-      priority: 1
-  protocolVersions:
-    - openAI
-  modelSizeRange:
-    min: 400B
-    max: 410B
-  engineConfig:
-    annotations:
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
-    runner:
-      name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 8 \
-          --mem-frac=0.9
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 128
-          memory: 216Gi
-          nvidia.com/gpu: 8
-        limits:
-          cpu: 128
-          memory: 216Gi
-          nvidia.com/gpu: 8
-
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-
-  routerConfig:
-    runner:
-      name: router
-      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
-      resources:
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      ports:
-        - containerPort: 8080
-          name: http
-      command:
-        - sh
-        - -c
-        - >
-          python3 -m sglang_router.launch_router
-          --host "0.0.0.0"
-          --port "8080"
-          --service-discovery
-          --service-discovery-namespace "${NAMESPACE}"
-          --service-discovery-port 8080
-          --selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-      env:
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        - name: INFERENCESERVICE_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.labels['ome.io/inferenceservice']
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
index e94c62bc..55f20a7e 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
@@ -37,23 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -129,23 +117,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
index b93995ae..43636215 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
index 6742148d..ec8764a6 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
@@ -37,20 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -127,8 +118,8 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
+#    affinity:
+#      nodeAffinity:
         requiredDuringSchedulingIgnoredDuringExecution:
           nodeSelectorTerms:
             - matchExpressions:
@@ -140,7 +131,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
index 009e84f9..a27e52fb 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
@@ -34,18 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
index 9728c48e..21797d18 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -37,23 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -129,23 +117,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
index 5dab9f3b..412206c3 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
index 8720109d..0231121e 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -37,23 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -129,8 +117,8 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
+#    affinity:
+#      nodeAffinity:
         requiredDuringSchedulingIgnoredDuringExecution:
           nodeSelectorTerms:
             - matchExpressions:
@@ -145,7 +133,7 @@ spec:
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
index 1e73f347..b7e7d36d 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
index e5882b60..9052b5b0 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
@@ -37,20 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -127,20 +118,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
index a3d334f6..2ef9d4d4 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
@@ -34,18 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
index 43a18ca9..3f02b061 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
@@ -37,23 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -129,23 +117,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
index 977e5c88..3337e988 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
index 416a82f0..40cbe8d8 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
@@ -37,20 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -128,20 +119,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index a8cd4997..8e78c01c 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -34,18 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
index 9d1f61e1..8c64685d 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -37,20 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -129,20 +120,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
index 8d50f476..2609ed15 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
@@ -34,18 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
index 961f0a4c..5869bd53 100644
--- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -37,23 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -129,23 +117,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
index d8a25498..7b193fa5 100644
--- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
index 961278d9..8f75eed8 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -37,23 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -129,23 +117,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
index 1897089d..0382daa3 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: lmsysorg/sglang:v0.4.8.post1-cu126
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
diff --git a/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml b/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml
index 8b29a3b0..dc1fc2b2 100644
--- a/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml
+++ b/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml
@@ -33,19 +33,6 @@ spec:
         effect: "NoSchedule"
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
-    nodeSelector:
-      oci.oraclecloud.com/rdma.authenticated: "16"
-      oci.oraclecloud.com/rdma.mlx_issues: "0"
-      oke.oraclecloud.com/pool.mode: cluster-network
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml
index 2c92af6f..92f869d5 100644
--- a/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
index 88f44125..296b386e 100644
--- a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
@@ -31,18 +31,6 @@ spec:
       - key: "nvidia.com/gpu"
         operator: "Exists"
         effect: "NoSchedule"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml
index 125a6c17..10d42567 100644
--- a/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml b/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml
index b76ece6d..e0f122f8 100644
--- a/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml b/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml
index 0628b078..96b1ad4e 100644
--- a/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml
@@ -31,15 +31,6 @@ spec:
       - key: "nvidia.com/gpu"
         operator: "Exists"
         effect: "NoSchedule"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
index 82d8d1c1..09a00a44 100644
--- a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
@@ -45,15 +45,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml
index 1d4d074e..bc873896 100644
--- a/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml
index f72a2170..e7030b78 100644
--- a/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml
index a5dac905..88c7c02a 100644
--- a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml
@@ -35,15 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
index cb676165..9f97cd3e 100644
--- a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
+++ b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
@@ -36,16 +36,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml
index c4679b18..e43fbb09 100644
--- a/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml
@@ -39,18 +39,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml b/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml
index 90a9a004..9de1e31b 100644
--- a/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml
+++ b/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml
index e1f74933..2823e997 100644
--- a/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index 5a6a851b..e05a7d10 100644
--- a/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -33,15 +33,6 @@ spec:
       - key: "nvidia.com/gpu"
         operator: "Exists"
         effect: "NoSchedule"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml
index 6fd01f5c..1d8d0f55 100644
--- a/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml
@@ -32,15 +32,6 @@ spec:
       - key: "nvidia.com/gpu"
         operator: "Exists"
         effect: "NoSchedule"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/mistral-7b-instruct-rt.yaml b/config/runtimes/vllm/mistral-7b-instruct-rt.yaml
index 18404026..39d1c175 100644
--- a/config/runtimes/vllm/mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/vllm/mistral-7b-instruct-rt.yaml
@@ -34,18 +34,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml
index 73c46c41..5279859f 100644
--- a/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml
@@ -34,18 +34,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml b/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml
index 2b4c159c..c555ac27 100644
--- a/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml
+++ b/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml
@@ -34,18 +34,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1