diff --git a/Makefile b/Makefile
index eea9a0ed..10124723 100644
--- a/Makefile
+++ b/Makefile
@@ -2,10 +2,10 @@
 CHARTS_DIR := ./charts
 
 # Define the registry and image tagging
-REGISTRY     ?= ghcr.io/moirai-internal/ome
+REGISTRY     ?= ghcr.io/moirai-internal
 TAG          ?= $(GIT_TAG)
 ARCH         ?= linux/amd64
-MANAGER_IMG  ?= $(REGISTRY)/manager:$(TAG)
+MANAGER_IMG  ?= $(REGISTRY)/ome-manager:$(TAG)
 
 # Git version and commit information for build
 version_pkg = github.com/sgl-project/ome/pkg/version
diff --git a/charts/ome-resources/values.yaml b/charts/ome-resources/values.yaml
index 193b1b91..aa5c8729 100644
--- a/charts/ome-resources/values.yaml
+++ b/charts/ome-resources/values.yaml
@@ -56,7 +56,7 @@ ome:
         cpu: 2
         memory: 4Gi
   omeAgent:
-    image: ghcr.io/moirai-internal/genai-ome-agent
+    image: ghcr.io/moirai-internal/ome-agent
     tag: *defaultVersion
     authType: InstancePrincipal
     compartmentId: ocid1.compartment.oc1..dummy-compartment
diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
index 9da0e862..111d1161 100644
--- a/config/runtimes/kustomization.yaml
+++ b/config/runtimes/kustomization.yaml
@@ -5,6 +5,9 @@ resources:
 - srt/deepseek-rdma-pd-rt.yaml
 - srt/deepseek-rdma-rt.yaml
 - srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+- srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
 - srt/llama-4-scout-17b-16e-instruct-rt.yaml
+- srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
 - srt/e5-mistral-7b-instruct-rt.yaml
 - srt/llama-3-3-70b-instruct-rt.yaml
+- srt/llama-3-3-70b-instruct-pd-rt.yaml
diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
index 2d05d410..a5ec1663 100644
--- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-deepseek-pd-rdma
+  name: srt-deepseek-rdma-pd
 spec:
   disabled: false
   modelSizeRange:
@@ -44,22 +44,9 @@ spec:
           effect: "NoSchedule"
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         env:
           - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
             value: "1"
@@ -137,28 +124,15 @@ spec:
           timeoutSeconds: 30
     worker:
       size: 1
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists
           effect: NoSchedule
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -205,7 +179,7 @@ spec:
             value: "0"
           - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
             value: "1"
-          - name: SGLANG_MOONCAKE_TRANS_THREAD
+          - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE
             value: "8"
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
@@ -225,24 +199,11 @@ spec:
           effect: "NoSchedule"
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         env:
-          - name: SGLANG_MOONCAKE_TRANS_THREAD
+          - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE
             value: "16"
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
@@ -309,28 +270,15 @@ spec:
           timeoutSeconds: 30
     worker:
       size: 1
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists
           effect: NoSchedule
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -372,7 +320,7 @@ spec:
             value: "0"
           - name: NVSHMEM_IB_TRAFFIC_CLASS
             value: "16"
-          - name: SGLANG_MOONCAKE_TRANS_THREAD
+          - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE
             value: "16"
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml
index 99b21eb3..a791a7b1 100644
--- a/config/runtimes/srt/deepseek-rdma-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-rt.yaml
@@ -35,7 +35,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
@@ -75,22 +75,9 @@ spec:
           effect: "NoSchedule"
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -98,7 +85,7 @@ spec:
             python3 -m sglang.launch_server 
             --host 0.0.0.0 --port 8080 
             --model-path ${MODEL_PATH} 
-            --tp 16 
+            --tp-size 16 
             --nccl-init $(LWS_LEADER_ADDRESS):5000 
             --nnodes ${LWS_GROUP_SIZE} 
             --node-rank ${LWS_WORKER_INDEX} 
@@ -139,28 +126,15 @@ spec:
           timeoutSeconds: 30
     worker:
       size: 1
-      nodeSelector:
-        oci.oraclecloud.com/rdma.authenticated: "16"
-        oci.oraclecloud.com/rdma.mlx_issues: "0"
-        oke.oraclecloud.com/pool.mode: cluster-network
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists
           effect: NoSchedule
       dnsPolicy: ClusterFirstWithHostNet
       hostNetwork: true
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node.kubernetes.io/instance-type
-                    operator: In
-                    values:
-                      - BM.GPU.H100.8
       runner:
         name: ome-container
-        image: ghcr.io/moirai-internal/sgl:dev2
+        image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
         command:
           - sh
           - -c
@@ -169,7 +143,7 @@ spec:
             --host 0.0.0.0 
             --port 8080 
             --model-path ${MODEL_PATH} 
-            --tp 16 
+            --tp-size 16 
             --nccl-init $(LWS_LEADER_ADDRESS):5000 
             --nnodes ${LWS_GROUP_SIZE} 
             --node-rank ${LWS_WORKER_INDEX} 
diff --git a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
index 2f9104ff..0b22c744 100644
--- a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml
@@ -22,7 +22,7 @@ spec:
   engineConfig:
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -38,7 +38,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --is-embedding
       volumeMounts:
         - mountPath: /dev/shm
@@ -93,16 +93,4 @@ spec:
     volumes:
       - name: dshm
         emptyDir:
-          medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
\ No newline at end of file
+          medium: Memory
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
deleted file mode 100644
index 14d6b5a3..00000000
--- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml
+++ /dev/null
@@ -1,138 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterServingRuntime
-metadata:
-  name: srt-llama-3-1-405b-instruct-fp8
-spec:
-  disabled: false
-  supportedModelFormats:
-    - modelFramework:
-        name: transformers
-        version: "4.43.0.dev0"
-      modelFormat:
-        name: safetensors
-        version: "1.0.0"
-      modelArchitecture: LlamaForCausalLM
-      autoSelect: false
-      priority: 1
-  protocolVersions:
-    - openAI
-  modelSizeRange:
-    min: 400B
-    max: 410B
-  engineConfig:
-    annotations:
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
-    runner:
-      name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp 8 \
-          --mem-frac=0.9
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 128
-          memory: 216Gi
-          nvidia.com/gpu: 8
-        limits:
-          cpu: 128
-          memory: 216Gi
-          nvidia.com/gpu: 8
-
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-
-  routerConfig:
-    runner:
-      name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
-      resources:
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      ports:
-        - containerPort: 8080
-          name: http
-      command:
-        - sh
-        - -c
-        - >
-          python3 -m sglang_router.launch_router
-          --host "0.0.0.0"
-          --port "8080"
-          --service-discovery
-          --service-discovery-namespace "${NAMESPACE}"
-          --service-discovery-port 8080
-          --selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-      env:
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        - name: INFERENCESERVICE_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.labels['ome.io/inferenceservice']
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..55f20a7e
--- /dev/null
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
@@ -0,0 +1,215 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-1-70b-instruct-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.42.3"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 60B
+    max: 75B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=4 \
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=4 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
index e3d7616e..43636215 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -110,7 +98,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
new file mode 100644
index 00000000..ec8764a6
--- /dev/null
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
@@ -0,0 +1,226 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-2-11b-vision-instruct-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.45.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: MllamaForConditionalGeneration
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 10B
+    max: 12B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size 1 \
+          --mem-frac=0.9 \
+          --chat-template llama_3_vision \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+#    affinity:
+#      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size 1 \
+          --mem-frac=0.9 \
+          --chat-template llama_3_vision \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
index a484dc2e..a27e52fb 100644
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml
@@ -34,18 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -62,7 +53,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9 \
           --chat-template llama_3_vision
       volumeMounts:
@@ -108,7 +99,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..21797d18
--- /dev/null
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -0,0 +1,215 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-2-1b-instruct-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.45.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 500M
+    max: 2B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size 1 \
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size 1 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
index ce55124c..412206c3 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -65,7 +53,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm
@@ -110,7 +98,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..0231121e
--- /dev/null
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -0,0 +1,227 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-2-3b-instruct-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.45.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 2B
+    max: 4B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size 1 \
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+#    affinity:
+#      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.B4.8
+                    - BM.GPU4.8
+                    - BM.GPU.A100-v2.8
+                    - BM.GPU.H100.8
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size 1 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
index 6d1d6842..b7e7d36d 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -65,7 +53,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 1 \
+          --tp-size 1 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm
@@ -110,7 +98,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
new file mode 100644
index 00000000..9052b5b0
--- /dev/null
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
@@ -0,0 +1,217 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-2-90b-vision-instruct-fp8-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.46.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: MllamaForConditionalGeneration
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 85B
+    max: 95B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size 4 \
+          --mem-frac=0.9 \
+          --chat-template llama_3_vision \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size 4 \
+          --mem-frac=0.9 \
+          --chat-template llama_3_vision \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
index 4a1d0e7b..2ef9d4d4 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
@@ -34,18 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -62,7 +53,7 @@ spec:
           --enable-metrics \
           --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp 4 \
+          --tp-size 4 \
           --mem-frac=0.9 \
           --chat-template llama_3_vision
       volumeMounts:
@@ -108,7 +99,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..3f02b061
--- /dev/null
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
@@ -0,0 +1,215 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-3-70b-instruct-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.47.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 60B
+    max: 75B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=4 \
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=4 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
index d7096f4f..3337e988 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -110,7 +98,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
index b62ae96c..40cbe8d8 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
@@ -37,20 +37,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -68,7 +59,7 @@ spec:
           --model-path="$MODEL_PATH" \
           --disaggregation-mode prefill \
           --disaggregation-ib-device mlx5_0 \
-          --tp 8 \
+          --tp-size 8 \
           --context-length=430000 \
           --chat-template llama-4 \
           --attention-backend fa3 \
@@ -85,7 +76,6 @@ spec:
           cpu: 128
           memory: 512Gi
           nvidia.com/gpu: 8
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -94,7 +84,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -103,7 +92,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
@@ -131,20 +119,11 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -162,7 +141,7 @@ spec:
           --model-path="$MODEL_PATH" \
           --disaggregation-mode decode \
           --disaggregation-ib-device mlx5_0 \
-          --tp 8 \
+          --tp-size 8 \
           --context-length=430000 \
           --chat-template llama-4 \
           --attention-backend fa3 \
@@ -179,7 +158,6 @@ spec:
           cpu: 128
           memory: 512Gi
           nvidia.com/gpu: 8
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -188,7 +166,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -197,7 +174,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
@@ -210,7 +186,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev13
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index 70e1d0f2..8e78c01c 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -34,18 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -61,7 +52,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --model-path="$MODEL_PATH" \
-          --tp 8 \
+          --tp-size 8 \
           --context-length=430000 \
           --chat-template llama-4 \
           --attention-backend fa3 \
@@ -78,7 +69,6 @@ spec:
           cpu: 128
           memory: 512Gi
           nvidia.com/gpu: 8
-
       readinessProbe:
         httpGet:
           path: /health_generate
@@ -87,7 +77,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 200
-
       livenessProbe:
         httpGet:
           path: /health
@@ -96,7 +85,6 @@ spec:
         successThreshold: 1
         periodSeconds: 60
         timeoutSeconds: 60
-
       startupProbe:
         httpGet:
           path: /health_generate
@@ -109,7 +97,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
new file mode 100644
index 00000000..8c64685d
--- /dev/null
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -0,0 +1,221 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-4-scout-17b-16e-instruct-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.51.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: Llama4ForConditionalGeneration
+      autoSelect: true
+      priority: 2
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 100B
+    max: 109B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --model-path="$MODEL_PATH" \
+          --tp-size 4 \
+          --mem-frac=0.95 \
+          --context-length=128000 \
+          --chat-template llama-4 \
+          --attention-backend fa3 \
+          --log-requests \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --model-path="$MODEL_PATH" \
+          --tp-size 4 \
+          --mem-frac=0.95 \
+          --context-length=128000 \
+          --chat-template llama-4 \
+          --attention-backend fa3 \
+          --log-requests \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
index 07be4bad..2609ed15 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
@@ -34,18 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -61,7 +52,7 @@ spec:
           --port=8080 \
           --enable-metrics \
           --model-path="$MODEL_PATH" \
-          --tp 4 \
+          --tp-size 4 \
           --mem-frac=0.95 \
           --context-length=128000 \
           --chat-template llama-4 \
@@ -110,7 +101,7 @@ spec:
   routerConfig:
     runner:
       name: router
-      image: ghcr.io/moirai-internal/sgl-router:dev2
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
       resources:
         limits:
           cpu: "1"
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..5869bd53
--- /dev/null
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -0,0 +1,215 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-mistral-7b-instruct-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.36.2"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: MistralForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 5B
+    max: 9B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH"\
+          --tp-size 2 \
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH"\
+          --tp-size 2 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
index 72262542..7b193fa5 100644
--- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -64,8 +52,8 @@ spec:
           --port=8080 \
           --enable-metrics \
           --log-requests \
-          --model="$MODEL_PATH"\
-          --tp 2 \
+          --model-path="$MODEL_PATH"\
+          --tp-size 2 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm
@@ -106,4 +94,35 @@ spec:
         successThreshold: 1
         periodSeconds: 6
         initialDelaySeconds: 60
-        timeoutSeconds: 30
\ No newline at end of file
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
new file mode 100644
index 00000000..8f75eed8
--- /dev/null
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -0,0 +1,215 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-mixtral-8x7b-instruct-pd
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.36.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1"
+      modelArchitecture: MixtralForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 45B
+    max: 50B
+  engineConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH"\
+          --tp-size 2 \
+          --mem-frac=0.9 \
+          --disaggregation-mode prefill \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  decoderConfig:
+    annotations:
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    dnsPolicy: ClusterFirstWithHostNet
+    hostNetwork: true
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH"\
+          --tp-size 2 \
+          --mem-frac=0.9 \
+          --disaggregation-mode decode \
+          --disaggregation-ib-device mlx5_0
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 2
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
index 0a18d556..0382daa3 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-mmixtral-8x7b-instruct
+  name: srt-mixtral-8x7b-instruct
 spec:
   disabled: false
   supportedModelFormats:
@@ -34,21 +34,9 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
-      image: ghcr.io/moirai-internal/sgl:dev2
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
       ports:
         - containerPort: 8080
           name: http1
@@ -64,8 +52,8 @@ spec:
           --port=8080 \
           --enable-metrics \
           --log-requests \
-          --model="$MODEL_PATH"\
-          --tp 2 \
+          --model-path="$MODEL_PATH"\
+          --tp-size 2 \
           --mem-frac=0.9
       volumeMounts:
         - mountPath: /dev/shm
@@ -106,4 +94,35 @@ spec:
         successThreshold: 1
         periodSeconds: 6
         initialDelaySeconds: 60
-        timeoutSeconds: 30
\ No newline at end of file
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml b/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml
index 8b29a3b0..dc1fc2b2 100644
--- a/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml
+++ b/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml
@@ -33,19 +33,6 @@ spec:
         effect: "NoSchedule"
     dnsPolicy: ClusterFirstWithHostNet
     hostNetwork: true
-    nodeSelector:
-      oci.oraclecloud.com/rdma.authenticated: "16"
-      oci.oraclecloud.com/rdma.mlx_issues: "0"
-      oke.oraclecloud.com/pool.mode: cluster-network
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml
index 2c92af6f..92f869d5 100644
--- a/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
index 88f44125..296b386e 100644
--- a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
@@ -31,18 +31,6 @@ spec:
       - key: "nvidia.com/gpu"
         operator: "Exists"
         effect: "NoSchedule"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml
index 125a6c17..10d42567 100644
--- a/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml b/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml
index b76ece6d..e0f122f8 100644
--- a/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml b/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml
index 0628b078..96b1ad4e 100644
--- a/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml
@@ -31,15 +31,6 @@ spec:
       - key: "nvidia.com/gpu"
         operator: "Exists"
         effect: "NoSchedule"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
index 82d8d1c1..09a00a44 100644
--- a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
@@ -45,15 +45,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml
index 1d4d074e..bc873896 100644
--- a/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml
index f72a2170..e7030b78 100644
--- a/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml
index a5dac905..88c7c02a 100644
--- a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml
@@ -35,15 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
index cb676165..9f97cd3e 100644
--- a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
+++ b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
@@ -36,16 +36,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml
index c4679b18..e43fbb09 100644
--- a/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml
@@ -39,18 +39,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml b/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml
index 90a9a004..9de1e31b 100644
--- a/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml
+++ b/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml
index e1f74933..2823e997 100644
--- a/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml
@@ -35,18 +35,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index 5a6a851b..e05a7d10 100644
--- a/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -33,15 +33,6 @@ spec:
       - key: "nvidia.com/gpu"
         operator: "Exists"
         effect: "NoSchedule"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml
index 6fd01f5c..1d8d0f55 100644
--- a/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml
@@ -32,15 +32,6 @@ spec:
       - key: "nvidia.com/gpu"
         operator: "Exists"
         effect: "NoSchedule"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
     volumes:
       - name: dshm
         emptyDir:
diff --git a/config/runtimes/vllm/mistral-7b-instruct-rt.yaml b/config/runtimes/vllm/mistral-7b-instruct-rt.yaml
index 18404026..39d1c175 100644
--- a/config/runtimes/vllm/mistral-7b-instruct-rt.yaml
+++ b/config/runtimes/vllm/mistral-7b-instruct-rt.yaml
@@ -34,18 +34,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml
index 73c46c41..5279859f 100644
--- a/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml
+++ b/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml
@@ -34,18 +34,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml b/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml
index 2b4c159c..c555ac27 100644
--- a/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml
+++ b/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml
@@ -34,18 +34,6 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.B4.8
-                    - BM.GPU4.8
-                    - BM.GPU.A100-v2.8
-                    - BM.GPU.H100.8
     runner:
       name: ome-container
       image: docker.io/vllm/vllm-openai:v0.9.0.1
diff --git a/go.mod b/go.mod
index 62595c07..7fe6a39a 100644
--- a/go.mod
+++ b/go.mod
@@ -63,7 +63,6 @@ require (
 	github.com/NYTimes/gziphandler v1.1.1 // indirect
 	github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
 	github.com/antonmedv/expr v1.15.3 // indirect
-	github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
 	github.com/blendle/zapdriver v1.3.1 // indirect
@@ -101,7 +100,6 @@ require (
 	github.com/google/cel-go v0.23.2 // indirect
 	github.com/google/gnostic-models v0.6.9 // indirect
 	github.com/google/go-containerregistry v0.16.1 // indirect
-	github.com/google/gofuzz v1.2.0 // indirect
 	github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect
 	github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
 	github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
@@ -109,12 +107,10 @@ require (
 	github.com/hashicorp/errwrap v1.1.0 // indirect
 	github.com/hashicorp/golang-lru v1.0.2 // indirect
 	github.com/hashicorp/hcl v1.0.0 // indirect
-	github.com/imdario/mergo v0.3.16 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/kelseyhightower/envconfig v1.4.0 // indirect
-	github.com/klauspost/compress v1.18.0 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
 	github.com/kylelemons/godebug v1.1.0 // indirect
 	github.com/leodido/go-urn v1.4.0 // indirect
diff --git a/go.sum b/go.sum
index 6906429c..cab24b93 100644
--- a/go.sum
+++ b/go.sum
@@ -58,8 +58,6 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8
 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
 github.com/antonmedv/expr v1.15.3 h1:q3hOJZNvLvhqE8OHBs1cFRdbXFNKuA+bHmRaI+AmRmI=
 github.com/antonmedv/expr v1.15.3/go.mod h1:0E/6TxnOlRNp81GMzX9QfDPAmHo2Phg00y4JUv1ihsE=
-github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
-github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
 github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -287,8 +285,6 @@ github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
 github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
-github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
-github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/jarcoal/httpmock v1.2.0 h1:gSvTxxFR/MEMfsGrvRbdfpRUMBStovlSRLw0Ep1bwwc=
diff --git a/hack/internal/tools/go.sum b/hack/internal/tools/go.sum
index 8f3b5ba9..4410b7e0 100644
--- a/hack/internal/tools/go.sum
+++ b/hack/internal/tools/go.sum
@@ -737,8 +737,7 @@ github.com/frankban/quicktest v1.14.4/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7z
 github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
 github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
 github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
-github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
-github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
+github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
 github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
 github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
 github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
@@ -797,8 +796,7 @@ github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5x
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
-github.com/gohugoio/hugo v0.142.0 h1:gOVP52kHxr5dByyKgo/74s35tLIcHiHVwojQ4fmd3A4=
-github.com/gohugoio/hugo v0.142.0/go.mod h1:G0uwM5aRUXN4cbnqrDQx9Dlgmf/ukUpPADajL8FbL9M=
+github.com/gohugoio/hugo v0.147.7 h1:7qQKI8wsPgF1ipYBcXgM8wFmqTyFpkmzqLEf3hpzpT8=
 github.com/gohugoio/hugo v0.147.7/go.mod h1:gBn9Oi4LomFk1XS9raAPHdxaPrhPoF8ZfRrEcZZFGpo=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
@@ -1093,8 +1091,7 @@ github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU
 github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
 github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
-github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M=
-github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc=
+github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
 github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
 github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI=
 github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
@@ -1164,13 +1161,11 @@ github.com/spf13/afero v1.3.3/go.mod h1:5KUK8ByomD5Ti5Artl0RtHeI5pTF7MIDuXL3yY52
 github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I=
 github.com/spf13/afero v1.9.2/go.mod h1:iUV7ddyEEZPO5gA3zD4fJt6iStLlL+Lg4m2cihcDf8Y=
 github.com/spf13/afero v1.9.5/go.mod h1:UBogFpq8E9Hx+xc5CNTTEpTnuHVmXDwZcZcE1eb/UhQ=
-github.com/spf13/afero v1.12.0 h1:UcOPyRBYczmFn6yvphxkn9ZEOY65cpwGKb5mL36mrqs=
-github.com/spf13/afero v1.12.0/go.mod h1:ZTlWwG4/ahT8W7T0WQ5uYmjI9duaLQGy3Q2OAl4sk/4=
+github.com/spf13/afero v1.14.0 h1:9tH6MapGnn/j0eb0yIXiLjERO8RB6xIVZRDCX7PtqWA=
 github.com/spf13/afero v1.14.0/go.mod h1:acJQ8t0ohCGuMN3O+Pv0V0hgMxNYDlvdk+VTfyZmbYo=
 github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
 github.com/spf13/cast v1.5.1/go.mod h1:b9PdjNptOpzXr7Rq1q9gJML/2cdGQAo69NKzQ10KN48=
-github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y=
-github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
+github.com/spf13/cast v1.8.0 h1:gEN9K4b8Xws4EX0+a0reLmhq8moKn7ntRlQYgjPeCDk=
 github.com/spf13/cast v1.8.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
 github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
 github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
@@ -1447,8 +1442,7 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20220819030929-7fc1605a5dde/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
-golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
 golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
 golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -1578,8 +1572,7 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
-golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
-golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
+golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4=
 golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@@ -1587,8 +1580,7 @@ golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxb
 golang.org/x/time v0.0.0-20220922220347-f3bd1da661af/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.1.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
-golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
+golang.org/x/time v0.10.0 h1:3usCWA8tQn0L8+hFJQNgzpWbd89begxN66o1Ojdn5L4=
 golang.org/x/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=