diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml
index f2eb26a4..92afd37e 100644
--- a/config/models/kustomization.yaml
+++ b/config/models/kustomization.yaml
@@ -3,7 +3,6 @@ kind: Kustomization
 
 resources:
   - meta/Llama-3.3-70B-instruct.yaml
-  - meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml
   - meta/llama-4-maverick-17b-128e-instruct-fp8.yaml
   - meta/llama-4-scout-17b-16e-instruct.yaml
   - intfloat/e5-mistral-7b-instruct.yaml
diff --git a/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml b/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml
deleted file mode 100644
index 4db9e60b..00000000
--- a/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterBaseModel
-metadata:
-  name: llama-3-3-70b-instruct-fp8-dynamic
-spec:
-  disabled: false
-  displayName: meta.llama-3.3-70b-instruct-fp8-dynamic
-  storage:
-    storageUri: hf://meta-llama/Llama-3.3-70B-Instruct
-    path: /raid/models/meta/llama-3-3-70b-instruct-fp8-dynamic
-  vendor: meta
-  version: "1.0.0"
\ No newline at end of file
diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
index a5ec1663..8925c6af 100644
--- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
@@ -63,7 +63,7 @@ spec:
             MC_TE_METRIC=true;
             SGLANG_TBO_DEBUG=1;
             python3 -m sglang.launch_server
-            --port 30000
+            --port 8080
             --host 0.0.0.0
             --model-path ${MODEL_PATH}
             --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_3,mlx5_4
@@ -158,7 +158,7 @@ spec:
             --dist-init-addr $(LWS_LEADER_ADDRESS):5000
             --nnodes ${LWS_GROUP_SIZE}
             --node-rank ${LWS_WORKER_INDEX}
-            --port 30000
+            --port 8080
             --trust-remote-code
             --ep-num-redundant-experts 32
             --moe-dense-tp-size 1
@@ -214,7 +214,7 @@ spec:
           - -c
           - >
             python3 -m sglang.launch_server
-            --port 30000
+            --port 8080
             --host 0.0.0.0
             --chunked-prefill-size 262144
             --page-size 64
@@ -302,7 +302,7 @@ spec:
             --dist-init-addr $(LWS_LEADER_ADDRESS):5000
             --nnodes ${LWS_GROUP_SIZE}
             --node-rank ${LWS_WORKER_INDEX}
-            --port 30000
+            --port 8080
             --decode-log-interval 1
             --host 0.0.0.0
             --trust-remote-code
@@ -325,4 +325,38 @@ spec:
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
           - name: GLOO_SOCKET_IFNAME
-            value: eth0
\ No newline at end of file
+            value: eth0
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host 0.0.0.0
+          --port 8080
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
index 55f20a7e..a628eca1 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.42.3"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: LlamaForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
deleted file mode 100644
index ec8764a6..00000000
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
+++ /dev/null
@@ -1,226 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterServingRuntime
-metadata:
-  name: srt-llama-3-2-11b-vision-instruct-pd
-spec:
-  disabled: false
-  supportedModelFormats:
-    - modelFramework:
-        name: transformers
-        version: "4.45.0.dev0"
-      modelFormat:
-        name: safetensors
-        version: "1"
-      modelArchitecture: MllamaForConditionalGeneration
-      autoSelect: false
-      priority: 1
-  protocolVersions:
-    - openAI
-  modelSizeRange:
-    min: 10B
-    max: 12B
-  engineConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 1 \
-          --mem-frac=0.9 \
-          --chat-template llama_3_vision \
-          --disaggregation-mode prefill \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 10
-          memory: 30Gi
-          nvidia.com/gpu: 1
-        limits:
-          cpu: 10
-          memory: 30Gi
-          nvidia.com/gpu: 1
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  decoderConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-#    affinity:
-#      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 1 \
-          --mem-frac=0.9 \
-          --chat-template llama_3_vision \
-          --disaggregation-mode decode \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 10
-          memory: 30Gi
-          nvidia.com/gpu: 1
-        limits:
-          cpu: 10
-          memory: 30Gi
-          nvidia.com/gpu: 1
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  routerConfig:
-    runner:
-      name: router
-      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
-      resources:
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      ports:
-        - containerPort: 8080
-          name: http
-      command:
-        - sh
-        - -c
-        - >
-          python3 -m sglang_router.launch_router
-          --host "0.0.0.0"
-          --port "8080"
-          --pd-disaggregation
-          --policy power_of_two
-          --service-discovery
-          --service-discovery-namespace "${NAMESPACE}"
-          --service-discovery-port 8080
-          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-      env:
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        - name: INFERENCESERVICE_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
index 21797d18..b2d20a9f 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.45.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: LlamaForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
index 0231121e..6fa60107 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.45.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: LlamaForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
deleted file mode 100644
index 9052b5b0..00000000
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
+++ /dev/null
@@ -1,217 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterServingRuntime
-metadata:
-  name: srt-llama-3-2-90b-vision-instruct-fp8-pd
-spec:
-  disabled: false
-  supportedModelFormats:
-    - modelFramework:
-        name: transformers
-        version: "4.46.0.dev0"
-      modelFormat:
-        name: safetensors
-        version: "1"
-      modelArchitecture: MllamaForConditionalGeneration
-      autoSelect: false
-      priority: 1
-  protocolVersions:
-    - openAI
-  modelSizeRange:
-    min: 85B
-    max: 95B
-  engineConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 4 \
-          --mem-frac=0.9 \
-          --chat-template llama_3_vision \
-          --disaggregation-mode prefill \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 30
-          memory: 100Gi
-          nvidia.com/gpu: 4
-        limits:
-          cpu: 30
-          memory: 100Gi
-          nvidia.com/gpu: 4
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  decoderConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 4 \
-          --mem-frac=0.9 \
-          --chat-template llama_3_vision \
-          --disaggregation-mode decode \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 30
-          memory: 100Gi
-          nvidia.com/gpu: 4
-        limits:
-          cpu: 30
-          memory: 100Gi
-          nvidia.com/gpu: 4
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  routerConfig:
-    runner:
-      name: router
-      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
-      resources:
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      ports:
-        - containerPort: 8080
-          name: http
-      command:
-        - sh
-        - -c
-        - >
-          python3 -m sglang_router.launch_router
-          --host "0.0.0.0"
-          --port "8080"
-          --pd-disaggregation
-          --policy power_of_two
-          --service-discovery
-          --service-discovery-namespace "${NAMESPACE}"
-          --service-discovery-port 8080
-          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-      env:
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        - name: INFERENCESERVICE_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml
similarity index 98%
rename from config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
rename to config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml
index 2ef9d4d4..5adecd0d 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-2-90b-vision-instruct-fp8
+  name: srt-llama-3-2-90b-vision-instruct
 spec:
   disabled: false
   supportedModelFormats:
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
index 3f02b061..2e78f6cf 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.47.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: LlamaForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
index 8c64685d..9def6d83 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.51.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: Llama4ForConditionalGeneration
       autoSelect: true
       priority: 2
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
index 5869bd53..6685b99e 100644
--- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.36.2"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: MistralForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
index 8f75eed8..cb4a2110 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.36.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: MixtralForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml b/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml
index e8bfa4bf..0adeb33f 100644
--- a/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml
+++ b/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml
@@ -2,22 +2,26 @@
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: deepseek-v3-pd
+  name: deepseek-v3
 ---
 
 apiVersion: ome.io/v1beta1
 kind: InferenceService
 metadata:
   name: deepseek-v3
-  namespace: deepseek-v3-pd
+  namespace: deepseek-v3
 spec:
   model:
     name: deepseek-v3
   runtime:
-    name: srt-deepseek-pd-rdma
+    name: srt-deepseek-rdma-pd
   engine:
     minReplicas: 1
     maxReplicas: 1
   decoder:
-    minReplicas: 2
-    maxReplicas: 2
\ No newline at end of file
+    minReplicas: 1
+    maxReplicas: 1
+  router:
+    minReplicas: 1
+    maxReplicas: 1
+    
\ No newline at end of file
diff --git a/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml b/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml
index 9e45bec6..b221a8fa 100644
--- a/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml
+++ b/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml
@@ -16,5 +16,11 @@ spec:
   runtime:
     name: srt-llama-4-maverick-17b-128e-instruct-fp8-pd
   engine:
-    minReplicas: 3
-    maxReplicas: 3
\ No newline at end of file
+    minReplicas: 1
+    maxReplicas: 1
+  decoder:
+    minReplicas: 1
+    maxReplicas: 1
+  router:
+    minReplicas: 1
+    maxReplicas: 1
\ No newline at end of file
diff --git a/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml b/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml
deleted file mode 100644
index c82d7da4..00000000
--- a/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
----
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: llama-3-3-70b-instruct-fp8-dynamic
----
-
-apiVersion: ome.io/v1beta1
-kind: InferenceService
-metadata:
-  name: llama-3-3-70b-instruct-fp8-dynamic
-  namespace: llama-3-3-70b-instruct-fp8-dynamic
-spec:
-  model:
-    name: llama-3-3-70b-instruct-fp8-dynamic
-  engine:
-    minReplicas: 1
-    maxReplicas: 1
\ No newline at end of file
diff --git a/config/samples/isvc/meta/llama3-3-70b-instruct.yaml b/config/samples/isvc/meta/llama3-3-70b-instruct.yaml
new file mode 100644
index 00000000..4734ea75
--- /dev/null
+++ b/config/samples/isvc/meta/llama3-3-70b-instruct.yaml
@@ -0,0 +1,27 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: llama-3-3-70b-instruct
+---
+
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: llama-3-3-70b-instruct
+  namespace: llama-3-3-70b-instruct
+spec:
+  model:
+    name: llama-3-3-70b-instruct
+  engine:
+    minReplicas: 1
+    maxReplicas: 1
+  runtime:
+    name: srt-llama-3-3-70b-instruct-pd
+  decoder:
+    minReplicas: 1
+    maxReplicas: 1
+  # router:
+  #   minReplicas: 1
+  #   maxReplicas: 1
+    
\ No newline at end of file