From a71c6adb7dce7de653bd94435f7c992df50e3ae3 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Fri, 27 Jun 2025 23:47:07 -0700
Subject: [PATCH 01/10] remove llama 3.3 70B FP8 from models

---
 config/models/kustomization.yaml                     |  1 -
 .../meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml     | 12 ------------
 2 files changed, 13 deletions(-)
 delete mode 100644 config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml

diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml
index f2eb26a4..92afd37e 100644
--- a/config/models/kustomization.yaml
+++ b/config/models/kustomization.yaml
@@ -3,7 +3,6 @@ kind: Kustomization
 
 resources:
   - meta/Llama-3.3-70B-instruct.yaml
-  - meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml
   - meta/llama-4-maverick-17b-128e-instruct-fp8.yaml
   - meta/llama-4-scout-17b-16e-instruct.yaml
   - intfloat/e5-mistral-7b-instruct.yaml
diff --git a/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml b/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml
deleted file mode 100644
index 4db9e60b..00000000
--- a/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterBaseModel
-metadata:
-  name: llama-3-3-70b-instruct-fp8-dynamic
-spec:
-  disabled: false
-  displayName: meta.llama-3.3-70b-instruct-fp8-dynamic
-  storage:
-    storageUri: hf://meta-llama/Llama-3.3-70B-Instruct
-    path: /raid/models/meta/llama-3-3-70b-instruct-fp8-dynamic
-  vendor: meta
-  version: "1.0.0"
\ No newline at end of file

From 7d035fd523ce50ea35aee3286f58473193a8f41a Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Fri, 27 Jun 2025 23:47:52 -0700
Subject: [PATCH 02/10] change llama 3.2 90B rt to bf16 version

---
 ...struct-fp8-rt.yaml => llama-3-2-90b-vision-instruct-rt.yaml} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename config/runtimes/srt/{llama-3-2-90b-vision-instruct-fp8-rt.yaml => llama-3-2-90b-vision-instruct-rt.yaml} (98%)

diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml
similarity index 98%
rename from config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
rename to config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml
index 2ef9d4d4..5adecd0d 100644
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml
@@ -1,7 +1,7 @@
 apiVersion: ome.io/v1beta1
 kind: ClusterServingRuntime
 metadata:
-  name: srt-llama-3-2-90b-vision-instruct-fp8
+  name: srt-llama-3-2-90b-vision-instruct
 spec:
   disabled: false
   supportedModelFormats:

From c883198f5c81cd629ef02d1fc1d72c8ce39f6cf5 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Fri, 27 Jun 2025 23:48:07 -0700
Subject: [PATCH 03/10] remove 11b 90b pd rt

---
 .../llama-3-2-11b-vision-instruct-pd-rt.yaml  | 226 ------------------
 ...ama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 217 -----------------
 2 files changed, 443 deletions(-)
 delete mode 100644 config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
 delete mode 100644 config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml

diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
deleted file mode 100644
index ec8764a6..00000000
--- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml
+++ /dev/null
@@ -1,226 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterServingRuntime
-metadata:
-  name: srt-llama-3-2-11b-vision-instruct-pd
-spec:
-  disabled: false
-  supportedModelFormats:
-    - modelFramework:
-        name: transformers
-        version: "4.45.0.dev0"
-      modelFormat:
-        name: safetensors
-        version: "1"
-      modelArchitecture: MllamaForConditionalGeneration
-      autoSelect: false
-      priority: 1
-  protocolVersions:
-    - openAI
-  modelSizeRange:
-    min: 10B
-    max: 12B
-  engineConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 1 \
-          --mem-frac=0.9 \
-          --chat-template llama_3_vision \
-          --disaggregation-mode prefill \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 10
-          memory: 30Gi
-          nvidia.com/gpu: 1
-        limits:
-          cpu: 10
-          memory: 30Gi
-          nvidia.com/gpu: 1
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  decoderConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-#    affinity:
-#      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: node.kubernetes.io/instance-type
-                  operator: In
-                  values:
-                    - BM.GPU.H100.8
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 1 \
-          --mem-frac=0.9 \
-          --chat-template llama_3_vision \
-          --disaggregation-mode decode \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 10
-          memory: 30Gi
-          nvidia.com/gpu: 1
-        limits:
-          cpu: 10
-          memory: 30Gi
-          nvidia.com/gpu: 1
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  routerConfig:
-    runner:
-      name: router
-      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
-      resources:
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      ports:
-        - containerPort: 8080
-          name: http
-      command:
-        - sh
-        - -c
-        - >
-          python3 -m sglang_router.launch_router
-          --host "0.0.0.0"
-          --port "8080"
-          --pd-disaggregation
-          --policy power_of_two
-          --service-discovery
-          --service-discovery-namespace "${NAMESPACE}"
-          --service-discovery-port 8080
-          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-      env:
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        - name: INFERENCESERVICE_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
deleted file mode 100644
index 9052b5b0..00000000
--- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml
+++ /dev/null
@@ -1,217 +0,0 @@
-apiVersion: ome.io/v1beta1
-kind: ClusterServingRuntime
-metadata:
-  name: srt-llama-3-2-90b-vision-instruct-fp8-pd
-spec:
-  disabled: false
-  supportedModelFormats:
-    - modelFramework:
-        name: transformers
-        version: "4.46.0.dev0"
-      modelFormat:
-        name: safetensors
-        version: "1"
-      modelArchitecture: MllamaForConditionalGeneration
-      autoSelect: false
-      priority: 1
-  protocolVersions:
-    - openAI
-  modelSizeRange:
-    min: 85B
-    max: 95B
-  engineConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 4 \
-          --mem-frac=0.9 \
-          --chat-template llama_3_vision \
-          --disaggregation-mode prefill \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 30
-          memory: 100Gi
-          nvidia.com/gpu: 4
-        limits:
-          cpu: 30
-          memory: 100Gi
-          nvidia.com/gpu: 4
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  decoderConfig:
-    annotations:
-      rdma.ome.io/auto-inject: "true"
-      rdma.ome.io/profile: "oci-roce"
-      rdma.ome.io/container-name: "ome-container"
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "8080"
-      prometheus.io/path: "/metrics"
-    labels:
-      logging-forward: enabled
-    tolerations:
-      - key: "nvidia.com/gpu"
-        operator: "Exists"
-        effect: "NoSchedule"
-    volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-    dnsPolicy: ClusterFirstWithHostNet
-    hostNetwork: true
-    runner:
-      name: ome-container
-      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
-      ports:
-        - containerPort: 8080
-          name: http1
-          protocol: TCP
-      command:
-        - /bin/bash
-        - '-lc'
-        - --
-      args:
-        - |
-          python3 -m sglang.launch_server \
-          --host=0.0.0.0 \
-          --port=8080 \
-          --enable-metrics \
-          --log-requests \
-          --model-path="$MODEL_PATH" \
-          --tp-size 4 \
-          --mem-frac=0.9 \
-          --chat-template llama_3_vision \
-          --disaggregation-mode decode \
-          --disaggregation-ib-device mlx5_0
-      volumeMounts:
-        - mountPath: /dev/shm
-          name: dshm
-      resources:
-        requests:
-          cpu: 30
-          memory: 100Gi
-          nvidia.com/gpu: 4
-        limits:
-          cpu: 30
-          memory: 100Gi
-          nvidia.com/gpu: 4
-      readinessProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 3
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 200
-      livenessProbe:
-        httpGet:
-          path: /health
-          port: 8080
-        failureThreshold: 5
-        successThreshold: 1
-        periodSeconds: 60
-        timeoutSeconds: 60
-      startupProbe:
-        httpGet:
-          path: /health_generate
-          port: 8080
-        failureThreshold: 150
-        successThreshold: 1
-        periodSeconds: 6
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-  routerConfig:
-    runner:
-      name: router
-      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
-      resources:
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      ports:
-        - containerPort: 8080
-          name: http
-      command:
-        - sh
-        - -c
-        - >
-          python3 -m sglang_router.launch_router
-          --host "0.0.0.0"
-          --port "8080"
-          --pd-disaggregation
-          --policy power_of_two
-          --service-discovery
-          --service-discovery-namespace "${NAMESPACE}"
-          --service-discovery-port 8080
-          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
-      env:
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        - name: INFERENCESERVICE_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file

From 7c91d5c09a05748a617246e514f69f65752cdca6 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Fri, 27 Jun 2025 23:48:42 -0700
Subject: [PATCH 04/10] add llama 3.3 pd isvc sample

---
 .../llama3-3-70b-instruct-fp8-dynamic.yaml    | 18 -------------
 .../isvc/meta/llama3-3-70b-instruct.yaml      | 27 +++++++++++++++++++
 2 files changed, 27 insertions(+), 18 deletions(-)
 delete mode 100644 config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml
 create mode 100644 config/samples/isvc/meta/llama3-3-70b-instruct.yaml

diff --git a/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml b/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml
deleted file mode 100644
index c82d7da4..00000000
--- a/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
----
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: llama-3-3-70b-instruct-fp8-dynamic
----
-
-apiVersion: ome.io/v1beta1
-kind: InferenceService
-metadata:
-  name: llama-3-3-70b-instruct-fp8-dynamic
-  namespace: llama-3-3-70b-instruct-fp8-dynamic
-spec:
-  model:
-    name: llama-3-3-70b-instruct-fp8-dynamic
-  engine:
-    minReplicas: 1
-    maxReplicas: 1
\ No newline at end of file
diff --git a/config/samples/isvc/meta/llama3-3-70b-instruct.yaml b/config/samples/isvc/meta/llama3-3-70b-instruct.yaml
new file mode 100644
index 00000000..4734ea75
--- /dev/null
+++ b/config/samples/isvc/meta/llama3-3-70b-instruct.yaml
@@ -0,0 +1,27 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: llama-3-3-70b-instruct
+---
+
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: llama-3-3-70b-instruct
+  namespace: llama-3-3-70b-instruct
+spec:
+  model:
+    name: llama-3-3-70b-instruct
+  engine:
+    minReplicas: 1
+    maxReplicas: 1
+  runtime:
+    name: srt-llama-3-3-70b-instruct-pd
+  decoder:
+    minReplicas: 1
+    maxReplicas: 1
+  # router:
+  #   minReplicas: 1
+  #   maxReplicas: 1
+    
\ No newline at end of file

From bbff36b828561e22a323cf3ecaebd4c07fca06dc Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Fri, 27 Jun 2025 23:49:08 -0700
Subject: [PATCH 05/10] sgl runtime fixes

---
 config/runtimes/srt/deepseek-rdma-pd-rt.yaml  | 44 ++++++++++++++++---
 .../srt/llama-3-1-70b-instruct-pd-rt.yaml     |  2 +-
 .../srt/llama-3-2-1b-instruct-pd-rt.yaml      |  2 +-
 .../srt/llama-3-2-3b-instruct-pd-rt.yaml      |  2 +-
 .../srt/llama-3-3-70b-instruct-pd-rt.yaml     |  2 +-
 .../llama-4-scout-17b-16e-instruct-pd-rt.yaml |  2 +-
 .../srt/mistral-7b-instruct-pd-rt.yaml        |  2 +-
 .../srt/mixtral-8x7b-instruct-pd-rt.yaml      |  2 +-
 8 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
index a5ec1663..8925c6af 100644
--- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
+++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml
@@ -63,7 +63,7 @@ spec:
             MC_TE_METRIC=true;
             SGLANG_TBO_DEBUG=1;
             python3 -m sglang.launch_server
-            --port 30000
+            --port 8080
             --host 0.0.0.0
             --model-path ${MODEL_PATH}
             --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_3,mlx5_4
@@ -158,7 +158,7 @@ spec:
             --dist-init-addr $(LWS_LEADER_ADDRESS):5000
             --nnodes ${LWS_GROUP_SIZE}
             --node-rank ${LWS_WORKER_INDEX}
-            --port 30000
+            --port 8080
             --trust-remote-code
             --ep-num-redundant-experts 32
             --moe-dense-tp-size 1
@@ -214,7 +214,7 @@ spec:
           - -c
           - >
             python3 -m sglang.launch_server
-            --port 30000
+            --port 8080
             --host 0.0.0.0
             --chunked-prefill-size 262144
             --page-size 64
@@ -302,7 +302,7 @@ spec:
             --dist-init-addr $(LWS_LEADER_ADDRESS):5000
             --nnodes ${LWS_GROUP_SIZE}
             --node-rank ${LWS_WORKER_INDEX}
-            --port 30000
+            --port 8080
             --decode-log-interval 1
             --host 0.0.0.0
             --trust-remote-code
@@ -325,4 +325,38 @@ spec:
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
           - name: GLOO_SOCKET_IFNAME
-            value: eth0
\ No newline at end of file
+            value: eth0
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host 0.0.0.0
+          --port 8080
+          --pd-disaggregation
+          --policy power_of_two
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+          --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
index 55f20a7e..a628eca1 100644
--- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.42.3"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: LlamaForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
index 21797d18..b2d20a9f 100644
--- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.45.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: LlamaForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
index 0231121e..6fa60107 100644
--- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.45.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: LlamaForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
index 3f02b061..2e78f6cf 100644
--- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.47.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: LlamaForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
index 8c64685d..9def6d83 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.51.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: Llama4ForConditionalGeneration
       autoSelect: true
       priority: 2
diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
index 5869bd53..6685b99e 100644
--- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.36.2"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: MistralForCausalLM
       autoSelect: false
       priority: 1
diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
index 8f75eed8..cb4a2110 100644
--- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
+++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml
@@ -10,7 +10,7 @@ spec:
         version: "4.36.0.dev0"
       modelFormat:
         name: safetensors
-        version: "1"
+        version: "1.0.0"
       modelArchitecture: MixtralForCausalLM
       autoSelect: false
       priority: 1

From 627a3fb875c916a0bf2cff6bbe84669d049523a6 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Fri, 27 Jun 2025 23:49:30 -0700
Subject: [PATCH 06/10] update sample isvc

---
 .../samples/isvc/deepseek-ai/deepseek-v3-pd.yaml   | 14 +++++++++-----
 .../llama-4-maverick-17b-128e-instruct-fp8.yaml    | 10 ++++++++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml b/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml
index e8bfa4bf..0adeb33f 100644
--- a/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml
+++ b/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml
@@ -2,22 +2,26 @@
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: deepseek-v3-pd
+  name: deepseek-v3
 ---
 
 apiVersion: ome.io/v1beta1
 kind: InferenceService
 metadata:
   name: deepseek-v3
-  namespace: deepseek-v3-pd
+  namespace: deepseek-v3
 spec:
   model:
     name: deepseek-v3
   runtime:
-    name: srt-deepseek-pd-rdma
+    name: srt-deepseek-rdma-pd
   engine:
     minReplicas: 1
     maxReplicas: 1
   decoder:
-    minReplicas: 2
-    maxReplicas: 2
\ No newline at end of file
+    minReplicas: 1
+    maxReplicas: 1
+  router:
+    minReplicas: 1
+    maxReplicas: 1
+    
\ No newline at end of file
diff --git a/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml b/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml
index 9e45bec6..b221a8fa 100644
--- a/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml
+++ b/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml
@@ -16,5 +16,11 @@ spec:
   runtime:
     name: srt-llama-4-maverick-17b-128e-instruct-fp8-pd
   engine:
-    minReplicas: 3
-    maxReplicas: 3
\ No newline at end of file
+    minReplicas: 1
+    maxReplicas: 1
+  decoder:
+    minReplicas: 1
+    maxReplicas: 1
+  router:
+    minReplicas: 1
+    maxReplicas: 1
\ No newline at end of file

From 46fe3098555c7cbd0387887e38ba4c4f8e6c0426 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Fri, 27 Jun 2025 23:50:54 -0700
Subject: [PATCH 07/10] fix inference service status propagation

---
 .../v1beta1/inferenceservice/controller.go    | 21 ++++++++-
 .../status/status_reconciler.go               | 45 +++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go
index cd35b373..4f3dbbb6 100644
--- a/pkg/controller/v1beta1/inferenceservice/controller.go
+++ b/pkg/controller/v1beta1/inferenceservice/controller.go
@@ -431,12 +431,31 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 		}
 	}
 
+	// Propagate status for all components
+	var componentList []v1beta2.ComponentType
 	if deploymentMode == constants.Serverless {
-		componentList := []v1beta2.ComponentType{v1beta2.EngineComponent}
+		// In Serverless mode, we only care about the engine component which is a Knative service.
+		componentList = []v1beta2.ComponentType{v1beta2.EngineComponent}
+
+		// For serverless, we only have one component, and we need to propagate its route and deployment readiness.
+		// For other modes, these are handled by the component-specific reconcilers.
 		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.RoutesReady)
 		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.LatestDeploymentReady)
+	} else {
+		// For other modes (RawDeployment, etc.), we check all defined components.
+		if mergedEngine != nil {
+			componentList = append(componentList, v1beta2.EngineComponent)
+		}
+		if mergedDecoder != nil {
+			componentList = append(componentList, v1beta2.DecoderComponent)
+		}
+		if mergedRouter != nil {
+			componentList = append(componentList, v1beta2.RouterComponent)
+		}
 	}
 
+	r.StatusManager.AggregateComponentReadyCondition(&isvc.Status, componentList)
+
 	if err = r.updateStatus(isvc, deploymentMode); err != nil {
 		r.Recorder.Event(isvc, v1.EventTypeWarning, "InternalError", err.Error())
 		return reconcile.Result{}, err
diff --git a/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go b/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go
index 3290f77e..c4227ea1 100644
--- a/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go
+++ b/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go
@@ -241,6 +241,51 @@ func (sr *StatusReconciler) SetModelFailureInfo(status *v1beta1.InferenceService
 }
 
 // PropagateCrossComponentStatus aggregates conditions across components
+// AggregateComponentReadyCondition creates the top-level Ready condition
+// based on the readiness of all specified components.
+func (sr *StatusReconciler) AggregateComponentReadyCondition(
+	status *v1beta1.InferenceServiceStatus,
+	componentList []v1beta1.ComponentType) {
+
+	// If there are no components, the service is not ready.
+	if len(componentList) == 0 {
+		status.SetCondition(apis.ConditionReady, &apis.Condition{
+			Type:    apis.ConditionReady,
+			Status:  v1.ConditionFalse,
+			Reason:  "NoComponents",
+			Message: "No components are defined for this InferenceService.",
+		})
+		return
+	}
+
+	readyCondition := &apis.Condition{
+		Type:    apis.ConditionReady,
+		Status:  v1.ConditionTrue,
+		Reason:  "AllComponentsReady",
+		Message: "All components are ready",
+	}
+
+	readyConditionsMap := sr.getReadyConditionsMap()
+
+	for _, component := range componentList {
+		componentReadyCondition := readyConditionsMap[component]
+		if !status.IsConditionReady(componentReadyCondition) {
+			readyCondition.Status = v1.ConditionFalse
+			readyCondition.Reason = string(component) + "NotReady"
+			// Get the actual condition to propagate the message
+			compCond := status.GetCondition(componentReadyCondition)
+			if compCond != nil {
+				readyCondition.Message = compCond.Message
+			} else {
+				readyCondition.Message = "Component " + string(component) + " is not ready"
+			}
+			break // one not ready is enough
+		}
+	}
+
+	status.SetCondition(apis.ConditionReady, readyCondition)
+}
+
 func (sr *StatusReconciler) PropagateCrossComponentStatus(
 	status *v1beta1.InferenceServiceStatus,
 	componentList []v1beta1.ComponentType,

From 326f4b6e0ca8e8aaf84db8ba955edc50bebdcca7 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Sat, 28 Jun 2025 00:02:32 -0700
Subject: [PATCH 08/10] fix API version

---
 .../v1beta1/inferenceservice/controller.go    | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go
index 4f3dbbb6..d1d9e627 100644
--- a/pkg/controller/v1beta1/inferenceservice/controller.go
+++ b/pkg/controller/v1beta1/inferenceservice/controller.go
@@ -10,7 +10,7 @@ import (
 	duckv1 "knative.dev/pkg/apis/duck/v1"
 	"knative.dev/pkg/network"
 
-	v1beta2 "github.com/sgl-project/ome/pkg/apis/ome/v1beta1"
+	v1beta1 "github.com/sgl-project/ome/pkg/apis/ome/v1beta1"
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	lws "sigs.k8s.io/lws/api/leaderworkerset/v1"
 
@@ -110,7 +110,7 @@ type InferenceServiceReconciler struct {
 
 func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	// Fetch the InferenceService instance
-	isvc := &v1beta2.InferenceService{}
+	isvc := &v1beta1.InferenceService{}
 	if err := r.Get(ctx, req.NamespacedName, isvc); err != nil {
 		if apierrors.IsNotFound(err) {
 			// Object not found, return.  Created objects are automatically garbage collected.
@@ -175,7 +175,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 
 	// Initialize status if not already initialized
 	if isvc.Status.Components == nil {
-		isvc.Status.Components = make(map[v1beta2.ComponentType]v1beta2.ComponentStatusSpec)
+		isvc.Status.Components = make(map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec)
 	}
 
 	// Setup reconcilers
@@ -432,25 +432,25 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 	}
 
 	// Propagate status for all components
-	var componentList []v1beta2.ComponentType
+	var componentList []v1beta1.ComponentType
 	if deploymentMode == constants.Serverless {
 		// In Serverless mode, we only care about the engine component which is a Knative service.
-		componentList = []v1beta2.ComponentType{v1beta2.EngineComponent}
+		componentList = []v1beta1.ComponentType{v1beta1.EngineComponent}
 
 		// For serverless, we only have one component, and we need to propagate its route and deployment readiness.
 		// For other modes, these are handled by the component-specific reconcilers.
-		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.RoutesReady)
-		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.LatestDeploymentReady)
+		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta1.RoutesReady)
+		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta1.LatestDeploymentReady)
 	} else {
 		// For other modes (RawDeployment, etc.), we check all defined components.
 		if mergedEngine != nil {
-			componentList = append(componentList, v1beta2.EngineComponent)
+			componentList = append(componentList, v1beta1.EngineComponent)
 		}
 		if mergedDecoder != nil {
-			componentList = append(componentList, v1beta2.DecoderComponent)
+			componentList = append(componentList, v1beta1.DecoderComponent)
 		}
 		if mergedRouter != nil {
-			componentList = append(componentList, v1beta2.RouterComponent)
+			componentList = append(componentList, v1beta1.RouterComponent)
 		}
 	}
 
@@ -464,7 +464,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 	return ctrl.Result{}, nil
 }
 
-func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta2.InferenceService) (ctrl.Result, error) {
+func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta1.InferenceService) (ctrl.Result, error) {
 	// We directly set URL and inference service status to Ready in VirtualDeployment mode
 
 	// Set URL across all Status components
@@ -478,8 +478,8 @@ func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta2.Infer
 	}
 	isvc.Status.URL = openAIURL
 	isvc.Status.Address = addressURL
-	isvc.Status.Components = map[v1beta2.ComponentType]v1beta2.ComponentStatusSpec{
-		v1beta2.PredictorComponent: {
+	isvc.Status.Components = map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec{
+		v1beta1.PredictorComponent: {
 			URL: openAIURL,
 		},
 	}
@@ -500,7 +500,7 @@ func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta2.Infer
 	return ctrl.Result{}, nil
 }
 
-func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta2.InferenceService) (ctrl.Result, error) {
+func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1.InferenceService) (ctrl.Result, error) {
 	// Abort early if the resolved deployment mode is Serverless, but Knative Services are not available
 	ksvcAvailable, err := utils.IsCrdAvailable(r.ClientConfig, knservingv1.SchemeGroupVersion.String(), constants.KnativeServiceKind)
 	if err != nil {
@@ -517,8 +517,8 @@ func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta2
 	return ctrl.Result{}, nil
 }
 
-func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta2.InferenceService, deploymentMode constants.DeploymentModeType) error {
-	existingService := &v1beta2.InferenceService{}
+func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.InferenceService, deploymentMode constants.DeploymentModeType) error {
+	existingService := &v1beta1.InferenceService{}
 	namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace}
 	if err := r.Get(context.TODO(), namespacedName, existingService); err != nil {
 		return err
@@ -548,13 +548,13 @@ func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta2.Infere
 	return nil
 }
 
-func inferenceServiceReadiness(status v1beta2.InferenceServiceStatus) bool {
+func inferenceServiceReadiness(status v1beta1.InferenceServiceStatus) bool {
 	return status.Conditions != nil &&
 		status.GetCondition(knapis.ConditionReady) != nil &&
 		status.GetCondition(knapis.ConditionReady).Status == v1.ConditionTrue
 }
 
-func inferenceServiceStatusEqual(s1, s2 v1beta2.InferenceServiceStatus) bool {
+func inferenceServiceStatusEqual(s1, s2 v1beta1.InferenceServiceStatus) bool {
 	return equality.Semantic.DeepEqual(s1, s2)
 }
 
@@ -590,7 +590,7 @@ func (r *InferenceServiceReconciler) SetupWithManager(mgr ctrl.Manager, deployCo
 	}
 
 	ctrlBuilder := ctrl.NewControllerManagedBy(mgr).
-		For(&v1beta2.InferenceService{}).
+		For(&v1beta1.InferenceService{}).
 		Owns(&appsv1.Deployment{}).
 		Owns(&v1.Service{}).
 		Owns(&v1.ConfigMap{}).
@@ -631,7 +631,7 @@ func (r *InferenceServiceReconciler) SetupWithManager(mgr ctrl.Manager, deployCo
 	return ctrlBuilder.Complete(r)
 }
 
-func (r *InferenceServiceReconciler) setExternalServiceURL(ctx context.Context, isvc *v1beta2.InferenceService, ingressConfig *controllerconfig.IngressConfig) error {
+func (r *InferenceServiceReconciler) setExternalServiceURL(ctx context.Context, isvc *v1beta1.InferenceService, ingressConfig *controllerconfig.IngressConfig) error {
 	// Get the external service
 	externalService := &v1.Service{}
 	if err := r.Get(ctx, types.NamespacedName{Name: isvc.Name, Namespace: isvc.Namespace}, externalService); err != nil {
@@ -659,23 +659,23 @@ type existingComponents struct {
 	Router  bool
 }
 
-func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context, isvc *v1beta2.InferenceService) (existingComponents, error) {
+func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context, isvc *v1beta1.InferenceService) (existingComponents, error) {
 	existing := existingComponents{}
 
 	// Check status for existing components - this is more reliable than querying deployments
 	if isvc.Status.Components != nil {
 		// Check if engine component exists in status
-		if _, hasEngine := isvc.Status.Components[v1beta2.EngineComponent]; hasEngine {
+		if _, hasEngine := isvc.Status.Components[v1beta1.EngineComponent]; hasEngine {
 			existing.Engine = true
 		}
 
 		// Check if decoder component exists in status
-		if _, hasDecoder := isvc.Status.Components[v1beta2.DecoderComponent]; hasDecoder {
+		if _, hasDecoder := isvc.Status.Components[v1beta1.DecoderComponent]; hasDecoder {
 			existing.Decoder = true
 		}
 
 		// Check if router component exists in status
-		if _, hasRouter := isvc.Status.Components[v1beta2.RouterComponent]; hasRouter {
+		if _, hasRouter := isvc.Status.Components[v1beta1.RouterComponent]; hasRouter {
 			existing.Router = true
 		}
 	}
@@ -684,6 +684,6 @@ func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context
 }
 
 // migratePredictorToNewArchitecture delegates to the migration utility
-func (r *InferenceServiceReconciler) migratePredictorToNewArchitecture(isvc *v1beta2.InferenceService) error {
+func (r *InferenceServiceReconciler) migratePredictorToNewArchitecture(isvc *v1beta1.InferenceService) error {
 	return isvcutils.MigratePredictorToNewArchitecture(context.Background(), r.Client, r.Log, isvc)
 }

From b4b80513c1232cd90b1a323f6c8a168b2f14f1c5 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Mon, 30 Jun 2025 15:24:02 -0700
Subject: [PATCH 09/10] Revert "fix API version"

This reverts commit 326f4b6e0ca8e8aaf84db8ba955edc50bebdcca7.
---
 .../v1beta1/inferenceservice/controller.go    | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go
index d1d9e627..4f3dbbb6 100644
--- a/pkg/controller/v1beta1/inferenceservice/controller.go
+++ b/pkg/controller/v1beta1/inferenceservice/controller.go
@@ -10,7 +10,7 @@ import (
 	duckv1 "knative.dev/pkg/apis/duck/v1"
 	"knative.dev/pkg/network"
 
-	v1beta1 "github.com/sgl-project/ome/pkg/apis/ome/v1beta1"
+	v1beta2 "github.com/sgl-project/ome/pkg/apis/ome/v1beta1"
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	lws "sigs.k8s.io/lws/api/leaderworkerset/v1"
 
@@ -110,7 +110,7 @@ type InferenceServiceReconciler struct {
 
 func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	// Fetch the InferenceService instance
-	isvc := &v1beta1.InferenceService{}
+	isvc := &v1beta2.InferenceService{}
 	if err := r.Get(ctx, req.NamespacedName, isvc); err != nil {
 		if apierrors.IsNotFound(err) {
 			// Object not found, return.  Created objects are automatically garbage collected.
@@ -175,7 +175,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 
 	// Initialize status if not already initialized
 	if isvc.Status.Components == nil {
-		isvc.Status.Components = make(map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec)
+		isvc.Status.Components = make(map[v1beta2.ComponentType]v1beta2.ComponentStatusSpec)
 	}
 
 	// Setup reconcilers
@@ -432,25 +432,25 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 	}
 
 	// Propagate status for all components
-	var componentList []v1beta1.ComponentType
+	var componentList []v1beta2.ComponentType
 	if deploymentMode == constants.Serverless {
 		// In Serverless mode, we only care about the engine component which is a Knative service.
-		componentList = []v1beta1.ComponentType{v1beta1.EngineComponent}
+		componentList = []v1beta2.ComponentType{v1beta2.EngineComponent}
 
 		// For serverless, we only have one component, and we need to propagate its route and deployment readiness.
 		// For other modes, these are handled by the component-specific reconcilers.
-		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta1.RoutesReady)
-		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta1.LatestDeploymentReady)
+		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.RoutesReady)
+		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.LatestDeploymentReady)
 	} else {
 		// For other modes (RawDeployment, etc.), we check all defined components.
 		if mergedEngine != nil {
-			componentList = append(componentList, v1beta1.EngineComponent)
+			componentList = append(componentList, v1beta2.EngineComponent)
 		}
 		if mergedDecoder != nil {
-			componentList = append(componentList, v1beta1.DecoderComponent)
+			componentList = append(componentList, v1beta2.DecoderComponent)
 		}
 		if mergedRouter != nil {
-			componentList = append(componentList, v1beta1.RouterComponent)
+			componentList = append(componentList, v1beta2.RouterComponent)
 		}
 	}
 
@@ -464,7 +464,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 	return ctrl.Result{}, nil
 }
 
-func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta1.InferenceService) (ctrl.Result, error) {
+func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta2.InferenceService) (ctrl.Result, error) {
 	// We directly set URL and inference service status to Ready in VirtualDeployment mode
 
 	// Set URL across all Status components
@@ -478,8 +478,8 @@ func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta1.Infer
 	}
 	isvc.Status.URL = openAIURL
 	isvc.Status.Address = addressURL
-	isvc.Status.Components = map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec{
-		v1beta1.PredictorComponent: {
+	isvc.Status.Components = map[v1beta2.ComponentType]v1beta2.ComponentStatusSpec{
+		v1beta2.PredictorComponent: {
 			URL: openAIURL,
 		},
 	}
@@ -500,7 +500,7 @@ func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta1.Infer
 	return ctrl.Result{}, nil
 }
 
-func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1.InferenceService) (ctrl.Result, error) {
+func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta2.InferenceService) (ctrl.Result, error) {
 	// Abort early if the resolved deployment mode is Serverless, but Knative Services are not available
 	ksvcAvailable, err := utils.IsCrdAvailable(r.ClientConfig, knservingv1.SchemeGroupVersion.String(), constants.KnativeServiceKind)
 	if err != nil {
@@ -517,8 +517,8 @@ func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1
 	return ctrl.Result{}, nil
 }
 
-func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.InferenceService, deploymentMode constants.DeploymentModeType) error {
-	existingService := &v1beta1.InferenceService{}
+func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta2.InferenceService, deploymentMode constants.DeploymentModeType) error {
+	existingService := &v1beta2.InferenceService{}
 	namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace}
 	if err := r.Get(context.TODO(), namespacedName, existingService); err != nil {
 		return err
@@ -548,13 +548,13 @@ func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.Infere
 	return nil
 }
 
-func inferenceServiceReadiness(status v1beta1.InferenceServiceStatus) bool {
+func inferenceServiceReadiness(status v1beta2.InferenceServiceStatus) bool {
 	return status.Conditions != nil &&
 		status.GetCondition(knapis.ConditionReady) != nil &&
 		status.GetCondition(knapis.ConditionReady).Status == v1.ConditionTrue
 }
 
-func inferenceServiceStatusEqual(s1, s2 v1beta1.InferenceServiceStatus) bool {
+func inferenceServiceStatusEqual(s1, s2 v1beta2.InferenceServiceStatus) bool {
 	return equality.Semantic.DeepEqual(s1, s2)
 }
 
@@ -590,7 +590,7 @@ func (r *InferenceServiceReconciler) SetupWithManager(mgr ctrl.Manager, deployCo
 	}
 
 	ctrlBuilder := ctrl.NewControllerManagedBy(mgr).
-		For(&v1beta1.InferenceService{}).
+		For(&v1beta2.InferenceService{}).
 		Owns(&appsv1.Deployment{}).
 		Owns(&v1.Service{}).
 		Owns(&v1.ConfigMap{}).
@@ -631,7 +631,7 @@ func (r *InferenceServiceReconciler) SetupWithManager(mgr ctrl.Manager, deployCo
 	return ctrlBuilder.Complete(r)
 }
 
-func (r *InferenceServiceReconciler) setExternalServiceURL(ctx context.Context, isvc *v1beta1.InferenceService, ingressConfig *controllerconfig.IngressConfig) error {
+func (r *InferenceServiceReconciler) setExternalServiceURL(ctx context.Context, isvc *v1beta2.InferenceService, ingressConfig *controllerconfig.IngressConfig) error {
 	// Get the external service
 	externalService := &v1.Service{}
 	if err := r.Get(ctx, types.NamespacedName{Name: isvc.Name, Namespace: isvc.Namespace}, externalService); err != nil {
@@ -659,23 +659,23 @@ type existingComponents struct {
 	Router  bool
 }
 
-func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context, isvc *v1beta1.InferenceService) (existingComponents, error) {
+func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context, isvc *v1beta2.InferenceService) (existingComponents, error) {
 	existing := existingComponents{}
 
 	// Check status for existing components - this is more reliable than querying deployments
 	if isvc.Status.Components != nil {
 		// Check if engine component exists in status
-		if _, hasEngine := isvc.Status.Components[v1beta1.EngineComponent]; hasEngine {
+		if _, hasEngine := isvc.Status.Components[v1beta2.EngineComponent]; hasEngine {
 			existing.Engine = true
 		}
 
 		// Check if decoder component exists in status
-		if _, hasDecoder := isvc.Status.Components[v1beta1.DecoderComponent]; hasDecoder {
+		if _, hasDecoder := isvc.Status.Components[v1beta2.DecoderComponent]; hasDecoder {
 			existing.Decoder = true
 		}
 
 		// Check if router component exists in status
-		if _, hasRouter := isvc.Status.Components[v1beta1.RouterComponent]; hasRouter {
+		if _, hasRouter := isvc.Status.Components[v1beta2.RouterComponent]; hasRouter {
 			existing.Router = true
 		}
 	}
@@ -684,6 +684,6 @@ func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context
 }
 
 // migratePredictorToNewArchitecture delegates to the migration utility
-func (r *InferenceServiceReconciler) migratePredictorToNewArchitecture(isvc *v1beta1.InferenceService) error {
+func (r *InferenceServiceReconciler) migratePredictorToNewArchitecture(isvc *v1beta2.InferenceService) error {
 	return isvcutils.MigratePredictorToNewArchitecture(context.Background(), r.Client, r.Log, isvc)
 }

From 34dcf58c4ed69efca74bd6f2a0f7c0fcb09223d7 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Mon, 30 Jun 2025 15:25:02 -0700
Subject: [PATCH 10/10] Revert "fix inference service status propagation"

This reverts commit 46fe3098555c7cbd0387887e38ba4c4f8e6c0426.
---
 .../v1beta1/inferenceservice/controller.go    | 21 +--------
 .../status/status_reconciler.go               | 45 -------------------
 2 files changed, 1 insertion(+), 65 deletions(-)

diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go
index 4f3dbbb6..cd35b373 100644
--- a/pkg/controller/v1beta1/inferenceservice/controller.go
+++ b/pkg/controller/v1beta1/inferenceservice/controller.go
@@ -431,31 +431,12 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 		}
 	}
 
-	// Propagate status for all components
-	var componentList []v1beta2.ComponentType
 	if deploymentMode == constants.Serverless {
-		// In Serverless mode, we only care about the engine component which is a Knative service.
-		componentList = []v1beta2.ComponentType{v1beta2.EngineComponent}
-
-		// For serverless, we only have one component, and we need to propagate its route and deployment readiness.
-		// For other modes, these are handled by the component-specific reconcilers.
+		componentList := []v1beta2.ComponentType{v1beta2.EngineComponent}
 		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.RoutesReady)
 		r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.LatestDeploymentReady)
-	} else {
-		// For other modes (RawDeployment, etc.), we check all defined components.
-		if mergedEngine != nil {
-			componentList = append(componentList, v1beta2.EngineComponent)
-		}
-		if mergedDecoder != nil {
-			componentList = append(componentList, v1beta2.DecoderComponent)
-		}
-		if mergedRouter != nil {
-			componentList = append(componentList, v1beta2.RouterComponent)
-		}
 	}
 
-	r.StatusManager.AggregateComponentReadyCondition(&isvc.Status, componentList)
-
 	if err = r.updateStatus(isvc, deploymentMode); err != nil {
 		r.Recorder.Event(isvc, v1.EventTypeWarning, "InternalError", err.Error())
 		return reconcile.Result{}, err
diff --git a/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go b/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go
index c4227ea1..3290f77e 100644
--- a/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go
+++ b/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go
@@ -241,51 +241,6 @@ func (sr *StatusReconciler) SetModelFailureInfo(status *v1beta1.InferenceService
 }
 
 // PropagateCrossComponentStatus aggregates conditions across components
-// AggregateComponentReadyCondition creates the top-level Ready condition
-// based on the readiness of all specified components.
-func (sr *StatusReconciler) AggregateComponentReadyCondition(
-	status *v1beta1.InferenceServiceStatus,
-	componentList []v1beta1.ComponentType) {
-
-	// If there are no components, the service is not ready.
-	if len(componentList) == 0 {
-		status.SetCondition(apis.ConditionReady, &apis.Condition{
-			Type:    apis.ConditionReady,
-			Status:  v1.ConditionFalse,
-			Reason:  "NoComponents",
-			Message: "No components are defined for this InferenceService.",
-		})
-		return
-	}
-
-	readyCondition := &apis.Condition{
-		Type:    apis.ConditionReady,
-		Status:  v1.ConditionTrue,
-		Reason:  "AllComponentsReady",
-		Message: "All components are ready",
-	}
-
-	readyConditionsMap := sr.getReadyConditionsMap()
-
-	for _, component := range componentList {
-		componentReadyCondition := readyConditionsMap[component]
-		if !status.IsConditionReady(componentReadyCondition) {
-			readyCondition.Status = v1.ConditionFalse
-			readyCondition.Reason = string(component) + "NotReady"
-			// Get the actual condition to propagate the message
-			compCond := status.GetCondition(componentReadyCondition)
-			if compCond != nil {
-				readyCondition.Message = compCond.Message
-			} else {
-				readyCondition.Message = "Component " + string(component) + " is not ready"
-			}
-			break // one not ready is enough
-		}
-	}
-
-	status.SetCondition(apis.ConditionReady, readyCondition)
-}
-
 func (sr *StatusReconciler) PropagateCrossComponentStatus(
 	status *v1beta1.InferenceServiceStatus,
 	componentList []v1beta1.ComponentType,