From a71c6adb7dce7de653bd94435f7c992df50e3ae3 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Fri, 27 Jun 2025 23:47:07 -0700 Subject: [PATCH 01/10] remove llama 3.3 70B FP8 from models --- config/models/kustomization.yaml | 1 - .../meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml | 12 ------------ 2 files changed, 13 deletions(-) delete mode 100644 config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml index f2eb26a4..92afd37e 100644 --- a/config/models/kustomization.yaml +++ b/config/models/kustomization.yaml @@ -3,7 +3,6 @@ kind: Kustomization resources: - meta/Llama-3.3-70B-instruct.yaml - - meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml - meta/llama-4-maverick-17b-128e-instruct-fp8.yaml - meta/llama-4-scout-17b-16e-instruct.yaml - intfloat/e5-mistral-7b-instruct.yaml diff --git a/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml b/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml deleted file mode 100644 index 4db9e60b..00000000 --- a/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterBaseModel -metadata: - name: llama-3-3-70b-instruct-fp8-dynamic -spec: - disabled: false - displayName: meta.llama-3.3-70b-instruct-fp8-dynamic - storage: - storageUri: hf://meta-llama/Llama-3.3-70B-Instruct - path: /raid/models/meta/llama-3-3-70b-instruct-fp8-dynamic - vendor: meta - version: "1.0.0" \ No newline at end of file From 7d035fd523ce50ea35aee3286f58473193a8f41a Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Fri, 27 Jun 2025 23:47:52 -0700 Subject: [PATCH 02/10] change llama 3.2 90B rt to bf16 version --- ...struct-fp8-rt.yaml => llama-3-2-90b-vision-instruct-rt.yaml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename config/runtimes/srt/{llama-3-2-90b-vision-instruct-fp8-rt.yaml => llama-3-2-90b-vision-instruct-rt.yaml} (98%) diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml similarity index 98% rename from config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml rename to config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml index 2ef9d4d4..5adecd0d 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-2-90b-vision-instruct-fp8 + name: srt-llama-3-2-90b-vision-instruct spec: disabled: false supportedModelFormats: From c883198f5c81cd629ef02d1fc1d72c8ce39f6cf5 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Fri, 27 Jun 2025 23:48:07 -0700 Subject: [PATCH 03/10] remove 11b 90b pd rt --- .../llama-3-2-11b-vision-instruct-pd-rt.yaml | 226 ------------------ ...ama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 217 ----------------- 2 files changed, 443 deletions(-) delete mode 100644 config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml delete mode 100644 config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml deleted file mode 100644 index ec8764a6..00000000 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml +++ /dev/null @@ -1,226 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterServingRuntime -metadata: - name: srt-llama-3-2-11b-vision-instruct-pd -spec: - disabled: false - supportedModelFormats: - - modelFramework: - name: transformers - version: "4.45.0.dev0" - modelFormat: - name: safetensors - version: "1" - modelArchitecture: MllamaForConditionalGeneration - autoSelect: false - priority: 1 - protocolVersions: - - openAI - modelSizeRange: - min: 10B - max: 12B - engineConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 1 \ - --mem-frac=0.9 \ - --chat-template llama_3_vision \ - --disaggregation-mode prefill \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - limits: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - decoderConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory -# affinity: -# nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 1 \ - --mem-frac=0.9 \ - --chat-template llama_3_vision \ - --disaggregation-mode decode \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - limits: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - routerConfig: - runner: - name: router - image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 - resources: - limits: - cpu: "1" - memory: "2Gi" - ports: - - containerPort: 8080 - name: http - command: - - sh - - -c - - > - python3 -m sglang_router.launch_router - --host "0.0.0.0" - --port "8080" - --pd-disaggregation - --policy power_of_two - --service-discovery - --service-discovery-namespace "${NAMESPACE}" - --service-discovery-port 8080 - --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} - --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: INFERENCESERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml deleted file mode 100644 index 9052b5b0..00000000 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml +++ /dev/null @@ -1,217 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterServingRuntime -metadata: - name: srt-llama-3-2-90b-vision-instruct-fp8-pd -spec: - disabled: false - supportedModelFormats: - - modelFramework: - name: transformers - version: "4.46.0.dev0" - modelFormat: - name: safetensors - version: "1" - modelArchitecture: MllamaForConditionalGeneration - autoSelect: false - priority: 1 - protocolVersions: - - openAI - modelSizeRange: - min: 85B - max: 95B - engineConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 4 \ - --mem-frac=0.9 \ - --chat-template llama_3_vision \ - --disaggregation-mode prefill \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 30 - memory: 100Gi - nvidia.com/gpu: 4 - limits: - cpu: 30 - memory: 100Gi - nvidia.com/gpu: 4 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - decoderConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 4 \ - --mem-frac=0.9 \ - --chat-template llama_3_vision \ - --disaggregation-mode decode \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 30 - memory: 100Gi - nvidia.com/gpu: 4 - limits: - cpu: 30 - memory: 100Gi - nvidia.com/gpu: 4 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - routerConfig: - runner: - name: router - image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 - resources: - limits: - cpu: "1" - memory: "2Gi" - ports: - - containerPort: 8080 - name: http - command: - - sh - - -c - - > - python3 -m sglang_router.launch_router - --host "0.0.0.0" - --port "8080" - --pd-disaggregation - --policy power_of_two - --service-discovery - --service-discovery-namespace "${NAMESPACE}" - --service-discovery-port 8080 - --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} - --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: INFERENCESERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file From 7c91d5c09a05748a617246e514f69f65752cdca6 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Fri, 27 Jun 2025 23:48:42 -0700 Subject: [PATCH 04/10] add llama 3.3 pd isvc sample --- .../llama3-3-70b-instruct-fp8-dynamic.yaml | 18 ------------- .../isvc/meta/llama3-3-70b-instruct.yaml | 27 +++++++++++++++++++ 2 files changed, 27 insertions(+), 18 deletions(-) delete mode 100644 config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml create mode 100644 config/samples/isvc/meta/llama3-3-70b-instruct.yaml diff --git a/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml b/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml deleted file mode 100644 index c82d7da4..00000000 --- a/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: v1 -kind: Namespace -metadata: - name: llama-3-3-70b-instruct-fp8-dynamic ---- - -apiVersion: ome.io/v1beta1 -kind: InferenceService -metadata: - name: llama-3-3-70b-instruct-fp8-dynamic - namespace: llama-3-3-70b-instruct-fp8-dynamic -spec: - model: - name: llama-3-3-70b-instruct-fp8-dynamic - engine: - minReplicas: 1 - maxReplicas: 1 \ No newline at end of file diff --git a/config/samples/isvc/meta/llama3-3-70b-instruct.yaml b/config/samples/isvc/meta/llama3-3-70b-instruct.yaml new file mode 100644 index 00000000..4734ea75 --- /dev/null +++ b/config/samples/isvc/meta/llama3-3-70b-instruct.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: llama-3-3-70b-instruct +--- + +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: llama-3-3-70b-instruct + namespace: llama-3-3-70b-instruct +spec: + model: + name: llama-3-3-70b-instruct + engine: + minReplicas: 1 + maxReplicas: 1 + runtime: + name: srt-llama-3-3-70b-instruct-pd + decoder: + minReplicas: 1 + maxReplicas: 1 + # router: + # minReplicas: 1 + # maxReplicas: 1 + \ No newline at end of file From bbff36b828561e22a323cf3ecaebd4c07fca06dc Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Fri, 27 Jun 2025 23:49:08 -0700 Subject: [PATCH 05/10] sgl runtime fixes --- config/runtimes/srt/deepseek-rdma-pd-rt.yaml | 44 ++++++++++++++++--- .../srt/llama-3-1-70b-instruct-pd-rt.yaml | 2 +- .../srt/llama-3-2-1b-instruct-pd-rt.yaml | 2 +- .../srt/llama-3-2-3b-instruct-pd-rt.yaml | 2 +- .../srt/llama-3-3-70b-instruct-pd-rt.yaml | 2 +- .../llama-4-scout-17b-16e-instruct-pd-rt.yaml | 2 +- .../srt/mistral-7b-instruct-pd-rt.yaml | 2 +- .../srt/mixtral-8x7b-instruct-pd-rt.yaml | 2 +- 8 files changed, 46 insertions(+), 12 deletions(-) diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml index a5ec1663..8925c6af 100644 --- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml @@ -63,7 +63,7 @@ spec: MC_TE_METRIC=true; SGLANG_TBO_DEBUG=1; python3 -m sglang.launch_server - --port 30000 + --port 8080 --host 0.0.0.0 --model-path ${MODEL_PATH} --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_3,mlx5_4 @@ -158,7 +158,7 @@ spec: --dist-init-addr $(LWS_LEADER_ADDRESS):5000 --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} - --port 30000 + --port 8080 --trust-remote-code --ep-num-redundant-experts 32 --moe-dense-tp-size 1 @@ -214,7 +214,7 @@ spec: - -c - > python3 -m sglang.launch_server - --port 30000 + --port 8080 --host 0.0.0.0 --chunked-prefill-size 262144 --page-size 64 @@ -302,7 +302,7 @@ spec: --dist-init-addr $(LWS_LEADER_ADDRESS):5000 --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} - --port 30000 + --port 8080 --decode-log-interval 1 --host 0.0.0.0 --trust-remote-code @@ -325,4 +325,38 @@ spec: - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" - name: GLOO_SOCKET_IFNAME - value: eth0 \ No newline at end of file + value: eth0 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host 0.0.0.0 + --port 8080 + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml index 55f20a7e..a628eca1 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.42.3" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: LlamaForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml index 21797d18..b2d20a9f 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.45.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: LlamaForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml index 0231121e..6fa60107 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.45.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: LlamaForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml index 3f02b061..2e78f6cf 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.47.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: LlamaForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml index 8c64685d..9def6d83 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.51.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: Llama4ForConditionalGeneration autoSelect: true priority: 2 diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml index 5869bd53..6685b99e 100644 --- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.36.2" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: MistralForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml index 8f75eed8..cb4a2110 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.36.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: MixtralForCausalLM autoSelect: false priority: 1 From 627a3fb875c916a0bf2cff6bbe84669d049523a6 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Fri, 27 Jun 2025 23:49:30 -0700 Subject: [PATCH 06/10] update sample isvc --- .../samples/isvc/deepseek-ai/deepseek-v3-pd.yaml | 14 +++++++++----- .../llama-4-maverick-17b-128e-instruct-fp8.yaml | 10 ++++++++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml b/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml index e8bfa4bf..0adeb33f 100644 --- a/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml +++ b/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml @@ -2,22 +2,26 @@ apiVersion: v1 kind: Namespace metadata: - name: deepseek-v3-pd + name: deepseek-v3 --- apiVersion: ome.io/v1beta1 kind: InferenceService metadata: name: deepseek-v3 - namespace: deepseek-v3-pd + namespace: deepseek-v3 spec: model: name: deepseek-v3 runtime: - name: srt-deepseek-pd-rdma + name: srt-deepseek-rdma-pd engine: minReplicas: 1 maxReplicas: 1 decoder: - minReplicas: 2 - maxReplicas: 2 \ No newline at end of file + minReplicas: 1 + maxReplicas: 1 + router: + minReplicas: 1 + maxReplicas: 1 + \ No newline at end of file diff --git a/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml b/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml index 9e45bec6..b221a8fa 100644 --- a/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml +++ b/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml @@ -16,5 +16,11 @@ spec: runtime: name: srt-llama-4-maverick-17b-128e-instruct-fp8-pd engine: - minReplicas: 3 - maxReplicas: 3 \ No newline at end of file + minReplicas: 1 + maxReplicas: 1 + decoder: + minReplicas: 1 + maxReplicas: 1 + router: + minReplicas: 1 + maxReplicas: 1 \ No newline at end of file From 46fe3098555c7cbd0387887e38ba4c4f8e6c0426 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Fri, 27 Jun 2025 23:50:54 -0700 Subject: [PATCH 07/10] fix inference service status propagation --- .../v1beta1/inferenceservice/controller.go | 21 ++++++++- .../status/status_reconciler.go | 45 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go index cd35b373..4f3dbbb6 100644 --- a/pkg/controller/v1beta1/inferenceservice/controller.go +++ b/pkg/controller/v1beta1/inferenceservice/controller.go @@ -431,12 +431,31 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req } } + // Propagate status for all components + var componentList []v1beta2.ComponentType if deploymentMode == constants.Serverless { - componentList := []v1beta2.ComponentType{v1beta2.EngineComponent} + // In Serverless mode, we only care about the engine component which is a Knative service. + componentList = []v1beta2.ComponentType{v1beta2.EngineComponent} + + // For serverless, we only have one component, and we need to propagate its route and deployment readiness. + // For other modes, these are handled by the component-specific reconcilers. r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.RoutesReady) r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.LatestDeploymentReady) + } else { + // For other modes (RawDeployment, etc.), we check all defined components. + if mergedEngine != nil { + componentList = append(componentList, v1beta2.EngineComponent) + } + if mergedDecoder != nil { + componentList = append(componentList, v1beta2.DecoderComponent) + } + if mergedRouter != nil { + componentList = append(componentList, v1beta2.RouterComponent) + } } + r.StatusManager.AggregateComponentReadyCondition(&isvc.Status, componentList) + if err = r.updateStatus(isvc, deploymentMode); err != nil { r.Recorder.Event(isvc, v1.EventTypeWarning, "InternalError", err.Error()) return reconcile.Result{}, err diff --git a/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go b/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go index 3290f77e..c4227ea1 100644 --- a/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go +++ b/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go @@ -241,6 +241,51 @@ func (sr *StatusReconciler) SetModelFailureInfo(status *v1beta1.InferenceService } // PropagateCrossComponentStatus aggregates conditions across components +// AggregateComponentReadyCondition creates the top-level Ready condition +// based on the readiness of all specified components. +func (sr *StatusReconciler) AggregateComponentReadyCondition( + status *v1beta1.InferenceServiceStatus, + componentList []v1beta1.ComponentType) { + + // If there are no components, the service is not ready. + if len(componentList) == 0 { + status.SetCondition(apis.ConditionReady, &apis.Condition{ + Type: apis.ConditionReady, + Status: v1.ConditionFalse, + Reason: "NoComponents", + Message: "No components are defined for this InferenceService.", + }) + return + } + + readyCondition := &apis.Condition{ + Type: apis.ConditionReady, + Status: v1.ConditionTrue, + Reason: "AllComponentsReady", + Message: "All components are ready", + } + + readyConditionsMap := sr.getReadyConditionsMap() + + for _, component := range componentList { + componentReadyCondition := readyConditionsMap[component] + if !status.IsConditionReady(componentReadyCondition) { + readyCondition.Status = v1.ConditionFalse + readyCondition.Reason = string(component) + "NotReady" + // Get the actual condition to propagate the message + compCond := status.GetCondition(componentReadyCondition) + if compCond != nil { + readyCondition.Message = compCond.Message + } else { + readyCondition.Message = "Component " + string(component) + " is not ready" + } + break // one not ready is enough + } + } + + status.SetCondition(apis.ConditionReady, readyCondition) +} + func (sr *StatusReconciler) PropagateCrossComponentStatus( status *v1beta1.InferenceServiceStatus, componentList []v1beta1.ComponentType, From 326f4b6e0ca8e8aaf84db8ba955edc50bebdcca7 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Sat, 28 Jun 2025 00:02:32 -0700 Subject: [PATCH 08/10] fix API version --- .../v1beta1/inferenceservice/controller.go | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go index 4f3dbbb6..d1d9e627 100644 --- a/pkg/controller/v1beta1/inferenceservice/controller.go +++ b/pkg/controller/v1beta1/inferenceservice/controller.go @@ -10,7 +10,7 @@ import ( duckv1 "knative.dev/pkg/apis/duck/v1" "knative.dev/pkg/network" - v1beta2 "github.com/sgl-project/ome/pkg/apis/ome/v1beta1" + v1beta1 "github.com/sgl-project/ome/pkg/apis/ome/v1beta1" autoscalingv2 "k8s.io/api/autoscaling/v2" lws "sigs.k8s.io/lws/api/leaderworkerset/v1" @@ -110,7 +110,7 @@ type InferenceServiceReconciler struct { func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // Fetch the InferenceService instance - isvc := &v1beta2.InferenceService{} + isvc := &v1beta1.InferenceService{} if err := r.Get(ctx, req.NamespacedName, isvc); err != nil { if apierrors.IsNotFound(err) { // Object not found, return. Created objects are automatically garbage collected. @@ -175,7 +175,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req // Initialize status if not already initialized if isvc.Status.Components == nil { - isvc.Status.Components = make(map[v1beta2.ComponentType]v1beta2.ComponentStatusSpec) + isvc.Status.Components = make(map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec) } // Setup reconcilers @@ -432,25 +432,25 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req } // Propagate status for all components - var componentList []v1beta2.ComponentType + var componentList []v1beta1.ComponentType if deploymentMode == constants.Serverless { // In Serverless mode, we only care about the engine component which is a Knative service. - componentList = []v1beta2.ComponentType{v1beta2.EngineComponent} + componentList = []v1beta1.ComponentType{v1beta1.EngineComponent} // For serverless, we only have one component, and we need to propagate its route and deployment readiness. // For other modes, these are handled by the component-specific reconcilers. - r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.RoutesReady) - r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.LatestDeploymentReady) + r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta1.RoutesReady) + r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta1.LatestDeploymentReady) } else { // For other modes (RawDeployment, etc.), we check all defined components. if mergedEngine != nil { - componentList = append(componentList, v1beta2.EngineComponent) + componentList = append(componentList, v1beta1.EngineComponent) } if mergedDecoder != nil { - componentList = append(componentList, v1beta2.DecoderComponent) + componentList = append(componentList, v1beta1.DecoderComponent) } if mergedRouter != nil { - componentList = append(componentList, v1beta2.RouterComponent) + componentList = append(componentList, v1beta1.RouterComponent) } } @@ -464,7 +464,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, nil } -func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta2.InferenceService) (ctrl.Result, error) { +func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta1.InferenceService) (ctrl.Result, error) { // We directly set URL and inference service status to Ready in VirtualDeployment mode // Set URL across all Status components @@ -478,8 +478,8 @@ func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta2.Infer } isvc.Status.URL = openAIURL isvc.Status.Address = addressURL - isvc.Status.Components = map[v1beta2.ComponentType]v1beta2.ComponentStatusSpec{ - v1beta2.PredictorComponent: { + isvc.Status.Components = map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec{ + v1beta1.PredictorComponent: { URL: openAIURL, }, } @@ -500,7 +500,7 @@ func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta2.Infer return ctrl.Result{}, nil } -func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta2.InferenceService) (ctrl.Result, error) { +func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1.InferenceService) (ctrl.Result, error) { // Abort early if the resolved deployment mode is Serverless, but Knative Services are not available ksvcAvailable, err := utils.IsCrdAvailable(r.ClientConfig, knservingv1.SchemeGroupVersion.String(), constants.KnativeServiceKind) if err != nil { @@ -517,8 +517,8 @@ func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta2 return ctrl.Result{}, nil } -func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta2.InferenceService, deploymentMode constants.DeploymentModeType) error { - existingService := &v1beta2.InferenceService{} +func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.InferenceService, deploymentMode constants.DeploymentModeType) error { + existingService := &v1beta1.InferenceService{} namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace} if err := r.Get(context.TODO(), namespacedName, existingService); err != nil { return err @@ -548,13 +548,13 @@ func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta2.Infere return nil } -func inferenceServiceReadiness(status v1beta2.InferenceServiceStatus) bool { +func inferenceServiceReadiness(status v1beta1.InferenceServiceStatus) bool { return status.Conditions != nil && status.GetCondition(knapis.ConditionReady) != nil && status.GetCondition(knapis.ConditionReady).Status == v1.ConditionTrue } -func inferenceServiceStatusEqual(s1, s2 v1beta2.InferenceServiceStatus) bool { +func inferenceServiceStatusEqual(s1, s2 v1beta1.InferenceServiceStatus) bool { return equality.Semantic.DeepEqual(s1, s2) } @@ -590,7 +590,7 @@ func (r *InferenceServiceReconciler) SetupWithManager(mgr ctrl.Manager, deployCo } ctrlBuilder := ctrl.NewControllerManagedBy(mgr). - For(&v1beta2.InferenceService{}). + For(&v1beta1.InferenceService{}). Owns(&appsv1.Deployment{}). Owns(&v1.Service{}). Owns(&v1.ConfigMap{}). @@ -631,7 +631,7 @@ func (r *InferenceServiceReconciler) SetupWithManager(mgr ctrl.Manager, deployCo return ctrlBuilder.Complete(r) } -func (r *InferenceServiceReconciler) setExternalServiceURL(ctx context.Context, isvc *v1beta2.InferenceService, ingressConfig *controllerconfig.IngressConfig) error { +func (r *InferenceServiceReconciler) setExternalServiceURL(ctx context.Context, isvc *v1beta1.InferenceService, ingressConfig *controllerconfig.IngressConfig) error { // Get the external service externalService := &v1.Service{} if err := r.Get(ctx, types.NamespacedName{Name: isvc.Name, Namespace: isvc.Namespace}, externalService); err != nil { @@ -659,23 +659,23 @@ type existingComponents struct { Router bool } -func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context, isvc *v1beta2.InferenceService) (existingComponents, error) { +func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context, isvc *v1beta1.InferenceService) (existingComponents, error) { existing := existingComponents{} // Check status for existing components - this is more reliable than querying deployments if isvc.Status.Components != nil { // Check if engine component exists in status - if _, hasEngine := isvc.Status.Components[v1beta2.EngineComponent]; hasEngine { + if _, hasEngine := isvc.Status.Components[v1beta1.EngineComponent]; hasEngine { existing.Engine = true } // Check if decoder component exists in status - if _, hasDecoder := isvc.Status.Components[v1beta2.DecoderComponent]; hasDecoder { + if _, hasDecoder := isvc.Status.Components[v1beta1.DecoderComponent]; hasDecoder { existing.Decoder = true } // Check if router component exists in status - if _, hasRouter := isvc.Status.Components[v1beta2.RouterComponent]; hasRouter { + if _, hasRouter := isvc.Status.Components[v1beta1.RouterComponent]; hasRouter { existing.Router = true } } @@ -684,6 +684,6 @@ func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context } // migratePredictorToNewArchitecture delegates to the migration utility -func (r *InferenceServiceReconciler) migratePredictorToNewArchitecture(isvc *v1beta2.InferenceService) error { +func (r *InferenceServiceReconciler) migratePredictorToNewArchitecture(isvc *v1beta1.InferenceService) error { return isvcutils.MigratePredictorToNewArchitecture(context.Background(), r.Client, r.Log, isvc) } From b4b80513c1232cd90b1a323f6c8a168b2f14f1c5 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Mon, 30 Jun 2025 15:24:02 -0700 Subject: [PATCH 09/10] Revert "fix API version" This reverts commit 326f4b6e0ca8e8aaf84db8ba955edc50bebdcca7. --- .../v1beta1/inferenceservice/controller.go | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go index d1d9e627..4f3dbbb6 100644 --- a/pkg/controller/v1beta1/inferenceservice/controller.go +++ b/pkg/controller/v1beta1/inferenceservice/controller.go @@ -10,7 +10,7 @@ import ( duckv1 "knative.dev/pkg/apis/duck/v1" "knative.dev/pkg/network" - v1beta1 "github.com/sgl-project/ome/pkg/apis/ome/v1beta1" + v1beta2 "github.com/sgl-project/ome/pkg/apis/ome/v1beta1" autoscalingv2 "k8s.io/api/autoscaling/v2" lws "sigs.k8s.io/lws/api/leaderworkerset/v1" @@ -110,7 +110,7 @@ type InferenceServiceReconciler struct { func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // Fetch the InferenceService instance - isvc := &v1beta1.InferenceService{} + isvc := &v1beta2.InferenceService{} if err := r.Get(ctx, req.NamespacedName, isvc); err != nil { if apierrors.IsNotFound(err) { // Object not found, return. Created objects are automatically garbage collected. @@ -175,7 +175,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req // Initialize status if not already initialized if isvc.Status.Components == nil { - isvc.Status.Components = make(map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec) + isvc.Status.Components = make(map[v1beta2.ComponentType]v1beta2.ComponentStatusSpec) } // Setup reconcilers @@ -432,25 +432,25 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req } // Propagate status for all components - var componentList []v1beta1.ComponentType + var componentList []v1beta2.ComponentType if deploymentMode == constants.Serverless { // In Serverless mode, we only care about the engine component which is a Knative service. - componentList = []v1beta1.ComponentType{v1beta1.EngineComponent} + componentList = []v1beta2.ComponentType{v1beta2.EngineComponent} // For serverless, we only have one component, and we need to propagate its route and deployment readiness. // For other modes, these are handled by the component-specific reconcilers. - r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta1.RoutesReady) - r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta1.LatestDeploymentReady) + r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.RoutesReady) + r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.LatestDeploymentReady) } else { // For other modes (RawDeployment, etc.), we check all defined components. if mergedEngine != nil { - componentList = append(componentList, v1beta1.EngineComponent) + componentList = append(componentList, v1beta2.EngineComponent) } if mergedDecoder != nil { - componentList = append(componentList, v1beta1.DecoderComponent) + componentList = append(componentList, v1beta2.DecoderComponent) } if mergedRouter != nil { - componentList = append(componentList, v1beta1.RouterComponent) + componentList = append(componentList, v1beta2.RouterComponent) } } @@ -464,7 +464,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, nil } -func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta1.InferenceService) (ctrl.Result, error) { +func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta2.InferenceService) (ctrl.Result, error) { // We directly set URL and inference service status to Ready in VirtualDeployment mode // Set URL across all Status components @@ -478,8 +478,8 @@ func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta1.Infer } isvc.Status.URL = openAIURL isvc.Status.Address = addressURL - isvc.Status.Components = map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec{ - v1beta1.PredictorComponent: { + isvc.Status.Components = map[v1beta2.ComponentType]v1beta2.ComponentStatusSpec{ + v1beta2.PredictorComponent: { URL: openAIURL, }, } @@ -500,7 +500,7 @@ func (r *InferenceServiceReconciler) handleVirtualDeployment(isvc *v1beta1.Infer return ctrl.Result{}, nil } -func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1.InferenceService) (ctrl.Result, error) { +func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta2.InferenceService) (ctrl.Result, error) { // Abort early if the resolved deployment mode is Serverless, but Knative Services are not available ksvcAvailable, err := utils.IsCrdAvailable(r.ClientConfig, knservingv1.SchemeGroupVersion.String(), constants.KnativeServiceKind) if err != nil { @@ -517,8 +517,8 @@ func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1 return ctrl.Result{}, nil } -func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.InferenceService, deploymentMode constants.DeploymentModeType) error { - existingService := &v1beta1.InferenceService{} +func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta2.InferenceService, deploymentMode constants.DeploymentModeType) error { + existingService := &v1beta2.InferenceService{} namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace} if err := r.Get(context.TODO(), namespacedName, existingService); err != nil { return err @@ -548,13 +548,13 @@ func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.Infere return nil } -func inferenceServiceReadiness(status v1beta1.InferenceServiceStatus) bool { +func inferenceServiceReadiness(status v1beta2.InferenceServiceStatus) bool { return status.Conditions != nil && status.GetCondition(knapis.ConditionReady) != nil && status.GetCondition(knapis.ConditionReady).Status == v1.ConditionTrue } -func inferenceServiceStatusEqual(s1, s2 v1beta1.InferenceServiceStatus) bool { +func inferenceServiceStatusEqual(s1, s2 v1beta2.InferenceServiceStatus) bool { return equality.Semantic.DeepEqual(s1, s2) } @@ -590,7 +590,7 @@ func (r *InferenceServiceReconciler) SetupWithManager(mgr ctrl.Manager, deployCo } ctrlBuilder := ctrl.NewControllerManagedBy(mgr). - For(&v1beta1.InferenceService{}). + For(&v1beta2.InferenceService{}). Owns(&appsv1.Deployment{}). Owns(&v1.Service{}). Owns(&v1.ConfigMap{}). @@ -631,7 +631,7 @@ func (r *InferenceServiceReconciler) SetupWithManager(mgr ctrl.Manager, deployCo return ctrlBuilder.Complete(r) } -func (r *InferenceServiceReconciler) setExternalServiceURL(ctx context.Context, isvc *v1beta1.InferenceService, ingressConfig *controllerconfig.IngressConfig) error { +func (r *InferenceServiceReconciler) setExternalServiceURL(ctx context.Context, isvc *v1beta2.InferenceService, ingressConfig *controllerconfig.IngressConfig) error { // Get the external service externalService := &v1.Service{} if err := r.Get(ctx, types.NamespacedName{Name: isvc.Name, Namespace: isvc.Namespace}, externalService); err != nil { @@ -659,23 +659,23 @@ type existingComponents struct { Router bool } -func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context, isvc *v1beta1.InferenceService) (existingComponents, error) { +func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context, isvc *v1beta2.InferenceService) (existingComponents, error) { existing := existingComponents{} // Check status for existing components - this is more reliable than querying deployments if isvc.Status.Components != nil { // Check if engine component exists in status - if _, hasEngine := isvc.Status.Components[v1beta1.EngineComponent]; hasEngine { + if _, hasEngine := isvc.Status.Components[v1beta2.EngineComponent]; hasEngine { existing.Engine = true } // Check if decoder component exists in status - if _, hasDecoder := isvc.Status.Components[v1beta1.DecoderComponent]; hasDecoder { + if _, hasDecoder := isvc.Status.Components[v1beta2.DecoderComponent]; hasDecoder { existing.Decoder = true } // Check if router component exists in status - if _, hasRouter := isvc.Status.Components[v1beta1.RouterComponent]; hasRouter { + if _, hasRouter := isvc.Status.Components[v1beta2.RouterComponent]; hasRouter { existing.Router = true } } @@ -684,6 +684,6 @@ func (r *InferenceServiceReconciler) checkExistingComponents(ctx context.Context } // migratePredictorToNewArchitecture delegates to the migration utility -func (r *InferenceServiceReconciler) migratePredictorToNewArchitecture(isvc *v1beta1.InferenceService) error { +func (r *InferenceServiceReconciler) migratePredictorToNewArchitecture(isvc *v1beta2.InferenceService) error { return isvcutils.MigratePredictorToNewArchitecture(context.Background(), r.Client, r.Log, isvc) } From 34dcf58c4ed69efca74bd6f2a0f7c0fcb09223d7 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Mon, 30 Jun 2025 15:25:02 -0700 Subject: [PATCH 10/10] Revert "fix inference service status propagation" This reverts commit 46fe3098555c7cbd0387887e38ba4c4f8e6c0426. --- .../v1beta1/inferenceservice/controller.go | 21 +-------- .../status/status_reconciler.go | 45 ------------------- 2 files changed, 1 insertion(+), 65 deletions(-) diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go index 4f3dbbb6..cd35b373 100644 --- a/pkg/controller/v1beta1/inferenceservice/controller.go +++ b/pkg/controller/v1beta1/inferenceservice/controller.go @@ -431,31 +431,12 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req } } - // Propagate status for all components - var componentList []v1beta2.ComponentType if deploymentMode == constants.Serverless { - // In Serverless mode, we only care about the engine component which is a Knative service. - componentList = []v1beta2.ComponentType{v1beta2.EngineComponent} - - // For serverless, we only have one component, and we need to propagate its route and deployment readiness. - // For other modes, these are handled by the component-specific reconcilers. + componentList := []v1beta2.ComponentType{v1beta2.EngineComponent} r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.RoutesReady) r.StatusManager.PropagateCrossComponentStatus(&isvc.Status, componentList, v1beta2.LatestDeploymentReady) - } else { - // For other modes (RawDeployment, etc.), we check all defined components. - if mergedEngine != nil { - componentList = append(componentList, v1beta2.EngineComponent) - } - if mergedDecoder != nil { - componentList = append(componentList, v1beta2.DecoderComponent) - } - if mergedRouter != nil { - componentList = append(componentList, v1beta2.RouterComponent) - } } - r.StatusManager.AggregateComponentReadyCondition(&isvc.Status, componentList) - if err = r.updateStatus(isvc, deploymentMode); err != nil { r.Recorder.Event(isvc, v1.EventTypeWarning, "InternalError", err.Error()) return reconcile.Result{}, err diff --git a/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go b/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go index c4227ea1..3290f77e 100644 --- a/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go +++ b/pkg/controller/v1beta1/inferenceservice/status/status_reconciler.go @@ -241,51 +241,6 @@ func (sr *StatusReconciler) SetModelFailureInfo(status *v1beta1.InferenceService } // PropagateCrossComponentStatus aggregates conditions across components -// AggregateComponentReadyCondition creates the top-level Ready condition -// based on the readiness of all specified components. -func (sr *StatusReconciler) AggregateComponentReadyCondition( - status *v1beta1.InferenceServiceStatus, - componentList []v1beta1.ComponentType) { - - // If there are no components, the service is not ready. - if len(componentList) == 0 { - status.SetCondition(apis.ConditionReady, &apis.Condition{ - Type: apis.ConditionReady, - Status: v1.ConditionFalse, - Reason: "NoComponents", - Message: "No components are defined for this InferenceService.", - }) - return - } - - readyCondition := &apis.Condition{ - Type: apis.ConditionReady, - Status: v1.ConditionTrue, - Reason: "AllComponentsReady", - Message: "All components are ready", - } - - readyConditionsMap := sr.getReadyConditionsMap() - - for _, component := range componentList { - componentReadyCondition := readyConditionsMap[component] - if !status.IsConditionReady(componentReadyCondition) { - readyCondition.Status = v1.ConditionFalse - readyCondition.Reason = string(component) + "NotReady" - // Get the actual condition to propagate the message - compCond := status.GetCondition(componentReadyCondition) - if compCond != nil { - readyCondition.Message = compCond.Message - } else { - readyCondition.Message = "Component " + string(component) + " is not ready" - } - break // one not ready is enough - } - } - - status.SetCondition(apis.ConditionReady, readyCondition) -} - func (sr *StatusReconciler) PropagateCrossComponentStatus( status *v1beta1.InferenceServiceStatus, componentList []v1beta1.ComponentType,