diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml index f2eb26a4..92afd37e 100644 --- a/config/models/kustomization.yaml +++ b/config/models/kustomization.yaml @@ -3,7 +3,6 @@ kind: Kustomization resources: - meta/Llama-3.3-70B-instruct.yaml - - meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml - meta/llama-4-maverick-17b-128e-instruct-fp8.yaml - meta/llama-4-scout-17b-16e-instruct.yaml - intfloat/e5-mistral-7b-instruct.yaml diff --git a/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml b/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml deleted file mode 100644 index 4db9e60b..00000000 --- a/config/models/meta/Llama-3.3-70B-instruct-FP8-Dynamic.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterBaseModel -metadata: - name: llama-3-3-70b-instruct-fp8-dynamic -spec: - disabled: false - displayName: meta.llama-3.3-70b-instruct-fp8-dynamic - storage: - storageUri: hf://meta-llama/Llama-3.3-70B-Instruct - path: /raid/models/meta/llama-3-3-70b-instruct-fp8-dynamic - vendor: meta - version: "1.0.0" \ No newline at end of file diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml index a5ec1663..8925c6af 100644 --- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml @@ -63,7 +63,7 @@ spec: MC_TE_METRIC=true; SGLANG_TBO_DEBUG=1; python3 -m sglang.launch_server - --port 30000 + --port 8080 --host 0.0.0.0 --model-path ${MODEL_PATH} --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_3,mlx5_4 @@ -158,7 +158,7 @@ spec: --dist-init-addr $(LWS_LEADER_ADDRESS):5000 --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} - --port 30000 + --port 8080 --trust-remote-code --ep-num-redundant-experts 32 --moe-dense-tp-size 1 @@ -214,7 +214,7 @@ spec: - -c - > python3 -m sglang.launch_server - --port 30000 + --port 8080 --host 0.0.0.0 --chunked-prefill-size 262144 --page-size 64 @@ -302,7 +302,7 @@ spec: --dist-init-addr $(LWS_LEADER_ADDRESS):5000 --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} - --port 30000 + --port 8080 --decode-log-interval 1 --host 0.0.0.0 --trust-remote-code @@ -325,4 +325,38 @@ spec: - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" - name: GLOO_SOCKET_IFNAME - value: eth0 \ No newline at end of file + value: eth0 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host 0.0.0.0 + --port 8080 + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml index 55f20a7e..a628eca1 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.42.3" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: LlamaForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml deleted file mode 100644 index ec8764a6..00000000 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml +++ /dev/null @@ -1,226 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterServingRuntime -metadata: - name: srt-llama-3-2-11b-vision-instruct-pd -spec: - disabled: false - supportedModelFormats: - - modelFramework: - name: transformers - version: "4.45.0.dev0" - modelFormat: - name: safetensors - version: "1" - modelArchitecture: MllamaForConditionalGeneration - autoSelect: false - priority: 1 - protocolVersions: - - openAI - modelSizeRange: - min: 10B - max: 12B - engineConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 1 \ - --mem-frac=0.9 \ - --chat-template llama_3_vision \ - --disaggregation-mode prefill \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - limits: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - decoderConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory -# affinity: -# nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 1 \ - --mem-frac=0.9 \ - --chat-template llama_3_vision \ - --disaggregation-mode decode \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - limits: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - routerConfig: - runner: - name: router - image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 - resources: - limits: - cpu: "1" - memory: "2Gi" - ports: - - containerPort: 8080 - name: http - command: - - sh - - -c - - > - python3 -m sglang_router.launch_router - --host "0.0.0.0" - --port "8080" - --pd-disaggregation - --policy power_of_two - --service-discovery - --service-discovery-namespace "${NAMESPACE}" - --service-discovery-port 8080 - --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} - --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: INFERENCESERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml index 21797d18..b2d20a9f 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.45.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: LlamaForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml index 0231121e..6fa60107 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.45.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: LlamaForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml deleted file mode 100644 index 9052b5b0..00000000 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml +++ /dev/null @@ -1,217 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterServingRuntime -metadata: - name: srt-llama-3-2-90b-vision-instruct-fp8-pd -spec: - disabled: false - supportedModelFormats: - - modelFramework: - name: transformers - version: "4.46.0.dev0" - modelFormat: - name: safetensors - version: "1" - modelArchitecture: MllamaForConditionalGeneration - autoSelect: false - priority: 1 - protocolVersions: - - openAI - modelSizeRange: - min: 85B - max: 95B - engineConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 4 \ - --mem-frac=0.9 \ - --chat-template llama_3_vision \ - --disaggregation-mode prefill \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 30 - memory: 100Gi - nvidia.com/gpu: 4 - limits: - cpu: 30 - memory: 100Gi - nvidia.com/gpu: 4 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - decoderConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 4 \ - --mem-frac=0.9 \ - --chat-template llama_3_vision \ - --disaggregation-mode decode \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 30 - memory: 100Gi - nvidia.com/gpu: 4 - limits: - cpu: 30 - memory: 100Gi - nvidia.com/gpu: 4 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - routerConfig: - runner: - name: router - image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 - resources: - limits: - cpu: "1" - memory: "2Gi" - ports: - - containerPort: 8080 - name: http - command: - - sh - - -c - - > - python3 -m sglang_router.launch_router - --host "0.0.0.0" - --port "8080" - --pd-disaggregation - --policy power_of_two - --service-discovery - --service-discovery-namespace "${NAMESPACE}" - --service-discovery-port 8080 - --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} - --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: INFERENCESERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml similarity index 98% rename from config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml rename to config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml index 2ef9d4d4..5adecd0d 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-2-90b-vision-instruct-fp8 + name: srt-llama-3-2-90b-vision-instruct spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml index 3f02b061..2e78f6cf 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.47.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: LlamaForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml index 8c64685d..9def6d83 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.51.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: Llama4ForConditionalGeneration autoSelect: true priority: 2 diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml index 5869bd53..6685b99e 100644 --- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.36.2" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: MistralForCausalLM autoSelect: false priority: 1 diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml index 8f75eed8..cb4a2110 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -10,7 +10,7 @@ spec: version: "4.36.0.dev0" modelFormat: name: safetensors - version: "1" + version: "1.0.0" modelArchitecture: MixtralForCausalLM autoSelect: false priority: 1 diff --git a/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml b/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml index e8bfa4bf..0adeb33f 100644 --- a/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml +++ b/config/samples/isvc/deepseek-ai/deepseek-v3-pd.yaml @@ -2,22 +2,26 @@ apiVersion: v1 kind: Namespace metadata: - name: deepseek-v3-pd + name: deepseek-v3 --- apiVersion: ome.io/v1beta1 kind: InferenceService metadata: name: deepseek-v3 - namespace: deepseek-v3-pd + namespace: deepseek-v3 spec: model: name: deepseek-v3 runtime: - name: srt-deepseek-pd-rdma + name: srt-deepseek-rdma-pd engine: minReplicas: 1 maxReplicas: 1 decoder: - minReplicas: 2 - maxReplicas: 2 \ No newline at end of file + minReplicas: 1 + maxReplicas: 1 + router: + minReplicas: 1 + maxReplicas: 1 + \ No newline at end of file diff --git a/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml b/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml index 9e45bec6..b221a8fa 100644 --- a/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml +++ b/config/samples/isvc/meta/llama-4-maverick-17b-128e-instruct-fp8.yaml @@ -16,5 +16,11 @@ spec: runtime: name: srt-llama-4-maverick-17b-128e-instruct-fp8-pd engine: - minReplicas: 3 - maxReplicas: 3 \ No newline at end of file + minReplicas: 1 + maxReplicas: 1 + decoder: + minReplicas: 1 + maxReplicas: 1 + router: + minReplicas: 1 + maxReplicas: 1 \ No newline at end of file diff --git a/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml b/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml deleted file mode 100644 index c82d7da4..00000000 --- a/config/samples/isvc/meta/llama3-3-70b-instruct-fp8-dynamic.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: v1 -kind: Namespace -metadata: - name: llama-3-3-70b-instruct-fp8-dynamic ---- - -apiVersion: ome.io/v1beta1 -kind: InferenceService -metadata: - name: llama-3-3-70b-instruct-fp8-dynamic - namespace: llama-3-3-70b-instruct-fp8-dynamic -spec: - model: - name: llama-3-3-70b-instruct-fp8-dynamic - engine: - minReplicas: 1 - maxReplicas: 1 \ No newline at end of file diff --git a/config/samples/isvc/meta/llama3-3-70b-instruct.yaml b/config/samples/isvc/meta/llama3-3-70b-instruct.yaml new file mode 100644 index 00000000..4734ea75 --- /dev/null +++ b/config/samples/isvc/meta/llama3-3-70b-instruct.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: llama-3-3-70b-instruct +--- + +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: llama-3-3-70b-instruct + namespace: llama-3-3-70b-instruct +spec: + model: + name: llama-3-3-70b-instruct + engine: + minReplicas: 1 + maxReplicas: 1 + runtime: + name: srt-llama-3-3-70b-instruct-pd + decoder: + minReplicas: 1 + maxReplicas: 1 + # router: + # minReplicas: 1 + # maxReplicas: 1 + \ No newline at end of file