diff --git a/charts/ome-serving/README.md b/charts/ome-serving/README.md index c19bfd73..b3b8cfae 100644 --- a/charts/ome-serving/README.md +++ b/charts/ome-serving/README.md @@ -5,9 +5,10 @@ Deploy ClusterBaseModels, ClusterServingRuntimes, and InferenceServices for LLM ## Features - **Auto-detection**: Model architecture, transformers version, size range, and served name are automatically detected from the model name -- **176 Models**: Built-in registry supports Qwen, Llama, DeepSeek, Mistral, Gemma, Phi, and more +- **165 Models**: Built-in registry supports Qwen, Llama, DeepSeek, Mistral, Gemma, Phi, and more - **Simplified Storage**: Use `hfModelId` for HuggingFace or `oci` for OCI Object Storage - **Scope Options**: Create ClusterBaseModel (cluster-wide) or BaseModel (namespace-scoped) +- **PD Mode**: Support for Prefill-Decode disaggregated serving with `pdMode: true` ## Installation @@ -202,9 +203,37 @@ models: gpus: 2 ``` +### PD Mode (Prefill-Decode Disaggregated) + +For models that support disaggregated serving, enable PD mode to deploy with separate prefill (engine) and decode (decoder) components: + +```yaml +models: + kimi-k2-instruct: + enabled: true + pdMode: true # Enable PD mode + vendor: moonshot + capabilities: [TEXT_TO_TEXT] + hfModelId: moonshotai/Kimi-K2-Instruct + runtime: + gpus: 8 + # Optional: customize replicas for each component + engine: + minReplicas: 1 + maxReplicas: 2 + decoder: + minReplicas: 1 + maxReplicas: 2 + router: + minReplicas: 1 + maxReplicas: 1 +``` + +Models that support PD mode: `kimi-k2-instruct`, `deepseek-rdma`, `llama-3-1-70b-instruct`, `llama-3-2-1b-instruct`, `llama-3-2-3b-instruct`, `llama-3-3-70b-instruct`, `llama-4-maverick-17b-128e-instruct-fp8`, `llama-4-scout-17b-16e-instruct`, `mistral-7b-instruct`, `mixtral-8x7b-instruct` + ## Supported Models -The chart includes a built-in registry of **176 models**. Model names in values.yaml must match registry entries exactly. +The chart includes a built-in registry of **165 models**. Model names in values.yaml must match registry entries exactly. ### Qwen3 `qwen3-0-6b`, `qwen3-32b`, `qwen3-4b`, `qwen3-8b`, `qwen3-embedding-0-6b`, `qwen3-embedding-4b`, `qwen3-next-80b-a3b-instruct` @@ -222,19 +251,19 @@ The chart includes a built-in registry of **176 models**. Model names in values. `deepseek-r1-distill-qwen-1-5b`, `deepseek-r1-distill-qwen-14b`, `deepseek-r1-distill-qwen-32b`, `deepseek-r1-distill-qwen-7b`, `gte-qwen2-7b-instruct`, `qwen-7b-chat`, `qwen1-5-110b-chat`, `qwen1-5-32b-chat`, `qwen1-5-72b-chat`, `qwen1-5-7b-chat`, `qwen2-5-1-5b`, `qwen2-5-14b`, `qwen2-5-32b-instruct`, `qwen2-5-3b`, `qwen2-5-72b-instruct`, `qwen2-5-7b`, `qwen2-5-coder-32b-instruct`, `qwen2-5-coder-7b-instruct`, `qwen2-72b-instruct`, `qwen2-7b-instruct`, `skywork-or1-7b-preview` ### Meta Llama 4 -`llama-4-maverick-17b-128e-instruct`, `llama-4-maverick-17b-128e-instruct-fp8`, `llama-4-maverick-17b-128e-instruct-fp8-grpc`, `llama-4-maverick-17b-128e-instruct-fp8-pd`, `llama-4-maverick-17b-128e-instruct-fp8-pd-grpc`, `llama-4-scout-17b-16e-instruct`, `llama-4-scout-17b-16e-instruct-pd` +`llama-4-maverick-17b-128e-instruct`, `llama-4-maverick-17b-128e-instruct-fp8`, `llama-4-maverick-17b-128e-instruct-fp8-grpc`, `llama-4-scout-17b-16e-instruct` ### Meta Llama Vision `llama-3-2-11b-vision-instruct`, `llama-3-2-90b-vision-instruct`, `llama-3-2-90b-vision-instruct-fp8` ### Meta Llama -`deepseek-coder-7b-instruct-v1-5`, `deepseek-llm-7b-chat`, `deepseek-r1-distill-llama-70b`, `deepseek-r1-distill-llama-8b`, `falcon3-10b-instruct`, `hermes-2-pro-llama-3-8b`, `llama-2-13b`, `llama-2-13b-chat-hf`, `llama-2-70b`, `llama-2-70b-chat-hf`, `llama-2-7b`, `llama-2-7b-chat-hf`, `llama-3-1-405b-instruct-fp8`, `llama-3-1-70b-instruct`, `llama-3-1-70b-instruct-pd`, `llama-3-1-8b-instruct`, `llama-3-1-8b-instruct-grpc`, `llama-3-1-nemotron-70b-instruct-hf`, `llama-3-1-nemotron-nano-8b-v1`, `llama-3-1-nemotron-ultra-253b-v1`, `llama-3-2-1b-instruct`, `llama-3-2-1b-instruct-pd`, `llama-3-2-3b-instruct`, `llama-3-2-3b-instruct-pd`, `llama-3-3-70b-instruct`, `llama-3-3-70b-instruct-fp8-dynamic`, `llama-3-3-70b-instruct-pd`, `llama-3-70b-instruct`, `llama-3-8b-instruct`, `llama-guard-3-8b`, `smollm-1-7b`, `smollm2-1-7b-instruct`, `solar-10-7b-instruct-v1-0`, `vicuna-13b-v1-5`, `vicuna-7b-v1-5` +`deepseek-coder-7b-instruct-v1-5`, `deepseek-llm-7b-chat`, `deepseek-r1-distill-llama-70b`, `deepseek-r1-distill-llama-8b`, `falcon3-10b-instruct`, `hermes-2-pro-llama-3-8b`, `llama-2-13b`, `llama-2-13b-chat-hf`, `llama-2-70b`, `llama-2-70b-chat-hf`, `llama-2-7b`, `llama-2-7b-chat-hf`, `llama-3-1-405b-instruct-fp8`, `llama-3-1-70b-instruct`, `llama-3-1-8b-instruct`, `llama-3-1-8b-instruct-grpc`, `llama-3-1-nemotron-70b-instruct-hf`, `llama-3-1-nemotron-nano-8b-v1`, `llama-3-1-nemotron-ultra-253b-v1`, `llama-3-2-1b-instruct`, `llama-3-2-3b-instruct`, `llama-3-3-70b-instruct`, `llama-3-3-70b-instruct-fp8-dynamic`, `llama-3-70b-instruct`, `llama-3-8b-instruct`, `llama-guard-3-8b`, `smollm-1-7b`, `smollm2-1-7b-instruct`, `solar-10-7b-instruct-v1-0`, `vicuna-13b-v1-5`, `vicuna-7b-v1-5` ### LLaVA `llava-next-72b`, `llava-onevision-qwen2-7b-ov`, `llava-v1-5-13b`, `nvila-8b` ### DeepSeek V3 -`deepseek-rdma`, `deepseek-rdma-pd`, `deepseek-v3`, `deepseek-v3-0324`, `kimi-k2-instruct`, `kimi-k2-pd` +`deepseek-rdma`, `deepseek-v3`, `deepseek-v3-0324`, `kimi-k2-instruct` ### DeepSeek V2 `deepseek-v2-lite-chat` @@ -249,10 +278,10 @@ The chart includes a built-in registry of **176 models**. Model names in values. `mistral-small-3-1-24b-instruct-2503` ### Mistral (Mixtral) -`mixtral-8x22b`, `mixtral-8x7b`, `mixtral-8x7b-instruct`, `mixtral-8x7b-instruct-pd` +`mixtral-8x22b`, `mixtral-8x7b`, `mixtral-8x7b-instruct` ### Mistral -`e5-7b-mistral-instruct`, `e5-mistral-7b-instruct`, `mistral-7b-instruct`, `mistral-7b-instruct-pd`, `mistral-7b-instruct-v0-2`, `mistral-7b-instruct-v0-3`, `mistral-nemo-instruct-2407` +`e5-mistral-7b-instruct`, `mistral-7b-instruct`, `mistral-7b-instruct-v0-2`, `mistral-7b-instruct-v0-3`, `mistral-nemo-instruct-2407` ### Google Gemma 3 `gemma-3-12b-it`, `gemma-3-1b-it`, `gemma-3-4b-it` diff --git a/charts/ome-serving/templates/_helpers.tpl b/charts/ome-serving/templates/_helpers.tpl index d8b0fa91..e59313b6 100644 --- a/charts/ome-serving/templates/_helpers.tpl +++ b/charts/ome-serving/templates/_helpers.tpl @@ -297,13 +297,6 @@ llama-4-maverick-17b-128e-instruct-fp8-grpc: priority: 2 sizeRange: ["400B", "402B"] servedName: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -llama-4-maverick-17b-128e-instruct-fp8-pd: - architecture: Llama4ForConditionalGeneration - transformersVersion: "4.51.0.dev0" - autoSelect: false - priority: 2 - sizeRange: ["400B", "402B"] - servedName: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 llama-4-maverick-17b-128e-instruct-fp8-pd-grpc: architecture: Llama4ForConditionalGeneration transformersVersion: "4.51.0" @@ -318,13 +311,6 @@ llama-4-scout-17b-16e-instruct: priority: 2 sizeRange: ["100B", "109B"] servedName: meta-llama/Llama-4-Scout-17B-16E-Instruct -llama-4-scout-17b-16e-instruct-pd: - architecture: Llama4ForConditionalGeneration - transformersVersion: "4.51.0" - autoSelect: true - priority: 2 - sizeRange: ["100B", "109B"] - servedName: meta-llama/Llama-4-Scout-17B-16E-Instruct # Llama models deepseek-coder-7b-instruct-v1-5: @@ -425,13 +411,6 @@ llama-3-1-70b-instruct: priority: 1 sizeRange: ["60B", "75B"] servedName: meta-llama/Meta-Llama-3.1-70B-Instruct -llama-3-1-70b-instruct-pd: - architecture: LlamaForCausalLM - transformersVersion: "4.43.0" - autoSelect: false - priority: 1 - sizeRange: ["60B", "75B"] - servedName: meta-llama/Llama-3.1-70B-Instruct llama-3-1-8b-instruct: architecture: LlamaForCausalLM transformersVersion: "4.42.3" @@ -481,13 +460,6 @@ llama-3-2-1b-instruct: priority: 1 sizeRange: ["500M", "2B"] servedName: meta-llama/Llama-3.2-1B-Instruct -llama-3-2-1b-instruct-pd: - architecture: LlamaForCausalLM - transformersVersion: "4.43.0" - autoSelect: false - priority: 1 - sizeRange: ["500M", "2B"] - servedName: meta-llama/Llama-3.2-1B-Instruct llama-3-2-3b-instruct: architecture: LlamaForCausalLM transformersVersion: "4.45.0.dev0" @@ -495,13 +467,6 @@ llama-3-2-3b-instruct: priority: 1 sizeRange: ["2B", "4B"] servedName: meta-llama/Llama-3.2-3B-Instruct -llama-3-2-3b-instruct-pd: - architecture: LlamaForCausalLM - transformersVersion: "4.43.0" - autoSelect: false - priority: 1 - sizeRange: ["2B", "4B"] - servedName: meta-llama/Llama-3.2-3B-Instruct llama-3-2-90b-vision-instruct: architecture: MllamaForConditionalGeneration transformersVersion: "4.45.0" @@ -530,13 +495,6 @@ llama-3-3-70b-instruct-fp8-dynamic: priority: 1 sizeRange: ["60B", "75B"] servedName: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic -llama-3-3-70b-instruct-pd: - architecture: LlamaForCausalLM - transformersVersion: "4.45.0" - autoSelect: false - priority: 1 - sizeRange: ["60B", "75B"] - servedName: meta-llama/Llama-3.3-70B-Instruct llama-3-70b-instruct: architecture: LlamaForCausalLM transformersVersion: "4.40.0.dev0" @@ -616,13 +574,6 @@ deepseek-rdma: priority: 1 sizeRange: ["650B", "700B"] servedName: deepseek-rdma -deepseek-rdma-pd: - architecture: DeepseekV3ForCausalLM - transformersVersion: "4.33.1" - autoSelect: false - priority: 1 - sizeRange: ["650B", "700B"] - servedName: deepseek-rdma-pd deepseek-v2-lite-chat: architecture: DeepseekV2ForCausalLM transformersVersion: "4.33.1" @@ -658,13 +609,6 @@ kimi-k2-instruct: priority: 1 sizeRange: ["900B", "1100B"] servedName: moonshotai/Kimi-K2-Instruct -kimi-k2-pd: - architecture: DeepseekV3ForCausalLM - transformersVersion: "4.48.3" - autoSelect: true - priority: 1 - sizeRange: ["1T", "1.5T"] - servedName: kimi-k2-pd # Mistral models e5-mistral-7b-instruct: @@ -681,13 +625,6 @@ mistral-7b-instruct: priority: 1 sizeRange: ["5B", "9B"] servedName: mistralai/Mistral-7B-Instruct-v0.2 -mistral-7b-instruct-pd: - architecture: MistralForCausalLM - transformersVersion: "4.36.0" - autoSelect: false - priority: 1 - sizeRange: ["5B", "9B"] - servedName: mistralai/Mistral-7B-Instruct-v0.2 mistral-7b-instruct-v0-2: architecture: MistralForCausalLM transformersVersion: "4.36.0" @@ -737,13 +674,6 @@ mixtral-8x7b-instruct: priority: 1 sizeRange: ["40B", "50B"] servedName: mistralai/Mixtral-8x7B-Instruct-v0.1 -mixtral-8x7b-instruct-pd: - architecture: MixtralForCausalLM - transformersVersion: "4.36.0.dev0" - autoSelect: false - priority: 1 - sizeRange: ["45B", "50B"] - servedName: mistralai/Mixtral-8x7B-Instruct-v0.1 # Gemma models gemma-2-27b-it: diff --git a/charts/ome-serving/templates/inferenceservice.yaml b/charts/ome-serving/templates/inferenceservice.yaml index a69ee192..58f48ffc 100644 --- a/charts/ome-serving/templates/inferenceservice.yaml +++ b/charts/ome-serving/templates/inferenceservice.yaml @@ -4,6 +4,21 @@ {{- if $model.namespaceScope }} {{- $namespace = $model.namespace | default $modelName }} {{- end }} +{{/* + PD mode: explicitly set via pdMode field + PD mode requires both decoder and router + Non-PD mode can optionally include router (but not decoder) +*/}} +{{- $isPdMode := $model.pdMode }} +{{/* + Helper to get replica values - supports both nested (engine.minReplicas) and flat (minReplicas) format +*/}} +{{- $engineMinReplicas := coalesce (dig "engine" "minReplicas" nil $model) $model.minReplicas $.Values.defaults.minReplicas }} +{{- $engineMaxReplicas := coalesce (dig "engine" "maxReplicas" nil $model) $model.maxReplicas $.Values.defaults.maxReplicas }} +{{- $decoderMinReplicas := coalesce (dig "decoder" "minReplicas" nil $model) $.Values.defaults.minReplicas }} +{{- $decoderMaxReplicas := coalesce (dig "decoder" "maxReplicas" nil $model) $.Values.defaults.maxReplicas }} +{{- $routerMinReplicas := coalesce (dig "router" "minReplicas" nil $model) $.Values.defaults.minReplicas }} +{{- $routerMaxReplicas := coalesce (dig "router" "maxReplicas" nil $model) $.Values.defaults.maxReplicas }} --- apiVersion: v1 kind: Namespace @@ -22,10 +37,23 @@ metadata: spec: model: name: {{ $modelName }} - runtime: - name: srt-{{ $modelName }} engine: - minReplicas: {{ $model.minReplicas | default $.Values.defaults.minReplicas }} - maxReplicas: {{ $model.maxReplicas | default $.Values.defaults.maxReplicas }} + minReplicas: {{ $engineMinReplicas }} + maxReplicas: {{ $engineMaxReplicas }} +{{- if $isPdMode }} + decoder: + minReplicas: {{ $decoderMinReplicas }} + maxReplicas: {{ $decoderMaxReplicas }} + router: + minReplicas: {{ $routerMinReplicas }} + maxReplicas: {{ $routerMaxReplicas }} +{{- else }} +{{/* Non-PD mode: router is optional, decoder is not allowed */}} +{{- if $model.router }} + router: + minReplicas: {{ $routerMinReplicas }} + maxReplicas: {{ $routerMaxReplicas }} +{{- end }} +{{- end }} {{- end }} {{- end }} diff --git a/charts/ome-serving/values.yaml b/charts/ome-serving/values.yaml index 138cc329..2bd09a3e 100644 --- a/charts/ome-serving/values.yaml +++ b/charts/ome-serving/values.yaml @@ -190,14 +190,6 @@ models: runtime: gpus: 1 - deepseek-rdma-pd: - enabled: false - vendor: other - capabilities: [TEXT_TO_TEXT] - hfModelId: deepseek-rdma-pd - runtime: - gpus: 1 - deepseek-v2-lite-chat: enabled: false vendor: deepseek @@ -526,14 +518,6 @@ models: runtime: gpus: 8 - kimi-k2-pd: - enabled: false - vendor: other - capabilities: [TEXT_TO_TEXT] - hfModelId: kimi-k2-pd - runtime: - gpus: 1 - kimi-vl-a3b-instruct: enabled: false vendor: moonshot @@ -622,14 +606,6 @@ models: runtime: gpus: 4 - llama-3-1-70b-instruct-pd: - enabled: false - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-3.1-70B-Instruct - runtime: - gpus: 4 - llama-3-1-8b-instruct: enabled: false vendor: meta @@ -686,14 +662,6 @@ models: runtime: gpus: 1 - llama-3-2-1b-instruct-pd: - enabled: false - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-3.2-1B-Instruct - runtime: - gpus: 1 - llama-3-2-3b-instruct: enabled: false vendor: meta @@ -702,14 +670,6 @@ models: runtime: gpus: 1 - llama-3-2-3b-instruct-pd: - enabled: false - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-3.2-3B-Instruct - runtime: - gpus: 1 - llama-3-2-90b-vision-instruct: enabled: false vendor: meta @@ -742,14 +702,6 @@ models: runtime: gpus: 2 - llama-3-3-70b-instruct-pd: - enabled: false - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-3.3-70B-Instruct - runtime: - gpus: 4 - llama-3-3-nemotron-super-49b-v1: enabled: false vendor: meta @@ -798,14 +750,6 @@ models: runtime: gpus: 8 - llama-4-maverick-17b-128e-instruct-fp8-pd: - enabled: false - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - runtime: - gpus: 8 - llama-4-maverick-17b-128e-instruct-fp8-pd-grpc: enabled: false vendor: meta @@ -822,14 +766,6 @@ models: runtime: gpus: 4 - llama-4-scout-17b-16e-instruct-pd: - enabled: false - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-4-Scout-17B-16E-Instruct - runtime: - gpus: 4 - llama-guard-3-8b: enabled: false vendor: meta @@ -918,14 +854,6 @@ models: runtime: gpus: 2 - mistral-7b-instruct-pd: - enabled: false - vendor: mistral - capabilities: [TEXT_TO_TEXT] - hfModelId: mistralai/Mistral-7B-Instruct-v0.2 - runtime: - gpus: 2 - mistral-7b-instruct-v0-2: enabled: false vendor: mistral @@ -982,14 +910,6 @@ models: runtime: gpus: 4 - mixtral-8x7b-instruct-pd: - enabled: false - vendor: mistral - capabilities: [TEXT_TO_TEXT] - hfModelId: mistralai/Mixtral-8x7B-Instruct-v0.1 - runtime: - gpus: 2 - mpt-7b: enabled: false vendor: other