ome-projects · slin1237 · Dec 13, 2025 · Dec 12, 2025 · Dec 13, 2025
@@ -5,9 +5,10 @@ Deploy ClusterBaseModels, ClusterServingRuntimes, and InferenceServices for LLM
 ## Features
 
 - **Auto-detection**: Model architecture, transformers version, size range, and served name are automatically detected from the model name
-- **176 Models**: Built-in registry supports Qwen, Llama, DeepSeek, Mistral, Gemma, Phi, and more
+- **165 Models**: Built-in registry supports Qwen, Llama, DeepSeek, Mistral, Gemma, Phi, and more
 - **Simplified Storage**: Use `hfModelId` for HuggingFace or `oci` for OCI Object Storage
 - **Scope Options**: Create ClusterBaseModel (cluster-wide) or BaseModel (namespace-scoped)
+- **PD Mode**: Support for Prefill-Decode disaggregated serving with `pdMode: true`
 
 ## Installation
 
@@ -202,9 +203,37 @@ models:
       gpus: 2
 ```
 
+### PD Mode (Prefill-Decode Disaggregated)
+
+For models that support disaggregated serving, enable PD mode to deploy with separate prefill (engine) and decode (decoder) components:
+
+```yaml
+models:
+  kimi-k2-instruct:
+    enabled: true
+    pdMode: true           # Enable PD mode
+    vendor: moonshot
+    capabilities: [TEXT_TO_TEXT]
+    hfModelId: moonshotai/Kimi-K2-Instruct
+    runtime:
+      gpus: 8
+    # Optional: customize replicas for each component
+    engine:
+      minReplicas: 1
+      maxReplicas: 2
+    decoder:
+      minReplicas: 1
+      maxReplicas: 2
+    router:
+      minReplicas: 1
+      maxReplicas: 1
+```
+
+Models that support PD mode: `kimi-k2-instruct`, `deepseek-rdma`, `llama-3-1-70b-instruct`, `llama-3-2-1b-instruct`, `llama-3-2-3b-instruct`, `llama-3-3-70b-instruct`, `llama-4-maverick-17b-128e-instruct-fp8`, `llama-4-scout-17b-16e-instruct`, `mistral-7b-instruct`, `mixtral-8x7b-instruct`
+
 ## Supported Models
 
-The chart includes a built-in registry of **176 models**. Model names in values.yaml must match registry entries exactly.
+The chart includes a built-in registry of **165 models**. Model names in values.yaml must match registry entries exactly.
 
 ### Qwen3
 `qwen3-0-6b`, `qwen3-32b`, `qwen3-4b`, `qwen3-8b`, `qwen3-embedding-0-6b`, `qwen3-embedding-4b`, `qwen3-next-80b-a3b-instruct`
@@ -222,19 +251,19 @@ The chart includes a built-in registry of **176 models**. Model names in values.
 `deepseek-r1-distill-qwen-1-5b`, `deepseek-r1-distill-qwen-14b`, `deepseek-r1-distill-qwen-32b`, `deepseek-r1-distill-qwen-7b`, `gte-qwen2-7b-instruct`, `qwen-7b-chat`, `qwen1-5-110b-chat`, `qwen1-5-32b-chat`, `qwen1-5-72b-chat`, `qwen1-5-7b-chat`, `qwen2-5-1-5b`, `qwen2-5-14b`, `qwen2-5-32b-instruct`, `qwen2-5-3b`, `qwen2-5-72b-instruct`, `qwen2-5-7b`, `qwen2-5-coder-32b-instruct`, `qwen2-5-coder-7b-instruct`, `qwen2-72b-instruct`, `qwen2-7b-instruct`, `skywork-or1-7b-preview`
 
 ### Meta Llama 4
-`llama-4-maverick-17b-128e-instruct`, `llama-4-maverick-17b-128e-instruct-fp8`, `llama-4-maverick-17b-128e-instruct-fp8-grpc`, `llama-4-maverick-17b-128e-instruct-fp8-pd`, `llama-4-maverick-17b-128e-instruct-fp8-pd-grpc`, `llama-4-scout-17b-16e-instruct`, `llama-4-scout-17b-16e-instruct-pd`
+`llama-4-maverick-17b-128e-instruct`, `llama-4-maverick-17b-128e-instruct-fp8`, `llama-4-maverick-17b-128e-instruct-fp8-grpc`, `llama-4-scout-17b-16e-instruct`
 
 ### Meta Llama Vision
 `llama-3-2-11b-vision-instruct`, `llama-3-2-90b-vision-instruct`, `llama-3-2-90b-vision-instruct-fp8`
 
 ### Meta Llama
-`deepseek-coder-7b-instruct-v1-5`, `deepseek-llm-7b-chat`, `deepseek-r1-distill-llama-70b`, `deepseek-r1-distill-llama-8b`, `falcon3-10b-instruct`, `hermes-2-pro-llama-3-8b`, `llama-2-13b`, `llama-2-13b-chat-hf`, `llama-2-70b`, `llama-2-70b-chat-hf`, `llama-2-7b`, `llama-2-7b-chat-hf`, `llama-3-1-405b-instruct-fp8`, `llama-3-1-70b-instruct`, `llama-3-1-70b-instruct-pd`, `llama-3-1-8b-instruct`, `llama-3-1-8b-instruct-grpc`, `llama-3-1-nemotron-70b-instruct-hf`, `llama-3-1-nemotron-nano-8b-v1`, `llama-3-1-nemotron-ultra-253b-v1`, `llama-3-2-1b-instruct`, `llama-3-2-1b-instruct-pd`, `llama-3-2-3b-instruct`, `llama-3-2-3b-instruct-pd`, `llama-3-3-70b-instruct`, `llama-3-3-70b-instruct-fp8-dynamic`, `llama-3-3-70b-instruct-pd`, `llama-3-70b-instruct`, `llama-3-8b-instruct`, `llama-guard-3-8b`, `smollm-1-7b`, `smollm2-1-7b-instruct`, `solar-10-7b-instruct-v1-0`, `vicuna-13b-v1-5`, `vicuna-7b-v1-5`
+`deepseek-coder-7b-instruct-v1-5`, `deepseek-llm-7b-chat`, `deepseek-r1-distill-llama-70b`, `deepseek-r1-distill-llama-8b`, `falcon3-10b-instruct`, `hermes-2-pro-llama-3-8b`, `llama-2-13b`, `llama-2-13b-chat-hf`, `llama-2-70b`, `llama-2-70b-chat-hf`, `llama-2-7b`, `llama-2-7b-chat-hf`, `llama-3-1-405b-instruct-fp8`, `llama-3-1-70b-instruct`, `llama-3-1-8b-instruct`, `llama-3-1-8b-instruct-grpc`, `llama-3-1-nemotron-70b-instruct-hf`, `llama-3-1-nemotron-nano-8b-v1`, `llama-3-1-nemotron-ultra-253b-v1`, `llama-3-2-1b-instruct`, `llama-3-2-3b-instruct`, `llama-3-3-70b-instruct`, `llama-3-3-70b-instruct-fp8-dynamic`, `llama-3-70b-instruct`, `llama-3-8b-instruct`, `llama-guard-3-8b`, `smollm-1-7b`, `smollm2-1-7b-instruct`, `solar-10-7b-instruct-v1-0`, `vicuna-13b-v1-5`, `vicuna-7b-v1-5`
 
 ### LLaVA
 `llava-next-72b`, `llava-onevision-qwen2-7b-ov`, `llava-v1-5-13b`, `nvila-8b`
 
 ### DeepSeek V3
-`deepseek-rdma`, `deepseek-rdma-pd`, `deepseek-v3`, `deepseek-v3-0324`, `kimi-k2-instruct`, `kimi-k2-pd`
+`deepseek-rdma`, `deepseek-v3`, `deepseek-v3-0324`, `kimi-k2-instruct`
 
 ### DeepSeek V2
 `deepseek-v2-lite-chat`
@@ -249,10 +278,10 @@ The chart includes a built-in registry of **176 models**. Model names in values.
 `mistral-small-3-1-24b-instruct-2503`
 
 ### Mistral (Mixtral)
-`mixtral-8x22b`, `mixtral-8x7b`, `mixtral-8x7b-instruct`, `mixtral-8x7b-instruct-pd`
+`mixtral-8x22b`, `mixtral-8x7b`, `mixtral-8x7b-instruct`
 
 ### Mistral
-`e5-7b-mistral-instruct`, `e5-mistral-7b-instruct`, `mistral-7b-instruct`, `mistral-7b-instruct-pd`, `mistral-7b-instruct-v0-2`, `mistral-7b-instruct-v0-3`, `mistral-nemo-instruct-2407`
+`e5-mistral-7b-instruct`, `mistral-7b-instruct`, `mistral-7b-instruct-v0-2`, `mistral-7b-instruct-v0-3`, `mistral-nemo-instruct-2407`
 
 ### Google Gemma 3
 `gemma-3-12b-it`, `gemma-3-1b-it`, `gemma-3-4b-it`

@@ -297,13 +297,6 @@ llama-4-maverick-17b-128e-instruct-fp8-grpc:
   priority: 2
   sizeRange: ["400B", "402B"]
   servedName: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-llama-4-maverick-17b-128e-instruct-fp8-pd:
-  architecture: Llama4ForConditionalGeneration
-  transformersVersion: "4.51.0.dev0"
-  autoSelect: false
-  priority: 2
-  sizeRange: ["400B", "402B"]
-  servedName: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
 llama-4-maverick-17b-128e-instruct-fp8-pd-grpc:
   architecture: Llama4ForConditionalGeneration
   transformersVersion: "4.51.0"
@@ -318,13 +311,6 @@ llama-4-scout-17b-16e-instruct:
   priority: 2
   sizeRange: ["100B", "109B"]
   servedName: meta-llama/Llama-4-Scout-17B-16E-Instruct
-llama-4-scout-17b-16e-instruct-pd:
-  architecture: Llama4ForConditionalGeneration
-  transformersVersion: "4.51.0"
-  autoSelect: true
-  priority: 2
-  sizeRange: ["100B", "109B"]
-  servedName: meta-llama/Llama-4-Scout-17B-16E-Instruct
 
 # Llama models
 deepseek-coder-7b-instruct-v1-5:
@@ -425,13 +411,6 @@ llama-3-1-70b-instruct:
   priority: 1
   sizeRange: ["60B", "75B"]
   servedName: meta-llama/Meta-Llama-3.1-70B-Instruct
-llama-3-1-70b-instruct-pd:
-  architecture: LlamaForCausalLM
-  transformersVersion: "4.43.0"
-  autoSelect: false
-  priority: 1
-  sizeRange: ["60B", "75B"]
-  servedName: meta-llama/Llama-3.1-70B-Instruct
 llama-3-1-8b-instruct:
   architecture: LlamaForCausalLM
   transformersVersion: "4.42.3"
@@ -481,27 +460,13 @@ llama-3-2-1b-instruct:
   priority: 1
   sizeRange: ["500M", "2B"]
   servedName: meta-llama/Llama-3.2-1B-Instruct
-llama-3-2-1b-instruct-pd:
-  architecture: LlamaForCausalLM
-  transformersVersion: "4.43.0"
-  autoSelect: false
-  priority: 1
-  sizeRange: ["500M", "2B"]
-  servedName: meta-llama/Llama-3.2-1B-Instruct
 llama-3-2-3b-instruct:
   architecture: LlamaForCausalLM
   transformersVersion: "4.45.0.dev0"
   autoSelect: false
   priority: 1
   sizeRange: ["2B", "4B"]
   servedName: meta-llama/Llama-3.2-3B-Instruct
-llama-3-2-3b-instruct-pd:
-  architecture: LlamaForCausalLM
-  transformersVersion: "4.43.0"
-  autoSelect: false
-  priority: 1
-  sizeRange: ["2B", "4B"]
-  servedName: meta-llama/Llama-3.2-3B-Instruct
 llama-3-2-90b-vision-instruct:
   architecture: MllamaForConditionalGeneration
   transformersVersion: "4.45.0"
@@ -530,13 +495,6 @@ llama-3-3-70b-instruct-fp8-dynamic:
   priority: 1
   sizeRange: ["60B", "75B"]
   servedName: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-llama-3-3-70b-instruct-pd:
-  architecture: LlamaForCausalLM
-  transformersVersion: "4.45.0"
-  autoSelect: false
-  priority: 1
-  sizeRange: ["60B", "75B"]
-  servedName: meta-llama/Llama-3.3-70B-Instruct
 llama-3-70b-instruct:
   architecture: LlamaForCausalLM
   transformersVersion: "4.40.0.dev0"
@@ -616,13 +574,6 @@ deepseek-rdma:
   priority: 1
   sizeRange: ["650B", "700B"]
   servedName: deepseek-rdma
-deepseek-rdma-pd:
-  architecture: DeepseekV3ForCausalLM
-  transformersVersion: "4.33.1"
-  autoSelect: false
-  priority: 1
-  sizeRange: ["650B", "700B"]
-  servedName: deepseek-rdma-pd
 deepseek-v2-lite-chat:
   architecture: DeepseekV2ForCausalLM
   transformersVersion: "4.33.1"
@@ -658,13 +609,6 @@ kimi-k2-instruct:
   priority: 1
   sizeRange: ["900B", "1100B"]
   servedName: moonshotai/Kimi-K2-Instruct
-kimi-k2-pd:
-  architecture: DeepseekV3ForCausalLM
-  transformersVersion: "4.48.3"
-  autoSelect: true
-  priority: 1
-  sizeRange: ["1T", "1.5T"]
-  servedName: kimi-k2-pd
 
 # Mistral models
 e5-mistral-7b-instruct:
@@ -681,13 +625,6 @@ mistral-7b-instruct:
   priority: 1
   sizeRange: ["5B", "9B"]
   servedName: mistralai/Mistral-7B-Instruct-v0.2
-mistral-7b-instruct-pd:
-  architecture: MistralForCausalLM
-  transformersVersion: "4.36.0"
-  autoSelect: false
-  priority: 1
-  sizeRange: ["5B", "9B"]
-  servedName: mistralai/Mistral-7B-Instruct-v0.2
 mistral-7b-instruct-v0-2:
   architecture: MistralForCausalLM
   transformersVersion: "4.36.0"
@@ -737,13 +674,6 @@ mixtral-8x7b-instruct:
   priority: 1
   sizeRange: ["40B", "50B"]
   servedName: mistralai/Mixtral-8x7B-Instruct-v0.1
-mixtral-8x7b-instruct-pd:
-  architecture: MixtralForCausalLM
-  transformersVersion: "4.36.0.dev0"
-  autoSelect: false
-  priority: 1
-  sizeRange: ["45B", "50B"]
-  servedName: mistralai/Mixtral-8x7B-Instruct-v0.1
 
 # Gemma models
 gemma-2-27b-it:

@@ -4,6 +4,21 @@
 {{- if $model.namespaceScope }}
 {{-   $namespace = $model.namespace | default $modelName }}
 {{- end }}
+{{/*
+  PD mode: explicitly set via pdMode field
+  PD mode requires both decoder and router
+  Non-PD mode can optionally include router (but not decoder)
+*/}}
+{{- $isPdMode := $model.pdMode }}
+{{/*
+  Helper to get replica values - supports both nested (engine.minReplicas) and flat (minReplicas) format
+*/}}
+{{- $engineMinReplicas := coalesce (dig "engine" "minReplicas" nil $model) $model.minReplicas $.Values.defaults.minReplicas }}
+{{- $engineMaxReplicas := coalesce (dig "engine" "maxReplicas" nil $model) $model.maxReplicas $.Values.defaults.maxReplicas }}
+{{- $decoderMinReplicas := coalesce (dig "decoder" "minReplicas" nil $model) $.Values.defaults.minReplicas }}
+{{- $decoderMaxReplicas := coalesce (dig "decoder" "maxReplicas" nil $model) $.Values.defaults.maxReplicas }}
+{{- $routerMinReplicas := coalesce (dig "router" "minReplicas" nil $model) $.Values.defaults.minReplicas }}
+{{- $routerMaxReplicas := coalesce (dig "router" "maxReplicas" nil $model) $.Values.defaults.maxReplicas }}
 ---
 apiVersion: v1
 kind: Namespace
@@ -22,10 +37,23 @@ metadata:
 spec:
   model:
     name: {{ $modelName }}
-  runtime:
-    name: srt-{{ $modelName }}
   engine:
-    minReplicas: {{ $model.minReplicas | default $.Values.defaults.minReplicas }}
-    maxReplicas: {{ $model.maxReplicas | default $.Values.defaults.maxReplicas }}
+    minReplicas: {{ $engineMinReplicas }}
+    maxReplicas: {{ $engineMaxReplicas }}
+{{- if $isPdMode }}
+  decoder:
+    minReplicas: {{ $decoderMinReplicas }}
+    maxReplicas: {{ $decoderMaxReplicas }}
+  router:
+    minReplicas: {{ $routerMinReplicas }}
+    maxReplicas: {{ $routerMaxReplicas }}
+{{- else }}
+{{/* Non-PD mode: router is optional, decoder is not allowed */}}
+{{- if $model.router }}
+  router:
+    minReplicas: {{ $routerMinReplicas }}
+    maxReplicas: {{ $routerMaxReplicas }}
+{{- end }}
+{{- end }}
 {{- end }}
 {{- end }}