diff --git a/charts/ome-serving/values.yaml b/charts/ome-serving/values.yaml index 7d2256b3..df8e713d 100644 --- a/charts/ome-serving/values.yaml +++ b/charts/ome-serving/values.yaml @@ -204,15 +204,6 @@ models: runtime: gpus: 1 - deepseek-rdma: - enabled: false - createRuntime: true - vendor: other - capabilities: [TEXT_TO_TEXT] - hfModelId: deepseek-rdma - runtime: - gpus: 1 - deepseek-v2-lite-chat: enabled: false createRuntime: true @@ -420,24 +411,6 @@ models: runtime: gpus: 4 - gpt-oss-120b-bf16: - enabled: false - createRuntime: true - vendor: lmsys - capabilities: [TEXT_TO_TEXT] - hfModelId: lmsys/gpt-oss-120b-bf16 - runtime: - gpus: 8 - - gpt-oss-120b-grpc: - enabled: false - createRuntime: true - vendor: openai - capabilities: [TEXT_TO_TEXT] - hfModelId: openai/gpt-oss-120b - runtime: - gpus: 2 - gpt-oss-20b: enabled: false createRuntime: true @@ -447,24 +420,6 @@ models: runtime: gpus: 2 - gpt-oss-20b-bf16: - enabled: false - createRuntime: true - vendor: lmsys - capabilities: [TEXT_TO_TEXT] - hfModelId: lmsys/gpt-oss-20b-bf16 - runtime: - gpus: 2 - - gpt-oss-20b-grpc: - enabled: false - createRuntime: true - vendor: openai - capabilities: [TEXT_TO_TEXT] - hfModelId: openai/gpt-oss-20b - runtime: - gpus: 1 - granite-3-0-3b-a800m-instruct: enabled: false createRuntime: true @@ -609,7 +564,7 @@ models: runtime: gpus: 4 - llama-2-13b: + llama-2-13b-hf: enabled: false createRuntime: true vendor: meta @@ -627,7 +582,7 @@ models: runtime: gpus: 1 - llama-2-70b: + llama-2-70b-hf: enabled: false createRuntime: true vendor: meta @@ -690,15 +645,6 @@ models: runtime: gpus: 1 - llama-3-1-8b-instruct-grpc: - enabled: false - createRuntime: true - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-3.1-8B-Instruct - runtime: - gpus: 1 - llama-3-1-nemotron-70b-instruct-hf: enabled: false createRuntime: true @@ -834,24 +780,6 @@ models: runtime: gpus: 8 - llama-4-maverick-17b-128e-instruct-fp8-grpc: - enabled: false - createRuntime: true - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - runtime: - gpus: 8 - - llama-4-maverick-17b-128e-instruct-fp8-pd-grpc: - enabled: false - createRuntime: true - vendor: meta - capabilities: [TEXT_TO_TEXT] - hfModelId: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - runtime: - gpus: 8 - llama-4-scout-17b-16e-instruct: enabled: false createRuntime: true @@ -951,15 +879,6 @@ models: runtime: gpus: 4 - mistral-7b-instruct: - enabled: false - createRuntime: true - vendor: mistral - capabilities: [TEXT_TO_TEXT] - hfModelId: mistralai/Mistral-7B-Instruct-v0.2 - runtime: - gpus: 2 - mistral-7b-instruct-v0-2: enabled: false createRuntime: true @@ -967,7 +886,7 @@ models: capabilities: [TEXT_TO_TEXT] hfModelId: mistralai/Mistral-7B-Instruct-v0.2 runtime: - gpus: 1 + gpus: 2 mistral-7b-instruct-v0-3: enabled: false @@ -996,7 +915,7 @@ models: runtime: gpus: 2 - mixtral-8x22b: + mixtral-8x22b-v0-1: enabled: false createRuntime: true vendor: mistral @@ -1005,7 +924,7 @@ models: runtime: gpus: 8 - mixtral-8x7b: + mixtral-8x7b-v0-1: enabled: false createRuntime: true vendor: mistral @@ -1014,7 +933,7 @@ models: runtime: gpus: 4 - mixtral-8x7b-instruct: + mixtral-8x7b-instruct-v0-1: enabled: false createRuntime: true vendor: mistral diff --git a/config/models/Qwen/Qwen1.5-110B-Chat.yaml b/config/models/Qwen/Qwen1.5-110B-Chat.yaml new file mode 100644 index 00000000..978a84ac --- /dev/null +++ b/config/models/Qwen/Qwen1.5-110B-Chat.yaml @@ -0,0 +1,23 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: qwen1-5-110b-chat +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: Qwen + displayName: qwen.qwen1-5-110b-chat + disabled: false + version: "1.0.0" + modelArchitecture: Qwen2ForCausalLM + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.41.2" + modelParameterSize: 110B + storage: + storageUri: hf://Qwen/Qwen1.5-110B-Chat + path: /raid/models/Qwen/Qwen1.5-110B-Chat + key: "hf-token" diff --git a/config/models/Qwen/Qwen1.5-32B-Chat.yaml b/config/models/Qwen/Qwen1.5-32B-Chat.yaml new file mode 100644 index 00000000..403a1835 --- /dev/null +++ b/config/models/Qwen/Qwen1.5-32B-Chat.yaml @@ -0,0 +1,23 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: qwen1-5-32b-chat +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: Qwen + displayName: qwen.qwen1-5-32b-chat + disabled: false + version: "1.0.0" + modelArchitecture: Qwen2ForCausalLM + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.41.2" + modelParameterSize: 32B + storage: + storageUri: hf://Qwen/Qwen1.5-32B-Chat + path: /raid/models/Qwen/Qwen1.5-32B-Chat + key: "hf-token" diff --git a/config/models/Qwen/Qwen1.5-72B-Chat.yaml b/config/models/Qwen/Qwen1.5-72B-Chat.yaml new file mode 100644 index 00000000..b6509b09 --- /dev/null +++ b/config/models/Qwen/Qwen1.5-72B-Chat.yaml @@ -0,0 +1,23 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: qwen1-5-72b-chat +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: Qwen + displayName: qwen.qwen1-5-72b-chat + disabled: false + version: "1.0.0" + modelArchitecture: Qwen2ForCausalLM + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.41.2" + modelParameterSize: 72B + storage: + storageUri: hf://Qwen/Qwen1.5-72B-Chat + path: /raid/models/Qwen/Qwen1.5-72B-Chat + key: "hf-token" diff --git a/config/models/Qwen/Qwen1.5-7B-Chat.yaml b/config/models/Qwen/Qwen1.5-7B-Chat.yaml new file mode 100644 index 00000000..e59fd274 --- /dev/null +++ b/config/models/Qwen/Qwen1.5-7B-Chat.yaml @@ -0,0 +1,23 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: qwen1-5-7b-chat +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: Qwen + displayName: qwen.qwen1-5-7b-chat + disabled: false + version: "1.0.0" + modelArchitecture: Qwen2ForCausalLM + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.41.2" + modelParameterSize: 7B + storage: + storageUri: hf://Qwen/Qwen1.5-7B-Chat + path: /raid/models/Qwen/Qwen1.5-7B-Chat + key: "hf-token" diff --git a/config/models/google/gemma-2-27b-it.yaml b/config/models/google/gemma-2-27b-it.yaml new file mode 100644 index 00000000..4aac08c9 --- /dev/null +++ b/config/models/google/gemma-2-27b-it.yaml @@ -0,0 +1,16 @@ +# This model is gated on HuggingFace +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: gemma-2-27b-it +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: google + disabled: false + version: "1.0.0" + displayName: google.gemma-2-27b-it + storage: + storageUri: hf://google/gemma-2-27b-it + path: /raid/models/google/gemma-2-27b-it + key: "hf-token"