From e31e494cae7efa0b421c94184abcf0437473bd00 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 13:49:23 -0700 Subject: [PATCH 01/13] clean up deps --- go.mod | 4 ---- go.sum | 4 ---- 2 files changed, 8 deletions(-) diff --git a/go.mod b/go.mod index 62595c07..7fe6a39a 100644 --- a/go.mod +++ b/go.mod @@ -63,7 +63,6 @@ require ( github.com/NYTimes/gziphandler v1.1.1 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/antonmedv/expr v1.15.3 // indirect - github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/blendle/zapdriver v1.3.1 // indirect @@ -101,7 +100,6 @@ require ( github.com/google/cel-go v0.23.2 // indirect github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-containerregistry v0.16.1 // indirect - github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect @@ -109,12 +107,10 @@ require ( github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/hashicorp/hcl v1.0.0 // indirect - github.com/imdario/mergo v0.3.16 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kelseyhightower/envconfig v1.4.0 // indirect - github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/cpuid/v2 v2.2.7 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/leodido/go-urn v1.4.0 // indirect diff --git a/go.sum b/go.sum index 6906429c..cab24b93 100644 --- a/go.sum +++ b/go.sum @@ -58,8 +58,6 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/antonmedv/expr v1.15.3 h1:q3hOJZNvLvhqE8OHBs1cFRdbXFNKuA+bHmRaI+AmRmI= github.com/antonmedv/expr v1.15.3/go.mod h1:0E/6TxnOlRNp81GMzX9QfDPAmHo2Phg00y4JUv1ihsE= -github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= -github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -287,8 +285,6 @@ github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= -github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jarcoal/httpmock v1.2.0 h1:gSvTxxFR/MEMfsGrvRbdfpRUMBStovlSRLw0Ep1bwwc= From f1d13d51992aa0d3ed2da9d17447676ec39db5f1 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 14:17:57 -0700 Subject: [PATCH 02/13] update router image --- config/runtimes/srt/deepseek-rdma-rt.yaml | 2 +- config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml | 2 +- config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml | 2 +- config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml | 2 +- .../runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml | 2 +- config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml index 99b21eb3..33e4eb3f 100644 --- a/config/runtimes/srt/deepseek-rdma-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-rt.yaml @@ -35,7 +35,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml index 14d6b5a3..d661af78 100644 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml @@ -108,7 +108,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml index e3d7616e..42c6a0cb 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml @@ -110,7 +110,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml index a484dc2e..448e1980 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml @@ -108,7 +108,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml index ce55124c..71a957b7 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml @@ -110,7 +110,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml index 6d1d6842..3bc1b8b8 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml @@ -110,7 +110,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml index 4a1d0e7b..8686bac5 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml @@ -108,7 +108,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml index d7096f4f..ebcad544 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml @@ -110,7 +110,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index 70e1d0f2..bc66768a 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -109,7 +109,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml index 07be4bad..d9a86c63 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml @@ -110,7 +110,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" From 3ea784c75bf29a9f67031cc295f9fa7018d897da Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 14:33:05 -0700 Subject: [PATCH 03/13] add router config to sglang runtimes --- .../runtimes/srt/mistral-7b-instruct-rt.yaml | 33 ++++++++++++++++++- .../srt/mixtral-8x7b-instruct-rt.yaml | 33 ++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml index 72262542..53ff611b 100644 --- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml @@ -106,4 +106,35 @@ spec: successThreshold: 1 periodSeconds: 6 initialDelaySeconds: 60 - timeoutSeconds: 30 \ No newline at end of file + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml index 0a18d556..441de166 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml @@ -106,4 +106,35 @@ spec: successThreshold: 1 periodSeconds: 6 initialDelaySeconds: 60 - timeoutSeconds: 30 \ No newline at end of file + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file From d8e123a932712493f6c9f702031a2ad6f9ee76de Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 15:05:10 -0700 Subject: [PATCH 04/13] add pd runtimes --- .../llama-3-1-405b-instruct-fp8-pd-rt.yaml | 141 +++++++++++++++++ .../srt/llama-3-1-70b-instruct-pd-rt.yaml | 143 ++++++++++++++++++ .../llama-3-2-11b-vision-instruct-pd-rt.yaml | 141 +++++++++++++++++ .../srt/llama-3-2-1b-instruct-pd-rt.yaml | 143 ++++++++++++++++++ .../srt/llama-3-2-3b-instruct-pd-rt.yaml | 143 ++++++++++++++++++ ...ama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 141 +++++++++++++++++ .../srt/llama-3-3-70b-instruct-pd-rt.yaml | 143 ++++++++++++++++++ ...-maverick-17b-128e-instruct-fp8-pd-rt.yaml | 2 +- .../llama-4-scout-17b-16e-instruct-pd-rt.yaml | 143 ++++++++++++++++++ .../srt/mistral-7b-instruct-pd-rt.yaml | 143 ++++++++++++++++++ .../srt/mixtral-8x7b-instruct-pd-rt.yaml | 143 ++++++++++++++++++ 11 files changed, 1425 insertions(+), 1 deletion(-) create mode 100644 config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml create mode 100644 config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml create mode 100644 config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml create mode 100644 config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml create mode 100644 config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml create mode 100644 config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml create mode 100644 config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml create mode 100644 config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml create mode 100644 config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml create mode 100644 config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml new file mode 100644 index 00000000..6517ba8f --- /dev/null +++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml @@ -0,0 +1,141 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-1-405b-instruct-fp8 +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.43.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 400B + max: 410B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 8 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 128 + memory: 216Gi + nvidia.com/gpu: 8 + limits: + cpu: 128 + memory: 216Gi + nvidia.com/gpu: 8 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml new file mode 100644 index 00000000..dbf58ea5 --- /dev/null +++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml @@ -0,0 +1,143 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-1-70b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.42.3" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 60B + max: 75B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=4 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml new file mode 100644 index 00000000..14a8e52e --- /dev/null +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml @@ -0,0 +1,141 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-2-11b-vision-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.45.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: MllamaForConditionalGeneration + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 10B + max: 12B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 1 \ + --mem-frac=0.9 \ + --chat-template llama_3_vision + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml new file mode 100644 index 00000000..05432454 --- /dev/null +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -0,0 +1,143 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-2-1b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.45.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 500M + max: 2B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 1 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml new file mode 100644 index 00000000..e7903afd --- /dev/null +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -0,0 +1,143 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-2-3b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.45.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 2B + max: 4B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 1 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml new file mode 100644 index 00000000..fde24e3e --- /dev/null +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml @@ -0,0 +1,141 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-2-90b-vision-instruct-fp8 +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.46.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: MllamaForConditionalGeneration + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 85B + max: 95B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 4 \ + --mem-frac=0.9 \ + --chat-template llama_3_vision + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + limits: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml new file mode 100644 index 00000000..ce194b7a --- /dev/null +++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml @@ -0,0 +1,143 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-3-70b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.47.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 60B + max: 75B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=4 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml index b62ae96c..e57b1323 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml @@ -210,7 +210,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev13 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml new file mode 100644 index 00000000..757336a2 --- /dev/null +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -0,0 +1,143 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-4-scout-17b-16e-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.51.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: Llama4ForConditionalGeneration + autoSelect: true + priority: 2 + protocolVersions: + - openAI + modelSizeRange: + min: 100B + max: 109B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --model-path="$MODEL_PATH" \ + --tp 4 \ + --mem-frac=0.95 \ + --context-length=128000 \ + --chat-template llama-4 \ + --attention-backend fa3 \ + --log-requests + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + limits: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml new file mode 100644 index 00000000..69cab117 --- /dev/null +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -0,0 +1,143 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-mistral-7b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.36.2" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: MistralForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 5B + max: 9B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model="$MODEL_PATH"\ + --tp 2 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml new file mode 100644 index 00000000..6dcec754 --- /dev/null +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -0,0 +1,143 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-mmixtral-8x7b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.36.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: MixtralForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 45B + max: 50B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model="$MODEL_PATH"\ + --tp 2 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file From fe28d78a6bd4797ae28a3de3b14cd53b9f359fd8 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 15:52:22 -0700 Subject: [PATCH 05/13] add decoderconfig to pd runtimes --- .../llama-3-1-405b-instruct-fp8-pd-rt.yaml | 102 ++++++++++++++++- .../srt/llama-3-1-70b-instruct-pd-rt.yaml | 104 +++++++++++++++++- .../llama-3-2-11b-vision-instruct-pd-rt.yaml | 102 ++++++++++++++++- .../srt/llama-3-2-1b-instruct-pd-rt.yaml | 104 +++++++++++++++++- .../srt/llama-3-2-3b-instruct-pd-rt.yaml | 104 +++++++++++++++++- ...ama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 102 ++++++++++++++++- .../srt/llama-3-3-70b-instruct-pd-rt.yaml | 104 +++++++++++++++++- ...-maverick-17b-128e-instruct-fp8-pd-rt.yaml | 6 - ...a-4-maverick-17b-128e-instruct-fp8-rt.yaml | 3 - .../llama-4-scout-17b-16e-instruct-pd-rt.yaml | 104 +++++++++++++++++- .../srt/mistral-7b-instruct-pd-rt.yaml | 104 +++++++++++++++++- .../srt/mixtral-8x7b-instruct-pd-rt.yaml | 104 +++++++++++++++++- 12 files changed, 993 insertions(+), 50 deletions(-) diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml index 6517ba8f..f131b54c 100644 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 410B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -43,6 +46,8 @@ spec: operator: In values: - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -63,7 +68,98 @@ spec: --log-requests \ --model-path="$MODEL_PATH" \ --tp 8 \ - --mem-frac=0.9 + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 128 + memory: 216Gi + nvidia.com/gpu: 8 + limits: + cpu: 128 + memory: 216Gi + nvidia.com/gpu: 8 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 8 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -76,7 +172,6 @@ spec: cpu: 128 memory: 216Gi nvidia.com/gpu: 8 - readinessProbe: httpGet: path: /health_generate @@ -85,7 +180,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -94,7 +188,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate @@ -104,7 +197,6 @@ spec: periodSeconds: 6 initialDelaySeconds: 60 timeoutSeconds: 30 - routerConfig: runner: name: router diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml index dbf58ea5..2906b2f2 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 75B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -46,6 +49,8 @@ spec: - BM.GPU4.8 - BM.GPU.A100-v2.8 - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -66,7 +71,101 @@ spec: --log-requests \ --model-path="$MODEL_PATH" \ --tp-size=4 \ - --mem-frac=0.9 + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=4 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -79,7 +178,6 @@ spec: cpu: 10 memory: 160Gi nvidia.com/gpu: 4 - readinessProbe: httpGet: path: /health_generate @@ -88,7 +186,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -97,7 +194,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml index 14a8e52e..5c7ac9da 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 12B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -43,6 +46,8 @@ spec: operator: In values: - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -64,7 +69,99 @@ spec: --model-path="$MODEL_PATH" \ --tp 1 \ --mem-frac=0.9 \ - --chat-template llama_3_vision + --chat-template llama_3_vision \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 1 \ + --mem-frac=0.9 \ + --chat-template llama_3_vision \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -77,7 +174,6 @@ spec: cpu: 10 memory: 30Gi nvidia.com/gpu: 1 - readinessProbe: httpGet: path: /health_generate @@ -86,7 +182,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -95,7 +190,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml index 05432454..055d7a2a 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 2B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -46,6 +49,8 @@ spec: - BM.GPU4.8 - BM.GPU.A100-v2.8 - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -66,7 +71,101 @@ spec: --log-requests \ --model-path="$MODEL_PATH" \ --tp 1 \ - --mem-frac=0.9 + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 1 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -79,7 +178,6 @@ spec: cpu: 10 memory: 30Gi nvidia.com/gpu: 1 - readinessProbe: httpGet: path: /health_generate @@ -88,7 +186,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -97,7 +194,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml index e7903afd..b019ec2e 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 4B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -46,6 +49,8 @@ spec: - BM.GPU4.8 - BM.GPU.A100-v2.8 - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -66,7 +71,101 @@ spec: --log-requests \ --model-path="$MODEL_PATH" \ --tp 1 \ - --mem-frac=0.9 + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 1 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -79,7 +178,6 @@ spec: cpu: 10 memory: 30Gi nvidia.com/gpu: 1 - readinessProbe: httpGet: path: /health_generate @@ -88,7 +186,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -97,7 +194,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml index fde24e3e..49b54544 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 95B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -43,6 +46,8 @@ spec: operator: In values: - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -64,7 +69,99 @@ spec: --model-path="$MODEL_PATH" \ --tp 4 \ --mem-frac=0.9 \ - --chat-template llama_3_vision + --chat-template llama_3_vision \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + limits: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp 4 \ + --mem-frac=0.9 \ + --chat-template llama_3_vision \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -77,7 +174,6 @@ spec: cpu: 30 memory: 100Gi nvidia.com/gpu: 4 - readinessProbe: httpGet: path: /health_generate @@ -86,7 +182,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -95,7 +190,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml index ce194b7a..191181d2 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 75B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -46,6 +49,8 @@ spec: - BM.GPU4.8 - BM.GPU.A100-v2.8 - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -66,7 +71,101 @@ spec: --log-requests \ --model-path="$MODEL_PATH" \ --tp-size=4 \ - --mem-frac=0.9 + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=4 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -79,7 +178,6 @@ spec: cpu: 10 memory: 160Gi nvidia.com/gpu: 4 - readinessProbe: httpGet: path: /health_generate @@ -88,7 +186,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -97,7 +194,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml index e57b1323..976ec907 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml @@ -85,7 +85,6 @@ spec: cpu: 128 memory: 512Gi nvidia.com/gpu: 8 - readinessProbe: httpGet: path: /health_generate @@ -94,7 +93,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -103,7 +101,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate @@ -179,7 +176,6 @@ spec: cpu: 128 memory: 512Gi nvidia.com/gpu: 8 - readinessProbe: httpGet: path: /health_generate @@ -188,7 +184,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -197,7 +192,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index bc66768a..8c750599 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -78,7 +78,6 @@ spec: cpu: 128 memory: 512Gi nvidia.com/gpu: 8 - readinessProbe: httpGet: path: /health_generate @@ -87,7 +86,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -96,7 +94,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml index 757336a2..19510eea 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 109B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -43,6 +46,8 @@ spec: operator: In values: - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -66,7 +71,101 @@ spec: --context-length=128000 \ --chat-template llama-4 \ --attention-backend fa3 \ - --log-requests + --log-requests \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + limits: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --model-path="$MODEL_PATH" \ + --tp 4 \ + --mem-frac=0.95 \ + --context-length=128000 \ + --chat-template llama-4 \ + --attention-backend fa3 \ + --log-requests \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -79,7 +178,6 @@ spec: cpu: 64 memory: 256Gi nvidia.com/gpu: 4 - readinessProbe: httpGet: path: /health_generate @@ -88,7 +186,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -97,7 +194,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml index 69cab117..c594751b 100644 --- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 9B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -46,6 +49,8 @@ spec: - BM.GPU4.8 - BM.GPU.A100-v2.8 - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -66,7 +71,101 @@ spec: --log-requests \ --model="$MODEL_PATH"\ --tp 2 \ - --mem-frac=0.9 + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model="$MODEL_PATH"\ + --tp 2 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -79,7 +178,6 @@ spec: cpu: 10 memory: 30Gi nvidia.com/gpu: 2 - readinessProbe: httpGet: path: /health @@ -88,7 +186,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -97,7 +194,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml index 6dcec754..9b8da644 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -21,6 +21,9 @@ spec: max: 50B engineConfig: annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" @@ -46,6 +49,8 @@ spec: - BM.GPU4.8 - BM.GPU.A100-v2.8 - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true runner: name: ome-container image: ghcr.io/moirai-internal/sgl:dev2 @@ -66,7 +71,101 @@ spec: --log-requests \ --model="$MODEL_PATH"\ --tp 2 \ - --mem-frac=0.9 + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: ghcr.io/moirai-internal/sgl:dev2 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model="$MODEL_PATH"\ + --tp 2 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 volumeMounts: - mountPath: /dev/shm name: dshm @@ -79,7 +178,6 @@ spec: cpu: 10 memory: 30Gi nvidia.com/gpu: 2 - readinessProbe: httpGet: path: /health @@ -88,7 +186,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -97,7 +194,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health From 407aada8cae9ba5571b5123c1341de2e6756d49d Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 16:16:19 -0700 Subject: [PATCH 06/13] add -pd as suffix to runtime names --- config/runtimes/srt/deepseek-rdma-pd-rt.yaml | 2 +- config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml | 2 +- config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml | 2 +- .../runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 2 +- config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml | 2 +- config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml | 2 +- config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml | 2 +- config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml | 2 +- config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml index 2d05d410..5a54d7c8 100644 --- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-deepseek-pd-rdma + name: srt-deepseek-rdma-pd spec: disabled: false modelSizeRange: diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml index f131b54c..52b5022d 100644 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-1-405b-instruct-fp8 + name: srt-llama-3-1-405b-instruct-fp8-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml index 2906b2f2..2753bdad 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-1-70b-instruct + name: srt-llama-3-1-70b-instruct-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml index 5c7ac9da..d7e8766e 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-2-11b-vision-instruct + name: srt-llama-3-2-11b-vision-instruct-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml index 055d7a2a..61e82e48 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-2-1b-instruct + name: srt-llama-3-2-1b-instruct-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml index b019ec2e..72b38ebc 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-2-3b-instruct + name: srt-llama-3-2-3b-instruct-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml index 49b54544..fc34160f 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-2-90b-vision-instruct-fp8 + name: srt-llama-3-2-90b-vision-instruct-fp8-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml index 191181d2..1bcb0328 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-3-3-70b-instruct + name: srt-llama-3-3-70b-instruct-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml index 19510eea..6b8dc2b2 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-llama-4-scout-17b-16e-instruct + name: srt-llama-4-scout-17b-16e-instruct-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml index c594751b..b2bc39e7 100644 --- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-mistral-7b-instruct + name: srt-mistral-7b-instruct-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml index 9b8da644..c4aea6ef 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-mmixtral-8x7b-instruct + name: srt-mixtral-8x7b-instruct-pd spec: disabled: false supportedModelFormats: diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml index 441de166..62251b91 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-mmixtral-8x7b-instruct + name: srt-mixtral-8x7b-instruct spec: disabled: false supportedModelFormats: From e479760d9d381e42e6c2e78932e4de796b531c83 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 16:42:21 -0700 Subject: [PATCH 07/13] use --model-path for consistancy --- config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/mistral-7b-instruct-rt.yaml | 2 +- config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml index b2bc39e7..89ebc20a 100644 --- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -69,7 +69,7 @@ spec: --port=8080 \ --enable-metrics \ --log-requests \ - --model="$MODEL_PATH"\ + --model-path="$MODEL_PATH"\ --tp 2 \ --mem-frac=0.9 \ --disaggregation-mode prefill \ @@ -161,7 +161,7 @@ spec: --port=8080 \ --enable-metrics \ --log-requests \ - --model="$MODEL_PATH"\ + --model-path="$MODEL_PATH"\ --tp 2 \ --mem-frac=0.9 \ --disaggregation-mode decode \ diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml index 53ff611b..231c1842 100644 --- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml @@ -64,7 +64,7 @@ spec: --port=8080 \ --enable-metrics \ --log-requests \ - --model="$MODEL_PATH"\ + --model-path="$MODEL_PATH"\ --tp 2 \ --mem-frac=0.9 volumeMounts: diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml index c4aea6ef..ad2a4908 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -69,7 +69,7 @@ spec: --port=8080 \ --enable-metrics \ --log-requests \ - --model="$MODEL_PATH"\ + --model-path="$MODEL_PATH"\ --tp 2 \ --mem-frac=0.9 \ --disaggregation-mode prefill \ @@ -161,7 +161,7 @@ spec: --port=8080 \ --enable-metrics \ --log-requests \ - --model="$MODEL_PATH"\ + --model-path="$MODEL_PATH"\ --tp 2 \ --mem-frac=0.9 \ --disaggregation-mode decode \ diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml index 62251b91..c3e26913 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml @@ -64,7 +64,7 @@ spec: --port=8080 \ --enable-metrics \ --log-requests \ - --model="$MODEL_PATH"\ + --model-path="$MODEL_PATH"\ --tp 2 \ --mem-frac=0.9 volumeMounts: From db94f2bfb6f34db24dfa3f6ab56770d554c87e97 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 16:49:53 -0700 Subject: [PATCH 08/13] replace --tp with --tp-size for consistency --- config/runtimes/srt/deepseek-rdma-rt.yaml | 4 ++-- config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml | 2 +- .../runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml | 2 +- .../srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml | 4 ++-- .../srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml | 2 +- config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml | 2 +- config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/mistral-7b-instruct-rt.yaml | 2 +- config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml | 2 +- 20 files changed, 30 insertions(+), 30 deletions(-) diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml index 33e4eb3f..2c3bd513 100644 --- a/config/runtimes/srt/deepseek-rdma-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-rt.yaml @@ -98,7 +98,7 @@ spec: python3 -m sglang.launch_server --host 0.0.0.0 --port 8080 --model-path ${MODEL_PATH} - --tp 16 + --tp-size 16 --nccl-init $(LWS_LEADER_ADDRESS):5000 --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} @@ -169,7 +169,7 @@ spec: --host 0.0.0.0 --port 8080 --model-path ${MODEL_PATH} - --tp 16 + --tp-size 16 --nccl-init $(LWS_LEADER_ADDRESS):5000 --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} diff --git a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml index 2f9104ff..d139ddd9 100644 --- a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml @@ -38,7 +38,7 @@ spec: --port=8080 \ --enable-metrics \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --is-embedding volumeMounts: - mountPath: /dev/shm diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml index 52b5022d..1d005591 100644 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml @@ -67,7 +67,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 8 \ + --tp-size 8 \ --mem-frac=0.9 \ --disaggregation-mode prefill \ --disaggregation-ib-device mlx5_0 @@ -156,7 +156,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 8 \ + --tp-size 8 \ --mem-frac=0.9 \ --disaggregation-mode decode \ --disaggregation-ib-device mlx5_0 diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml index d661af78..c152183b 100644 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml @@ -62,7 +62,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 8 \ + --tp-size 8 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml index d7e8766e..b12b0c96 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml @@ -67,7 +67,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 \ --chat-template llama_3_vision \ --disaggregation-mode prefill \ @@ -157,7 +157,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 \ --chat-template llama_3_vision \ --disaggregation-mode decode \ diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml index 448e1980..d141e791 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml @@ -62,7 +62,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 \ --chat-template llama_3_vision volumeMounts: diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml index 61e82e48..58c9b505 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -70,7 +70,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 \ --disaggregation-mode prefill \ --disaggregation-ib-device mlx5_0 @@ -162,7 +162,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 \ --disaggregation-mode decode \ --disaggregation-ib-device mlx5_0 diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml index 71a957b7..38fe8459 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml @@ -65,7 +65,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml index 72b38ebc..c45678e5 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -70,7 +70,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 \ --disaggregation-mode prefill \ --disaggregation-ib-device mlx5_0 @@ -162,7 +162,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 \ --disaggregation-mode decode \ --disaggregation-ib-device mlx5_0 diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml index 3bc1b8b8..3c78795e 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml @@ -65,7 +65,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml index fc34160f..0e3b2ca1 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml @@ -67,7 +67,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 4 \ + --tp-size 4 \ --mem-frac=0.9 \ --chat-template llama_3_vision \ --disaggregation-mode prefill \ @@ -157,7 +157,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 4 \ + --tp-size 4 \ --mem-frac=0.9 \ --chat-template llama_3_vision \ --disaggregation-mode decode \ diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml index 8686bac5..3b38f67a 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml @@ -62,7 +62,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 4 \ + --tp-size 4 \ --mem-frac=0.9 \ --chat-template llama_3_vision volumeMounts: diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml index 976ec907..d8f2c123 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml @@ -68,7 +68,7 @@ spec: --model-path="$MODEL_PATH" \ --disaggregation-mode prefill \ --disaggregation-ib-device mlx5_0 \ - --tp 8 \ + --tp-size 8 \ --context-length=430000 \ --chat-template llama-4 \ --attention-backend fa3 \ @@ -159,7 +159,7 @@ spec: --model-path="$MODEL_PATH" \ --disaggregation-mode decode \ --disaggregation-ib-device mlx5_0 \ - --tp 8 \ + --tp-size 8 \ --context-length=430000 \ --chat-template llama-4 \ --attention-backend fa3 \ diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index 8c750599..12fd0ae9 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -61,7 +61,7 @@ spec: --port=8080 \ --enable-metrics \ --model-path="$MODEL_PATH" \ - --tp 8 \ + --tp-size 8 \ --context-length=430000 \ --chat-template llama-4 \ --attention-backend fa3 \ diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml index 6b8dc2b2..de4ca76b 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -66,7 +66,7 @@ spec: --port=8080 \ --enable-metrics \ --model-path="$MODEL_PATH" \ - --tp 4 \ + --tp-size 4 \ --mem-frac=0.95 \ --context-length=128000 \ --chat-template llama-4 \ @@ -158,7 +158,7 @@ spec: --port=8080 \ --enable-metrics \ --model-path="$MODEL_PATH" \ - --tp 4 \ + --tp-size 4 \ --mem-frac=0.95 \ --context-length=128000 \ --chat-template llama-4 \ diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml index d9a86c63..06f028e1 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml @@ -61,7 +61,7 @@ spec: --port=8080 \ --enable-metrics \ --model-path="$MODEL_PATH" \ - --tp 4 \ + --tp-size 4 \ --mem-frac=0.95 \ --context-length=128000 \ --chat-template llama-4 \ diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml index 89ebc20a..f8d9b501 100644 --- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -70,7 +70,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH"\ - --tp 2 \ + --tp-size 2 \ --mem-frac=0.9 \ --disaggregation-mode prefill \ --disaggregation-ib-device mlx5_0 @@ -162,7 +162,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH"\ - --tp 2 \ + --tp-size 2 \ --mem-frac=0.9 \ --disaggregation-mode decode \ --disaggregation-ib-device mlx5_0 diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml index 231c1842..12812e7e 100644 --- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml @@ -65,7 +65,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH"\ - --tp 2 \ + --tp-size 2 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml index ad2a4908..9d15dcce 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -70,7 +70,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH"\ - --tp 2 \ + --tp-size 2 \ --mem-frac=0.9 \ --disaggregation-mode prefill \ --disaggregation-ib-device mlx5_0 @@ -162,7 +162,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH"\ - --tp 2 \ + --tp-size 2 \ --mem-frac=0.9 \ --disaggregation-mode decode \ --disaggregation-ib-device mlx5_0 diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml index c3e26913..973045c4 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml @@ -65,7 +65,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH"\ - --tp 2 \ + --tp-size 2 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm From 2393a8bff413f0f9f874777a5338e863db0b5d50 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 17:19:26 -0700 Subject: [PATCH 09/13] use latest sgl image in runtimes --- config/runtimes/srt/deepseek-rdma-pd-rt.yaml | 8 ++++---- config/runtimes/srt/deepseek-rdma-rt.yaml | 4 ++-- config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml | 2 +- .../runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml | 2 +- config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml | 2 +- .../runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml | 2 +- config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml | 2 +- .../srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 4 ++-- .../srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml | 2 +- config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml | 2 +- .../srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml | 4 ++-- .../srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml | 2 +- .../srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml | 4 ++-- .../runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml | 2 +- config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/mistral-7b-instruct-rt.yaml | 2 +- config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml | 4 ++-- config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml | 2 +- 25 files changed, 40 insertions(+), 40 deletions(-) diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml index 5a54d7c8..4226b0d9 100644 --- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml @@ -59,7 +59,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 env: - name: NVSHMEM_ENABLE_NIC_PE_MAPPING value: "1" @@ -158,7 +158,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -240,7 +240,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 env: - name: SGLANG_MOONCAKE_TRANS_THREAD value: "16" @@ -330,7 +330,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml index 2c3bd513..e65af7fa 100644 --- a/config/runtimes/srt/deepseek-rdma-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-rt.yaml @@ -90,7 +90,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -160,7 +160,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c diff --git a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml index d139ddd9..b74a82e4 100644 --- a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml @@ -22,7 +22,7 @@ spec: engineConfig: runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml index 1d005591..c72f3a62 100644 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml @@ -50,7 +50,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -139,7 +139,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml index c152183b..3855879a 100644 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml @@ -45,7 +45,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml index 2753bdad..e94c62bc 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml @@ -53,7 +53,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -145,7 +145,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml index 42c6a0cb..b93995ae 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml @@ -48,7 +48,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml index b12b0c96..6742148d 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml @@ -50,7 +50,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -140,7 +140,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml index d141e791..009e84f9 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml @@ -45,7 +45,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml index 58c9b505..9728c48e 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -53,7 +53,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -145,7 +145,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml index 38fe8459..5dab9f3b 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml @@ -48,7 +48,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml index c45678e5..8720109d 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -53,7 +53,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -145,7 +145,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml index 3c78795e..1e73f347 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml @@ -48,7 +48,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml index 0e3b2ca1..e5882b60 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml @@ -50,7 +50,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -140,7 +140,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml index 3b38f67a..a3d334f6 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml @@ -45,7 +45,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml index 1bcb0328..43a18ca9 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml @@ -53,7 +53,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -145,7 +145,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml index ebcad544..977e5c88 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml @@ -48,7 +48,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml index d8f2c123..416a82f0 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml @@ -50,7 +50,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -141,7 +141,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index 12fd0ae9..a8cd4997 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -45,7 +45,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml index de4ca76b..9d1f61e1 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -50,7 +50,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -142,7 +142,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml index 06f028e1..8d50f476 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml @@ -45,7 +45,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml index f8d9b501..961f0a4c 100644 --- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -53,7 +53,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -145,7 +145,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml index 12812e7e..d8a25498 100644 --- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml @@ -48,7 +48,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml index 9d15dcce..961278d9 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -53,7 +53,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -145,7 +145,7 @@ spec: hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml index 973045c4..1897089d 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml @@ -48,7 +48,7 @@ spec: - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 From d63c7c2df335e5845fc9a89936e842c0e4dd968b Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 17:36:20 -0700 Subject: [PATCH 10/13] fix image repo --- Makefile | 2 +- charts/ome-resources/values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index eea9a0ed..fb0d4f39 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CHARTS_DIR := ./charts # Define the registry and image tagging -REGISTRY ?= ghcr.io/moirai-internal/ome +REGISTRY ?= ghcr.io/moirai-internal TAG ?= $(GIT_TAG) ARCH ?= linux/amd64 MANAGER_IMG ?= $(REGISTRY)/manager:$(TAG) diff --git a/charts/ome-resources/values.yaml b/charts/ome-resources/values.yaml index 193b1b91..aa5c8729 100644 --- a/charts/ome-resources/values.yaml +++ b/charts/ome-resources/values.yaml @@ -56,7 +56,7 @@ ome: cpu: 2 memory: 4Gi omeAgent: - image: ghcr.io/moirai-internal/genai-ome-agent + image: ghcr.io/moirai-internal/ome-agent tag: *defaultVersion authType: InstancePrincipal compartmentId: ocid1.compartment.oc1..dummy-compartment From b0c7d9eabf44f9f112be65e0eae57db026b243aa Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 17:47:13 -0700 Subject: [PATCH 11/13] fix ome manager img name --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fb0d4f39..10124723 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ CHARTS_DIR := ./charts REGISTRY ?= ghcr.io/moirai-internal TAG ?= $(GIT_TAG) ARCH ?= linux/amd64 -MANAGER_IMG ?= $(REGISTRY)/manager:$(TAG) +MANAGER_IMG ?= $(REGISTRY)/ome-manager:$(TAG) # Git version and commit information for build version_pkg = github.com/sgl-project/ome/pkg/version From 838f12edef98749fbae54f5aa0b47cf9ab20bed1 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 17:48:17 -0700 Subject: [PATCH 12/13] cleanup dep --- hack/internal/tools/go.sum | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/hack/internal/tools/go.sum b/hack/internal/tools/go.sum index 8f3b5ba9..4410b7e0 100644 --- a/hack/internal/tools/go.sum +++ b/hack/internal/tools/go.sum @@ -737,8 +737,7 @@ github.com/frankban/quicktest v1.14.4/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7z github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= -github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= -github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= @@ -797,8 +796,7 @@ github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5x github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/gohugoio/hugo v0.142.0 h1:gOVP52kHxr5dByyKgo/74s35tLIcHiHVwojQ4fmd3A4= -github.com/gohugoio/hugo v0.142.0/go.mod h1:G0uwM5aRUXN4cbnqrDQx9Dlgmf/ukUpPADajL8FbL9M= +github.com/gohugoio/hugo v0.147.7 h1:7qQKI8wsPgF1ipYBcXgM8wFmqTyFpkmzqLEf3hpzpT8= github.com/gohugoio/hugo v0.147.7/go.mod h1:gBn9Oi4LomFk1XS9raAPHdxaPrhPoF8ZfRrEcZZFGpo= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= @@ -1093,8 +1091,7 @@ github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4= -github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= -github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= +github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= @@ -1164,13 +1161,11 @@ github.com/spf13/afero v1.3.3/go.mod h1:5KUK8ByomD5Ti5Artl0RtHeI5pTF7MIDuXL3yY52 github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= github.com/spf13/afero v1.9.2/go.mod h1:iUV7ddyEEZPO5gA3zD4fJt6iStLlL+Lg4m2cihcDf8Y= github.com/spf13/afero v1.9.5/go.mod h1:UBogFpq8E9Hx+xc5CNTTEpTnuHVmXDwZcZcE1eb/UhQ= -github.com/spf13/afero v1.12.0 h1:UcOPyRBYczmFn6yvphxkn9ZEOY65cpwGKb5mL36mrqs= -github.com/spf13/afero v1.12.0/go.mod h1:ZTlWwG4/ahT8W7T0WQ5uYmjI9duaLQGy3Q2OAl4sk/4= +github.com/spf13/afero v1.14.0 h1:9tH6MapGnn/j0eb0yIXiLjERO8RB6xIVZRDCX7PtqWA= github.com/spf13/afero v1.14.0/go.mod h1:acJQ8t0ohCGuMN3O+Pv0V0hgMxNYDlvdk+VTfyZmbYo= github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cast v1.5.1/go.mod h1:b9PdjNptOpzXr7Rq1q9gJML/2cdGQAo69NKzQ10KN48= -github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y= -github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/cast v1.8.0 h1:gEN9K4b8Xws4EX0+a0reLmhq8moKn7ntRlQYgjPeCDk= github.com/spf13/cast v1.8.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= @@ -1447,8 +1442,7 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220819030929-7fc1605a5dde/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= -golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -1578,8 +1572,7 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= -golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1587,8 +1580,7 @@ golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/time v0.0.0-20220922220347-f3bd1da661af/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.1.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= -golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.10.0 h1:3usCWA8tQn0L8+hFJQNgzpWbd89begxN66o1Ojdn5L4= golang.org/x/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= From 6fdd662d96a0a88f2fe6d840c29f53da33f9e2d7 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Thu, 26 Jun 2025 17:51:49 -0700 Subject: [PATCH 13/13] add pd to runtime kustomize --- config/runtimes/kustomization.yaml | 3 + config/runtimes/srt/deepseek-rdma-pd-rt.yaml | 66 +---- config/runtimes/srt/deepseek-rdma-rt.yaml | 30 +-- .../srt/e5-mistral-7b-instruct-rt.yaml | 16 +- .../llama-3-1-405b-instruct-fp8-pd-rt.yaml | 233 ------------------ .../srt/llama-3-1-405b-instruct-fp8-rt.yaml | 138 ----------- .../srt/llama-3-1-70b-instruct-pd-rt.yaml | 28 +-- .../srt/llama-3-1-70b-instruct-rt.yaml | 14 +- .../llama-3-2-11b-vision-instruct-pd-rt.yaml | 17 +- .../srt/llama-3-2-11b-vision-instruct-rt.yaml | 11 +- .../srt/llama-3-2-1b-instruct-pd-rt.yaml | 28 +-- .../srt/llama-3-2-1b-instruct-rt.yaml | 14 +- .../srt/llama-3-2-3b-instruct-pd-rt.yaml | 20 +- .../srt/llama-3-2-3b-instruct-rt.yaml | 14 +- ...ama-3-2-90b-vision-instruct-fp8-pd-rt.yaml | 22 +- .../llama-3-2-90b-vision-instruct-fp8-rt.yaml | 11 +- .../srt/llama-3-3-70b-instruct-pd-rt.yaml | 28 +-- .../srt/llama-3-3-70b-instruct-rt.yaml | 14 +- ...-maverick-17b-128e-instruct-fp8-pd-rt.yaml | 22 +- ...a-4-maverick-17b-128e-instruct-fp8-rt.yaml | 11 +- .../llama-4-scout-17b-16e-instruct-pd-rt.yaml | 22 +- .../llama-4-scout-17b-16e-instruct-rt.yaml | 11 +- .../srt/mistral-7b-instruct-pd-rt.yaml | 28 +-- .../runtimes/srt/mistral-7b-instruct-rt.yaml | 14 +- .../srt/mixtral-8x7b-instruct-pd-rt.yaml | 28 +-- .../srt/mixtral-8x7b-instruct-rt.yaml | 14 +- config/runtimes/vllm/deepseek-v3-rdma-rt.yaml | 13 - .../vllm/e5-mistral-7b-instruct-rt.yaml | 12 - .../vllm/llama-3-1-405b-instruct-fp8-rt.yaml | 12 - .../vllm/llama-3-1-70b-instruct-rt.yaml | 12 - .../llama-3-1-nemotron-nano-8b-v1-rt.yaml | 12 - .../llama-3-1-nemotron-ultra-253b-v1-rt.yaml | 9 - .../llama-3-2-11b-vision-instruct-rt.yaml | 9 - .../vllm/llama-3-2-1b-instruct-rt.yaml | 12 - .../vllm/llama-3-2-3b-instruct-rt.yaml | 12 - .../llama-3-2-90b-vision-instruct-fp8-rt.yaml | 9 - ...llama-3-3-70b-instruct-fp8-dynamic-rt.yaml | 10 - .../vllm/llama-3-3-70b-instruct-rt.yaml | 12 - .../llama-3-3-nemotron-super-49b-v1-rt.yaml | 12 - .../vllm/llama-3-70b-instruct-rt.yaml | 12 - ...a-4-maverick-17b-128e-instruct-fp8-rt.yaml | 9 - .../llama-4-scout-17b-16e-instruct-rt.yaml | 9 - .../runtimes/vllm/mistral-7b-instruct-rt.yaml | 12 - .../vllm/mixtral-8x7b-instruct-rt.yaml | 12 - .../vllm/phi-3-vision-128k-instruct-rt.yaml | 12 - 45 files changed, 48 insertions(+), 1021 deletions(-) delete mode 100644 config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml delete mode 100644 config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml index 9da0e862..111d1161 100644 --- a/config/runtimes/kustomization.yaml +++ b/config/runtimes/kustomization.yaml @@ -5,6 +5,9 @@ resources: - srt/deepseek-rdma-pd-rt.yaml - srt/deepseek-rdma-rt.yaml - srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +- srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml - srt/llama-4-scout-17b-16e-instruct-rt.yaml +- srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml - srt/e5-mistral-7b-instruct-rt.yaml - srt/llama-3-3-70b-instruct-rt.yaml +- srt/llama-3-3-70b-instruct-pd-rt.yaml diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml index 4226b0d9..a5ec1663 100644 --- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml @@ -44,22 +44,9 @@ spec: effect: "NoSchedule" dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 env: - name: NVSHMEM_ENABLE_NIC_PE_MAPPING value: "1" @@ -137,28 +124,15 @@ spec: timeoutSeconds: 30 worker: size: 1 - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -205,7 +179,7 @@ spec: value: "0" - name: NVSHMEM_ENABLE_NIC_PE_MAPPING value: "1" - - name: SGLANG_MOONCAKE_TRANS_THREAD + - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE value: "8" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" @@ -225,24 +199,11 @@ spec: effect: "NoSchedule" dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 env: - - name: SGLANG_MOONCAKE_TRANS_THREAD + - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE value: "16" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" @@ -309,28 +270,15 @@ spec: timeoutSeconds: 30 worker: size: 1 - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -372,7 +320,7 @@ spec: value: "0" - name: NVSHMEM_IB_TRAFFIC_CLASS value: "16" - - name: SGLANG_MOONCAKE_TRANS_THREAD + - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE value: "16" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml index e65af7fa..a791a7b1 100644 --- a/config/runtimes/srt/deepseek-rdma-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-rt.yaml @@ -75,22 +75,9 @@ spec: effect: "NoSchedule" dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -139,28 +126,15 @@ spec: timeoutSeconds: 30 worker: size: 1 - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c diff --git a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml index b74a82e4..0b22c744 100644 --- a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml @@ -22,7 +22,7 @@ spec: engineConfig: runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -93,16 +93,4 @@ spec: volumes: - name: dshm emptyDir: - medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 \ No newline at end of file + medium: Memory \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml deleted file mode 100644 index c72f3a62..00000000 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-pd-rt.yaml +++ /dev/null @@ -1,233 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterServingRuntime -metadata: - name: srt-llama-3-1-405b-instruct-fp8-pd -spec: - disabled: false - supportedModelFormats: - - modelFramework: - name: transformers - version: "4.43.0.dev0" - modelFormat: - name: safetensors - version: "1" - modelArchitecture: LlamaForCausalLM - autoSelect: false - priority: 1 - protocolVersions: - - openAI - modelSizeRange: - min: 400B - max: 410B - engineConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 8 \ - --mem-frac=0.9 \ - --disaggregation-mode prefill \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 128 - memory: 216Gi - nvidia.com/gpu: 8 - limits: - cpu: 128 - memory: 216Gi - nvidia.com/gpu: 8 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - decoderConfig: - annotations: - rdma.ome.io/auto-inject: "true" - rdma.ome.io/profile: "oci-roce" - rdma.ome.io/container-name: "ome-container" - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 - dnsPolicy: ClusterFirstWithHostNet - hostNetwork: true - runner: - name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 8 \ - --mem-frac=0.9 \ - --disaggregation-mode decode \ - --disaggregation-ib-device mlx5_0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 128 - memory: 216Gi - nvidia.com/gpu: 8 - limits: - cpu: 128 - memory: 216Gi - nvidia.com/gpu: 8 - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - routerConfig: - runner: - name: router - image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 - resources: - limits: - cpu: "1" - memory: "2Gi" - ports: - - containerPort: 8080 - name: http - command: - - sh - - -c - - > - python3 -m sglang_router.launch_router - --host "0.0.0.0" - --port "8080" - --pd-disaggregation - --policy power_of_two - --service-discovery - --service-discovery-namespace "${NAMESPACE}" - --service-discovery-port 8080 - --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} - --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: INFERENCESERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['ome.io/inferenceservice'] diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml deleted file mode 100644 index 3855879a..00000000 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml +++ /dev/null @@ -1,138 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterServingRuntime -metadata: - name: srt-llama-3-1-405b-instruct-fp8 -spec: - disabled: false - supportedModelFormats: - - modelFramework: - name: transformers - version: "4.43.0.dev0" - modelFormat: - name: safetensors - version: "1.0.0" - modelArchitecture: LlamaForCausalLM - autoSelect: false - priority: 1 - protocolVersions: - - openAI - modelSizeRange: - min: 400B - max: 410B - engineConfig: - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 - runner: - name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp-size 8 \ - --mem-frac=0.9 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 128 - memory: 216Gi - nvidia.com/gpu: 8 - limits: - cpu: 128 - memory: 216Gi - nvidia.com/gpu: 8 - - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - - routerConfig: - runner: - name: router - image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 - resources: - limits: - cpu: "1" - memory: "2Gi" - ports: - - containerPort: 8080 - name: http - command: - - sh - - -c - - > - python3 -m sglang_router.launch_router - --host "0.0.0.0" - --port "8080" - --service-discovery - --service-discovery-namespace "${NAMESPACE}" - --service-discovery-port 8080 - --selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: INFERENCESERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['ome.io/inferenceservice'] diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml index e94c62bc..55f20a7e 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml @@ -37,23 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -129,23 +117,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml index b93995ae..43636215 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml index 6742148d..ec8764a6 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml @@ -37,20 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -127,8 +118,8 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: +# affinity: +# nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: @@ -140,7 +131,7 @@ spec: hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml index 009e84f9..a27e52fb 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml @@ -34,18 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml index 9728c48e..21797d18 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -37,23 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -129,23 +117,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml index 5dab9f3b..412206c3 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml index 8720109d..0231121e 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -37,23 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -129,8 +117,8 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: +# affinity: +# nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: @@ -145,7 +133,7 @@ spec: hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml index 1e73f347..b7e7d36d 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml index e5882b60..9052b5b0 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml @@ -37,20 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -127,20 +118,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml index a3d334f6..2ef9d4d4 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml @@ -34,18 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml index 43a18ca9..3f02b061 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml @@ -37,23 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -129,23 +117,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml index 977e5c88..3337e988 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml index 416a82f0..40cbe8d8 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml @@ -37,20 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -128,20 +119,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index a8cd4997..8e78c01c 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -34,18 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml index 9d1f61e1..8c64685d 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -37,20 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -129,20 +120,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml index 8d50f476..2609ed15 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml @@ -34,18 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml index 961f0a4c..5869bd53 100644 --- a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -37,23 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -129,23 +117,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml index d8a25498..7b193fa5 100644 --- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml index 961278d9..8f75eed8 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -37,23 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -129,23 +117,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml index 1897089d..0382daa3 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: lmsysorg/sglang:v0.4.8.post1-cu126 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 diff --git a/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml b/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml index 8b29a3b0..dc1fc2b2 100644 --- a/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml +++ b/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml @@ -33,19 +33,6 @@ spec: effect: "NoSchedule" dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml index 2c92af6f..92f869d5 100644 --- a/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml +++ b/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml index 88f44125..296b386e 100644 --- a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml @@ -31,18 +31,6 @@ spec: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml index 125a6c17..10d42567 100644 --- a/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml b/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml index b76ece6d..e0f122f8 100644 --- a/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml b/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml index 0628b078..96b1ad4e 100644 --- a/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml @@ -31,15 +31,6 @@ spec: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml index 82d8d1c1..09a00a44 100644 --- a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml @@ -45,15 +45,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml index 1d4d074e..bc873896 100644 --- a/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml index f72a2170..e7030b78 100644 --- a/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml index a5dac905..88c7c02a 100644 --- a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml @@ -35,15 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml index cb676165..9f97cd3e 100644 --- a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml +++ b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml @@ -36,16 +36,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml index c4679b18..e43fbb09 100644 --- a/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml @@ -39,18 +39,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml b/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml index 90a9a004..9de1e31b 100644 --- a/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml +++ b/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml index e1f74933..2823e997 100644 --- a/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index 5a6a851b..e05a7d10 100644 --- a/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -33,15 +33,6 @@ spec: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml index 6fd01f5c..1d8d0f55 100644 --- a/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml @@ -32,15 +32,6 @@ spec: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/mistral-7b-instruct-rt.yaml b/config/runtimes/vllm/mistral-7b-instruct-rt.yaml index 18404026..39d1c175 100644 --- a/config/runtimes/vllm/mistral-7b-instruct-rt.yaml +++ b/config/runtimes/vllm/mistral-7b-instruct-rt.yaml @@ -34,18 +34,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml index 73c46c41..5279859f 100644 --- a/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml @@ -34,18 +34,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml b/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml index 2b4c159c..c555ac27 100644 --- a/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml +++ b/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml @@ -34,18 +34,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1