diff --git a/Makefile b/Makefile index eea9a0ed..10124723 100644 --- a/Makefile +++ b/Makefile @@ -2,10 +2,10 @@ CHARTS_DIR := ./charts # Define the registry and image tagging -REGISTRY ?= ghcr.io/moirai-internal/ome +REGISTRY ?= ghcr.io/moirai-internal TAG ?= $(GIT_TAG) ARCH ?= linux/amd64 -MANAGER_IMG ?= $(REGISTRY)/manager:$(TAG) +MANAGER_IMG ?= $(REGISTRY)/ome-manager:$(TAG) # Git version and commit information for build version_pkg = github.com/sgl-project/ome/pkg/version diff --git a/charts/ome-resources/values.yaml b/charts/ome-resources/values.yaml index 193b1b91..aa5c8729 100644 --- a/charts/ome-resources/values.yaml +++ b/charts/ome-resources/values.yaml @@ -56,7 +56,7 @@ ome: cpu: 2 memory: 4Gi omeAgent: - image: ghcr.io/moirai-internal/genai-ome-agent + image: ghcr.io/moirai-internal/ome-agent tag: *defaultVersion authType: InstancePrincipal compartmentId: ocid1.compartment.oc1..dummy-compartment diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml index 9da0e862..111d1161 100644 --- a/config/runtimes/kustomization.yaml +++ b/config/runtimes/kustomization.yaml @@ -5,6 +5,9 @@ resources: - srt/deepseek-rdma-pd-rt.yaml - srt/deepseek-rdma-rt.yaml - srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +- srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml - srt/llama-4-scout-17b-16e-instruct-rt.yaml +- srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml - srt/e5-mistral-7b-instruct-rt.yaml - srt/llama-3-3-70b-instruct-rt.yaml +- srt/llama-3-3-70b-instruct-pd-rt.yaml diff --git a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml index 2d05d410..a5ec1663 100644 --- a/config/runtimes/srt/deepseek-rdma-pd-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-pd-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-deepseek-pd-rdma + name: srt-deepseek-rdma-pd spec: disabled: false modelSizeRange: @@ -44,22 +44,9 @@ spec: effect: "NoSchedule" dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 env: - name: NVSHMEM_ENABLE_NIC_PE_MAPPING value: "1" @@ -137,28 +124,15 @@ spec: timeoutSeconds: 30 worker: size: 1 - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -205,7 +179,7 @@ spec: value: "0" - name: NVSHMEM_ENABLE_NIC_PE_MAPPING value: "1" - - name: SGLANG_MOONCAKE_TRANS_THREAD + - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE value: "8" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" @@ -225,24 +199,11 @@ spec: effect: "NoSchedule" dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 env: - - name: SGLANG_MOONCAKE_TRANS_THREAD + - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE value: "16" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" @@ -309,28 +270,15 @@ spec: timeoutSeconds: 30 worker: size: 1 - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -372,7 +320,7 @@ spec: value: "0" - name: NVSHMEM_IB_TRAFFIC_CLASS value: "16" - - name: SGLANG_MOONCAKE_TRANS_THREAD + - name: SGLANG_DISAGGREGATION_THREAD_POOL_SIZE value: "16" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" diff --git a/config/runtimes/srt/deepseek-rdma-rt.yaml b/config/runtimes/srt/deepseek-rdma-rt.yaml index 99b21eb3..a791a7b1 100644 --- a/config/runtimes/srt/deepseek-rdma-rt.yaml +++ b/config/runtimes/srt/deepseek-rdma-rt.yaml @@ -35,7 +35,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" @@ -75,22 +75,9 @@ spec: effect: "NoSchedule" dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -98,7 +85,7 @@ spec: python3 -m sglang.launch_server --host 0.0.0.0 --port 8080 --model-path ${MODEL_PATH} - --tp 16 + --tp-size 16 --nccl-init $(LWS_LEADER_ADDRESS):5000 --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} @@ -139,28 +126,15 @@ spec: timeoutSeconds: 30 worker: size: 1 - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 command: - sh - -c @@ -169,7 +143,7 @@ spec: --host 0.0.0.0 --port 8080 --model-path ${MODEL_PATH} - --tp 16 + --tp-size 16 --nccl-init $(LWS_LEADER_ADDRESS):5000 --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} diff --git a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml index 2f9104ff..0b22c744 100644 --- a/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/e5-mistral-7b-instruct-rt.yaml @@ -22,7 +22,7 @@ spec: engineConfig: runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -38,7 +38,7 @@ spec: --port=8080 \ --enable-metrics \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --is-embedding volumeMounts: - mountPath: /dev/shm @@ -93,16 +93,4 @@ spec: volumes: - name: dshm emptyDir: - medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 \ No newline at end of file + medium: Memory \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml deleted file mode 100644 index 14d6b5a3..00000000 --- a/config/runtimes/srt/llama-3-1-405b-instruct-fp8-rt.yaml +++ /dev/null @@ -1,138 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterServingRuntime -metadata: - name: srt-llama-3-1-405b-instruct-fp8 -spec: - disabled: false - supportedModelFormats: - - modelFramework: - name: transformers - version: "4.43.0.dev0" - modelFormat: - name: safetensors - version: "1.0.0" - modelArchitecture: LlamaForCausalLM - autoSelect: false - priority: 1 - protocolVersions: - - openAI - modelSizeRange: - min: 400B - max: 410B - engineConfig: - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 - runner: - name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - /bin/bash - - '-lc' - - -- - args: - - | - python3 -m sglang.launch_server \ - --host=0.0.0.0 \ - --port=8080 \ - --enable-metrics \ - --log-requests \ - --model-path="$MODEL_PATH" \ - --tp 8 \ - --mem-frac=0.9 - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 128 - memory: 216Gi - nvidia.com/gpu: 8 - limits: - cpu: 128 - memory: 216Gi - nvidia.com/gpu: 8 - - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - - routerConfig: - runner: - name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 - resources: - limits: - cpu: "1" - memory: "2Gi" - ports: - - containerPort: 8080 - name: http - command: - - sh - - -c - - > - python3 -m sglang_router.launch_router - --host "0.0.0.0" - --port "8080" - --service-discovery - --service-discovery-namespace "${NAMESPACE}" - --service-discovery-port 8080 - --selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: INFERENCESERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['ome.io/inferenceservice'] diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml new file mode 100644 index 00000000..55f20a7e --- /dev/null +++ b/config/runtimes/srt/llama-3-1-70b-instruct-pd-rt.yaml @@ -0,0 +1,215 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-1-70b-instruct-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.42.3" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 60B + max: 75B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=4 \ + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=4 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml index e3d7616e..43636215 100644 --- a/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-1-70b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -110,7 +98,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml new file mode 100644 index 00000000..ec8764a6 --- /dev/null +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-pd-rt.yaml @@ -0,0 +1,226 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-2-11b-vision-instruct-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.45.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: MllamaForConditionalGeneration + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 10B + max: 12B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size 1 \ + --mem-frac=0.9 \ + --chat-template llama_3_vision \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory +# affinity: +# nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size 1 \ + --mem-frac=0.9 \ + --chat-template llama_3_vision \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml index a484dc2e..a27e52fb 100644 --- a/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-11b-vision-instruct-rt.yaml @@ -34,18 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -62,7 +53,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 \ --chat-template llama_3_vision volumeMounts: @@ -108,7 +99,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml new file mode 100644 index 00000000..21797d18 --- /dev/null +++ b/config/runtimes/srt/llama-3-2-1b-instruct-pd-rt.yaml @@ -0,0 +1,215 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-2-1b-instruct-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.45.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 500M + max: 2B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size 1 \ + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size 1 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml index ce55124c..412206c3 100644 --- a/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-1b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -65,7 +53,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm @@ -110,7 +98,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml new file mode 100644 index 00000000..0231121e --- /dev/null +++ b/config/runtimes/srt/llama-3-2-3b-instruct-pd-rt.yaml @@ -0,0 +1,227 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-2-3b-instruct-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.45.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 2B + max: 4B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size 1 \ + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory +# affinity: +# nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.B4.8 + - BM.GPU4.8 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size 1 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml index 6d1d6842..b7e7d36d 100644 --- a/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-2-3b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -65,7 +53,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 1 \ + --tp-size 1 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm @@ -110,7 +98,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml new file mode 100644 index 00000000..9052b5b0 --- /dev/null +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-pd-rt.yaml @@ -0,0 +1,217 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-2-90b-vision-instruct-fp8-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.46.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: MllamaForConditionalGeneration + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 85B + max: 95B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size 4 \ + --mem-frac=0.9 \ + --chat-template llama_3_vision \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + limits: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size 4 \ + --mem-frac=0.9 \ + --chat-template llama_3_vision \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + limits: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml index 4a1d0e7b..2ef9d4d4 100644 --- a/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-3-2-90b-vision-instruct-fp8-rt.yaml @@ -34,18 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -62,7 +53,7 @@ spec: --enable-metrics \ --log-requests \ --model-path="$MODEL_PATH" \ - --tp 4 \ + --tp-size 4 \ --mem-frac=0.9 \ --chat-template llama_3_vision volumeMounts: @@ -108,7 +99,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml new file mode 100644 index 00000000..3f02b061 --- /dev/null +++ b/config/runtimes/srt/llama-3-3-70b-instruct-pd-rt.yaml @@ -0,0 +1,215 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-3-70b-instruct-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.47.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 60B + max: 75B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=4 \ + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=4 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml index d7096f4f..3337e988 100644 --- a/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml +++ b/config/runtimes/srt/llama-3-3-70b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -110,7 +98,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml index b62ae96c..40cbe8d8 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml @@ -37,20 +37,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -68,7 +59,7 @@ spec: --model-path="$MODEL_PATH" \ --disaggregation-mode prefill \ --disaggregation-ib-device mlx5_0 \ - --tp 8 \ + --tp-size 8 \ --context-length=430000 \ --chat-template llama-4 \ --attention-backend fa3 \ @@ -85,7 +76,6 @@ spec: cpu: 128 memory: 512Gi nvidia.com/gpu: 8 - readinessProbe: httpGet: path: /health_generate @@ -94,7 +84,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -103,7 +92,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate @@ -131,20 +119,11 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 dnsPolicy: ClusterFirstWithHostNet hostNetwork: true runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -162,7 +141,7 @@ spec: --model-path="$MODEL_PATH" \ --disaggregation-mode decode \ --disaggregation-ib-device mlx5_0 \ - --tp 8 \ + --tp-size 8 \ --context-length=430000 \ --chat-template llama-4 \ --attention-backend fa3 \ @@ -179,7 +158,6 @@ spec: cpu: 128 memory: 512Gi nvidia.com/gpu: 8 - readinessProbe: httpGet: path: /health_generate @@ -188,7 +166,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -197,7 +174,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate @@ -210,7 +186,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev13 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index 70e1d0f2..8e78c01c 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -34,18 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -61,7 +52,7 @@ spec: --port=8080 \ --enable-metrics \ --model-path="$MODEL_PATH" \ - --tp 8 \ + --tp-size 8 \ --context-length=430000 \ --chat-template llama-4 \ --attention-backend fa3 \ @@ -78,7 +69,6 @@ spec: cpu: 128 memory: 512Gi nvidia.com/gpu: 8 - readinessProbe: httpGet: path: /health_generate @@ -87,7 +77,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 200 - livenessProbe: httpGet: path: /health @@ -96,7 +85,6 @@ spec: successThreshold: 1 periodSeconds: 60 timeoutSeconds: 60 - startupProbe: httpGet: path: /health_generate @@ -109,7 +97,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml new file mode 100644 index 00000000..8c64685d --- /dev/null +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml @@ -0,0 +1,221 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-4-scout-17b-16e-instruct-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.51.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: Llama4ForConditionalGeneration + autoSelect: true + priority: 2 + protocolVersions: + - openAI + modelSizeRange: + min: 100B + max: 109B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --model-path="$MODEL_PATH" \ + --tp-size 4 \ + --mem-frac=0.95 \ + --context-length=128000 \ + --chat-template llama-4 \ + --attention-backend fa3 \ + --log-requests \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + limits: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --model-path="$MODEL_PATH" \ + --tp-size 4 \ + --mem-frac=0.95 \ + --context-length=128000 \ + --chat-template llama-4 \ + --attention-backend fa3 \ + --log-requests \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + limits: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml index 07be4bad..2609ed15 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml @@ -34,18 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -61,7 +52,7 @@ spec: --port=8080 \ --enable-metrics \ --model-path="$MODEL_PATH" \ - --tp 4 \ + --tp-size 4 \ --mem-frac=0.95 \ --context-length=128000 \ --chat-template llama-4 \ @@ -110,7 +101,7 @@ spec: routerConfig: runner: name: router - image: ghcr.io/moirai-internal/sgl-router:dev2 + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 resources: limits: cpu: "1" diff --git a/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml new file mode 100644 index 00000000..5869bd53 --- /dev/null +++ b/config/runtimes/srt/mistral-7b-instruct-pd-rt.yaml @@ -0,0 +1,215 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-mistral-7b-instruct-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.36.2" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: MistralForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 5B + max: 9B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH"\ + --tp-size 2 \ + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH"\ + --tp-size 2 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/mistral-7b-instruct-rt.yaml b/config/runtimes/srt/mistral-7b-instruct-rt.yaml index 72262542..7b193fa5 100644 --- a/config/runtimes/srt/mistral-7b-instruct-rt.yaml +++ b/config/runtimes/srt/mistral-7b-instruct-rt.yaml @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -64,8 +52,8 @@ spec: --port=8080 \ --enable-metrics \ --log-requests \ - --model="$MODEL_PATH"\ - --tp 2 \ + --model-path="$MODEL_PATH"\ + --tp-size 2 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm @@ -106,4 +94,35 @@ spec: successThreshold: 1 periodSeconds: 6 initialDelaySeconds: 60 - timeoutSeconds: 30 \ No newline at end of file + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml new file mode 100644 index 00000000..8f75eed8 --- /dev/null +++ b/config/runtimes/srt/mixtral-8x7b-instruct-pd-rt.yaml @@ -0,0 +1,215 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-mixtral-8x7b-instruct-pd +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.36.0.dev0" + modelFormat: + name: safetensors + version: "1" + modelArchitecture: MixtralForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 45B + max: 50B + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH"\ + --tp-size 2 \ + --mem-frac=0.9 \ + --disaggregation-mode prefill \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + decoderConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH"\ + --tp-size 2 \ + --mem-frac=0.9 \ + --disaggregation-mode decode \ + --disaggregation-ib-device mlx5_0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 2 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --pd-disaggregation + --policy power_of_two + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --prefill-selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + --decode-selector component=decoder ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml index 0a18d556..0382daa3 100644 --- a/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/srt/mixtral-8x7b-instruct-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-mmixtral-8x7b-instruct + name: srt-mixtral-8x7b-instruct spec: disabled: false supportedModelFormats: @@ -34,21 +34,9 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container - image: ghcr.io/moirai-internal/sgl:dev2 + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 ports: - containerPort: 8080 name: http1 @@ -64,8 +52,8 @@ spec: --port=8080 \ --enable-metrics \ --log-requests \ - --model="$MODEL_PATH"\ - --tp 2 \ + --model-path="$MODEL_PATH"\ + --tp-size 2 \ --mem-frac=0.9 volumeMounts: - mountPath: /dev/shm @@ -106,4 +94,35 @@ spec: successThreshold: 1 periodSeconds: 6 initialDelaySeconds: 60 - timeoutSeconds: 30 \ No newline at end of file + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml b/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml index 8b29a3b0..dc1fc2b2 100644 --- a/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml +++ b/config/runtimes/vllm/deepseek-v3-rdma-rt.yaml @@ -33,19 +33,6 @@ spec: effect: "NoSchedule" dnsPolicy: ClusterFirstWithHostNet hostNetwork: true - nodeSelector: - oci.oraclecloud.com/rdma.authenticated: "16" - oci.oraclecloud.com/rdma.mlx_issues: "0" - oke.oraclecloud.com/pool.mode: cluster-network - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml b/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml index 2c92af6f..92f869d5 100644 --- a/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml +++ b/config/runtimes/vllm/e5-mistral-7b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml index 88f44125..296b386e 100644 --- a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml @@ -31,18 +31,6 @@ spec: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml index 125a6c17..10d42567 100644 --- a/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-70b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml b/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml index b76ece6d..e0f122f8 100644 --- a/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-nemotron-nano-8b-v1-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml b/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml index 0628b078..96b1ad4e 100644 --- a/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-nemotron-ultra-253b-v1-rt.yaml @@ -31,15 +31,6 @@ spec: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml index 82d8d1c1..09a00a44 100644 --- a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml @@ -45,15 +45,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml index 1d4d074e..bc873896 100644 --- a/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-1b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml index f72a2170..e7030b78 100644 --- a/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-3b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml index a5dac905..88c7c02a 100644 --- a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-rt.yaml @@ -35,15 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml index cb676165..9f97cd3e 100644 --- a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml +++ b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml @@ -36,16 +36,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml index c4679b18..e43fbb09 100644 --- a/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-3-70b-instruct-rt.yaml @@ -39,18 +39,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml b/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml index 90a9a004..9de1e31b 100644 --- a/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml +++ b/config/runtimes/vllm/llama-3-3-nemotron-super-49b-v1-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml index e1f74933..2823e997 100644 --- a/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-70b-instruct-rt.yaml @@ -35,18 +35,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index 5a6a851b..e05a7d10 100644 --- a/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/vllm/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -33,15 +33,6 @@ spec: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml index 6fd01f5c..1d8d0f55 100644 --- a/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-4-scout-17b-16e-instruct-rt.yaml @@ -32,15 +32,6 @@ spec: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.H100.8 volumes: - name: dshm emptyDir: diff --git a/config/runtimes/vllm/mistral-7b-instruct-rt.yaml b/config/runtimes/vllm/mistral-7b-instruct-rt.yaml index 18404026..39d1c175 100644 --- a/config/runtimes/vllm/mistral-7b-instruct-rt.yaml +++ b/config/runtimes/vllm/mistral-7b-instruct-rt.yaml @@ -34,18 +34,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml b/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml index 73c46c41..5279859f 100644 --- a/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml +++ b/config/runtimes/vllm/mixtral-8x7b-instruct-rt.yaml @@ -34,18 +34,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml b/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml index 2b4c159c..c555ac27 100644 --- a/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml +++ b/config/runtimes/vllm/phi-3-vision-128k-instruct-rt.yaml @@ -34,18 +34,6 @@ spec: - name: dshm emptyDir: medium: Memory - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.A100-v2.8 - - BM.GPU.H100.8 runner: name: ome-container image: docker.io/vllm/vllm-openai:v0.9.0.1 diff --git a/go.mod b/go.mod index 62595c07..7fe6a39a 100644 --- a/go.mod +++ b/go.mod @@ -63,7 +63,6 @@ require ( github.com/NYTimes/gziphandler v1.1.1 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/antonmedv/expr v1.15.3 // indirect - github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/blendle/zapdriver v1.3.1 // indirect @@ -101,7 +100,6 @@ require ( github.com/google/cel-go v0.23.2 // indirect github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-containerregistry v0.16.1 // indirect - github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect @@ -109,12 +107,10 @@ require ( github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/hashicorp/hcl v1.0.0 // indirect - github.com/imdario/mergo v0.3.16 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kelseyhightower/envconfig v1.4.0 // indirect - github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/cpuid/v2 v2.2.7 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/leodido/go-urn v1.4.0 // indirect diff --git a/go.sum b/go.sum index 6906429c..cab24b93 100644 --- a/go.sum +++ b/go.sum @@ -58,8 +58,6 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/antonmedv/expr v1.15.3 h1:q3hOJZNvLvhqE8OHBs1cFRdbXFNKuA+bHmRaI+AmRmI= github.com/antonmedv/expr v1.15.3/go.mod h1:0E/6TxnOlRNp81GMzX9QfDPAmHo2Phg00y4JUv1ihsE= -github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= -github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -287,8 +285,6 @@ github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= -github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jarcoal/httpmock v1.2.0 h1:gSvTxxFR/MEMfsGrvRbdfpRUMBStovlSRLw0Ep1bwwc= diff --git a/hack/internal/tools/go.sum b/hack/internal/tools/go.sum index 8f3b5ba9..4410b7e0 100644 --- a/hack/internal/tools/go.sum +++ b/hack/internal/tools/go.sum @@ -737,8 +737,7 @@ github.com/frankban/quicktest v1.14.4/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7z github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= -github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= -github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= @@ -797,8 +796,7 @@ github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5x github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/gohugoio/hugo v0.142.0 h1:gOVP52kHxr5dByyKgo/74s35tLIcHiHVwojQ4fmd3A4= -github.com/gohugoio/hugo v0.142.0/go.mod h1:G0uwM5aRUXN4cbnqrDQx9Dlgmf/ukUpPADajL8FbL9M= +github.com/gohugoio/hugo v0.147.7 h1:7qQKI8wsPgF1ipYBcXgM8wFmqTyFpkmzqLEf3hpzpT8= github.com/gohugoio/hugo v0.147.7/go.mod h1:gBn9Oi4LomFk1XS9raAPHdxaPrhPoF8ZfRrEcZZFGpo= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= @@ -1093,8 +1091,7 @@ github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4= -github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= -github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= +github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= @@ -1164,13 +1161,11 @@ github.com/spf13/afero v1.3.3/go.mod h1:5KUK8ByomD5Ti5Artl0RtHeI5pTF7MIDuXL3yY52 github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= github.com/spf13/afero v1.9.2/go.mod h1:iUV7ddyEEZPO5gA3zD4fJt6iStLlL+Lg4m2cihcDf8Y= github.com/spf13/afero v1.9.5/go.mod h1:UBogFpq8E9Hx+xc5CNTTEpTnuHVmXDwZcZcE1eb/UhQ= -github.com/spf13/afero v1.12.0 h1:UcOPyRBYczmFn6yvphxkn9ZEOY65cpwGKb5mL36mrqs= -github.com/spf13/afero v1.12.0/go.mod h1:ZTlWwG4/ahT8W7T0WQ5uYmjI9duaLQGy3Q2OAl4sk/4= +github.com/spf13/afero v1.14.0 h1:9tH6MapGnn/j0eb0yIXiLjERO8RB6xIVZRDCX7PtqWA= github.com/spf13/afero v1.14.0/go.mod h1:acJQ8t0ohCGuMN3O+Pv0V0hgMxNYDlvdk+VTfyZmbYo= github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cast v1.5.1/go.mod h1:b9PdjNptOpzXr7Rq1q9gJML/2cdGQAo69NKzQ10KN48= -github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y= -github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/cast v1.8.0 h1:gEN9K4b8Xws4EX0+a0reLmhq8moKn7ntRlQYgjPeCDk= github.com/spf13/cast v1.8.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= @@ -1447,8 +1442,7 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220819030929-7fc1605a5dde/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= -golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -1578,8 +1572,7 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= -golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1587,8 +1580,7 @@ golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/time v0.0.0-20220922220347-f3bd1da661af/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.1.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= -golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.10.0 h1:3usCWA8tQn0L8+hFJQNgzpWbd89begxN66o1Ojdn5L4= golang.org/x/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=