From a5f763f464a8d26b864e56ec39bb3940240b6ed7 Mon Sep 17 00:00:00 2001 From: xinyzzha Date: Thu, 11 Dec 2025 14:00:57 -0800 Subject: [PATCH] Add node affinity for benchmark job pod --- .../v1beta1/benchmark/controller.go | 36 ++ .../v1beta1/benchmark/controller_test.go | 440 ++++++++++++++++++ .../inferenceservice/components/base.go | 59 +-- .../v1beta1/inferenceservice/utils/utils.go | 74 +++ .../inferenceservice/utils/utils_test.go | 162 +++++++ 5 files changed, 714 insertions(+), 57 deletions(-) diff --git a/pkg/controller/v1beta1/benchmark/controller.go b/pkg/controller/v1beta1/benchmark/controller.go index 7b080caf..c3bca0a2 100644 --- a/pkg/controller/v1beta1/benchmark/controller.go +++ b/pkg/controller/v1beta1/benchmark/controller.go @@ -215,12 +215,47 @@ func (r *BenchmarkJobReconciler) createPodSpec(ctx context.Context, benchmarkJob podSpec := r.buildBasePodSpec(container, volumes) + // Add node selector for InferenceService base model if specified + if benchmarkJob.Spec.Endpoint.InferenceService != nil { + if err := r.addNodeSelectorFromInferenceService(ctx, benchmarkJob, podSpec); err != nil { + r.Log.Error(err, "Failed to add node selector from InferenceService, continuing without it") + // Don't fail the whole reconciliation, just log the error + } + } + if benchmarkJob.Spec.PodOverride != nil { return r.applyPodOverrides(podSpec, benchmarkJob.Spec.PodOverride) } return podSpec, nil } +// addNodeSelectorFromInferenceService adds node affinity based on the InferenceService's base model +func (r *BenchmarkJobReconciler) addNodeSelectorFromInferenceService(ctx context.Context, benchmarkJob *v1beta1.BenchmarkJob, podSpec *v1.PodSpec) error { + ref := benchmarkJob.Spec.Endpoint.InferenceService + inferenceService, err := benchmarkutils.GetInferenceService(ctx, r.Client, ref) + if err != nil { + return err + } + + baseModelName := benchmarkutils.GetBaseModelName(inferenceService) + if baseModelName == "" { + return fmt.Errorf("InferenceService %s/%s has no Model defined", inferenceService.Namespace, inferenceService.Name) + } + + _, baseModelMeta, err := isvcutils.GetBaseModel(r.Client, baseModelName, inferenceService.Namespace) + if err != nil { + return err + } + + isvcutils.AddPreferredNodeAffinityForModel(podSpec, baseModelMeta) + r.Log.Info("Added node affinity for benchmark job", + "baseModel", baseModelMeta.Name, + "namespace", baseModelMeta.Namespace, + "benchmarkJob", benchmarkJob.Name) + + return nil +} + // buildDefaultContainer creates the default benchmark container with resources and env vars func (r *BenchmarkJobReconciler) buildDefaultContainer(ctx context.Context, benchmarkJob *v1beta1.BenchmarkJob, config *controllerconfig.BenchmarkJobConfig) (*v1.Container, error) { resources := v1.ResourceRequirements{ @@ -387,6 +422,7 @@ func (r *BenchmarkJobReconciler) applyPodOverrides(podSpec *v1.PodSpec, override Containers: []v1.Container{*mergedContainer}, Volumes: podSpec.Volumes, Tolerations: podSpec.Tolerations, + Affinity: podSpec.Affinity, RestartPolicy: v1.RestartPolicyNever, } diff --git a/pkg/controller/v1beta1/benchmark/controller_test.go b/pkg/controller/v1beta1/benchmark/controller_test.go index f2a1b98f..9b1c3819 100644 --- a/pkg/controller/v1beta1/benchmark/controller_test.go +++ b/pkg/controller/v1beta1/benchmark/controller_test.go @@ -516,6 +516,446 @@ func TestBenchmarkJobReconciler_buildBenchmarkCommand(t *testing.T) { } } +func TestBenchmarkJobReconciler_addNodeSelectorFromInferenceService(t *testing.T) { + scheme := runtime.NewScheme() + _ = v1beta1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) + + tests := []struct { + name string + benchmarkJob *v1beta1.BenchmarkJob + inferenceService *v1beta1.InferenceService + baseModel *v1beta1.BaseModel + clusterBaseModel *v1beta1.ClusterBaseModel + expectAffinity bool + expectLabelKey string + expectErr bool + }{ + { + name: "adds node affinity for BaseModel (namespace-scoped)", + benchmarkJob: &v1beta1.BenchmarkJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-benchmark", + Namespace: "default", + }, + Spec: v1beta1.BenchmarkJobSpec{ + Endpoint: v1beta1.EndpointSpec{ + InferenceService: &v1beta1.InferenceServiceReference{ + Name: "test-isvc", + Namespace: "default", + }, + }, + }, + }, + inferenceService: &v1beta1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-isvc", + Namespace: "default", + }, + Spec: v1beta1.InferenceServiceSpec{ + Predictor: v1beta1.PredictorSpec{ + Model: &v1beta1.ModelSpec{ + BaseModel: StringPtr("test-base-model"), + }, + }, + }, + }, + baseModel: &v1beta1.BaseModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-base-model", + Namespace: "default", + }, + Spec: v1beta1.BaseModelSpec{ + ModelFormat: v1beta1.ModelFormat{Name: "pytorch"}, + Storage: &v1beta1.StorageSpec{Path: StringPtr("/models/test")}, + }, + }, + expectAffinity: true, + expectLabelKey: "models.ome.io/default.basemodel.test-base-model", + expectErr: false, + }, + { + name: "adds node affinity for ClusterBaseModel", + benchmarkJob: &v1beta1.BenchmarkJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-benchmark", + Namespace: "default", + }, + Spec: v1beta1.BenchmarkJobSpec{ + Endpoint: v1beta1.EndpointSpec{ + InferenceService: &v1beta1.InferenceServiceReference{ + Name: "test-isvc", + Namespace: "default", + }, + }, + }, + }, + inferenceService: &v1beta1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-isvc", + Namespace: "default", + }, + Spec: v1beta1.InferenceServiceSpec{ + Predictor: v1beta1.PredictorSpec{ + Model: &v1beta1.ModelSpec{ + BaseModel: StringPtr("test-cluster-model"), + }, + }, + }, + }, + clusterBaseModel: &v1beta1.ClusterBaseModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster-model", + }, + Spec: v1beta1.BaseModelSpec{ + ModelFormat: v1beta1.ModelFormat{Name: "pytorch"}, + Storage: &v1beta1.StorageSpec{Path: StringPtr("/models/cluster-test")}, + }, + }, + expectAffinity: true, + expectLabelKey: "models.ome.io/clusterbasemodel.test-cluster-model", + expectErr: false, + }, + { + name: "error when InferenceService not found", + benchmarkJob: &v1beta1.BenchmarkJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-benchmark", + Namespace: "default", + }, + Spec: v1beta1.BenchmarkJobSpec{ + Endpoint: v1beta1.EndpointSpec{ + InferenceService: &v1beta1.InferenceServiceReference{ + Name: "non-existent-isvc", + Namespace: "default", + }, + }, + }, + }, + expectAffinity: false, + expectErr: true, + }, + { + name: "error when BaseModel not found", + benchmarkJob: &v1beta1.BenchmarkJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-benchmark", + Namespace: "default", + }, + Spec: v1beta1.BenchmarkJobSpec{ + Endpoint: v1beta1.EndpointSpec{ + InferenceService: &v1beta1.InferenceServiceReference{ + Name: "test-isvc", + Namespace: "default", + }, + }, + }, + }, + inferenceService: &v1beta1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-isvc", + Namespace: "default", + }, + Spec: v1beta1.InferenceServiceSpec{ + Predictor: v1beta1.PredictorSpec{ + Model: &v1beta1.ModelSpec{ + BaseModel: StringPtr("non-existent-model"), + }, + }, + }, + }, + expectAffinity: false, + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + clientBuilder := cfake.NewClientBuilder().WithScheme(scheme) + + if tt.inferenceService != nil { + clientBuilder = clientBuilder.WithObjects(tt.inferenceService) + } + if tt.baseModel != nil { + clientBuilder = clientBuilder.WithObjects(tt.baseModel) + } + if tt.clusterBaseModel != nil { + clientBuilder = clientBuilder.WithObjects(tt.clusterBaseModel) + } + + client := clientBuilder.Build() + + r := &BenchmarkJobReconciler{ + Client: client, + Scheme: scheme, + Log: zap.New(), + } + + podSpec := &corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "benchmark", Image: "test-image"}, + }, + } + + err := r.addNodeSelectorFromInferenceService(context.TODO(), tt.benchmarkJob, podSpec) + + if tt.expectErr { + assert.Error(t, err) + return + } + + assert.NoError(t, err) + + if tt.expectAffinity { + assert.NotNil(t, podSpec.Affinity, "Affinity should not be nil") + assert.NotNil(t, podSpec.Affinity.NodeAffinity, "NodeAffinity should not be nil") + + preferredTerms := podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution + assert.NotEmpty(t, preferredTerms, "PreferredDuringSchedulingIgnoredDuringExecution should not be empty") + + // Check that the expected label key exists + found := false + for _, term := range preferredTerms { + for _, expr := range term.Preference.MatchExpressions { + if expr.Key == tt.expectLabelKey { + found = true + assert.Equal(t, corev1.NodeSelectorOpIn, expr.Operator) + assert.Contains(t, expr.Values, "Ready") + assert.Equal(t, int32(100), term.Weight) + break + } + } + } + assert.True(t, found, "Expected label key %s not found in affinity terms", tt.expectLabelKey) + } + }) + } +} + +func TestBenchmarkJobReconciler_createPodSpec_NodeAffinity(t *testing.T) { + scheme := runtime.NewScheme() + _ = v1beta1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) + _ = batchv1.AddToScheme(scheme) + + benchmarkJob := &v1beta1.BenchmarkJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-benchmark", + Namespace: "default", + }, + Spec: v1beta1.BenchmarkJobSpec{ + Endpoint: v1beta1.EndpointSpec{ + InferenceService: &v1beta1.InferenceServiceReference{ + Name: "test-isvc", + Namespace: "default", + }, + }, + Task: "chat", + MaxTimePerIteration: IntPtr(60), + MaxRequestsPerIteration: IntPtr(100), + OutputLocation: &v1beta1.StorageSpec{ + StorageUri: StringPtr("oci://n/my-namespace/b/my-bucket/o/results"), + }, + }, + } + + inferenceService := &v1beta1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-isvc", + Namespace: "default", + }, + Spec: v1beta1.InferenceServiceSpec{ + Predictor: v1beta1.PredictorSpec{ + Model: &v1beta1.ModelSpec{ + BaseModel: StringPtr("test-model"), + }, + }, + }, + Status: v1beta1.InferenceServiceStatus{ + URL: &apis.URL{ + Scheme: "http", + Host: "test-isvc.default.svc.cluster.local", + }, + }, + } + + baseModel := &v1beta1.BaseModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-model", + Namespace: "default", + }, + Spec: v1beta1.BaseModelSpec{ + ModelFormat: v1beta1.ModelFormat{Name: "pytorch"}, + Storage: &v1beta1.StorageSpec{Path: StringPtr("/models/test")}, + }, + } + + client := cfake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(benchmarkJob, inferenceService, baseModel). + Build() + + r := &BenchmarkJobReconciler{ + Client: client, + Scheme: scheme, + Log: zap.New(), + } + + benchmarkConfig := &controllerconfig.BenchmarkJobConfig{ + PodConfig: controllerconfig.PodConfig{ + Image: "test-image", + CPURequest: "100m", + CPULimit: "200m", + MemoryRequest: "100Mi", + MemoryLimit: "200Mi", + }, + } + + podSpec, err := r.createPodSpec(context.TODO(), benchmarkJob, benchmarkConfig) + + assert.NoError(t, err) + assert.NotNil(t, podSpec) + + // Verify node affinity was added + assert.NotNil(t, podSpec.Affinity, "Affinity should not be nil") + assert.NotNil(t, podSpec.Affinity.NodeAffinity, "NodeAffinity should not be nil") + + preferredTerms := podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution + assert.NotEmpty(t, preferredTerms, "PreferredDuringSchedulingIgnoredDuringExecution should not be empty") + + // Check that the expected label key exists + expectedLabelKey := "models.ome.io/default.basemodel.test-model" + found := false + for _, term := range preferredTerms { + for _, expr := range term.Preference.MatchExpressions { + if expr.Key == expectedLabelKey { + found = true + assert.Equal(t, corev1.NodeSelectorOpIn, expr.Operator) + assert.Contains(t, expr.Values, "Ready") + assert.Equal(t, int32(100), term.Weight) + break + } + } + } + assert.True(t, found, "Expected label key %s not found in affinity terms", expectedLabelKey) +} + +func TestBenchmarkJobReconciler_createPodSpec_NodeAffinity_WithPodOverride(t *testing.T) { + scheme := runtime.NewScheme() + _ = v1beta1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) + _ = batchv1.AddToScheme(scheme) + + // BenchmarkJob with PodOverride (this triggered the bug) + benchmarkJob := &v1beta1.BenchmarkJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-benchmark", + Namespace: "default", + }, + Spec: v1beta1.BenchmarkJobSpec{ + Endpoint: v1beta1.EndpointSpec{ + InferenceService: &v1beta1.InferenceServiceReference{ + Name: "test-isvc", + Namespace: "default", + }, + }, + Task: "chat", + MaxTimePerIteration: IntPtr(60), + MaxRequestsPerIteration: IntPtr(100), + OutputLocation: &v1beta1.StorageSpec{ + StorageUri: StringPtr("oci://n/my-namespace/b/my-bucket/o/results"), + }, + // PodOverride triggers the applyPodOverrides path + PodOverride: &v1beta1.PodOverride{ + Image: "custom-image:latest", + }, + }, + } + + inferenceService := &v1beta1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-isvc", + Namespace: "default", + }, + Spec: v1beta1.InferenceServiceSpec{ + Predictor: v1beta1.PredictorSpec{ + Model: &v1beta1.ModelSpec{ + BaseModel: StringPtr("test-model"), + }, + }, + }, + Status: v1beta1.InferenceServiceStatus{ + URL: &apis.URL{ + Scheme: "http", + Host: "test-isvc.default.svc.cluster.local", + }, + }, + } + + baseModel := &v1beta1.BaseModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-model", + Namespace: "default", + }, + Spec: v1beta1.BaseModelSpec{ + ModelFormat: v1beta1.ModelFormat{Name: "pytorch"}, + Storage: &v1beta1.StorageSpec{Path: StringPtr("/models/test")}, + }, + } + + client := cfake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(benchmarkJob, inferenceService, baseModel). + Build() + + r := &BenchmarkJobReconciler{ + Client: client, + Scheme: scheme, + Log: zap.New(), + } + + benchmarkConfig := &controllerconfig.BenchmarkJobConfig{ + PodConfig: controllerconfig.PodConfig{ + Image: "test-image", + CPURequest: "100m", + CPULimit: "200m", + MemoryRequest: "100Mi", + MemoryLimit: "200Mi", + }, + } + + podSpec, err := r.createPodSpec(context.TODO(), benchmarkJob, benchmarkConfig) + + assert.NoError(t, err) + assert.NotNil(t, podSpec) + + // Verify PodOverride was applied (custom image) + assert.Equal(t, "custom-image:latest", podSpec.Containers[0].Image) + + // Verify node affinity was preserved after PodOverride was applied + assert.NotNil(t, podSpec.Affinity, "Affinity should not be nil after PodOverride") + assert.NotNil(t, podSpec.Affinity.NodeAffinity, "NodeAffinity should not be nil after PodOverride") + + preferredTerms := podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution + assert.NotEmpty(t, preferredTerms, "PreferredDuringSchedulingIgnoredDuringExecution should not be empty after PodOverride") + + // Check that the expected label key exists + expectedLabelKey := "models.ome.io/default.basemodel.test-model" + found := false + for _, term := range preferredTerms { + for _, expr := range term.Preference.MatchExpressions { + if expr.Key == expectedLabelKey { + found = true + assert.Equal(t, corev1.NodeSelectorOpIn, expr.Operator) + assert.Contains(t, expr.Values, "Ready") + assert.Equal(t, int32(100), term.Weight) + break + } + } + } + assert.True(t, found, "Expected label key %s not found in affinity terms after PodOverride", expectedLabelKey) +} + func TestBenchmarkJobReconciler_updateStatus(t *testing.T) { scheme := runtime.NewScheme() _ = v1beta1.AddToScheme(scheme) diff --git a/pkg/controller/v1beta1/inferenceservice/components/base.go b/pkg/controller/v1beta1/inferenceservice/components/base.go index 32ced0e1..15889277 100644 --- a/pkg/controller/v1beta1/inferenceservice/components/base.go +++ b/pkg/controller/v1beta1/inferenceservice/components/base.go @@ -223,62 +223,8 @@ func UpdatePodSpecNodeSelector(b *BaseComponentFields, isvc *v1beta1.InferenceSe return } - // Determine if this is a ClusterBaseModel or BaseModel - var labelKey string - isClusterScoped := b.BaseModelMeta.Namespace == "" - - if isClusterScoped { - // ClusterBaseModel - labelKey = constants.GetClusterBaseModelLabel(b.BaseModelMeta.Name) - } else { - // BaseModel (namespace-scoped) - labelKey = constants.GetBaseModelLabel(b.BaseModelMeta.Namespace, b.BaseModelMeta.Name) - } - - // Add preferred node affinity for model readiness label. - // This allows cluster autoscaler to scale up nodes that don't yet have the model label - // (which is dynamically added after nodes join), while still preferring nodes where - // the model is already ready for optimal performance. - if podSpec.Affinity == nil { - podSpec.Affinity = &corev1.Affinity{} - } - if podSpec.Affinity.NodeAffinity == nil { - podSpec.Affinity.NodeAffinity = &corev1.NodeAffinity{} - } - - // Check if this model affinity term already exists to avoid duplicates during reconciliation - affinityExists := false - for _, term := range podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { - for _, expr := range term.Preference.MatchExpressions { - if expr.Key == labelKey { - affinityExists = true - break - } - } - if affinityExists { - break - } - } - - if !affinityExists { - // Use max weight (100) to strongly prefer nodes with ready models - preferredTerm := corev1.PreferredSchedulingTerm{ - Weight: 100, - Preference: corev1.NodeSelectorTerm{ - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: labelKey, - Operator: corev1.NodeSelectorOpIn, - Values: []string{"Ready"}, - }, - }, - }, - } - podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = append( - podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution, - preferredTerm, - ) - } + // Add preferred node affinity for model readiness using the shared utility function + isvcutils.AddPreferredNodeAffinityForModel(podSpec, b.BaseModelMeta) // Add node selector merged from AcceleratorClass if applicable // Only add mergedNodeSelector to engine and decoder component. @@ -293,7 +239,6 @@ func UpdatePodSpecNodeSelector(b *BaseComponentFields, isvc *v1beta1.InferenceSe } b.Log.Info("Added preferred node affinity for model scheduling", - "labelKey", labelKey, "modelName", b.BaseModelMeta.Name, "namespace", b.BaseModelMeta.Namespace, "inferenceService", isvc.Name) diff --git a/pkg/controller/v1beta1/inferenceservice/utils/utils.go b/pkg/controller/v1beta1/inferenceservice/utils/utils.go index 56954ffe..5cba6a96 100644 --- a/pkg/controller/v1beta1/inferenceservice/utils/utils.go +++ b/pkg/controller/v1beta1/inferenceservice/utils/utils.go @@ -6,6 +6,7 @@ import ( "fmt" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" @@ -92,3 +93,76 @@ func GetTargetServicePort(ctx context.Context, c client.Client, isvc *v1beta1.In return port, nil } + +// AddPreferredNodeAffinityForModel adds a preferred node affinity term to the pod spec +// for scheduling pods on nodes where the base model is ready. +// This is used by both InferenceService and BenchmarkJob controllers to ensure pods +// are scheduled on nodes with the model available. +// +// Parameters: +// - podSpec: The pod spec to update (must not be nil) +// - baseModelMeta: The metadata of the base model (ClusterBaseModel or BaseModel) +// +// The function: +// - Determines the label key based on whether it's a ClusterBaseModel (empty namespace) or BaseModel +// - Adds a preferred node affinity with weight 100 to prefer nodes with "Ready" model status +// - Avoids adding duplicate affinity terms if one already exists for the same model +func AddPreferredNodeAffinityForModel(podSpec *corev1.PodSpec, baseModelMeta *metav1.ObjectMeta) { + if podSpec == nil || baseModelMeta == nil { + return + } + + // Determine if this is a ClusterBaseModel or BaseModel based on namespace + var labelKey string + isClusterScoped := baseModelMeta.Namespace == "" + + if isClusterScoped { + // ClusterBaseModel + labelKey = constants.GetClusterBaseModelLabel(baseModelMeta.Name) + } else { + // BaseModel (namespace-scoped) + labelKey = constants.GetBaseModelLabel(baseModelMeta.Namespace, baseModelMeta.Name) + } + + // Initialize affinity structures if nil + if podSpec.Affinity == nil { + podSpec.Affinity = &corev1.Affinity{} + } + if podSpec.Affinity.NodeAffinity == nil { + podSpec.Affinity.NodeAffinity = &corev1.NodeAffinity{} + } + + // Check if this model affinity term already exists to avoid duplicates + affinityExists := false + for _, term := range podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { + for _, expr := range term.Preference.MatchExpressions { + if expr.Key == labelKey { + affinityExists = true + break + } + } + if affinityExists { + break + } + } + + if !affinityExists { + // Use max weight (100) to strongly prefer nodes with ready models + preferredTerm := corev1.PreferredSchedulingTerm{ + Weight: 100, + Preference: corev1.NodeSelectorTerm{ + MatchExpressions: []corev1.NodeSelectorRequirement{ + { + Key: labelKey, + Operator: corev1.NodeSelectorOpIn, + Values: []string{"Ready"}, + }, + }, + }, + } + podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = append( + podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution, + preferredTerm, + ) + } +} diff --git a/pkg/controller/v1beta1/inferenceservice/utils/utils_test.go b/pkg/controller/v1beta1/inferenceservice/utils/utils_test.go index 99432ee9..e9df951a 100644 --- a/pkg/controller/v1beta1/inferenceservice/utils/utils_test.go +++ b/pkg/controller/v1beta1/inferenceservice/utils/utils_test.go @@ -1869,3 +1869,165 @@ func TestGetTargetServicePort_ServiceNameResolution(t *testing.T) { }) } } + +func TestAddPreferredNodeAffinityForModel(t *testing.T) { + tests := []struct { + name string + podSpec *v1.PodSpec + baseModelMeta *metav1.ObjectMeta + wantAffinity bool + wantLabelKey string + }{ + { + name: "ClusterBaseModel - adds node affinity", + podSpec: &v1.PodSpec{}, + baseModelMeta: &metav1.ObjectMeta{ + Name: "test-cluster-model", + Namespace: "", // Empty namespace indicates ClusterBaseModel + }, + wantAffinity: true, + wantLabelKey: "models.ome.io/clusterbasemodel.test-cluster-model", + }, + { + name: "BaseModel (namespace-scoped) - adds node affinity", + podSpec: &v1.PodSpec{}, + baseModelMeta: &metav1.ObjectMeta{ + Name: "test-model", + Namespace: "default", + }, + wantAffinity: true, + wantLabelKey: "models.ome.io/default.basemodel.test-model", + }, + { + name: "nil podSpec - no panic", + podSpec: nil, + baseModelMeta: &metav1.ObjectMeta{Name: "test-model", Namespace: "default"}, + wantAffinity: false, + }, + { + name: "nil baseModelMeta - no panic", + podSpec: &v1.PodSpec{}, + baseModelMeta: nil, + wantAffinity: false, + }, + { + name: "existing affinity - appends without duplicating", + podSpec: &v1.PodSpec{ + Affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ + { + Weight: 50, + Preference: v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "existing-label", + Operator: v1.NodeSelectorOpIn, + Values: []string{"value"}, + }, + }, + }, + }, + }, + }, + }, + }, + baseModelMeta: &metav1.ObjectMeta{ + Name: "test-model", + Namespace: "default", + }, + wantAffinity: true, + wantLabelKey: "models.ome.io/default.basemodel.test-model", + }, + { + name: "duplicate affinity check - does not add duplicate", + podSpec: &v1.PodSpec{ + Affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ + { + Weight: 100, + Preference: v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "models.ome.io/default.basemodel.test-model", + Operator: v1.NodeSelectorOpIn, + Values: []string{"Ready"}, + }, + }, + }, + }, + }, + }, + }, + }, + baseModelMeta: &metav1.ObjectMeta{ + Name: "test-model", + Namespace: "default", + }, + wantAffinity: true, + wantLabelKey: "models.ome.io/default.basemodel.test-model", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + initialTermCount := 0 + if tt.podSpec != nil && tt.podSpec.Affinity != nil && + tt.podSpec.Affinity.NodeAffinity != nil { + initialTermCount = len(tt.podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution) + } + + AddPreferredNodeAffinityForModel(tt.podSpec, tt.baseModelMeta) + + if !tt.wantAffinity { + // For nil cases, just verify no panic occurred + return + } + + assert.NotNil(t, tt.podSpec.Affinity, "Affinity should not be nil") + assert.NotNil(t, tt.podSpec.Affinity.NodeAffinity, "NodeAffinity should not be nil") + + preferredTerms := tt.podSpec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution + assert.NotEmpty(t, preferredTerms, "PreferredDuringSchedulingIgnoredDuringExecution should not be empty") + + // Check that the expected label key exists in one of the terms + found := false + for _, term := range preferredTerms { + for _, expr := range term.Preference.MatchExpressions { + if expr.Key == tt.wantLabelKey { + found = true + assert.Equal(t, v1.NodeSelectorOpIn, expr.Operator) + assert.Contains(t, expr.Values, "Ready") + assert.Equal(t, int32(100), term.Weight) + break + } + } + } + assert.True(t, found, "Expected label key %s not found in affinity terms", tt.wantLabelKey) + + // For the duplicate test case, verify no additional term was added + if tt.name == "duplicate affinity check - does not add duplicate" { + assert.Equal(t, initialTermCount, len(preferredTerms), + "Should not add duplicate affinity term") + } + + // For the existing affinity case, verify the existing term is preserved + if tt.name == "existing affinity - appends without duplicating" { + assert.Equal(t, initialTermCount+1, len(preferredTerms), + "Should append new affinity term") + // Check the existing term is still there + existingFound := false + for _, term := range preferredTerms { + for _, expr := range term.Preference.MatchExpressions { + if expr.Key == "existing-label" { + existingFound = true + break + } + } + } + assert.True(t, existingFound, "Existing affinity term should be preserved") + } + }) + } +}