diff --git a/components/manifests/base/kustomization.yaml b/components/manifests/base/kustomization.yaml index 7ad363c44..9667a57c5 100644 --- a/components/manifests/base/kustomization.yaml +++ b/components/manifests/base/kustomization.yaml @@ -15,6 +15,7 @@ resources: - public-api-deployment.yaml - workspace-pvc.yaml - minio-deployment.yaml +- session-limitrange.yaml # Default images (can be overridden by overlays) images: diff --git a/components/manifests/base/operator-deployment.yaml b/components/manifests/base/operator-deployment.yaml index fe6a7b08e..b01a5675a 100644 --- a/components/manifests/base/operator-deployment.yaml +++ b/components/manifests/base/operator-deployment.yaml @@ -117,6 +117,31 @@ spec: value: "http://minio.ambient-code.svc:9000" # In-cluster MinIO (change for external S3) - name: S3_BUCKET value: "ambient-sessions" # Create this bucket in MinIO console + # Session container resources (externalized from sessions.go) + - name: RUNNER_CPU_REQUEST + value: "500m" + - name: RUNNER_CPU_LIMIT + value: "2000m" + - name: RUNNER_MEMORY_REQUEST + value: "1Gi" + - name: RUNNER_MEMORY_LIMIT + value: "4Gi" + - name: CONTENT_CPU_REQUEST + value: "100m" + - name: CONTENT_CPU_LIMIT + value: "500m" + - name: CONTENT_MEMORY_REQUEST + value: "1Gi" + - name: CONTENT_MEMORY_LIMIT + value: "2Gi" + - name: STATE_SYNC_CPU_REQUEST + value: "100m" + - name: STATE_SYNC_CPU_LIMIT + value: "1000m" + - name: STATE_SYNC_MEMORY_REQUEST + value: "1Gi" + - name: STATE_SYNC_MEMORY_LIMIT + value: "2Gi" # OpenTelemetry configuration - name: OTEL_EXPORTER_OTLP_ENDPOINT value: "otel-collector.ambient-code.svc:4317" # Deploy OTel collector separately diff --git a/components/manifests/base/session-limitrange.yaml b/components/manifests/base/session-limitrange.yaml new file mode 100644 index 000000000..bdf8a1e4c --- /dev/null +++ b/components/manifests/base/session-limitrange.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: LimitRange +metadata: + name: session-container-defaults +spec: + limits: + - type: Container + default: # applied as limit if container omits limits + cpu: "500m" + memory: "2Gi" + defaultRequest: # applied as request if container omits requests + cpu: "100m" + memory: "1Gi" + min: # API server rejects containers below these + cpu: "50m" + memory: "1Gi" + max: # API server rejects containers above these + cpu: "4" + memory: "8Gi" diff --git a/components/manifests/overlays/kind/operator-config.yaml b/components/manifests/overlays/kind/operator-config.yaml index e0158c20f..636611c1a 100644 --- a/components/manifests/overlays/kind/operator-config.yaml +++ b/components/manifests/overlays/kind/operator-config.yaml @@ -8,6 +8,10 @@ metadata: data: # Use local runner image loaded into kind cluster (not quay.io) AMBIENT_CODE_RUNNER_IMAGE: "vteam_claude_runner:latest" + # Session container resource overrides (smaller for local dev) + RUNNER_MEMORY_LIMIT: "1Gi" + CONTENT_MEMORY_LIMIT: "256Mi" + STATE_SYNC_MEMORY_LIMIT: "256Mi" # Vertex AI Configuration - Disabled for e2e testing CLAUDE_CODE_USE_VERTEX: "0" CLOUD_ML_REGION: "" diff --git a/components/operator/internal/config/config.go b/components/operator/internal/config/config.go index 306076c57..53d76f562 100644 --- a/components/operator/internal/config/config.go +++ b/components/operator/internal/config/config.go @@ -30,6 +30,19 @@ type Config struct { S3Endpoint string S3Bucket string PodFSGroup *int64 + // Session container resources (externalized for per-environment tuning) + RunnerCPURequest string + RunnerCPULimit string + RunnerMemoryRequest string + RunnerMemoryLimit string + ContentCPURequest string + ContentCPULimit string + ContentMemoryRequest string + ContentMemoryLimit string + StateSyncCPURequest string + StateSyncCPULimit string + StateSyncMemoryRequest string + StateSyncMemoryLimit string } // InitK8sClients initializes the Kubernetes clients @@ -70,6 +83,15 @@ func InitK8sClients() error { return nil } +// envOrDefault returns the value of the named environment variable or the +// provided fallback when the variable is unset or empty. +func envOrDefault(key, fallback string) string { + if v := os.Getenv(key); v != "" { + return v + } + return fallback +} + // LoadConfig loads the operator configuration from environment variables func LoadConfig() *Config { // Get namespace from environment or use default @@ -138,5 +160,17 @@ func LoadConfig() *Config { S3Endpoint: s3Endpoint, S3Bucket: s3Bucket, PodFSGroup: podFSGroup, + RunnerCPURequest: envOrDefault("RUNNER_CPU_REQUEST", "500m"), + RunnerCPULimit: envOrDefault("RUNNER_CPU_LIMIT", "2000m"), + RunnerMemoryRequest: envOrDefault("RUNNER_MEMORY_REQUEST", "1Gi"), + RunnerMemoryLimit: envOrDefault("RUNNER_MEMORY_LIMIT", "4Gi"), + ContentCPURequest: envOrDefault("CONTENT_CPU_REQUEST", "100m"), + ContentCPULimit: envOrDefault("CONTENT_CPU_LIMIT", "500m"), + ContentMemoryRequest: envOrDefault("CONTENT_MEMORY_REQUEST", "1Gi"), + ContentMemoryLimit: envOrDefault("CONTENT_MEMORY_LIMIT", "2Gi"), + StateSyncCPURequest: envOrDefault("STATE_SYNC_CPU_REQUEST", "100m"), + StateSyncCPULimit: envOrDefault("STATE_SYNC_CPU_LIMIT", "1000m"), + StateSyncMemoryRequest: envOrDefault("STATE_SYNC_MEMORY_REQUEST", "1Gi"), + StateSyncMemoryLimit: envOrDefault("STATE_SYNC_MEMORY_LIMIT", "2Gi"), } } diff --git a/components/operator/internal/handlers/sessions.go b/components/operator/internal/handlers/sessions.go index 5183c13f6..15acfb5c7 100644 --- a/components/operator/internal/handlers/sessions.go +++ b/components/operator/internal/handlers/sessions.go @@ -851,6 +851,16 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { PeriodSeconds: 5, }, VolumeMounts: []corev1.VolumeMount{{Name: "workspace", MountPath: "/workspace"}}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse(appConfig.ContentCPURequest), + corev1.ResourceMemory: resource.MustParse(appConfig.ContentMemoryRequest), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse(appConfig.ContentCPULimit), + corev1.ResourceMemory: resource.MustParse(appConfig.ContentMemoryLimit), + }, + }, }, { Name: "ambient-code-runner", @@ -1138,12 +1148,12 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("512Mi"), + corev1.ResourceCPU: resource.MustParse(appConfig.RunnerCPURequest), + corev1.ResourceMemory: resource.MustParse(appConfig.RunnerMemoryRequest), }, Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("2000m"), // 2 cores for MCP + Claude SDK - corev1.ResourceMemory: resource.MustParse("4Gi"), // Increased for Playwright/Chromium + dev server + corev1.ResourceCPU: resource.MustParse(appConfig.RunnerCPULimit), + corev1.ResourceMemory: resource.MustParse(appConfig.RunnerMemoryLimit), }, }, }, @@ -1177,12 +1187,12 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("100m"), - corev1.ResourceMemory: resource.MustParse("128Mi"), + corev1.ResourceCPU: resource.MustParse(appConfig.StateSyncCPURequest), + corev1.ResourceMemory: resource.MustParse(appConfig.StateSyncMemoryRequest), }, Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("1000m"), // Increased from 200m for MCP startup - corev1.ResourceMemory: resource.MustParse("1Gi"), // Increased from 256Mi + corev1.ResourceCPU: resource.MustParse(appConfig.StateSyncCPULimit), + corev1.ResourceMemory: resource.MustParse(appConfig.StateSyncMemoryLimit), }, }, }, @@ -1256,6 +1266,11 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { // Do not mount runner Secret volume; runner fetches tokens on demand + // Ensure LimitRange exists in the session namespace (guardrail for future containers) + if err := ensureLimitRange(sessionNamespace); err != nil { + log.Printf("Warning: failed to ensure LimitRange in %s: %v", sessionNamespace, err) + } + // Create the pod createdPod, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Create(context.TODO(), pod, v1.CreateOptions{}) if err != nil { @@ -2028,6 +2043,54 @@ func deletePodAndPerPodService(namespace, podName, sessionName string) error { return nil } +// ensureLimitRange creates or updates a LimitRange in the given namespace so that any +// container without explicit resource requests/limits gets safe defaults and containers +// below the minimum are rejected by the API server. +func ensureLimitRange(namespace string) error { + lr := &corev1.LimitRange{ + ObjectMeta: v1.ObjectMeta{ + Name: "session-container-defaults", + Namespace: namespace, + }, + Spec: corev1.LimitRangeSpec{ + Limits: []corev1.LimitRangeItem{ + { + Type: corev1.LimitTypeContainer, + Default: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("2Gi"), + }, + DefaultRequest: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Min: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Max: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + }, + }, + } + + _, err := config.K8sClient.CoreV1().LimitRanges(namespace).Create(context.TODO(), lr, v1.CreateOptions{}) + if err != nil { + if errors.IsAlreadyExists(err) { + _, err = config.K8sClient.CoreV1().LimitRanges(namespace).Update(context.TODO(), lr, v1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("update LimitRange: %w", err) + } + } else { + return fmt.Errorf("create LimitRange: %w", err) + } + } + return nil +} + // copySecretToNamespace copies a secret to a target namespace with owner references func copySecretToNamespace(ctx context.Context, sourceSecret *corev1.Secret, targetNamespace string, ownerObj *unstructured.Unstructured) error { // Check if secret already exists in target namespace