Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions components/manifests/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ resources:
- public-api-deployment.yaml
- workspace-pvc.yaml
- minio-deployment.yaml
- session-limitrange.yaml

# Default images (can be overridden by overlays)
images:
Expand Down
25 changes: 25 additions & 0 deletions components/manifests/base/operator-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,31 @@ spec:
value: "http://minio.ambient-code.svc:9000" # In-cluster MinIO (change for external S3)
- name: S3_BUCKET
value: "ambient-sessions" # Create this bucket in MinIO console
# Session container resources (externalized from sessions.go)
- name: RUNNER_CPU_REQUEST
value: "500m"
- name: RUNNER_CPU_LIMIT
value: "2000m"
- name: RUNNER_MEMORY_REQUEST
value: "1Gi"
- name: RUNNER_MEMORY_LIMIT
value: "4Gi"
- name: CONTENT_CPU_REQUEST
value: "100m"
- name: CONTENT_CPU_LIMIT
value: "500m"
- name: CONTENT_MEMORY_REQUEST
value: "1Gi"
- name: CONTENT_MEMORY_LIMIT
value: "2Gi"
- name: STATE_SYNC_CPU_REQUEST
value: "100m"
- name: STATE_SYNC_CPU_LIMIT
value: "1000m"
- name: STATE_SYNC_MEMORY_REQUEST
value: "1Gi"
- name: STATE_SYNC_MEMORY_LIMIT
value: "2Gi"
# OpenTelemetry configuration
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "otel-collector.ambient-code.svc:4317" # Deploy OTel collector separately
Expand Down
19 changes: 19 additions & 0 deletions components/manifests/base/session-limitrange.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: LimitRange
metadata:
name: session-container-defaults
spec:
limits:
- type: Container
default: # applied as limit if container omits limits
cpu: "500m"
memory: "2Gi"
defaultRequest: # applied as request if container omits requests
cpu: "100m"
memory: "1Gi"
min: # API server rejects containers below these
cpu: "50m"
memory: "1Gi"
max: # API server rejects containers above these
cpu: "4"
memory: "8Gi"
4 changes: 4 additions & 0 deletions components/manifests/overlays/kind/operator-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ metadata:
data:
# Use local runner image loaded into kind cluster (not quay.io)
AMBIENT_CODE_RUNNER_IMAGE: "vteam_claude_runner:latest"
# Session container resource overrides (smaller for local dev)
RUNNER_MEMORY_LIMIT: "1Gi"
CONTENT_MEMORY_LIMIT: "256Mi"
STATE_SYNC_MEMORY_LIMIT: "256Mi"
# Vertex AI Configuration - Disabled for e2e testing
CLAUDE_CODE_USE_VERTEX: "0"
CLOUD_ML_REGION: ""
Expand Down
34 changes: 34 additions & 0 deletions components/operator/internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@ type Config struct {
S3Endpoint string
S3Bucket string
PodFSGroup *int64
// Session container resources (externalized for per-environment tuning)
RunnerCPURequest string
RunnerCPULimit string
RunnerMemoryRequest string
RunnerMemoryLimit string
ContentCPURequest string
ContentCPULimit string
ContentMemoryRequest string
ContentMemoryLimit string
StateSyncCPURequest string
StateSyncCPULimit string
StateSyncMemoryRequest string
StateSyncMemoryLimit string
}

// InitK8sClients initializes the Kubernetes clients
Expand Down Expand Up @@ -70,6 +83,15 @@ func InitK8sClients() error {
return nil
}

// envOrDefault returns the value of the named environment variable or the
// provided fallback when the variable is unset or empty.
func envOrDefault(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}

// LoadConfig loads the operator configuration from environment variables
func LoadConfig() *Config {
// Get namespace from environment or use default
Expand Down Expand Up @@ -138,5 +160,17 @@ func LoadConfig() *Config {
S3Endpoint: s3Endpoint,
S3Bucket: s3Bucket,
PodFSGroup: podFSGroup,
RunnerCPURequest: envOrDefault("RUNNER_CPU_REQUEST", "500m"),
RunnerCPULimit: envOrDefault("RUNNER_CPU_LIMIT", "2000m"),
RunnerMemoryRequest: envOrDefault("RUNNER_MEMORY_REQUEST", "1Gi"),
RunnerMemoryLimit: envOrDefault("RUNNER_MEMORY_LIMIT", "4Gi"),
ContentCPURequest: envOrDefault("CONTENT_CPU_REQUEST", "100m"),
ContentCPULimit: envOrDefault("CONTENT_CPU_LIMIT", "500m"),
ContentMemoryRequest: envOrDefault("CONTENT_MEMORY_REQUEST", "1Gi"),
ContentMemoryLimit: envOrDefault("CONTENT_MEMORY_LIMIT", "2Gi"),
StateSyncCPURequest: envOrDefault("STATE_SYNC_CPU_REQUEST", "100m"),
StateSyncCPULimit: envOrDefault("STATE_SYNC_CPU_LIMIT", "1000m"),
StateSyncMemoryRequest: envOrDefault("STATE_SYNC_MEMORY_REQUEST", "1Gi"),
StateSyncMemoryLimit: envOrDefault("STATE_SYNC_MEMORY_LIMIT", "2Gi"),
}
}
79 changes: 71 additions & 8 deletions components/operator/internal/handlers/sessions.go
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,16 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
PeriodSeconds: 5,
},
VolumeMounts: []corev1.VolumeMount{{Name: "workspace", MountPath: "/workspace"}},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse(appConfig.ContentCPURequest),
corev1.ResourceMemory: resource.MustParse(appConfig.ContentMemoryRequest),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse(appConfig.ContentCPULimit),
corev1.ResourceMemory: resource.MustParse(appConfig.ContentMemoryLimit),
},
},
},
{
Name: "ambient-code-runner",
Expand Down Expand Up @@ -1138,12 +1148,12 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {

Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("512Mi"),
corev1.ResourceCPU: resource.MustParse(appConfig.RunnerCPURequest),
corev1.ResourceMemory: resource.MustParse(appConfig.RunnerMemoryRequest),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2000m"), // 2 cores for MCP + Claude SDK
corev1.ResourceMemory: resource.MustParse("4Gi"), // Increased for Playwright/Chromium + dev server
corev1.ResourceCPU: resource.MustParse(appConfig.RunnerCPULimit),
corev1.ResourceMemory: resource.MustParse(appConfig.RunnerMemoryLimit),
},
},
},
Expand Down Expand Up @@ -1177,12 +1187,12 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("128Mi"),
corev1.ResourceCPU: resource.MustParse(appConfig.StateSyncCPURequest),
corev1.ResourceMemory: resource.MustParse(appConfig.StateSyncMemoryRequest),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1000m"), // Increased from 200m for MCP startup
corev1.ResourceMemory: resource.MustParse("1Gi"), // Increased from 256Mi
corev1.ResourceCPU: resource.MustParse(appConfig.StateSyncCPULimit),
corev1.ResourceMemory: resource.MustParse(appConfig.StateSyncMemoryLimit),
},
},
},
Expand Down Expand Up @@ -1256,6 +1266,11 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {

// Do not mount runner Secret volume; runner fetches tokens on demand

// Ensure LimitRange exists in the session namespace (guardrail for future containers)
if err := ensureLimitRange(sessionNamespace); err != nil {
log.Printf("Warning: failed to ensure LimitRange in %s: %v", sessionNamespace, err)
}

// Create the pod
createdPod, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Create(context.TODO(), pod, v1.CreateOptions{})
if err != nil {
Expand Down Expand Up @@ -2028,6 +2043,54 @@ func deletePodAndPerPodService(namespace, podName, sessionName string) error {
return nil
}

// ensureLimitRange creates or updates a LimitRange in the given namespace so that any
// container without explicit resource requests/limits gets safe defaults and containers
// below the minimum are rejected by the API server.
func ensureLimitRange(namespace string) error {
lr := &corev1.LimitRange{
ObjectMeta: v1.ObjectMeta{
Name: "session-container-defaults",
Namespace: namespace,
},
Spec: corev1.LimitRangeSpec{
Limits: []corev1.LimitRangeItem{
{
Type: corev1.LimitTypeContainer,
Default: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
DefaultRequest: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Min: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("50m"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Max: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("4"),
corev1.ResourceMemory: resource.MustParse("8Gi"),
},
},
},
},
}

_, err := config.K8sClient.CoreV1().LimitRanges(namespace).Create(context.TODO(), lr, v1.CreateOptions{})
if err != nil {
if errors.IsAlreadyExists(err) {
_, err = config.K8sClient.CoreV1().LimitRanges(namespace).Update(context.TODO(), lr, v1.UpdateOptions{})
if err != nil {
return fmt.Errorf("update LimitRange: %w", err)
}
} else {
return fmt.Errorf("create LimitRange: %w", err)
}
}
return nil
}

// copySecretToNamespace copies a secret to a target namespace with owner references
func copySecretToNamespace(ctx context.Context, sourceSecret *corev1.Secret, targetNamespace string, ownerObj *unstructured.Unstructured) error {
// Check if secret already exists in target namespace
Expand Down
Loading