diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go index 950c0e2f3..aaa73b3f0 100644 --- a/pkg/evaluation/eval.go +++ b/pkg/evaluation/eval.go @@ -309,7 +309,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res return result, fmt.Errorf("building eval image: %w", err) } - events, err := r.runCagentInContainer(ctx, imageID, getUserMessages(evalSess.Session)) + events, err := r.runCagentInContainer(ctx, imageID, getUserMessages(evalSess.Session), evals.Setup) if err != nil { return result, fmt.Errorf("running cagent in container: %w", err) } @@ -346,7 +346,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res return result, nil } -func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, questions []string) ([]map[string]any, error) { +func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, questions []string, setup string) ([]map[string]any, error) { agentDir := r.agentSource.ParentDir() agentFile := filepath.Base(r.agentSource.Name()) containerName := fmt.Sprintf("cagent-eval-%d", uuid.New().ID()) @@ -396,7 +396,31 @@ func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, quest } } - args = append(args, imageID, "/configs/"+agentFile) + // When a setup script is provided, mount it into the container and + // override the entrypoint to run it before cagent exec. + // The default entrypoint is: /run.sh /cagent exec --yolo --json + // /run.sh starts dockerd then exec's "$@". + if setup != "" { + setupFile := filepath.Join(os.TempDir(), fmt.Sprintf("cagent-eval-setup-%d.sh", uuid.New().ID())) + if err := os.WriteFile(setupFile, []byte(setup), 0o600); err != nil { + return nil, fmt.Errorf("writing setup script: %w", err) + } + defer os.Remove(setupFile) + + args = append(args, + "-v", setupFile+":/setup.sh:ro", + "--entrypoint", "/run.sh", + ) + } + + args = append(args, imageID) + + if setup != "" { + // Run setup script, then cagent exec with the original arguments. + args = append(args, "sh", "-c", "sh /setup.sh && exec /cagent exec --yolo --json \"$@\"", "--", "/configs/"+agentFile) + } else { + args = append(args, "/configs/"+agentFile) + } args = append(args, questions...) cmd := exec.CommandContext(ctx, "docker", args...) diff --git a/pkg/session/session.go b/pkg/session/session.go index 731901003..5ccd32424 100644 --- a/pkg/session/session.go +++ b/pkg/session/session.go @@ -207,6 +207,7 @@ type EvalCriteria struct { Relevance []string `json:"relevance"` // Statements that should be true about the response WorkingDir string `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/ Size string `json:"size,omitempty"` // Expected response size: S, M, L, XL + Setup string `json:"setup,omitempty"` // Optional sh script to run in the container before cagent exec } // Session helper methods