Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions pkg/evaluation/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
return result, fmt.Errorf("building eval image: %w", err)
}

events, err := r.runCagentInContainer(ctx, imageID, getUserMessages(evalSess.Session))
events, err := r.runCagentInContainer(ctx, imageID, getUserMessages(evalSess.Session), evals.Setup)
if err != nil {
return result, fmt.Errorf("running cagent in container: %w", err)
}
Expand Down Expand Up @@ -346,7 +346,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
return result, nil
}

func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, questions []string) ([]map[string]any, error) {
func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, questions []string, setup string) ([]map[string]any, error) {
agentDir := r.agentSource.ParentDir()
agentFile := filepath.Base(r.agentSource.Name())
containerName := fmt.Sprintf("cagent-eval-%d", uuid.New().ID())
Expand Down Expand Up @@ -396,7 +396,31 @@ func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, quest
}
}

args = append(args, imageID, "/configs/"+agentFile)
// When a setup script is provided, mount it into the container and
// override the entrypoint to run it before cagent exec.
// The default entrypoint is: /run.sh /cagent exec --yolo --json
// /run.sh starts dockerd then exec's "$@".
if setup != "" {
setupFile := filepath.Join(os.TempDir(), fmt.Sprintf("cagent-eval-setup-%d.sh", uuid.New().ID()))
if err := os.WriteFile(setupFile, []byte(setup), 0o600); err != nil {
return nil, fmt.Errorf("writing setup script: %w", err)
}
defer os.Remove(setupFile)

args = append(args,
"-v", setupFile+":/setup.sh:ro",
"--entrypoint", "/run.sh",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing error handling for file removal

The defer os.Remove(setupFile) silently ignores any errors during temporary file cleanup. While os.Remove() failures are uncommon on Unix-like systems, this is poor error handling practice. If the file cannot be removed due to permissions or file system issues, temporary files could accumulate on disk over repeated evaluations.

Consider logging cleanup errors:

defer func() {
    if err := os.Remove(setupFile); err != nil && !os.IsNotExist(err) {
        slog.Debug("failed to remove setup file", "path", setupFile, "error", err)
    }
}()

This makes cleanup failures visible for debugging without failing the evaluation.

)
}

args = append(args, imageID)

if setup != "" {
// Run setup script, then cagent exec with the original arguments.
args = append(args, "sh", "-c", "sh /setup.sh && exec /cagent exec --yolo --json \"$@\"", "--", "/configs/"+agentFile)
} else {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error handling issue for setup script failures

When a container exits with an error, the code cannot distinguish between setup script failure and cagent execution failure. Both result in the same error path (lines 446-453), making debugging difficult. The error message "container failed" doesn't indicate whether the failure occurred in the setup script or the cagent exec command.

Consider:

  1. Adding a marker/log line at the end of the setup script to indicate successful completion
  2. Checking for this marker to provide clearer error messages
  3. Or wrapping the setup script execution to capture its specific exit code

Example:

sh /setup.sh && echo '__SETUP_COMPLETE__' && exec /cagent exec --yolo --json "$@"

Then parse the output to determine if setup completed before reporting errors.

args = append(args, "/configs/"+agentFile)
}
args = append(args, questions...)

cmd := exec.CommandContext(ctx, "docker", args...)
Expand Down
1 change: 1 addition & 0 deletions pkg/session/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ type EvalCriteria struct {
Relevance []string `json:"relevance"` // Statements that should be true about the response
WorkingDir string `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
Size string `json:"size,omitempty"` // Expected response size: S, M, L, XL
Setup string `json:"setup,omitempty"` // Optional sh script to run in the container before cagent exec
}

// Session helper methods
Expand Down
Loading