-
Notifications
You must be signed in to change notification settings - Fork 251
eval: add optional setup script support for eval sessions #1673
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -309,7 +309,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res | |
| return result, fmt.Errorf("building eval image: %w", err) | ||
| } | ||
|
|
||
| events, err := r.runCagentInContainer(ctx, imageID, getUserMessages(evalSess.Session)) | ||
| events, err := r.runCagentInContainer(ctx, imageID, getUserMessages(evalSess.Session), evals.Setup) | ||
| if err != nil { | ||
| return result, fmt.Errorf("running cagent in container: %w", err) | ||
| } | ||
|
|
@@ -346,7 +346,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res | |
| return result, nil | ||
| } | ||
|
|
||
| func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, questions []string) ([]map[string]any, error) { | ||
| func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, questions []string, setup string) ([]map[string]any, error) { | ||
| agentDir := r.agentSource.ParentDir() | ||
| agentFile := filepath.Base(r.agentSource.Name()) | ||
| containerName := fmt.Sprintf("cagent-eval-%d", uuid.New().ID()) | ||
|
|
@@ -396,7 +396,31 @@ func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, quest | |
| } | ||
| } | ||
|
|
||
| args = append(args, imageID, "/configs/"+agentFile) | ||
| // When a setup script is provided, mount it into the container and | ||
| // override the entrypoint to run it before cagent exec. | ||
| // The default entrypoint is: /run.sh /cagent exec --yolo --json | ||
| // /run.sh starts dockerd then exec's "$@". | ||
| if setup != "" { | ||
| setupFile := filepath.Join(os.TempDir(), fmt.Sprintf("cagent-eval-setup-%d.sh", uuid.New().ID())) | ||
| if err := os.WriteFile(setupFile, []byte(setup), 0o600); err != nil { | ||
| return nil, fmt.Errorf("writing setup script: %w", err) | ||
| } | ||
| defer os.Remove(setupFile) | ||
|
|
||
| args = append(args, | ||
| "-v", setupFile+":/setup.sh:ro", | ||
| "--entrypoint", "/run.sh", | ||
| ) | ||
| } | ||
|
|
||
| args = append(args, imageID) | ||
|
|
||
| if setup != "" { | ||
| // Run setup script, then cagent exec with the original arguments. | ||
| args = append(args, "sh", "-c", "sh /setup.sh && exec /cagent exec --yolo --json \"$@\"", "--", "/configs/"+agentFile) | ||
| } else { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Error handling issue for setup script failures When a container exits with an error, the code cannot distinguish between setup script failure and cagent execution failure. Both result in the same error path (lines 446-453), making debugging difficult. The error message "container failed" doesn't indicate whether the failure occurred in the setup script or the cagent exec command. Consider:
Example: sh /setup.sh && echo '__SETUP_COMPLETE__' && exec /cagent exec --yolo --json "$@"Then parse the output to determine if setup completed before reporting errors. |
||
| args = append(args, "/configs/"+agentFile) | ||
| } | ||
| args = append(args, questions...) | ||
|
|
||
| cmd := exec.CommandContext(ctx, "docker", args...) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing error handling for file removal
The
defer os.Remove(setupFile)silently ignores any errors during temporary file cleanup. While os.Remove() failures are uncommon on Unix-like systems, this is poor error handling practice. If the file cannot be removed due to permissions or file system issues, temporary files could accumulate on disk over repeated evaluations.Consider logging cleanup errors:
This makes cleanup failures visible for debugging without failing the evaluation.