diff --git a/pkg/evaluation/Dockerfile.template b/pkg/evaluation/Dockerfile.template index dbc6bd9cf..f96002be7 100644 --- a/pkg/evaluation/Dockerfile.template +++ b/pkg/evaluation/Dockerfile.template @@ -1,34 +1,17 @@ # syntax=docker/dockerfile:1 -FROM docker:dind AS dind -RUN rm -f /usr/local/bin/docker-compose /usr/local/libexec/docker/cli-plugins/docker-compose /usr/local/libexec/docker/cli-plugins/docker-buildx 2>/dev/null || true +FROM alpine:latest +LABEL "io.docker.agent.evals.image"="default" +COPY --from=docker/docker-agent:edge /docker-agent / RUN cat <<-'EOF' >/run.sh #!/usr/bin/env sh -set -euxo pipefail -( - echo "Starting dockerd..." - export TINI_SUBREAPER=1 - export DOCKER_DRIVER=vfs - dockerd-entrypoint.sh dockerd & - - until docker info > /dev/null 2>&1 - do - echo "Waiting for dockerd..." - sleep 1 - done - echo "dockerd is ready!" -) >/dev/null 2>&1 - +set -euo pipefail exec "$@" EOF RUN chmod +x /run.sh - -FROM scratch -COPY --from=dind / / -COPY --from=docker/docker-agent:edge /docker-agent / WORKDIR /working_dir ENV TELEMETRY_ENABLED=false ENV DOCKER_AGENT_HIDE_TELEMETRY_BANNER=1 ENTRYPOINT ["/run.sh", "/docker-agent", "run", "--exec", "--yolo", "--json"] {{if .CopyWorkingDir}}COPY . ./ -{{end}} \ No newline at end of file +{{end}} diff --git a/pkg/evaluation/build.go b/pkg/evaluation/build.go index a5ed77fc2..58c786d78 100644 --- a/pkg/evaluation/build.go +++ b/pkg/evaluation/build.go @@ -11,6 +11,8 @@ import ( "path/filepath" "strings" "text/template" + + "github.com/docker/docker-agent/pkg/session" ) var ( @@ -24,30 +26,38 @@ var ( dockerfileCustomTemplate = template.Must(template.New("DockerfileCustom").Parse(dockerfileCustomTmpl)) ) +// imageKey uniquely identifies a Docker image build configuration. +type imageKey struct { + workingDir string + image string +} + +// String returns a stable string representation for use as a singleflight key. +func (k imageKey) String() string { + return k.workingDir + "\x00" + k.image +} + // getOrBuildImage returns a cached image ID or builds a new one. -// Images are cached by working directory to avoid redundant builds. -// Concurrent calls for the same working directory are deduplicated +// Concurrent calls for the same (workingDir, image) pair are deduplicated // using singleflight so that only one build runs at a time per key. -func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string, error) { +func (r *Runner) getOrBuildImage(ctx context.Context, evals *session.EvalCriteria) (string, error) { + key := imageKey{workingDir: evals.WorkingDir, image: evals.Image} + r.imageCacheMu.Lock() - if imageID, ok := r.imageCache[workingDir]; ok { + if imageID, ok := r.imageCache[key]; ok { r.imageCacheMu.Unlock() return imageID, nil } r.imageCacheMu.Unlock() - // singleflight ensures only one build per working directory runs at a time. - // The cache write inside the callback guarantees the result is available - // before singleflight releases the key, so subsequent callers always - // hit the cache above. - v, err, _ := r.imageBuildGroup.Do(workingDir, func() (any, error) { - imageID, err := r.buildEvalImage(ctx, workingDir) + v, err, _ := r.imageBuildGroup.Do(key.String(), func() (any, error) { + imageID, err := r.buildEvalImage(ctx, evals) if err != nil { return "", err } r.imageCacheMu.Lock() - r.imageCache[workingDir] = imageID + r.imageCache[key] = imageID r.imageCacheMu.Unlock() return imageID, nil @@ -59,18 +69,28 @@ func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string return v.(string), nil } -func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string, error) { +// resolveBaseImage returns the effective base image for an eval. +// The per-eval image takes priority over the global --base-image flag. +func (r *Runner) resolveBaseImage(evals *session.EvalCriteria) string { + if evals.Image != "" { + return evals.Image + } + return r.BaseImage +} + +// buildEvalImage builds a Docker image for an evaluation. +func (r *Runner) buildEvalImage(ctx context.Context, evals *session.EvalCriteria) (string, error) { var buildContext string var data struct { CopyWorkingDir bool BaseImage string } - if workingDir == "" { + if evals.WorkingDir == "" { buildContext = r.EvalsDir data.CopyWorkingDir = false } else { - buildContext = filepath.Join(r.EvalsDir, "working_dirs", workingDir) + buildContext = filepath.Join(r.EvalsDir, "working_dirs", evals.WorkingDir) if _, err := os.Stat(buildContext); os.IsNotExist(err) { return "", fmt.Errorf("working directory not found: %s", buildContext) } @@ -79,9 +99,9 @@ func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string, // Choose template based on whether a custom base image is provided tmpl := dockerfileTemplate - if r.BaseImage != "" { + if baseImage := r.resolveBaseImage(evals); baseImage != "" { tmpl = dockerfileCustomTemplate - data.BaseImage = r.BaseImage + data.BaseImage = baseImage } var dockerfile bytes.Buffer diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go index 37d912891..867934888 100644 --- a/pkg/evaluation/eval.go +++ b/pkg/evaluation/eval.go @@ -36,12 +36,11 @@ type Runner struct { judge *Judge runConfig *config.RuntimeConfig - // imageCache caches built Docker images by working directory. - // Key is the working directory (empty string for no working dir). - imageCache map[string]string + // imageCache caches built Docker images by (workingDir, image) pair. + imageCache map[imageKey]string imageCacheMu sync.Mutex - // imageBuildGroup deduplicates concurrent image builds for the same working directory. + // imageBuildGroup deduplicates concurrent image builds for the same (workingDir, image) pair. imageBuildGroup singleflight.Group } @@ -56,7 +55,7 @@ func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judge agentSource: agentSource, judge: judge, runConfig: runConfig, - imageCache: make(map[string]string), + imageCache: make(map[imageKey]string), } } @@ -230,63 +229,68 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) { } // preBuildImages pre-builds all unique Docker images needed for the evaluations. -// This is done in parallel to avoid serialized builds during evaluation. +// Concurrent calls for the same (workingDir, image) pair are deduplicated by +// getOrBuildImage's singleflight, so we simply iterate over all evals. func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []InputSession) error { - // Collect unique working directories - workingDirs := make(map[string]struct{}) + if len(evals) == 0 { + return nil + } + + // Count unique images to report an accurate number. + unique := make(map[imageKey]struct{}) for _, eval := range evals { + var key imageKey if eval.Evals != nil { - workingDirs[eval.Evals.WorkingDir] = struct{}{} + key = imageKey{workingDir: eval.Evals.WorkingDir, image: eval.Evals.Image} } + unique[key] = struct{}{} } - if len(workingDirs) == 0 { - return nil - } - - fmt.Fprintf(out, "Pre-building %d Docker image(s)...\n", len(workingDirs)) + fmt.Fprintf(out, "Pre-building %d Docker image(s)...\n", len(unique)) - // Build images in parallel with limited concurrency type buildResult struct { - workingDir string - err error + title string + err error } - work := make(chan string, len(workingDirs)) - for wd := range workingDirs { - work <- wd + work := make(chan InputSession, len(evals)) + for _, eval := range evals { + work <- eval } close(work) - results := make(chan buildResult, len(workingDirs)) + results := make(chan buildResult, len(evals)) - // Use same concurrency as evaluation runs for image builds - buildWorkers := min(r.Concurrency, len(workingDirs)) + buildWorkers := min(r.Concurrency, len(evals)) var wg sync.WaitGroup for range buildWorkers { wg.Go(func() { - for wd := range work { + for eval := range work { if ctx.Err() != nil { - results <- buildResult{workingDir: wd, err: ctx.Err()} + results <- buildResult{title: eval.Title, err: ctx.Err()} continue } - _, err := r.getOrBuildImage(ctx, wd) - results <- buildResult{workingDir: wd, err: err} + + criteria := eval.Evals + if criteria == nil { + criteria = &session.EvalCriteria{} + } + + _, err := r.getOrBuildImage(ctx, criteria) + results <- buildResult{title: eval.Title, err: err} } }) } - // Wait for all builds to complete go func() { wg.Wait() close(results) }() - // Collect errors var errs []error for result := range results { if result.err != nil { - errs = append(errs, fmt.Errorf("building image for %q: %w", result.workingDir, result.err)) + errs = append(errs, fmt.Errorf("building image for %q: %w", result.title, result.err)) } } @@ -323,9 +327,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res result.ToolCallsExpected = 1.0 } - workingDir := evals.WorkingDir - - imageID, err := r.getOrBuildImage(ctx, workingDir) + imageID, err := r.getOrBuildImage(ctx, evals) if err != nil { return result, fmt.Errorf("building eval image: %w", err) } diff --git a/pkg/session/session.go b/pkg/session/session.go index accf93a3a..ff52efd29 100644 --- a/pkg/session/session.go +++ b/pkg/session/session.go @@ -222,6 +222,7 @@ type EvalCriteria struct { WorkingDir string `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/ Size string `json:"size,omitempty"` // Expected response size: S, M, L, XL Setup string `json:"setup,omitempty"` // Optional sh script to run in the container before docker agent run --exec + Image string `json:"image,omitempty"` // Custom Docker image for this eval (overrides --base-image) } // UnmarshalJSON implements custom JSON unmarshaling for EvalCriteria that