docker · dgageot · Mar 18, 2026 · Mar 18, 2026
@@ -1,34 +1,17 @@
 # syntax=docker/dockerfile:1
 
-FROM docker:dind AS dind
-RUN rm -f /usr/local/bin/docker-compose /usr/local/libexec/docker/cli-plugins/docker-compose /usr/local/libexec/docker/cli-plugins/docker-buildx 2>/dev/null || true
+FROM alpine:latest
+LABEL "io.docker.agent.evals.image"="default"
+COPY --from=docker/docker-agent:edge /docker-agent /
 RUN cat <<-'EOF' >/run.sh
 #!/usr/bin/env sh
-set -euxo pipefail
-(
-    echo "Starting dockerd..."
-    export TINI_SUBREAPER=1
-    export DOCKER_DRIVER=vfs
-    dockerd-entrypoint.sh dockerd &
-
-    until docker info > /dev/null 2>&1
-    do
-        echo "Waiting for dockerd..."
-        sleep 1
-    done
-    echo "dockerd is ready!"
-) >/dev/null 2>&1
-
+set -euo pipefail
 exec "$@"
 EOF
 RUN chmod +x /run.sh
-
-FROM scratch
-COPY --from=dind / /
-COPY --from=docker/docker-agent:edge /docker-agent /
 WORKDIR /working_dir
 ENV TELEMETRY_ENABLED=false
 ENV DOCKER_AGENT_HIDE_TELEMETRY_BANNER=1
 ENTRYPOINT ["/run.sh", "/docker-agent", "run", "--exec", "--yolo", "--json"]
 {{if .CopyWorkingDir}}COPY . ./
-{{end}}
+{{end}}
@@ -11,6 +11,8 @@ import (
 	"path/filepath"
 	"strings"
 	"text/template"
+
+	"github.com/docker/docker-agent/pkg/session"
 )
 
 var (
@@ -24,30 +26,38 @@ var (
 	dockerfileCustomTemplate = template.Must(template.New("DockerfileCustom").Parse(dockerfileCustomTmpl))
 )
 
+// imageKey uniquely identifies a Docker image build configuration.
+type imageKey struct {
+	workingDir string
+	image      string
+}
+
+// String returns a stable string representation for use as a singleflight key.
+func (k imageKey) String() string {
+	return k.workingDir + "\x00" + k.image
+}
+
 // getOrBuildImage returns a cached image ID or builds a new one.
-// Images are cached by working directory to avoid redundant builds.
-// Concurrent calls for the same working directory are deduplicated
+// Concurrent calls for the same (workingDir, image) pair are deduplicated
 // using singleflight so that only one build runs at a time per key.
-func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string, error) {
+func (r *Runner) getOrBuildImage(ctx context.Context, evals *session.EvalCriteria) (string, error) {
+	key := imageKey{workingDir: evals.WorkingDir, image: evals.Image}
+
 	r.imageCacheMu.Lock()
-	if imageID, ok := r.imageCache[workingDir]; ok {
+	if imageID, ok := r.imageCache[key]; ok {
 		r.imageCacheMu.Unlock()
 		return imageID, nil
 	}
 	r.imageCacheMu.Unlock()
 
-	// singleflight ensures only one build per working directory runs at a time.
-	// The cache write inside the callback guarantees the result is available
-	// before singleflight releases the key, so subsequent callers always
-	// hit the cache above.
-	v, err, _ := r.imageBuildGroup.Do(workingDir, func() (any, error) {
-		imageID, err := r.buildEvalImage(ctx, workingDir)
+	v, err, _ := r.imageBuildGroup.Do(key.String(), func() (any, error) {
+		imageID, err := r.buildEvalImage(ctx, evals)
 		if err != nil {
 			return "", err
 		}
 
 		r.imageCacheMu.Lock()
-		r.imageCache[workingDir] = imageID
+		r.imageCache[key] = imageID
 		r.imageCacheMu.Unlock()
 
 		return imageID, nil
@@ -59,18 +69,28 @@ func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string
 	return v.(string), nil
 }
 
-func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string, error) {
+// resolveBaseImage returns the effective base image for an eval.
+// The per-eval image takes priority over the global --base-image flag.
+func (r *Runner) resolveBaseImage(evals *session.EvalCriteria) string {
+	if evals.Image != "" {
+		return evals.Image
+	}
+	return r.BaseImage
+}
+
+// buildEvalImage builds a Docker image for an evaluation.
+func (r *Runner) buildEvalImage(ctx context.Context, evals *session.EvalCriteria) (string, error) {
 	var buildContext string
 	var data struct {
 		CopyWorkingDir bool
 		BaseImage      string
 	}
 
-	if workingDir == "" {
+	if evals.WorkingDir == "" {
 		buildContext = r.EvalsDir
 		data.CopyWorkingDir = false
 	} else {
-		buildContext = filepath.Join(r.EvalsDir, "working_dirs", workingDir)
+		buildContext = filepath.Join(r.EvalsDir, "working_dirs", evals.WorkingDir)
 		if _, err := os.Stat(buildContext); os.IsNotExist(err) {
 			return "", fmt.Errorf("working directory not found: %s", buildContext)
 		}
@@ -79,9 +99,9 @@ func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string,
 
 	// Choose template based on whether a custom base image is provided
 	tmpl := dockerfileTemplate
-	if r.BaseImage != "" {
+	if baseImage := r.resolveBaseImage(evals); baseImage != "" {
 		tmpl = dockerfileCustomTemplate
-		data.BaseImage = r.BaseImage
+		data.BaseImage = baseImage
 	}
 
 	var dockerfile bytes.Buffer

@@ -36,12 +36,11 @@ type Runner struct {
 	judge       *Judge
 	runConfig   *config.RuntimeConfig
 
-	// imageCache caches built Docker images by working directory.
-	// Key is the working directory (empty string for no working dir).
-	imageCache   map[string]string
+	// imageCache caches built Docker images by (workingDir, image) pair.
+	imageCache   map[imageKey]string
 	imageCacheMu sync.Mutex
 
-	// imageBuildGroup deduplicates concurrent image builds for the same working directory.
+	// imageBuildGroup deduplicates concurrent image builds for the same (workingDir, image) pair.
 	imageBuildGroup singleflight.Group
 }
 
@@ -56,7 +55,7 @@ func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judge
 		agentSource: agentSource,
 		judge:       judge,
 		runConfig:   runConfig,
-		imageCache:  make(map[string]string),
+		imageCache:  make(map[imageKey]string),
 	}
 }
 
@@ -230,63 +229,68 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
 }
 
 // preBuildImages pre-builds all unique Docker images needed for the evaluations.
-// This is done in parallel to avoid serialized builds during evaluation.
+// Concurrent calls for the same (workingDir, image) pair are deduplicated by
+// getOrBuildImage's singleflight, so we simply iterate over all evals.
 func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []InputSession) error {
-	// Collect unique working directories
-	workingDirs := make(map[string]struct{})
+	if len(evals) == 0 {
+		return nil
+	}
+
+	// Count unique images to report an accurate number.
+	unique := make(map[imageKey]struct{})
 	for _, eval := range evals {
+		var key imageKey
 		if eval.Evals != nil {
-			workingDirs[eval.Evals.WorkingDir] = struct{}{}
+			key = imageKey{workingDir: eval.Evals.WorkingDir, image: eval.Evals.Image}
 		}
+		unique[key] = struct{}{}
 	}
 
-	if len(workingDirs) == 0 {
-		return nil
-	}
-
-	fmt.Fprintf(out, "Pre-building %d Docker image(s)...\n", len(workingDirs))
+	fmt.Fprintf(out, "Pre-building %d Docker image(s)...\n", len(unique))
 
-	// Build images in parallel with limited concurrency
 	type buildResult struct {
-		workingDir string
-		err        error
+		title string
+		err   error
 	}
 
-	work := make(chan string, len(workingDirs))
-	for wd := range workingDirs {
-		work <- wd
+	work := make(chan InputSession, len(evals))
+	for _, eval := range evals {
+		work <- eval
 	}
 	close(work)
 
-	results := make(chan buildResult, len(workingDirs))
+	results := make(chan buildResult, len(evals))
 
-	// Use same concurrency as evaluation runs for image builds
-	buildWorkers := min(r.Concurrency, len(workingDirs))
+	buildWorkers := min(r.Concurrency, len(evals))
 	var wg sync.WaitGroup
 	for range buildWorkers {
 		wg.Go(func() {
-			for wd := range work {
+			for eval := range work {
 				if ctx.Err() != nil {
-					results <- buildResult{workingDir: wd, err: ctx.Err()}
+					results <- buildResult{title: eval.Title, err: ctx.Err()}
 					continue
 				}
-				_, err := r.getOrBuildImage(ctx, wd)
-				results <- buildResult{workingDir: wd, err: err}
+
+				criteria := eval.Evals
+				if criteria == nil {
+					criteria = &session.EvalCriteria{}
+				}
+
+				_, err := r.getOrBuildImage(ctx, criteria)
+				results <- buildResult{title: eval.Title, err: err}
 			}
 		})
 	}
 
-	// Wait for all builds to complete
 	go func() {
 		wg.Wait()
 		close(results)
 	}()
 
-	// Collect errors
 	var errs []error
 	for result := range results {
 		if result.err != nil {
-			errs = append(errs, fmt.Errorf("building image for %q: %w", result.workingDir, result.err))
+			errs = append(errs, fmt.Errorf("building image for %q: %w", result.title, result.err))
 		}
 	}
 
@@ -323,9 +327,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 		result.ToolCallsExpected = 1.0
 	}
 
-	workingDir := evals.WorkingDir
-
-	imageID, err := r.getOrBuildImage(ctx, workingDir)
+	imageID, err := r.getOrBuildImage(ctx, evals)
 	if err != nil {
 		return result, fmt.Errorf("building eval image: %w", err)
 	}

@@ -222,6 +222,7 @@ type EvalCriteria struct {
 	WorkingDir string   `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
 	Size       string   `json:"size,omitempty"`        // Expected response size: S, M, L, XL
 	Setup      string   `json:"setup,omitempty"`       // Optional sh script to run in the container before docker agent run --exec
+	Image      string   `json:"image,omitempty"`       // Custom Docker image for this eval (overrides --base-image)
 }
 
 // UnmarshalJSON implements custom JSON unmarshaling for EvalCriteria that