Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 5 additions & 22 deletions pkg/evaluation/Dockerfile.template
Original file line number Diff line number Diff line change
@@ -1,34 +1,17 @@
# syntax=docker/dockerfile:1

FROM docker:dind AS dind
RUN rm -f /usr/local/bin/docker-compose /usr/local/libexec/docker/cli-plugins/docker-compose /usr/local/libexec/docker/cli-plugins/docker-buildx 2>/dev/null || true
FROM alpine:latest
LABEL "io.docker.agent.evals.image"="default"
COPY --from=docker/docker-agent:edge /docker-agent /
RUN cat <<-'EOF' >/run.sh
#!/usr/bin/env sh
set -euxo pipefail
(
echo "Starting dockerd..."
export TINI_SUBREAPER=1
export DOCKER_DRIVER=vfs
dockerd-entrypoint.sh dockerd &

until docker info > /dev/null 2>&1
do
echo "Waiting for dockerd..."
sleep 1
done
echo "dockerd is ready!"
) >/dev/null 2>&1

set -euo pipefail
exec "$@"
EOF
RUN chmod +x /run.sh

FROM scratch
COPY --from=dind / /
COPY --from=docker/docker-agent:edge /docker-agent /
WORKDIR /working_dir
ENV TELEMETRY_ENABLED=false
ENV DOCKER_AGENT_HIDE_TELEMETRY_BANNER=1
ENTRYPOINT ["/run.sh", "/docker-agent", "run", "--exec", "--yolo", "--json"]
{{if .CopyWorkingDir}}COPY . ./
{{end}}
{{end}}
52 changes: 36 additions & 16 deletions pkg/evaluation/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"path/filepath"
"strings"
"text/template"

"github.com/docker/docker-agent/pkg/session"
)

var (
Expand All @@ -24,30 +26,38 @@ var (
dockerfileCustomTemplate = template.Must(template.New("DockerfileCustom").Parse(dockerfileCustomTmpl))
)

// imageKey uniquely identifies a Docker image build configuration.
type imageKey struct {
workingDir string
image string
}

// String returns a stable string representation for use as a singleflight key.
func (k imageKey) String() string {
return k.workingDir + "\x00" + k.image
}

// getOrBuildImage returns a cached image ID or builds a new one.
// Images are cached by working directory to avoid redundant builds.
// Concurrent calls for the same working directory are deduplicated
// Concurrent calls for the same (workingDir, image) pair are deduplicated
// using singleflight so that only one build runs at a time per key.
func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string, error) {
func (r *Runner) getOrBuildImage(ctx context.Context, evals *session.EvalCriteria) (string, error) {
key := imageKey{workingDir: evals.WorkingDir, image: evals.Image}

r.imageCacheMu.Lock()
if imageID, ok := r.imageCache[workingDir]; ok {
if imageID, ok := r.imageCache[key]; ok {
r.imageCacheMu.Unlock()
return imageID, nil
}
r.imageCacheMu.Unlock()

// singleflight ensures only one build per working directory runs at a time.
// The cache write inside the callback guarantees the result is available
// before singleflight releases the key, so subsequent callers always
// hit the cache above.
v, err, _ := r.imageBuildGroup.Do(workingDir, func() (any, error) {
imageID, err := r.buildEvalImage(ctx, workingDir)
v, err, _ := r.imageBuildGroup.Do(key.String(), func() (any, error) {
imageID, err := r.buildEvalImage(ctx, evals)
if err != nil {
return "", err
}

r.imageCacheMu.Lock()
r.imageCache[workingDir] = imageID
r.imageCache[key] = imageID
r.imageCacheMu.Unlock()

return imageID, nil
Expand All @@ -59,18 +69,28 @@ func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string
return v.(string), nil
}

func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string, error) {
// resolveBaseImage returns the effective base image for an eval.
// The per-eval image takes priority over the global --base-image flag.
func (r *Runner) resolveBaseImage(evals *session.EvalCriteria) string {
if evals.Image != "" {
return evals.Image
}
return r.BaseImage
}

// buildEvalImage builds a Docker image for an evaluation.
func (r *Runner) buildEvalImage(ctx context.Context, evals *session.EvalCriteria) (string, error) {
var buildContext string
var data struct {
CopyWorkingDir bool
BaseImage string
}

if workingDir == "" {
if evals.WorkingDir == "" {
buildContext = r.EvalsDir
data.CopyWorkingDir = false
} else {
buildContext = filepath.Join(r.EvalsDir, "working_dirs", workingDir)
buildContext = filepath.Join(r.EvalsDir, "working_dirs", evals.WorkingDir)
if _, err := os.Stat(buildContext); os.IsNotExist(err) {
return "", fmt.Errorf("working directory not found: %s", buildContext)
}
Expand All @@ -79,9 +99,9 @@ func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string,

// Choose template based on whether a custom base image is provided
tmpl := dockerfileTemplate
if r.BaseImage != "" {
if baseImage := r.resolveBaseImage(evals); baseImage != "" {
tmpl = dockerfileCustomTemplate
data.BaseImage = r.BaseImage
data.BaseImage = baseImage
}

var dockerfile bytes.Buffer
Expand Down
68 changes: 35 additions & 33 deletions pkg/evaluation/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,11 @@ type Runner struct {
judge *Judge
runConfig *config.RuntimeConfig

// imageCache caches built Docker images by working directory.
// Key is the working directory (empty string for no working dir).
imageCache map[string]string
// imageCache caches built Docker images by (workingDir, image) pair.
imageCache map[imageKey]string
imageCacheMu sync.Mutex

// imageBuildGroup deduplicates concurrent image builds for the same working directory.
// imageBuildGroup deduplicates concurrent image builds for the same (workingDir, image) pair.
imageBuildGroup singleflight.Group
}

Expand All @@ -56,7 +55,7 @@ func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judge
agentSource: agentSource,
judge: judge,
runConfig: runConfig,
imageCache: make(map[string]string),
imageCache: make(map[imageKey]string),
}
}

Expand Down Expand Up @@ -230,63 +229,68 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
}

// preBuildImages pre-builds all unique Docker images needed for the evaluations.
// This is done in parallel to avoid serialized builds during evaluation.
// Concurrent calls for the same (workingDir, image) pair are deduplicated by
// getOrBuildImage's singleflight, so we simply iterate over all evals.
func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []InputSession) error {
// Collect unique working directories
workingDirs := make(map[string]struct{})
if len(evals) == 0 {
return nil
}

// Count unique images to report an accurate number.
unique := make(map[imageKey]struct{})
for _, eval := range evals {
var key imageKey
if eval.Evals != nil {
workingDirs[eval.Evals.WorkingDir] = struct{}{}
key = imageKey{workingDir: eval.Evals.WorkingDir, image: eval.Evals.Image}
}
unique[key] = struct{}{}
}

if len(workingDirs) == 0 {
return nil
}

fmt.Fprintf(out, "Pre-building %d Docker image(s)...\n", len(workingDirs))
fmt.Fprintf(out, "Pre-building %d Docker image(s)...\n", len(unique))

// Build images in parallel with limited concurrency
type buildResult struct {
workingDir string
err error
title string
err error
}

work := make(chan string, len(workingDirs))
for wd := range workingDirs {
work <- wd
work := make(chan InputSession, len(evals))
for _, eval := range evals {
work <- eval
}
close(work)

results := make(chan buildResult, len(workingDirs))
results := make(chan buildResult, len(evals))

// Use same concurrency as evaluation runs for image builds
buildWorkers := min(r.Concurrency, len(workingDirs))
buildWorkers := min(r.Concurrency, len(evals))
var wg sync.WaitGroup
for range buildWorkers {
wg.Go(func() {
for wd := range work {
for eval := range work {
if ctx.Err() != nil {
results <- buildResult{workingDir: wd, err: ctx.Err()}
results <- buildResult{title: eval.Title, err: ctx.Err()}
continue
}
_, err := r.getOrBuildImage(ctx, wd)
results <- buildResult{workingDir: wd, err: err}

criteria := eval.Evals
if criteria == nil {
criteria = &session.EvalCriteria{}
}

_, err := r.getOrBuildImage(ctx, criteria)
results <- buildResult{title: eval.Title, err: err}
}
})
}

// Wait for all builds to complete
go func() {
wg.Wait()
close(results)
}()

// Collect errors
var errs []error
for result := range results {
if result.err != nil {
errs = append(errs, fmt.Errorf("building image for %q: %w", result.workingDir, result.err))
errs = append(errs, fmt.Errorf("building image for %q: %w", result.title, result.err))
}
}

Expand Down Expand Up @@ -323,9 +327,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
result.ToolCallsExpected = 1.0
}

workingDir := evals.WorkingDir

imageID, err := r.getOrBuildImage(ctx, workingDir)
imageID, err := r.getOrBuildImage(ctx, evals)
if err != nil {
return result, fmt.Errorf("building eval image: %w", err)
}
Expand Down
1 change: 1 addition & 0 deletions pkg/session/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ type EvalCriteria struct {
WorkingDir string `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
Size string `json:"size,omitempty"` // Expected response size: S, M, L, XL
Setup string `json:"setup,omitempty"` // Optional sh script to run in the container before docker agent run --exec
Image string `json:"image,omitempty"` // Custom Docker image for this eval (overrides --base-image)
}

// UnmarshalJSON implements custom JSON unmarshaling for EvalCriteria that
Expand Down
Loading