diff --git a/.github/workflows/ai-moderator.lock.yml b/.github/workflows/ai-moderator.lock.yml index 7cfceecc80d..62b84983a55 100644 --- a/.github/workflows/ai-moderator.lock.yml +++ b/.github/workflows/ai-moderator.lock.yml @@ -406,6 +406,7 @@ jobs: env: GH_AW_CACHE_DIR: /tmp/gh-aw/cache-memory GH_AW_MIN_INTEGRITY: none + GH_AW_ALLOWED_EXTENSIONS: '.json' run: bash "${RUNNER_TEMP}/gh-aw/actions/setup_cache_memory_git.sh" - name: Checkout PR branch id: checkout-pr diff --git a/actions/setup/sh/setup_cache_memory_git.sh b/actions/setup/sh/setup_cache_memory_git.sh index 36634b7ea0c..c15d5ecffd2 100644 --- a/actions/setup/sh/setup_cache_memory_git.sh +++ b/actions/setup/sh/setup_cache_memory_git.sh @@ -5,10 +5,18 @@ # This script is run AFTER the cache is restored and BEFORE the agent executes. # It ensures the cache directory contains a git repository with integrity branches # and checks out the correct branch for the current run's integrity level. +# After git setup it applies pre-agent security sanitization: strips execute bits from +# all working-tree files, and removes files with disallowed extensions when +# GH_AW_ALLOWED_EXTENSIONS is set. # # Required environment variables: -# GH_AW_CACHE_DIR: Path to the cache-memory directory (e.g. /tmp/gh-aw/cache-memory) -# GH_AW_MIN_INTEGRITY: Integrity level for this run (merged|approved|unapproved|none) +# GH_AW_CACHE_DIR: Path to the cache-memory directory (e.g. /tmp/gh-aw/cache-memory) +# GH_AW_MIN_INTEGRITY: Integrity level for this run (merged|approved|unapproved|none) +# +# Optional environment variables: +# GH_AW_ALLOWED_EXTENSIONS: Colon-separated list of allowed file extensions for pre-agent +# sanitization (e.g. .json:.md:.txt). When set, any restored file +# whose extension is not in this list is removed before the agent runs. set -euo pipefail @@ -101,3 +109,59 @@ for level in "${LEVELS[@]}"; do done echo "Cache memory git setup complete (integrity: $INTEGRITY)" + +# --- Security: pre-agent working-tree sanitization --- +# 1. Delete all working-tree symlinks so that a prior run cannot plant links to files +# outside the cache (e.g. secrets) that would bypass the regular-file checks below. +find . -not -path './.git/*' -type l -delete 2>/dev/null || true +echo "Pre-agent sanitization: deleted all working-tree symlinks" + +# 2. Strip execute bits from all working-tree files so that a prior run cannot plant +# executable scripts (e.g. helper.sh) that the agent or runner could invoke before +# any validation gate fires. +find . -not -path './.git/*' -type f -exec chmod a-x {} + 2>/dev/null || true +echo "Pre-agent sanitization: stripped execute permissions from all working-tree files" + +# 3. If GH_AW_ALLOWED_EXTENSIONS is set (colon-separated, e.g. .json:.md:.txt), remove +# any restored file whose extension is not in the allowed list. This ensures the agent +# never encounters unexpected file types planted by a prior compromised run. +if [ -n "${GH_AW_ALLOWED_EXTENSIONS:-}" ]; then + echo "Pre-agent sanitization: enforcing allowed extensions: ${GH_AW_ALLOWED_EXTENSIONS}" + # Build a normalized (lowercase, whitespace-trimmed) allowed list for case-insensitive + # comparison. Pre-computing this once avoids re-parsing it for every file. + _normalized_allowed="" + IFS=: read -ra _raw_exts <<< "$GH_AW_ALLOWED_EXTENSIONS" + for _e in "${_raw_exts[@]}"; do + # Trim all whitespace and convert to lowercase + _e="$(printf '%s' "$_e" | tr -d '[:space:]' | tr '[:upper:]' '[:lower:]')" + if [ -n "$_e" ]; then + _normalized_allowed="${_normalized_allowed}${_e}:" + fi + done + removed=0 + # Use NUL-delimited output so filenames containing newlines are handled correctly. + while IFS= read -r -d '' file; do + filename="$(basename "$file")" + # Extract the last dot-prefixed segment as the extension, or empty if no dot. + # Normalize to lowercase for case-insensitive comparison against the allowed list. + case "$filename" in + *.*) ext=".$(printf '%s' "${filename##*.}" | tr '[:upper:]' '[:lower:]')" ;; + *) ext="" ;; + esac + # Check whether this extension appears in the normalized allowed list + found=0 + IFS=: read -ra _ALLOWED_EXTS <<< "${_normalized_allowed%:}" + for _a in "${_ALLOWED_EXTS[@]}"; do + if [ "$ext" = "$_a" ]; then + found=1 + break + fi + done + if [ "$found" -eq 0 ]; then + echo "Removing disallowed file: $file (extension: '${ext:-none}')" + rm -f "$file" + removed=$((removed + 1)) + fi + done < <(find . -not -path './.git/*' -type f -print0) + echo "Pre-agent sanitization complete: removed ${removed} file(s) with disallowed extensions" +fi diff --git a/actions/setup/sh/setup_cache_memory_git_test.sh b/actions/setup/sh/setup_cache_memory_git_test.sh new file mode 100644 index 00000000000..cdab13c42df --- /dev/null +++ b/actions/setup/sh/setup_cache_memory_git_test.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# Tests for setup_cache_memory_git.sh — pre-agent sanitization block +# Run: bash setup_cache_memory_git_test.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT="${SCRIPT_DIR}/setup_cache_memory_git.sh" + +# Test counters +TESTS_PASSED=0 +TESTS_FAILED=0 + +# Temporary workspace for all tests +WORKSPACE=$(mktemp -d) + +cleanup() { + rm -rf "${WORKSPACE}" +} +trap cleanup EXIT + +# Helper: assert a condition +assert() { + local name="$1" + local condition="$2" + if eval "${condition}" 2>/dev/null; then + echo " ✓ ${name}" + TESTS_PASSED=$((TESTS_PASSED + 1)) + else + echo " ✗ ${name}" + TESTS_FAILED=$((TESTS_FAILED + 1)) + fi +} + +# Helper: create a fresh git cache dir with the given files already committed. +# Usage: make_cache_dir [ ...] +# Files are created and committed to the 'none' branch (the lowest-trust default). +make_cache_dir() { + local dir="$1" + shift + mkdir -p "${dir}" + pushd "${dir}" >/dev/null + git init -b merged -q + git config user.email "test@example.com" + git config user.name "test" + git config core.hooksPath /dev/null + git commit --allow-empty -m "initial" -q + for level in approved unapproved none; do + git branch "${level}" 2>/dev/null || true + done + git checkout -q none + for f in "$@"; do + mkdir -p "$(dirname "${f}")" + echo "content" > "${f}" + done + git add -A + git commit --allow-empty -m "test-files" -q + popd >/dev/null +} + +# Run the script, capturing stdout and ignoring the exit code. +run_script() { + local dir="$1" + local integrity="${2:-none}" + local allowed_exts="${3:-}" + GH_AW_CACHE_DIR="${dir}" \ + GH_AW_MIN_INTEGRITY="${integrity}" \ + GH_AW_ALLOWED_EXTENSIONS="${allowed_exts}" \ + bash "${SCRIPT}" 2>&1 || true +} + +echo "Testing setup_cache_memory_git.sh — pre-agent sanitization" +echo "" + +# ── Test 1: Execute bits are stripped from restored files ──────────────────── +echo "Test 1: Execute bits are stripped unconditionally" +D="${WORKSPACE}/test1" +make_cache_dir "${D}" "script.sh" "data.json" +# Make files executable before the script runs +chmod +x "${D}/script.sh" "${D}/data.json" +run_script "${D}" none >/dev/null +assert "script.sh is not executable" "[ ! -x '${D}/script.sh' ]" +assert "data.json is not executable" "[ ! -x '${D}/data.json' ]" +assert "script.sh still exists" "[ -f '${D}/script.sh' ]" +assert "data.json still exists" "[ -f '${D}/data.json' ]" +echo "" + +# ── Test 2: .git directory files are NOT touched (sanity check) ────────────── +echo "Test 2: .git directory is not affected by chmod" +D="${WORKSPACE}/test2" +make_cache_dir "${D}" "file.txt" +HOOK_FILE="${D}/.git/hooks/pre-commit" +echo "#!/bin/bash" > "${HOOK_FILE}" +chmod +x "${HOOK_FILE}" +run_script "${D}" none >/dev/null +# The hook file cleanup happens earlier in the script but the .git dir itself is +# excluded from find. Verify find exclusion by checking the .git dir is intact. +assert ".git directory still exists" "[ -d '${D}/.git' ]" +echo "" + +# ── Test 3: No extension filter — all files kept when GH_AW_ALLOWED_EXTENSIONS is empty ─ +echo "Test 3: No extension filter when GH_AW_ALLOWED_EXTENSIONS is unset" +D="${WORKSPACE}/test3" +make_cache_dir "${D}" "file.json" "file.md" "helper.sh" "binary" +run_script "${D}" none "" +assert "file.json kept" "[ -f '${D}/file.json' ]" +assert "file.md kept" "[ -f '${D}/file.md' ]" +assert "helper.sh kept" "[ -f '${D}/helper.sh' ]" +assert "binary kept" "[ -f '${D}/binary' ]" +echo "" + +# ── Test 4: Extension filter removes disallowed files ──────────────────────── +echo "Test 4: Extension filter removes disallowed file types" +D="${WORKSPACE}/test4" +make_cache_dir "${D}" "data.json" "notes.md" "helper.sh" "archive.zip" +run_script "${D}" none ".json:.md" +assert "data.json kept" "[ -f '${D}/data.json' ]" +assert "notes.md kept" "[ -f '${D}/notes.md' ]" +assert "helper.sh removed" "[ ! -f '${D}/helper.sh' ]" +assert "archive.zip removed" "[ ! -f '${D}/archive.zip' ]" +echo "" + +# ── Test 5: Extension filter removes files without any extension ───────────── +echo "Test 5: Extension filter removes files with no extension" +D="${WORKSPACE}/test5" +make_cache_dir "${D}" "data.json" "noext" +run_script "${D}" none ".json" +assert "data.json kept" "[ -f '${D}/data.json' ]" +assert "noext removed" "[ ! -f '${D}/noext' ]" +echo "" + +# ── Test 6: Extension filter with single extension ─────────────────────────── +echo "Test 6: Extension filter with a single allowed extension" +D="${WORKSPACE}/test6" +make_cache_dir "${D}" "report.json" "notes.txt" "image.png" +run_script "${D}" none ".json" +assert "report.json kept" "[ -f '${D}/report.json' ]" +assert "notes.txt removed" "[ ! -f '${D}/notes.txt' ]" +assert "image.png removed" "[ ! -f '${D}/image.png' ]" +echo "" + +# ── Test 7: Execute bits stripped AND disallowed files removed together ─────── +echo "Test 7: Execute-bit stripping and extension filtering both apply" +D="${WORKSPACE}/test7" +make_cache_dir "${D}" "keep.json" "drop.sh" +chmod +x "${D}/keep.json" "${D}/drop.sh" +run_script "${D}" none ".json" +assert "keep.json exists" "[ -f '${D}/keep.json' ]" +assert "keep.json not executable" "[ ! -x '${D}/keep.json' ]" +assert "drop.sh removed" "[ ! -f '${D}/drop.sh' ]" +echo "" + +# ── Test 8: Extension matching is case-insensitive ─────────────────────────── +echo "Test 8: Extension matching is case-insensitive" +D="${WORKSPACE}/test8" +make_cache_dir "${D}" "data.json" "data.JSON" "notes.MD" +# Allow list uses lowercase; both .json and .JSON files, and .MD files, should be kept +run_script "${D}" none ".json:.md" +assert "data.json kept (exact match)" "[ -f '${D}/data.json' ]" +assert "data.JSON kept (uppercase file)" "[ -f '${D}/data.JSON' ]" +assert "notes.MD kept (uppercase file)" "[ -f '${D}/notes.MD' ]" +echo "" + +# ── Test 9: Whitespace in GH_AW_ALLOWED_EXTENSIONS is trimmed ──────────────── +echo "Test 9: Whitespace in allowed extensions list is trimmed" +D="${WORKSPACE}/test9" +make_cache_dir "${D}" "data.json" "note.md" "drop.sh" +# Extensions with leading/trailing spaces should still match +run_script "${D}" none " .json : .md " +assert "data.json kept (trimmed .json)" "[ -f '${D}/data.json' ]" +assert "note.md kept (trimmed .md)" "[ -f '${D}/note.md' ]" +assert "drop.sh removed" "[ ! -f '${D}/drop.sh' ]" +echo "" + +# ── Test 10: Symlinks are deleted unconditionally ──────────────────────────── +echo "Test 10: Symlinks in working tree are deleted" +D="${WORKSPACE}/test10" +make_cache_dir "${D}" "real.json" +# Plant a symlink (simulating a compromised prior run) +ln -s /etc/passwd "${D}/evil-link" +assert "symlink exists before script" "[ -L '${D}/evil-link' ]" +run_script "${D}" none >/dev/null +assert "symlink removed by script" "[ ! -L '${D}/evil-link' ]" +assert "real file still exists" "[ -f '${D}/real.json' ]" +echo "" + +# ── Test 11: Files with spaces in name are handled correctly ───────────────── +echo "Test 11: Files with spaces in names are handled correctly" +D="${WORKSPACE}/test11" +make_cache_dir "${D}" "my data.json" "my script.sh" +run_script "${D}" none ".json" +assert "file with space and .json kept" "[ -f '${D}/my data.json' ]" +assert "file with space and .sh removed" "[ ! -f '${D}/my script.sh' ]" +echo "" + +# ── Summary ────────────────────────────────────────────────────────────────── +echo "Tests passed: ${TESTS_PASSED}" +echo "Tests failed: ${TESTS_FAILED}" + +if [ "${TESTS_FAILED}" -gt 0 ]; then + exit 1 +fi + +echo "✓ All tests passed!" diff --git a/docs/adr/26587-pre-agent-cache-memory-working-tree-sanitization.md b/docs/adr/26587-pre-agent-cache-memory-working-tree-sanitization.md new file mode 100644 index 00000000000..985a764b2cc --- /dev/null +++ b/docs/adr/26587-pre-agent-cache-memory-working-tree-sanitization.md @@ -0,0 +1,93 @@ +# ADR-26587: Pre-Agent Cache-Memory Working-Tree Sanitization + +**Date**: 2026-04-16 +**Status**: Draft +**Deciders**: pelikhan, Copilot + +--- + +## Part 1 — Narrative (Human-Friendly) + +### Context + +The cache-memory system restores files from a prior agent run into a working directory before the current agent executes. With `integrity: none` (the default configuration), this restoration happens before any threat-detection gate fires. A compromised prior run could therefore plant executable scripts (e.g. `helper.sh`) or unexpected file types in the cache, which the next agent would encounter without any validation. The system needed a deterministic security gate that runs *after* the cache is restored but *before* the agent can access the working tree. + +### Decision + +We will extend the existing `setup_cache_memory_git.sh` shell script—which already runs in the correct position (after cache restore, before agent execution)—to perform two sanitization steps: (1) unconditionally strip execute bits from every working-tree file so planted executables cannot run, and (2) when `GH_AW_ALLOWED_EXTENSIONS` is configured (a colon-separated list passed by the Go code in `generateCacheMemoryGitSetupStep`), remove any file whose extension is not on the allowed list. This approach adds a zero-new-step security gate without restructuring workflow templates. + +### Alternatives Considered + +#### Alternative 1: Separate Dedicated "Sanitize Cache" Workflow Step + +A standalone step could be inserted into every generated workflow immediately after the cache-restore step. This would be visible in the YAML and independently auditable. However, it requires patching every generated workflow template, increases generated-YAML complexity, and creates a step-ordering dependency that is harder to enforce correctly across all cache configurations. Centralising the logic in the existing git-setup script avoids these problems. + +#### Alternative 2: Go-Level File System Sanitization + +The sanitization could be implemented in Go within `cache.go` at workflow-generation time or injected as a Go binary invoked from the step. This would keep all business logic in a single language and allow richer error handling. It was rejected because the shell script already executes in the exact lifecycle position required, and introducing a compiled Go binary just for chmod/rm operations would add build and distribution overhead with no material benefit over shell primitives. + +#### Alternative 3: Content-Based Threat Scanning + +Rather than filtering by permission bits or file extension, the gate could scan file content for known-malicious patterns (YARA rules, signature checks, etc.). This provides stronger detection for obfuscated threats but introduces false-positive risk, brittle pattern maintenance, and significant latency. The permission-strip + extension-filter approach provides meaningful defence-in-depth with deterministic, auditable behaviour and no false positives. + +#### Alternative 4: Rely on Existing Threat Detection + +The existing integrity and threat-detection pipeline could be trusted to catch malicious content. This was rejected because threat detection fires concurrently with or after agent execution—not unconditionally before it—so a planted executable could be invoked before detection fires. The pre-agent sanitization gate closes this race condition. + +### Consequences + +#### Positive +- Eliminates a class of supply-chain-style attacks where a compromised prior run plants executable files in cache-memory. +- The execute-bit strip provides unconditional defence even when no `allowed-extensions` list is configured. +- Extension filtering allows teams to explicitly declare a minimal allowed surface area for cached files. +- No additional workflow step is emitted; existing generated workflows are not structurally changed. +- Unit tests in `cache_integrity_test.go` cover the env-var emission contract, making the behaviour regression-safe. + +#### Negative +- `chmod a-x` runs on every cache-memory-enabled workflow unconditionally, even when no security concern exists—a small overhead on each run. +- Any legitimate workflow that relies on executable bits being preserved in cached files (unlikely given cache-memory is data-only by design) would break silently—files are de-executed without notification. +- The extension filtering uses a colon-separated string passed through an env var, which is a slightly brittle interface compared to a structured config file; misconfigured extension strings (e.g. missing leading dot) would silently allow all files. + +#### Neutral +- The sanitization is implemented in Bash within the existing setup script rather than as a new architectural component, keeping the footprint minimal. +- The `GH_AW_ALLOWED_EXTENSIONS` env var is only emitted when `AllowedExtensions` is non-empty; the default (all extensions allowed) requires no configuration change. +- The design naturally separates concerns: Go code owns the configuration contract, the shell script owns the filesystem operations. + +--- + +## Part 2 — Normative Specification (RFC 2119) + +> The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, **SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this section are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119). + +### Pre-Agent Sanitization Execution Order + +1. The sanitization gate **MUST** execute after the cache-restore step and before any agent step can access the cache working tree. +2. The sanitization gate **MUST NOT** be skipped or made conditional on the `GH_AW_MIN_INTEGRITY` level; it **MUST** run for all integrity configurations, including `none`. +3. The sanitization logic **MUST** be implemented in `setup_cache_memory_git.sh` or a script invoked by it, so that it runs in the same step as the git-setup work. + +### Execute-Bit Stripping + +1. Implementations **MUST** strip execute permissions (`chmod a-x`) from every file in the cache working tree, excluding files inside `.git/`. +2. Implementations **MUST NOT** require any configuration to enable execute-bit stripping; it **MUST** be active for all cache-memory workflows. +3. Implementations **SHOULD** log a confirmation message after stripping execute permissions so that audit logs confirm the step ran. + +### Extension-Based File Filtering + +1. When the `GH_AW_ALLOWED_EXTENSIONS` environment variable is set and non-empty, implementations **MUST** remove any working-tree file (excluding `.git/`) whose extension is not present in the colon-separated allowed list. +2. Extensions in `GH_AW_ALLOWED_EXTENSIONS` **MUST** include the leading dot (e.g. `.json`, `.md`); files with no extension are treated as if their extension is an empty string. +3. When `GH_AW_ALLOWED_EXTENSIONS` is absent or empty, implementations **MUST NOT** remove any files based on extension (all extensions are allowed). +4. Implementations **SHOULD** log each removed file path and its extension, and **SHOULD** log a final count of removed files. + +### Go-to-Shell Configuration Contract + +1. The Go code in `generateCacheMemoryGitSetupStep` **MUST** emit `GH_AW_ALLOWED_EXTENSIONS` as an environment variable to the git-setup step when `cache.AllowedExtensions` is non-empty. +2. The env-var value **MUST** be the `AllowedExtensions` slice joined with `:` as separator. +3. The Go code **MUST NOT** emit `GH_AW_ALLOWED_EXTENSIONS` when `AllowedExtensions` is nil or empty; the absence of the variable is the signal for "allow all extensions." + +### Conformance + +An implementation is considered conformant with this ADR if it satisfies all **MUST** and **MUST NOT** requirements above. Failure to meet any **MUST** or **MUST NOT** requirement constitutes non-conformance. + +--- + +*This is a DRAFT ADR generated by the [Design Decision Gate](https://github.com/github/gh-aw/actions/runs/24514620900) workflow. The PR author must review, complete, and finalize this document before the PR can merge.* diff --git a/pkg/testutil/spec_test.go b/pkg/testutil/spec_test.go index 63f49c4e1e5..c0cf356e2bc 100644 --- a/pkg/testutil/spec_test.go +++ b/pkg/testutil/spec_test.go @@ -134,8 +134,8 @@ func TestSpec_PublicAPI_StripYAMLCommentHeader(t *testing.T) { expected string }{ { - name: "strips leading comment block before YAML content", - input: "# Auto-generated by gh-aw\n# Do not edit manually\nruns-on: ubuntu-latest\n", + name: "strips leading comment block before YAML content", + input: "# Auto-generated by gh-aw\n# Do not edit manually\nruns-on: ubuntu-latest\n", expected: "runs-on: ubuntu-latest\n", }, { @@ -144,8 +144,8 @@ func TestSpec_PublicAPI_StripYAMLCommentHeader(t *testing.T) { expected: "runs-on: ubuntu-latest\n", }, { - name: "strips multi-line comment block before YAML document separator", - input: "# Header\n# More header\n---\njobs:\n build:\n runs-on: ubuntu-latest\n", + name: "strips multi-line comment block before YAML document separator", + input: "# Header\n# More header\n---\njobs:\n build:\n runs-on: ubuntu-latest\n", expected: "---\njobs:\n build:\n runs-on: ubuntu-latest\n", }, } diff --git a/pkg/workflow/cache.go b/pkg/workflow/cache.go index 50bee620744..2198a68a6cd 100644 --- a/pkg/workflow/cache.go +++ b/pkg/workflow/cache.go @@ -18,6 +18,24 @@ var cacheLog = logger.New("workflow:cache") // validCacheMemoryScopes defines the allowed values for cache-memory scope var validCacheMemoryScopes = []string{"workflow", "repo"} +// isValidFileExtension reports whether s is a valid file extension of the form ^\.[A-Za-z0-9]+$ +// (e.g. ".json", ".md"). This strict pattern prevents YAML injection when extensions are +// embedded in generated workflow YAML as single-quoted scalars. +func isValidFileExtension(s string) bool { + if len(s) < 2 || s[0] != '.' { + return false + } + for _, c := range s[1:] { + isLower := c >= 'a' && c <= 'z' + isUpper := c >= 'A' && c <= 'Z' + isDigit := c >= '0' && c <= '9' + if !isLower && !isUpper && !isDigit { + return false + } + } + return true +} + // CacheMemoryConfig holds configuration for cache-memory functionality type CacheMemoryConfig struct { Caches []CacheMemoryEntry `yaml:"caches,omitempty"` // cache configurations @@ -164,6 +182,11 @@ func parseCacheMemoryEntry(cacheMap map[string]any, defaultID string) (CacheMemo entry.AllowedExtensions = make([]string, 0, len(extArray)) for _, ext := range extArray { if extStr, ok := ext.(string); ok { + // Validate: must be of the form ^\.[A-Za-z0-9]+$ to prevent YAML injection + // and ensure the shell sanitization script handles them correctly. + if !isValidFileExtension(extStr) { + return entry, fmt.Errorf("invalid allowed-extension %q: must start with '.' followed by alphanumeric characters only (e.g. .json)", extStr) + } entry.AllowedExtensions = append(entry.AllowedExtensions, extStr) } } @@ -512,6 +535,9 @@ func generateCacheMemorySteps(builder *strings.Builder, data *WorkflowData) { // generateCacheMemoryGitSetupStep emits a pre-agent step that sets up the git-backed integrity // repository inside the given cache directory. It must run after the cache is restored so that // any previous git history is available for the merge-down step. +// The step also performs pre-agent security sanitization: it strips execute bits from all +// working-tree files and, when allowed extensions are configured, removes files with +// disallowed extensions before the agent can access them. func generateCacheMemoryGitSetupStep(builder *strings.Builder, cache CacheMemoryEntry, cacheDir, integrityLevel string, useBackwardCompatiblePaths bool) { if useBackwardCompatiblePaths { builder.WriteString(" - name: Setup cache-memory git repository\n") @@ -521,6 +547,14 @@ func generateCacheMemoryGitSetupStep(builder *strings.Builder, cache CacheMemory builder.WriteString(" env:\n") fmt.Fprintf(builder, " GH_AW_CACHE_DIR: %s\n", cacheDir) fmt.Fprintf(builder, " GH_AW_MIN_INTEGRITY: %s\n", integrityLevel) + // Pass colon-separated allowed extensions so the setup script can remove disallowed files + // before the agent runs (pre-agent sanitization). Skip when the list is empty (allow all). + // Single quotes in the value are escaped ('' in YAML single-quoted scalars) as defense-in-depth, + // even though isValidFileExtension already rejects values containing single quotes at parse time. + if len(cache.AllowedExtensions) > 0 { + escaped := strings.ReplaceAll(strings.Join(cache.AllowedExtensions, ":"), "'", "''") + fmt.Fprintf(builder, " GH_AW_ALLOWED_EXTENSIONS: '%s'\n", escaped) + } builder.WriteString(" run: bash \"${RUNNER_TEMP}/gh-aw/actions/setup_cache_memory_git.sh\"\n") } diff --git a/pkg/workflow/cache_integrity_test.go b/pkg/workflow/cache_integrity_test.go index b9b40b73da8..2dfa6594552 100644 --- a/pkg/workflow/cache_integrity_test.go +++ b/pkg/workflow/cache_integrity_test.go @@ -535,3 +535,166 @@ func TestCacheMemoryGitCommitSteps_RestoreOnlySkipped(t *testing.T) { assert.Empty(t, output, "Restore-only caches should not generate a git commit step") } + +// TestCacheMemoryGitSetupStep_AllowedExtensionsEnvVar verifies that the git setup step +// emits GH_AW_ALLOWED_EXTENSIONS when allowed extensions are configured, and omits it +// when the extensions list is empty (all allowed). +func TestCacheMemoryGitSetupStep_AllowedExtensionsEnvVar(t *testing.T) { + tests := []struct { + name string + allowedExtensions []string + expectEnvVar bool + expectedValue string + }{ + { + name: "empty extensions (all allowed) - no env var", + allowedExtensions: []string{}, + expectEnvVar: false, + }, + { + name: "nil extensions (all allowed) - no env var", + allowedExtensions: nil, + expectEnvVar: false, + }, + { + name: "specific extensions - env var emitted", + allowedExtensions: []string{".json", ".md", ".txt"}, + expectEnvVar: true, + expectedValue: "GH_AW_ALLOWED_EXTENSIONS: '.json:.md:.txt'", + }, + { + name: "single extension - env var emitted", + allowedExtensions: []string{".json"}, + expectEnvVar: true, + expectedValue: "GH_AW_ALLOWED_EXTENSIONS: '.json'", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cache := CacheMemoryEntry{ + ID: "default", + AllowedExtensions: tt.allowedExtensions, + } + var builder strings.Builder + generateCacheMemoryGitSetupStep(&builder, cache, "/tmp/gh-aw/cache-memory", "none", true) + output := builder.String() + + if tt.expectEnvVar { + assert.Contains(t, output, tt.expectedValue, + "Should emit GH_AW_ALLOWED_EXTENSIONS with colon-separated extensions") + } else { + assert.NotContains(t, output, "GH_AW_ALLOWED_EXTENSIONS", + "Should NOT emit GH_AW_ALLOWED_EXTENSIONS when all extensions are allowed") + } + + // Always verify the other env vars are present + assert.Contains(t, output, "GH_AW_CACHE_DIR: /tmp/gh-aw/cache-memory", + "Should always set cache dir env var") + assert.Contains(t, output, "GH_AW_MIN_INTEGRITY: none", + "Should always set integrity level env var") + }) + } +} + +// TestCacheMemorySteps_PreAgentSanitizationEnvVar verifies that generateCacheMemorySteps +// emits GH_AW_ALLOWED_EXTENSIONS in the git setup step when allowed extensions are configured. +func TestCacheMemorySteps_PreAgentSanitizationEnvVar(t *testing.T) { + toolsMap := map[string]any{ + "cache-memory": map[string]any{ + "allowed-extensions": []any{".json", ".md", ".txt"}, + }, + } + + toolsConfig, err := ParseToolsConfig(toolsMap) + require.NoError(t, err, "Should parse tools config") + + compiler := NewCompiler() + cacheMemoryConfig, err := compiler.extractCacheMemoryConfig(toolsConfig) + require.NoError(t, err, "Should extract cache-memory config") + + data := &WorkflowData{ + CacheMemoryConfig: cacheMemoryConfig, + } + + var builder strings.Builder + generateCacheMemorySteps(&builder, data) + output := builder.String() + + assert.Contains(t, output, "GH_AW_ALLOWED_EXTENSIONS: '.json:.md:.txt'", + "Should pass allowed extensions to git setup step for pre-agent sanitization") +} + +// TestCacheMemorySteps_NoExtensionsNoSanitizationEnvVar verifies that the default +// empty extensions list (all allowed) does NOT emit GH_AW_ALLOWED_EXTENSIONS, +// so the sanitization step does not remove any files. +func TestCacheMemorySteps_NoExtensionsNoSanitizationEnvVar(t *testing.T) { + toolsMap := map[string]any{ + "cache-memory": true, + } + + toolsConfig, err := ParseToolsConfig(toolsMap) + require.NoError(t, err, "Should parse tools config") + + compiler := NewCompiler() + cacheMemoryConfig, err := compiler.extractCacheMemoryConfig(toolsConfig) + require.NoError(t, err, "Should extract cache-memory config") + + data := &WorkflowData{ + CacheMemoryConfig: cacheMemoryConfig, + } + + var builder strings.Builder + generateCacheMemorySteps(&builder, data) + output := builder.String() + + assert.NotContains(t, output, "GH_AW_ALLOWED_EXTENSIONS", + "Should NOT pass GH_AW_ALLOWED_EXTENSIONS when all extensions are allowed (empty list)") +} + +// TestCacheMemoryAllowedExtensions_ValidationAndEscaping verifies that: +// - Valid extensions (e.g. ".json", ".md") are accepted. +// - Extensions not matching ^\.[A-Za-z0-9]+$ (e.g. containing single quotes, spaces, +// or missing a leading dot) are rejected with a descriptive error. +// - When emitting the env var, single quotes in extension values are doubled (”) for +// safe embedding in YAML single-quoted scalars. +func TestCacheMemoryAllowedExtensions_ValidationAndEscaping(t *testing.T) { + t.Run("isValidFileExtension helper", func(t *testing.T) { + valid := []string{".json", ".md", ".txt", ".csv", ".JSON", ".MD", ".1"} + for _, ext := range valid { + assert.True(t, isValidFileExtension(ext), "Expected %q to be valid", ext) + } + invalid := []string{"json", ".json.bak", ".json ", ".", ".json'evil", ".json\nevil", "", "."} + for _, ext := range invalid { + assert.False(t, isValidFileExtension(ext), "Expected %q to be invalid", ext) + } + }) + + t.Run("invalid extension rejected at parse time", func(t *testing.T) { + toolsMap := map[string]any{ + "cache-memory": map[string]any{ + "allowed-extensions": []any{".json", "no-leading-dot"}, + }, + } + toolsConfig, err := ParseToolsConfig(toolsMap) + require.NoError(t, err, "Should parse tools config") + compiler := NewCompiler() + _, err = compiler.extractCacheMemoryConfig(toolsConfig) + require.Error(t, err, "Should reject invalid extension at parse time") + assert.Contains(t, err.Error(), "no-leading-dot", "Error should identify the bad value") + }) + + t.Run("single-quote escaping in emitted YAML", func(t *testing.T) { + // This bypasses parse-time validation to confirm the emit escaping is independent. + cache := CacheMemoryEntry{ + ID: "default", + AllowedExtensions: []string{".json", ".it'"}, + } + var builder strings.Builder + generateCacheMemoryGitSetupStep(&builder, cache, "/tmp/gh-aw/cache-memory", "none", true) + output := builder.String() + // Single quote must be doubled for safe YAML single-quoted scalar embedding + assert.Contains(t, output, "GH_AW_ALLOWED_EXTENSIONS: '.json:.it'''", + "Single quotes in extension values must be escaped as '' in YAML output") + }) +}