diff --git a/pkg/workflow/compiler.go b/pkg/workflow/compiler.go index fe8584c31e3..7ef27ec662c 100644 --- a/pkg/workflow/compiler.go +++ b/pkg/workflow/compiler.go @@ -26,6 +26,14 @@ const ( // This includes environment variable values, if conditions, and other expression contexts // See: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration MaxExpressionSize = 21000 // 21KB in bytes + + // MaxPromptChunkSize is the maximum size for each chunk when splitting prompt text (20KB) + // This limit ensures each heredoc block stays under GitHub Actions step size limits (21KB) + MaxPromptChunkSize = 20000 // 20KB limit for each chunk + + // MaxPromptChunks is the maximum number of chunks allowed when splitting prompt text + // This prevents excessive step generation for extremely large prompt texts + MaxPromptChunks = 5 // Maximum number of chunks ) //go:embed schemas/github-workflow.json diff --git a/pkg/workflow/sh.go b/pkg/workflow/sh.go index bd50f5cac63..9f93dc732cd 100644 --- a/pkg/workflow/sh.go +++ b/pkg/workflow/sh.go @@ -59,12 +59,67 @@ func WriteShellScriptToYAML(yaml *strings.Builder, script string, indent string) } } -// WritePromptTextToYAML writes prompt text to a YAML heredoc with proper indentation +// WritePromptTextToYAML writes prompt text to a YAML heredoc with proper indentation. +// It chunks the text into groups of lines of less than MaxPromptChunkSize characters, with a maximum of MaxPromptChunks chunks. +// Each chunk is written as a separate heredoc to avoid GitHub Actions step size limits (21KB). func WritePromptTextToYAML(yaml *strings.Builder, text string, indent string) { - yaml.WriteString(indent + "cat >> $GITHUB_AW_PROMPT << 'EOF'\n") textLines := strings.Split(text, "\n") - for _, line := range textLines { - fmt.Fprintf(yaml, "%s%s\n", indent, line) + chunks := chunkLines(textLines, indent, MaxPromptChunkSize, MaxPromptChunks) + + // Write each chunk as a separate heredoc + for _, chunk := range chunks { + yaml.WriteString(indent + "cat >> $GITHUB_AW_PROMPT << 'EOF'\n") + for _, line := range chunk { + fmt.Fprintf(yaml, "%s%s\n", indent, line) + } + yaml.WriteString(indent + "EOF\n") + } +} + +// chunkLines splits lines into chunks where each chunk's total size (including indent) is less than maxSize. +// Returns at most maxChunks chunks. If content exceeds the limit, it truncates at the last chunk. +func chunkLines(lines []string, indent string, maxSize int, maxChunks int) [][]string { + if len(lines) == 0 { + return [][]string{{}} + } + + var chunks [][]string + var currentChunk []string + currentSize := 0 + + for _, line := range lines { + // Calculate size including indent and newline + lineSize := len(indent) + len(line) + 1 + + // If adding this line would exceed the limit, start a new chunk + if currentSize+lineSize > maxSize && len(currentChunk) > 0 { + // Check if we've reached the maximum number of chunks + if len(chunks) >= maxChunks-1 { + // We're at the last allowed chunk, so add remaining lines to current chunk + currentChunk = append(currentChunk, line) + currentSize += lineSize + continue + } + + // Start a new chunk + chunks = append(chunks, currentChunk) + currentChunk = []string{line} + currentSize = lineSize + } else { + currentChunk = append(currentChunk, line) + currentSize += lineSize + } + } + + // Add the last chunk if there's content + if len(currentChunk) > 0 { + chunks = append(chunks, currentChunk) } - yaml.WriteString(indent + "EOF\n") + + // If we still have no chunks, return an empty chunk + if len(chunks) == 0 { + return [][]string{{}} + } + + return chunks } diff --git a/pkg/workflow/sh_integration_test.go b/pkg/workflow/sh_integration_test.go new file mode 100644 index 00000000000..14b81fe2171 --- /dev/null +++ b/pkg/workflow/sh_integration_test.go @@ -0,0 +1,369 @@ +package workflow + +import ( + "strings" + "testing" +) + +// TestWritePromptTextToYAML_IntegrationWithCompiler verifies that WritePromptTextToYAML +// correctly handles large prompt text that would be used in actual workflow compilation. +// This test simulates what would happen if an embedded prompt file was very large. +func TestWritePromptTextToYAML_IntegrationWithCompiler(t *testing.T) { + // Create a realistic scenario: a very long help text or documentation + // that might be included as prompt instructions + section := strings.Repeat("This is an important instruction line that provides guidance to the AI agent on how to perform its task correctly. ", 10) + + // Create 200 lines to ensure we exceed 20KB + lines := make([]string, 200) + for i := range lines { + lines[i] = section + } + largePromptText := strings.Join(lines, "\n") + + // Calculate total size + totalSize := len(largePromptText) + if totalSize < 20000 { + t.Fatalf("Test setup error: prompt text should be at least 20000 bytes, got %d", totalSize) + } + + var yaml strings.Builder + indent := " " // Standard indent used in workflow generation + + // Call the function as it would be called in real compilation + WritePromptTextToYAML(&yaml, largePromptText, indent) + + result := yaml.String() + + // Verify multiple heredoc blocks were created + heredocCount := strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") + if heredocCount < 2 { + t.Errorf("Expected multiple heredoc blocks for large text (%d bytes), got %d", totalSize, heredocCount) + } + + // Verify we didn't exceed 5 chunks + if heredocCount > 5 { + t.Errorf("Expected at most 5 heredoc blocks (max limit), got %d", heredocCount) + } + + // Verify each heredoc is closed + eofCount := strings.Count(result, indent+"EOF") + if eofCount != heredocCount { + t.Errorf("Expected %d EOF markers to match %d heredoc blocks, got %d", heredocCount, heredocCount, eofCount) + } + + // Verify the content is preserved (check first and last sections) + firstSection := section[:100] + lastSection := section[len(section)-100:] + if !strings.Contains(result, firstSection) { + t.Error("Expected to find beginning of original text in output") + } + if !strings.Contains(result, lastSection) { + t.Error("Expected to find end of original text in output") + } + + // Verify the YAML structure is valid (basic check) + if !strings.Contains(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") { + t.Error("Expected proper heredoc syntax in output") + } + + t.Logf("Successfully chunked %d bytes into %d heredoc blocks", totalSize, heredocCount) + + // Verify no lines are lost - extract content from heredoc blocks and compare + extractedLines := extractLinesFromYAML(result, indent) + originalLines := strings.Split(largePromptText, "\n") + + if len(extractedLines) != len(originalLines) { + t.Errorf("Line count mismatch: expected %d lines, got %d lines", len(originalLines), len(extractedLines)) + } + + // Verify content integrity by checking line-by-line + mismatchCount := 0 + for i := 0; i < len(originalLines) && i < len(extractedLines); i++ { + if originalLines[i] != extractedLines[i] { + mismatchCount++ + if mismatchCount <= 3 { // Only report first 3 mismatches + t.Errorf("Line %d mismatch:\nExpected: %q\nGot: %q", i+1, originalLines[i], extractedLines[i]) + } + } + } + + if mismatchCount > 0 { + t.Errorf("Total line mismatches: %d", mismatchCount) + } +} + +// TestWritePromptTextToYAML_RealWorldSizeSimulation simulates various real-world scenarios +// to ensure chunking works correctly across different text sizes. +func TestWritePromptTextToYAML_RealWorldSizeSimulation(t *testing.T) { + tests := []struct { + name string + textSize int // approximate size in bytes + linesCount int // number of lines + expectedChunks int // expected number of chunks + maxChunks int // should not exceed this + }{ + { + name: "small prompt (< 1KB)", + textSize: 500, + linesCount: 10, + expectedChunks: 1, + maxChunks: 1, + }, + { + name: "medium prompt (~10KB)", + textSize: 10000, + linesCount: 100, + expectedChunks: 1, + maxChunks: 1, + }, + { + name: "large prompt (~25KB)", + textSize: 25000, + linesCount: 250, + expectedChunks: 2, + maxChunks: 2, + }, + { + name: "very large prompt (~50KB)", + textSize: 50000, + linesCount: 500, + expectedChunks: 3, + maxChunks: 3, + }, + { + name: "extremely large prompt (~120KB)", + textSize: 120000, + linesCount: 1200, + expectedChunks: 5, + maxChunks: 5, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create text of approximately the desired size + // Account for newlines: total size = linesCount * (lineSize + 1) - 1 (no trailing newline) + lineSize := (tt.textSize + 1) / tt.linesCount // Adjust for newlines + if lineSize < 1 { + lineSize = 1 + } + line := strings.Repeat("x", lineSize) + lines := make([]string, tt.linesCount) + for i := range lines { + lines[i] = line + } + text := strings.Join(lines, "\n") + + var yaml strings.Builder + indent := " " + + WritePromptTextToYAML(&yaml, text, indent) + + result := yaml.String() + heredocCount := strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") + + if heredocCount < tt.expectedChunks { + t.Errorf("Expected at least %d chunks for %s, got %d", tt.expectedChunks, tt.name, heredocCount) + } + + if heredocCount > tt.maxChunks { + t.Errorf("Expected at most %d chunks for %s, got %d", tt.maxChunks, tt.name, heredocCount) + } + + eofCount := strings.Count(result, indent+"EOF") + if eofCount != heredocCount { + t.Errorf("EOF count (%d) doesn't match heredoc count (%d) for %s", eofCount, heredocCount, tt.name) + } + + t.Logf("%s: %d bytes chunked into %d blocks", tt.name, len(text), heredocCount) + + // Verify no lines are lost + extractedLines := extractLinesFromYAML(result, indent) + originalLines := strings.Split(text, "\n") + + if len(extractedLines) != len(originalLines) { + t.Errorf("%s: Line count mismatch - expected %d lines, got %d lines", tt.name, len(originalLines), len(extractedLines)) + } + }) + } +} + +// extractLinesFromYAML extracts the actual content lines from a YAML heredoc output +// by parsing the heredoc blocks and removing the indent +func extractLinesFromYAML(yamlOutput string, indent string) []string { + var lines []string + inHeredoc := false + + for _, line := range strings.Split(yamlOutput, "\n") { + // Check if we're starting a heredoc block + if strings.Contains(line, "cat >> $GITHUB_AW_PROMPT << 'EOF'") { + inHeredoc = true + continue + } + + // Check if we're ending a heredoc block + if strings.TrimSpace(line) == "EOF" { + inHeredoc = false + continue + } + + // If we're in a heredoc block, extract the content line + if inHeredoc { + // Remove the indent from the line + if strings.HasPrefix(line, indent) { + contentLine := strings.TrimPrefix(line, indent) + lines = append(lines, contentLine) + } + } + } + + return lines +} + +// TestWritePromptTextToYAML_NoDataLoss verifies that no lines or chunks are lost +// during the chunking process, even with edge cases. +func TestWritePromptTextToYAML_NoDataLoss(t *testing.T) { + tests := []struct { + name string + lines []string + expectLoss bool + }{ + { + name: "single line", + lines: []string{"Single line of text"}, + expectLoss: false, + }, + { + name: "multiple short lines", + lines: []string{"Line 1", "Line 2", "Line 3", "Line 4", "Line 5"}, + expectLoss: false, + }, + { + name: "empty lines", + lines: []string{"Line 1", "", "Line 3", "", "Line 5"}, + expectLoss: false, + }, + { + name: "very long single line", + lines: []string{strings.Repeat("x", 25000)}, + expectLoss: false, + }, + { + name: "exactly at chunk boundary", + lines: func() []string { + // Create lines that total exactly 20000 bytes with indent + line := strings.Repeat("x", 100) + lines := make([]string, 180) + for i := range lines { + lines[i] = line + } + return lines + }(), + expectLoss: false, + }, + { + name: "large number of lines requiring max chunks", + lines: func() []string { + line := strings.Repeat("y", 1000) + lines := make([]string, 600) + for i := range lines { + lines[i] = line + } + return lines + }(), + expectLoss: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + text := strings.Join(tt.lines, "\n") + var yaml strings.Builder + indent := " " + + WritePromptTextToYAML(&yaml, text, indent) + + result := yaml.String() + + // Extract lines from the YAML output + extractedLines := extractLinesFromYAML(result, indent) + + // Verify line count + if len(extractedLines) != len(tt.lines) { + t.Errorf("Line count mismatch: expected %d lines, got %d lines", len(tt.lines), len(extractedLines)) + t.Logf("Original lines: %d", len(tt.lines)) + t.Logf("Extracted lines: %d", len(extractedLines)) + } + + // Verify content integrity + mismatchCount := 0 + for i := 0; i < len(tt.lines) && i < len(extractedLines); i++ { + if tt.lines[i] != extractedLines[i] { + mismatchCount++ + if mismatchCount <= 3 { + t.Errorf("Line %d mismatch:\nExpected: %q\nGot: %q", i+1, tt.lines[i], extractedLines[i]) + } + } + } + + if mismatchCount > 0 { + t.Errorf("Total line mismatches: %d", mismatchCount) + } + }) + } +} + +// TestWritePromptTextToYAML_ChunkIntegrity verifies that chunks are properly formed +// and that the chunking process maintains data integrity. +func TestWritePromptTextToYAML_ChunkIntegrity(t *testing.T) { + // Create a large text that will require multiple chunks + line := strings.Repeat("Test line with some content. ", 50) + lines := make([]string, 300) + for i := range lines { + lines[i] = line + } + text := strings.Join(lines, "\n") + + var yaml strings.Builder + indent := " " + + WritePromptTextToYAML(&yaml, text, indent) + + result := yaml.String() + + // Count heredoc blocks + heredocCount := strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") + + t.Logf("Created %d heredoc blocks for %d lines (%d bytes)", heredocCount, len(lines), len(text)) + + // Verify we have multiple chunks but not exceeding max + if heredocCount < 2 { + t.Errorf("Expected multiple chunks for large text, got %d", heredocCount) + } + + if heredocCount > MaxPromptChunks { + t.Errorf("Expected at most %d chunks, got %d", MaxPromptChunks, heredocCount) + } + + // Verify all heredocs are properly closed + eofCount := strings.Count(result, indent+"EOF") + if eofCount != heredocCount { + t.Errorf("Heredoc closure mismatch: %d opens, %d closes", heredocCount, eofCount) + } + + // Verify no data loss + extractedLines := extractLinesFromYAML(result, indent) + if len(extractedLines) != len(lines) { + t.Errorf("Line count mismatch: expected %d, got %d", len(lines), len(extractedLines)) + } + + // Verify content integrity by checking a few random samples + sampleIndices := []int{0, len(lines) / 4, len(lines) / 2, len(lines) * 3 / 4, len(lines) - 1} + for _, idx := range sampleIndices { + if idx < len(lines) && idx < len(extractedLines) { + if lines[idx] != extractedLines[idx] { + t.Errorf("Content mismatch at line %d:\nExpected: %q\nGot: %q", idx+1, lines[idx], extractedLines[idx]) + } + } + } +} diff --git a/pkg/workflow/sh_test.go b/pkg/workflow/sh_test.go new file mode 100644 index 00000000000..73bec34a15f --- /dev/null +++ b/pkg/workflow/sh_test.go @@ -0,0 +1,287 @@ +package workflow + +import ( + "strings" + "testing" +) + +func TestWritePromptTextToYAML_SmallText(t *testing.T) { + var yaml strings.Builder + text := "This is a small text\nWith a few lines\nThat doesn't need chunking" + indent := " " + + WritePromptTextToYAML(&yaml, text, indent) + + result := yaml.String() + + // Should have exactly one heredoc block + if strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") != 1 { + t.Errorf("Expected 1 heredoc block for small text, got %d", strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'")) + } + + // Should contain all original lines + if !strings.Contains(result, "This is a small text") { + t.Error("Expected to find original text in output") + } + if !strings.Contains(result, "With a few lines") { + t.Error("Expected to find original text in output") + } + if !strings.Contains(result, "That doesn't need chunking") { + t.Error("Expected to find original text in output") + } + + // Should have proper EOF markers + if strings.Count(result, indent+"EOF") != 1 { + t.Errorf("Expected 1 EOF marker, got %d", strings.Count(result, indent+"EOF")) + } +} + +func TestWritePromptTextToYAML_LargeText(t *testing.T) { + var yaml strings.Builder + // Create text that exceeds 20000 characters + longLine := strings.Repeat("This is a very long line of content that will be repeated many times to exceed the character limit. ", 10) + lines := make([]string, 50) + for i := range lines { + lines[i] = longLine + } + text := strings.Join(lines, "\n") + indent := " " + + // Calculate expected size + totalSize := 0 + for _, line := range lines { + totalSize += len(indent) + len(line) + 1 + } + + // This should create multiple chunks since each line is ~1000 chars and we have 50 lines + WritePromptTextToYAML(&yaml, text, indent) + + result := yaml.String() + + // Should have multiple heredoc blocks + heredocCount := strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") + if heredocCount < 2 { + t.Errorf("Expected at least 2 heredoc blocks for large text (total size ~%d bytes), got %d", totalSize, heredocCount) + } + + // Should not exceed 5 chunks (max limit) + if heredocCount > 5 { + t.Errorf("Expected at most 5 heredoc blocks, got %d", heredocCount) + } + + // Should have matching EOF markers + eofCount := strings.Count(result, indent+"EOF") + if eofCount != heredocCount { + t.Errorf("Expected %d EOF markers to match %d heredoc blocks, got %d", heredocCount, heredocCount, eofCount) + } + + // Should contain original content (or at least the beginning if truncated) + firstLine := strings.Split(text, "\n")[0] + if !strings.Contains(result, firstLine[:50]) { + t.Error("Expected to find beginning of original text in output") + } +} + +func TestWritePromptTextToYAML_ExactChunkBoundary(t *testing.T) { + var yaml strings.Builder + indent := " " + + // Create text that's exactly at the 20000 character boundary + // Each line: indent (10) + line (100) + newline (1) = 111 bytes + // 180 lines = 19,980 bytes (just under 20000) + line := strings.Repeat("x", 100) + lines := make([]string, 180) + for i := range lines { + lines[i] = line + } + text := strings.Join(lines, "\n") + + WritePromptTextToYAML(&yaml, text, indent) + + result := yaml.String() + + // Should have exactly 1 heredoc block since we're just under the limit + heredocCount := strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") + if heredocCount != 1 { + t.Errorf("Expected 1 heredoc block for text just under limit, got %d", heredocCount) + } +} + +func TestWritePromptTextToYAML_MaxChunksLimit(t *testing.T) { + var yaml strings.Builder + indent := " " + + // Create text that would need more than 5 chunks (if we allowed it) + // Each line: indent (10) + line (1000) + newline (1) = 1011 bytes + // 600 lines = ~606,600 bytes + // At 20000 bytes per chunk, this would need ~31 chunks, but we limit to 5 + line := strings.Repeat("y", 1000) + lines := make([]string, 600) + for i := range lines { + lines[i] = line + } + text := strings.Join(lines, "\n") + + WritePromptTextToYAML(&yaml, text, indent) + + result := yaml.String() + + // Should have exactly 5 heredoc blocks (the maximum) + heredocCount := strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") + if heredocCount != 5 { + t.Errorf("Expected exactly 5 heredoc blocks (max limit), got %d", heredocCount) + } + + // Should have matching EOF markers + eofCount := strings.Count(result, indent+"EOF") + if eofCount != 5 { + t.Errorf("Expected 5 EOF markers, got %d", eofCount) + } +} + +func TestWritePromptTextToYAML_EmptyText(t *testing.T) { + var yaml strings.Builder + text := "" + indent := " " + + WritePromptTextToYAML(&yaml, text, indent) + + result := yaml.String() + + // Should have at least one heredoc block (even for empty text) + if strings.Count(result, "cat >> $GITHUB_AW_PROMPT << 'EOF'") < 1 { + t.Error("Expected at least 1 heredoc block even for empty text") + } + + // Should have matching EOF markers + if strings.Count(result, indent+"EOF") < 1 { + t.Error("Expected at least 1 EOF marker") + } +} + +func TestChunkLines_SmallInput(t *testing.T) { + lines := []string{"line1", "line2", "line3"} + indent := " " + maxSize := 20000 + maxChunks := 5 + + chunks := chunkLines(lines, indent, maxSize, maxChunks) + + if len(chunks) != 1 { + t.Errorf("Expected 1 chunk for small input, got %d", len(chunks)) + } + + if len(chunks[0]) != 3 { + t.Errorf("Expected chunk to contain 3 lines, got %d", len(chunks[0])) + } +} + +func TestChunkLines_ExceedsSize(t *testing.T) { + // Create lines that will exceed maxSize + line := strings.Repeat("x", 1000) + lines := make([]string, 50) + for i := range lines { + lines[i] = line + } + + indent := " " + maxSize := 20000 + maxChunks := 5 + + chunks := chunkLines(lines, indent, maxSize, maxChunks) + + // Should have multiple chunks + if len(chunks) < 2 { + t.Errorf("Expected at least 2 chunks, got %d", len(chunks)) + } + + // Verify each chunk (except possibly the last) stays within size limit + for i, chunk := range chunks { + size := 0 + for _, line := range chunk { + size += len(indent) + len(line) + 1 + } + + // Last chunk might exceed if we hit maxChunks limit + if i < len(chunks)-1 && size > maxSize { + t.Errorf("Chunk %d exceeds size limit: %d > %d", i, size, maxSize) + } + } + + // Verify total lines are preserved + totalLines := 0 + for _, chunk := range chunks { + totalLines += len(chunk) + } + if totalLines != len(lines) { + t.Errorf("Expected %d total lines, got %d", len(lines), totalLines) + } +} + +func TestChunkLines_MaxChunksEnforced(t *testing.T) { + // Create many lines that would need more than maxChunks + line := strings.Repeat("x", 1000) + lines := make([]string, 600) + for i := range lines { + lines[i] = line + } + + indent := " " + maxSize := 20000 + maxChunks := 5 + + chunks := chunkLines(lines, indent, maxSize, maxChunks) + + // Should have exactly maxChunks + if len(chunks) != maxChunks { + t.Errorf("Expected exactly %d chunks (max limit), got %d", maxChunks, len(chunks)) + } + + // Verify all lines are included (even if last chunk is large) + totalLines := 0 + for _, chunk := range chunks { + totalLines += len(chunk) + } + if totalLines != len(lines) { + t.Errorf("Expected %d total lines, got %d", len(lines), totalLines) + } +} + +func TestChunkLines_EmptyInput(t *testing.T) { + lines := []string{} + indent := " " + maxSize := 20000 + maxChunks := 5 + + chunks := chunkLines(lines, indent, maxSize, maxChunks) + + // Should return at least one empty chunk + if len(chunks) != 1 { + t.Errorf("Expected 1 chunk for empty input, got %d", len(chunks)) + } + + if len(chunks[0]) != 0 { + t.Errorf("Expected empty chunk, got %d lines", len(chunks[0])) + } +} + +func TestChunkLines_SingleLineExceedsLimit(t *testing.T) { + // Single line that exceeds maxSize + line := strings.Repeat("x", 25000) + lines := []string{line} + + indent := " " + maxSize := 20000 + maxChunks := 5 + + chunks := chunkLines(lines, indent, maxSize, maxChunks) + + // Should still have one chunk with that single line + if len(chunks) != 1 { + t.Errorf("Expected 1 chunk, got %d", len(chunks)) + } + + if len(chunks[0]) != 1 { + t.Errorf("Expected 1 line in chunk, got %d", len(chunks[0])) + } +}