diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index aa4bd862e09e4..2c050fddc413f 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -16,13 +16,47 @@ # under the License. name: Dev -on: [push, pull_request] +on: [push, pull_request, workflow_dispatch] concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true jobs: + breaking-changes: + name: Breaking Changes Detection + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + permissions: + contents: read + pull-requests: write + issues: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + # Need full history to access base branch for git diff operations + # GitHub Actions only fetches single commit by default (fetch-depth: 1) + # Our script uses: git diff "$BASE_REF"..HEAD which requires base branch + fetch-depth: 0 + + - name: Run breaking changes detection + shell: bash + run: | + chmod +x ./ci/scripts/detect-breaking-changes.sh + ./ci/scripts/detect-breaking-changes.sh + timeout-minutes: 5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload breaking changes report + if: failure() + uses: actions/upload-artifact@v4 + with: + name: breaking-changes-report + path: breaking-changes-report.md + retention-days: 30 + license-header-check: runs-on: ubuntu-latest name: Check License Header @@ -35,6 +69,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch full history + token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/setup-node@v4 with: node-version: "20" diff --git a/ci/scripts/README.md b/ci/scripts/README.md new file mode 100644 index 0000000000000..147ed266072af --- /dev/null +++ b/ci/scripts/README.md @@ -0,0 +1,49 @@ +# CI Scripts + +This directory contains scripts used in the DataFusion CI/CD pipeline. + +## Breaking Changes Detection + +### `detect-breaking-changes.sh` + +Automatically detects potential breaking changes in DataFusion pull requests using git-based analysis. + +**Features:** +- Detects changes to LogicalPlan enums and variants +- Monitors DataFrame API modifications +- Checks SQL parser keyword changes +- Generates detailed reports with diff snippets +- Integrates with GitHub to post PR comments +- Lightweight git-based analysis (no compilation required) + +**Usage:** +```bash +# Run from repository root +./ci/scripts/detect-breaking-changes.sh + +# With custom base ref +GITHUB_BASE_REF=v42.0.0 ./ci/scripts/detect-breaking-changes.sh +``` + +**Environment Variables:** +- `GITHUB_BASE_REF`: Base branch/tag to compare against (default: `main`) +- `GITHUB_HEAD_REF`: Head ref being tested (default: `HEAD`) +- `GITHUB_TOKEN`: Required for posting PR comments +- `GITHUB_ENV`: Set by GitHub Actions for CI integration + +**Output:** +- Exit code 0: No breaking changes detected +- Exit code 1: Breaking changes detected +- Generates `breaking-changes-report.md` when issues found + +### `breaking-changes-report.md` + +Template/example of the report generated when breaking changes are detected. Contains: +- Summary of detected changes +- Links to DataFusion API stability guidelines +- Required actions for PR authors +- Approval requirements + +## Integration + +The breaking changes detection is integrated into the GitHub Actions workflow (`.github/workflows/dev.yml`) and runs automatically on all pull requests. \ No newline at end of file diff --git a/ci/scripts/detect-breaking-changes.sh b/ci/scripts/detect-breaking-changes.sh new file mode 100755 index 0000000000000..288a78b39ff60 --- /dev/null +++ b/ci/scripts/detect-breaking-changes.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +set -euo pipefail # Add -u for undefined variables and -o pipefail for pipe failures + +# Ensure we're in the repository root +if [ ! -f "Cargo.toml" ] || [ ! -d ".git" ]; then + echo "❌ Error: This script must be run from the repository root directory" + echo "Current directory: $(pwd)" + echo "Expected: A directory containing Cargo.toml and .git" + exit 1 +fi + +BASE_REF="${GITHUB_BASE_REF:-main}" +HEAD_REF="${GITHUB_HEAD_REF:-HEAD}" + +# In GitHub Actions PR context, use the proper base reference +if [ -n "$GITHUB_BASE_REF" ]; then + # For PRs, GitHub provides GITHUB_BASE_REF (e.g., "main") + # We need to reference it as origin/$GITHUB_BASE_REF + BASE_REF="origin/$GITHUB_BASE_REF" +fi + +echo "🔍 DataFusion Breaking Changes Detection" +echo "Comparing against: $BASE_REF" +echo "Current ref: $HEAD_REF" + +# Skip installation - we're only using git-based analysis +echo "📋 Using git-based analysis for breaking change detection..." + +detect_datafusion_breaking_changes() { + local breaking_changes_found=1 # 1 = false in bash + + echo "📋 Checking DataFusion-specific API rules..." + + # Validate that we can access git and the base ref exists + if ! git rev-parse --verify "$BASE_REF" >/dev/null 2>&1; then + echo "⚠️ Warning: Base ref '$BASE_REF' not found, skipping git-based analysis" + return 1 # No breaking changes (can't detect them) + fi + + # Only run git-based checks if we have a valid base ref + if [ -n "$BASE_REF" ]; then + echo "Checking for Rust source file changes..." + if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "\.rs$"; then + echo "Rust files modified - running detailed checks..." + + echo "Checking LogicalPlan stability..." + if check_logical_plan_changes; then + echo "❌ Breaking changes detected in LogicalPlan" + breaking_changes_found=0 + else + echo "✅ No LogicalPlan breaking changes detected" + fi + + echo "Checking DataFrame API..." + if check_dataframe_api_changes; then + echo "❌ Breaking changes detected in DataFrame API" + breaking_changes_found=0 + else + echo "✅ No DataFrame API breaking changes detected" + fi + + echo "Checking SQL parser compatibility..." + if check_sql_parser_changes; then + echo "❌ Breaking changes detected in SQL parser" + breaking_changes_found=0 + else + echo "✅ No SQL parser breaking changes detected" + fi + else + echo "✅ No Rust source files modified - no breaking changes possible" + fi + else + echo "⚠️ Skipping git-based checks (no base ref available)" + fi + + return $breaking_changes_found +} + +check_logical_plan_changes() { + # Check for changes in LogicalPlan files using git diff + if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "(logical_plan|logical.*expr)"; then + echo " LogicalPlan-related files modified" + + # Check for enum variant removals (potential breaking change) + if git diff "$BASE_REF"..HEAD 2>/dev/null | grep -qE "^-.*enum.*LogicalPlan|^-.*pub.*enum"; then + echo " Enum definitions modified" + return 0 # Breaking change detected + fi + + # Check for removed enum variants + if git diff "$BASE_REF"..HEAD 2>/dev/null | grep -qE "^-\s*[A-Z][a-zA-Z]*\s*[\(\{,]"; then + echo " Enum variants potentially removed" + return 0 # Breaking change detected + fi + fi + + return 1 # No breaking changes +} + +check_dataframe_api_changes() { + # Check if DataFrame public methods were changed + if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "dataframe.*\.rs$"; then + echo " DataFrame files modified, checking for API changes..." + + # Check for ANY changes to public methods (additions, removals, modifications) + # Only look at actual DataFrame source files, not test files + if git diff "$BASE_REF"..HEAD 2>/dev/null -- "datafusion/core/src/dataframe/" | grep -qE "^[-+].*pub fn"; then + echo " Public DataFrame API methods modified" + return 0 # Breaking change detected + fi + fi + + return 1 # No breaking changes +} + +check_sql_parser_changes() { + # Check for SQL keyword removals + if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "keywords"; then + if git diff "$BASE_REF"..HEAD 2>/dev/null | grep -qE "^-.*,"; then + echo " SQL keywords potentially removed" + return 0 # Breaking change detected + fi + fi + + return 1 # No breaking changes +} + +generate_breaking_changes_report() { + local output_file="breaking-changes-report.md" + local timestamp=$(date -u +"%Y-%m-%d %H:%M:%S UTC") + + cat > "$output_file" << EOF +# 🚨 Breaking Changes Report + +**Generated:** $timestamp +**Base Ref:** $BASE_REF +**Head Ref:** $HEAD_REF + +## Summary +Breaking changes detected in this PR that require the \`api-change\` label. + +## DataFusion API Stability Guidelines +Per the [API Health Policy](https://datafusion.apache.org/contributor-guide/specification/api-health-policy.html): + +EOF + + echo "### Git-Based Analysis:" >> "$output_file" + + # Only add git-based analysis if we have a base ref + if [ -n "$BASE_REF" ]; then + local changes_found=false + + # Check for DataFrame API changes with details - EXCLUDE script files + if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "dataframe.*\.rs$"; then + # Only look at .rs files in DataFrame source directories + local dataframe_diff=$(git diff "$BASE_REF"..HEAD 2>/dev/null -- "datafusion/core/src/dataframe/" | grep -E "^[-+].*pub fn") + if [ -n "$dataframe_diff" ]; then + echo "- ⚠️ **DataFrame API changes detected:**" >> "$output_file" + echo "\`\`\`diff" >> "$output_file" + echo "$dataframe_diff" | head -10 >> "$output_file" + echo "\`\`\`" >> "$output_file" + echo "" >> "$output_file" + changes_found=true + fi + fi + + # Check for LogicalPlan changes with details - EXCLUDE script files + local logical_plan_files=$(git diff "$BASE_REF" --name-only 2>/dev/null | grep -E "(logical_plan|logical.*expr).*\.rs$") + if [ -n "$logical_plan_files" ]; then + echo "- ⚠️ **LogicalPlan changes detected in:**" >> "$output_file" + echo "$logical_plan_files" | sed 's/^/ - /' >> "$output_file" + echo "" >> "$output_file" + changes_found=true + fi + + # Show general public API changes - ONLY in .rs files, EXCLUDE ci/scripts and tests + local api_changes=$(git diff "$BASE_REF"..HEAD 2>/dev/null -- "datafusion/" "ballista/" | grep -E "^[-+].*pub (fn|struct|enum)" | grep -v "/tests/") + if [ -n "$api_changes" ]; then + echo "- ⚠️ **Public API signature changes:**" >> "$output_file" + echo "\`\`\`diff" >> "$output_file" + echo "$api_changes" | head -10 >> "$output_file" + echo "\`\`\`" >> "$output_file" + echo "" >> "$output_file" + changes_found=true + fi + + if [ "$changes_found" = false ]; then + echo "- ⚠️ Breaking changes detected but specific patterns not identified" >> "$output_file" + echo "- Review the modified files below for potential API changes" >> "$output_file" + fi + + # Add list of changed SOURCE files only + echo "" >> "$output_file" + echo "### Source Files Modified:" >> "$output_file" + git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -E "\.rs$" | head -20 | sed 's/^/- /' >> "$output_file" || { + echo "- No Rust source files modified" >> "$output_file" + } + + echo "" >> "$output_file" + echo "### All Files Modified:" >> "$output_file" + git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | head -20 | sed 's/^/- /' >> "$output_file" + else + echo "- ⚠️ Git-based analysis skipped (no base ref available)" >> "$output_file" + fi + + cat >> "$output_file" << EOF + +## Required Actions: +1. Add the \`api-change\` label to this PR +2. Update CHANGELOG.md with breaking change details +3. Consider adding deprecation warnings before removal +4. Update migration guide if needed + +## Approval Requirements: +- Breaking changes require approval from a DataFusion maintainer +- Consider if this change is necessary or if a deprecation path exists + +## Note: +This analysis uses git-based pattern detection to identify common breaking change patterns. +Review the specific changes carefully to determine if they truly constitute breaking changes. +EOF + + echo "📋 Report generated: $output_file" +} + +main() { + if detect_datafusion_breaking_changes; then + echo "" + echo "🚨 FINAL RESULT: Breaking changes detected!" + # Only set GITHUB_ENV if it exists (in CI) + if [ -n "$GITHUB_ENV" ]; then + echo "BREAKING_CHANGES_DETECTED=true" >> "$GITHUB_ENV" + fi + + generate_breaking_changes_report + + # Only try to comment if we have GitHub CLI and are in a PR context + if command -v gh >/dev/null 2>&1 && [ -n "$GITHUB_TOKEN" ] && [ -n "$GITHUB_REPOSITORY" ] && [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then + echo "💬 Adding PR comment..." + gh pr comment --body-file breaking-changes-report.md || { + echo "⚠️ Could not add PR comment, but report was generated" + } + else + echo "⚠️ Skipping PR comment (not in PR context or missing GitHub CLI)" + fi + + exit 1 + else + echo "" + echo "🎉 FINAL RESULT: No breaking changes detected" + # Only set GITHUB_ENV if it exists (in CI) + if [ -n "$GITHUB_ENV" ]; then + echo "BREAKING_CHANGES_DETECTED=false" >> "$GITHUB_ENV" + fi + fi +} + +main "$@" \ No newline at end of file