From 1e5b557459c82383af65090404750032abbad599 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Tue, 30 Dec 2025 18:08:54 +0800 Subject: [PATCH 1/5] auto fix for all non-functional tests in dev/rust_lint.sh --- ci/scripts/doc_prettier_check.sh | 85 ++++++++++++++------- ci/scripts/license_header.sh | 54 ++++++++++++- ci/scripts/rust_clippy.sh | 59 ++++++++++++++- ci/scripts/rust_docs.sh | 1 + ci/scripts/rust_fmt.sh | 51 ++++++++++++- ci/scripts/rust_toml_fmt.sh | 55 ++++++++++++-- ci/scripts/typos_check.sh | 55 +++++++++++++- ci/scripts/utils/git.sh | 27 +++++++ dev/rust_lint.sh | 125 ++++++++++++++++++++++++------- 9 files changed, 441 insertions(+), 71 deletions(-) create mode 100644 ci/scripts/utils/git.sh diff --git a/ci/scripts/doc_prettier_check.sh b/ci/scripts/doc_prettier_check.sh index d94a0d1c9617..f074ab928355 100755 --- a/ci/scripts/doc_prettier_check.sh +++ b/ci/scripts/doc_prettier_check.sh @@ -17,41 +17,68 @@ # specific language governing permissions and limitations # under the License. -SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" - -MODE="--check" -ACTION="Checking" -if [ $# -gt 0 ]; then - if [ "$1" = "--write" ]; then - MODE="--write" - ACTION="Formatting" - else - echo "Usage: $0 [--write]" >&2 - exit 1 - fi +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")" +PRETTIER_VERSION="2.7.1" +PRETTIER_TARGETS=( + '{datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md' + '!datafusion/CHANGELOG.md' + README.md + CONTRIBUTING.md +) + +source "${SCRIPT_DIR}/utils/git.sh" + +MODE="check" +ALLOW_DIRTY=0 + +usage() { + cat >&2 </dev/null 2>&1; then echo "npx is required to run the prettier check. Install Node.js (e.g., brew install node) and re-run." >&2 exit 1 fi - -# Ignore subproject CHANGELOG.md because it is machine generated -npx prettier@2.7.1 $MODE \ - '{datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md' \ - '!datafusion/CHANGELOG.md' \ - README.md \ - CONTRIBUTING.md -status=$? - -if [ $status -ne 0 ]; then - if [ "$MODE" = "--check" ]; then - echo "Prettier check failed. Re-run with --write (e.g., ./ci/scripts/doc_prettier_check.sh --write) to format files, commit the changes, and re-run the check." >&2 - else - echo "Prettier format failed. Files may have been modified; commit any changes and re-run." >&2 - fi - exit $status + +PRETTIER_MODE=(--check) +if [[ "$MODE" == "write" ]]; then + PRETTIER_MODE=(--write) fi + +# Ignore subproject CHANGELOG.md because it is machine generated +npx "prettier@${PRETTIER_VERSION}" "${PRETTIER_MODE[@]}" "${PRETTIER_TARGETS[@]}" diff --git a/ci/scripts/license_header.sh b/ci/scripts/license_header.sh index 5345728f9cdf..4894e4727a36 100755 --- a/ci/scripts/license_header.sh +++ b/ci/scripts/license_header.sh @@ -17,6 +17,54 @@ # specific language governing permissions and limitations # under the License. -# Check Apache license header -set -ex -hawkeye check --config licenserc.toml +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")" + +source "${SCRIPT_DIR}/utils/git.sh" + +MODE="check" +ALLOW_DIRTY=0 +HAWKEYE_CONFIG="licenserc.toml" + +usage() { + cat >&2 <&2 <&2 <&2 <&2 <&2 + return 1 + fi +} diff --git a/dev/rust_lint.sh b/dev/rust_lint.sh index 21d461184641..19ad522c5ce6 100755 --- a/dev/rust_lint.sh +++ b/dev/rust_lint.sh @@ -23,30 +23,103 @@ # Note: The installed checking tools (e.g., taplo) are not guaranteed to match # the CI versions for simplicity, there might be some minor differences. Check # `.github/workflows` for the CI versions. +# +# +# +# For each lint scripts: +# +# By default, they run in check mode: +# ./ci/scripts/rust_fmt.sh +# +# With `--write`, scripts perform best-effort auto fixes: +# ./ci/scripts/rust_fmt.sh --write +# +# The `--write` flag assumes a clean git repository (no uncommitted changes); to force +# auto fixes even if there are unstaged changes, use `--allow-dirty`: +# ./ci/scripts/rust_fmt.sh --write --allow-dirty +# +# New scripts can use `rust_fmt.sh` as a reference. + +set -euo pipefail + +usage() { + cat >&2 < /dev/null; then + echo "Installing $cmd using: $install_cmd" + eval "$install_cmd" + fi +} + +MODE="check" +ALLOW_DIRTY=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --write) + MODE="write" + ;; + --allow-dirty) + ALLOW_DIRTY=1 + ;; + -h|--help) + usage + ;; + *) + usage + ;; + esac + shift +done + +SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")" + +ensure_tool "taplo" "cargo install taplo-cli" +ensure_tool "hawkeye" "cargo install hawkeye --locked" +ensure_tool "typos" "cargo install typos-cli --locked" + +run_step() { + local name="$1" + shift + echo "[${SCRIPT_NAME}] Running ${name}" + "$@" +} + +declare -a WRITE_STEPS=( + "ci/scripts/rust_fmt.sh|true" + "ci/scripts/rust_clippy.sh|true" + "ci/scripts/rust_toml_fmt.sh|true" + "ci/scripts/license_header.sh|true" + "ci/scripts/typos_check.sh|true" + "ci/scripts/doc_prettier_check.sh|true" +) + +declare -a READONLY_STEPS=( + "ci/scripts/rust_docs.sh|false" +) -# For `.toml` format checking -set -e -if ! command -v taplo &> /dev/null; then - echo "Installing taplo using cargo" - cargo install taplo-cli -fi - -# For Apache licence header checking -if ! command -v hawkeye &> /dev/null; then - echo "Installing hawkeye using cargo" - cargo install hawkeye --locked -fi - -# For spelling checks -if ! command -v typos &> /dev/null; then - echo "Installing typos using cargo" - cargo install typos-cli --locked -fi - -ci/scripts/rust_fmt.sh -ci/scripts/rust_clippy.sh -ci/scripts/rust_toml_fmt.sh -ci/scripts/rust_docs.sh -ci/scripts/license_header.sh -ci/scripts/typos_check.sh -ci/scripts/doc_prettier_check.sh +for entry in "${WRITE_STEPS[@]}" "${READONLY_STEPS[@]}"; do + IFS='|' read -r script_path supports_write <<<"$entry" + script_name="$(basename "$script_path")" + args=() + if [[ "$supports_write" == "true" && "$MODE" == "write" ]]; then + args+=(--write) + [[ $ALLOW_DIRTY -eq 1 ]] && args+=(--allow-dirty) + fi + if [[ ${#args[@]} -gt 0 ]]; then + run_step "$script_name" "$script_path" "${args[@]}" + else + run_step "$script_name" "$script_path" + fi +done From 8c99417929fcf133423dd3392f1939ab13a0bc93 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Tue, 30 Dec 2025 18:29:39 +0800 Subject: [PATCH 2/5] Introduce all possible lint violations --- .../physical-expr-common/src/physical_expr.rs | 19 +------------------ datafusion/physical-expr/src/physical_expr.rs | 9 ++++++++- datafusion/sql/Cargo.toml | 2 +- docs/source/user-guide/configs.md | 2 +- 4 files changed, 11 insertions(+), 21 deletions(-) diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 2358a2194091..750cb8395deb 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -1,20 +1,3 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - use std::any::Any; use std::fmt; use std::fmt::{Debug, Display, Formatter}; @@ -98,7 +81,7 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash { /// /// Returns an `Err` if the expression could not be evaluated or if the length of the /// `selection` validity array and the number of row in `batch` is not equal. - fn evaluate_selection( + fn evaluate_selection( &self, batch: &RecordBatch, selection: &BooleanArray, diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index e750bfd79d77..42a7fc4eef3c 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -99,7 +99,7 @@ pub fn physical_exprs_bag_equal( /// /// # Returns /// -/// A vector of lexicographic orderings for physical execution, or an error if +/// A vector of lexicographic orderings for physical execution, or an error iif /// the transformation fails. /// /// # Examples @@ -474,4 +474,11 @@ mod tests { assert!(!stable_composite.is_volatile_node()); assert!(!is_volatile(&stable_composite)); // No volatile children } + + #[test] + fn test_clone_on_copy_violation() { + let value: usize = 42; + let duplicate = value.clone(); // clippy::clone_on_copy (auto-fixable) + assert_eq!(value, duplicate); + } } diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index a814292a3d71..69d41678d566 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -20,7 +20,7 @@ name = "datafusion-sql" description = "DataFusion SQL Query Planner" keywords = ["datafusion", "sql", "parser", "planner"] readme = "README.md" -version = { workspace = true } +version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } repository = { workspace = true } diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index c9222afe8ceb..3c2265e256d1 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -66,7 +66,7 @@ The following configuration settings are available: | key | default | description | | ----------------------------------------------------------------------- | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | datafusion.catalog.create_default_catalog_and_schema | true | Whether the default catalog and schema should be created automatically. | -| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | +| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | | datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | | datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | | datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | From dbfc3188425cecf5a4dc9ee054547057242a731c Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Tue, 30 Dec 2025 18:50:15 +0800 Subject: [PATCH 3/5] fix license_header.sh bug --- ci/scripts/license_header.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/scripts/license_header.sh b/ci/scripts/license_header.sh index 4894e4727a36..7ab8c9637598 100755 --- a/ci/scripts/license_header.sh +++ b/ci/scripts/license_header.sh @@ -63,7 +63,15 @@ fi if [[ "$MODE" == "write" ]]; then echo "[${SCRIPT_NAME}] \`hawkeye format --config ${HAWKEYE_CONFIG}\`" - hawkeye format --config "${HAWKEYE_CONFIG}" + if ! hawkeye format --config "${HAWKEYE_CONFIG}"; then + status=$? + # hawkeye returns exit code 1 when it applies fixes; treat that as success. + if [[ $status -eq 1 ]]; then + echo "[${SCRIPT_NAME}] hawkeye format applied fixes (exit 1 treated as success)" + else + exit $status + fi + fi else echo "[${SCRIPT_NAME}] \`hawkeye check --config ${HAWKEYE_CONFIG}\`" hawkeye check --config "${HAWKEYE_CONFIG}" From f499533f430c4dd69320b3257464741c7f295277 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Tue, 30 Dec 2025 18:55:54 +0800 Subject: [PATCH 4/5] auto-fix violations --- .../physical-expr-common/src/physical_expr.rs | 19 ++++++++++++++++++- datafusion/physical-expr/src/physical_expr.rs | 9 +-------- datafusion/sql/Cargo.toml | 2 +- docs/source/user-guide/configs.md | 2 +- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 750cb8395deb..2358a2194091 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::any::Any; use std::fmt; use std::fmt::{Debug, Display, Formatter}; @@ -81,7 +98,7 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash { /// /// Returns an `Err` if the expression could not be evaluated or if the length of the /// `selection` validity array and the number of row in `batch` is not equal. - fn evaluate_selection( + fn evaluate_selection( &self, batch: &RecordBatch, selection: &BooleanArray, diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index 42a7fc4eef3c..e750bfd79d77 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -99,7 +99,7 @@ pub fn physical_exprs_bag_equal( /// /// # Returns /// -/// A vector of lexicographic orderings for physical execution, or an error iif +/// A vector of lexicographic orderings for physical execution, or an error if /// the transformation fails. /// /// # Examples @@ -474,11 +474,4 @@ mod tests { assert!(!stable_composite.is_volatile_node()); assert!(!is_volatile(&stable_composite)); // No volatile children } - - #[test] - fn test_clone_on_copy_violation() { - let value: usize = 42; - let duplicate = value.clone(); // clippy::clone_on_copy (auto-fixable) - assert_eq!(value, duplicate); - } } diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index 69d41678d566..a814292a3d71 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -20,7 +20,7 @@ name = "datafusion-sql" description = "DataFusion SQL Query Planner" keywords = ["datafusion", "sql", "parser", "planner"] readme = "README.md" -version = { workspace = true } +version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } repository = { workspace = true } diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 3c2265e256d1..c9222afe8ceb 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -66,7 +66,7 @@ The following configuration settings are available: | key | default | description | | ----------------------------------------------------------------------- | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | datafusion.catalog.create_default_catalog_and_schema | true | Whether the default catalog and schema should be created automatically. | -| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | +| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | | datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | | datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | | datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | From ae7fc7007e405e9ed034a45f5fb117053e7b5a66 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Wed, 31 Dec 2025 18:22:22 +0800 Subject: [PATCH 5/5] review --- ci/scripts/rust_clippy.sh | 14 +++++++------- ci/scripts/rust_fmt.sh | 2 +- dev/rust_lint.sh | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ci/scripts/rust_clippy.sh b/ci/scripts/rust_clippy.sh index d15a8742424f..f8b5c0852fa3 100755 --- a/ci/scripts/rust_clippy.sh +++ b/ci/scripts/rust_clippy.sh @@ -63,14 +63,14 @@ if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then require_clean_work_tree "$SCRIPT_NAME" || exit 1 fi +CLIPPY_CMD=(cargo clippy) if [[ "$MODE" == "write" ]]; then - ALLOW_DIRTY_ARGS=() + CLIPPY_CMD+=(--fix) if [[ $ALLOW_DIRTY -eq 1 ]]; then - ALLOW_DIRTY_ARGS+=(--allow-dirty --allow-staged) + CLIPPY_CMD+=(--allow-dirty --allow-staged) fi - echo "[${SCRIPT_NAME}] \`cargo clippy --fix --all-targets --workspace --features ${CLIPPY_FEATURES} -- -D warnings\`" - cargo clippy --fix "${ALLOW_DIRTY_ARGS[@]}" "${CLIPPY_ARGS[@]}" "${CLIPPY_LINT_ARGS[@]}" -else - echo "[${SCRIPT_NAME}] \`cargo clippy --all-targets --workspace --features ${CLIPPY_FEATURES} -- -D warnings\`" - cargo clippy "${CLIPPY_ARGS[@]}" "${CLIPPY_LINT_ARGS[@]}" fi +CLIPPY_CMD+=("${CLIPPY_ARGS[@]}" "${CLIPPY_LINT_ARGS[@]}") + +echo "[${SCRIPT_NAME}] \`${CLIPPY_CMD[*]}\`" +"${CLIPPY_CMD[@]}" diff --git a/ci/scripts/rust_fmt.sh b/ci/scripts/rust_fmt.sh index b72e9d0afa65..16c87cea5e0f 100755 --- a/ci/scripts/rust_fmt.sh +++ b/ci/scripts/rust_fmt.sh @@ -65,4 +65,4 @@ if [[ "$MODE" == "write" ]]; then else echo "[${SCRIPT_NAME}] \`cargo fmt --all -- --check\`" cargo fmt --all -- --check -fi \ No newline at end of file +fi diff --git a/dev/rust_lint.sh b/dev/rust_lint.sh index 19ad522c5ce6..43d29bd88166 100755 --- a/dev/rust_lint.sh +++ b/dev/rust_lint.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -85,7 +85,7 @@ done SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")" -ensure_tool "taplo" "cargo install taplo-cli" +ensure_tool "taplo" "cargo install taplo-cli --locked" ensure_tool "hawkeye" "cargo install hawkeye --locked" ensure_tool "typos" "cargo install typos-cli --locked"