From e2f82b9639210f53a4578a7c70ef4d783f1691bf Mon Sep 17 00:00:00 2001 From: Ganesh Patil <7030871503ganeshpatil@gmail.com> Date: Thu, 15 Jan 2026 16:32:32 +0530 Subject: [PATCH 1/5] Fix regex pattern in regex_replace_posix_groups The regex pattern was using (\d*) which matches zero or more digits, causing a lone backslash to be incorrectly converted to ${}. Changed to (\d+) which requires at least one digit, ensuring only actual POSIX capture group references like \1, \2, etc. are converted to , . Added unit tests to verify the correct behavior. --- .../functions/src/regex/regexpreplace.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index b5ab46f0ec71..e3ce83f4ef82 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -193,7 +193,7 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result { /// used by regexp_replace fn regex_replace_posix_groups(replacement: &str) -> String { static CAPTURE_GROUPS_RE_LOCK: LazyLock = - LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap()); + LazyLock::new(|| Regex::new(r"(\\)(\d+)").unwrap()); CAPTURE_GROUPS_RE_LOCK .replace_all(replacement, "$${$2}") .into_owned() @@ -659,6 +659,23 @@ mod tests { use super::*; + #[test] + fn test_regex_replace_posix_groups() { + // Test that \1, \2, etc. are replaced with ${1}, ${2}, etc. + assert_eq!(regex_replace_posix_groups(r"\1"), "${1}"); + assert_eq!(regex_replace_posix_groups(r"\12"), "${12}"); + assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y"); + assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}"); + + // Test that a lone backslash is NOT replaced (requires at least one digit) + assert_eq!(regex_replace_posix_groups(r"\"), r"\"); + assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar"); + + // Test that backslash followed by non-digit is preserved + assert_eq!(regex_replace_posix_groups(r"\n"), r"\n"); + assert_eq!(regex_replace_posix_groups(r"\t"), r"\t"); + } + macro_rules! static_pattern_regexp_replace { ($name:ident, $T:ty, $O:ty) => { #[test] From c5a2dff81a5d90ebb18a855dd57ebbd0cefa5581 Mon Sep 17 00:00:00 2001 From: Ganesh Patil <7030871503ganeshpatil@gmail.com> Date: Thu, 15 Jan 2026 18:03:24 +0530 Subject: [PATCH 2/5] Fix regex pattern to handle both single and double backslash escapes The regex pattern was only matching single backslash (\1) but SQL string literals with escaped backslashes pass through as double backslash (\\1). Changed the pattern from (\\)(\d+) to \\{1,2}(\d+) to match 1 or 2 backslashes followed by digits, ensuring proper handling of SQL escaped backreferences like 'X\\1Y'. Added unit tests for the double backslash case. --- datafusion/functions/src/regex/regexpreplace.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index e3ce83f4ef82..a96b0d5d68a0 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -189,13 +189,15 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result { } } -/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1}) +/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like ${1}) /// used by regexp_replace +/// Handles both single backslash (\1) and double backslash (\\1) which can occur +/// when SQL strings with escaped backslashes are passed through fn regex_replace_posix_groups(replacement: &str) -> String { static CAPTURE_GROUPS_RE_LOCK: LazyLock = - LazyLock::new(|| Regex::new(r"(\\)(\d+)").unwrap()); + LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap()); CAPTURE_GROUPS_RE_LOCK - .replace_all(replacement, "$${$2}") + .replace_all(replacement, "$${$1}") .into_owned() } @@ -667,6 +669,11 @@ mod tests { assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y"); assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}"); + // Test double backslash (from SQL escaped strings like '\\1') + assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}"); + assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y"); + assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}"); + // Test that a lone backslash is NOT replaced (requires at least one digit) assert_eq!(regex_replace_posix_groups(r"\"), r"\"); assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar"); From 502488e2cbc740ff8eaca936cf3bb6f29bc04793 Mon Sep 17 00:00:00 2001 From: Ganesh Patil <7030871503ganeshpatil@gmail.com> Date: Fri, 16 Jan 2026 17:02:03 +0530 Subject: [PATCH 3/5] Add test cases for 3 and 4 backslashes before digits --- datafusion/functions/src/regex/regexpreplace.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index a96b0d5d68a0..9a4f2219b4d1 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -674,6 +674,11 @@ mod tests { assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y"); assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}"); + // Test 3 or 4 backslashes before digits to document expected behavior + assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}"); + assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}"); + assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}"); + // Test that a lone backslash is NOT replaced (requires at least one digit) assert_eq!(regex_replace_posix_groups(r"\"), r"\"); assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar"); From d41f9bf8aaea4fbdcfc9feaa923d9c1c9abc5b7e Mon Sep 17 00:00:00 2001 From: Ganesh Patil <7030871503ganeshpatil@gmail.com> Date: Tue, 20 Jan 2026 07:32:44 +0530 Subject: [PATCH 4/5] Add test and documentation for \0 behavior in regex_replace_posix_groups --- datafusion/functions/src/regex/regexpreplace.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 9a4f2219b4d1..6505f8d8abcc 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -193,6 +193,10 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result { /// used by regexp_replace /// Handles both single backslash (\1) and double backslash (\\1) which can occur /// when SQL strings with escaped backslashes are passed through +/// +/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax +/// substitutes the entire match. This is consistent with POSIX behavior where +/// \0 (or &) refers to the entire matched string. fn regex_replace_posix_groups(replacement: &str) -> String { static CAPTURE_GROUPS_RE_LOCK: LazyLock = LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap()); @@ -686,6 +690,12 @@ mod tests { // Test that backslash followed by non-digit is preserved assert_eq!(regex_replace_posix_groups(r"\n"), r"\n"); assert_eq!(regex_replace_posix_groups(r"\t"), r"\t"); + + // Test \0 behavior: \0 is converted to ${0}, which in Rust's regex + // replacement syntax substitutes the entire match. This is consistent + // with POSIX behavior where \0 (or &) refers to the entire matched string. + assert_eq!(regex_replace_posix_groups(r"\0"), "${0}"); + assert_eq!(regex_replace_posix_groups(r"prefix\0suffix"), "prefix${0}suffix"); } macro_rules! static_pattern_regexp_replace { From 9cae656541e0a80aff377161bcfa26b6b84ceb28 Mon Sep 17 00:00:00 2001 From: Ganesh Patil <7030871503ganeshpatil@gmail.com> Date: Tue, 20 Jan 2026 09:20:47 +0530 Subject: [PATCH 5/5] Fix cargo fmt formatting for assert_eq --- datafusion/functions/src/regex/regexpreplace.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 6505f8d8abcc..68e324e21c89 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -695,7 +695,10 @@ mod tests { // replacement syntax substitutes the entire match. This is consistent // with POSIX behavior where \0 (or &) refers to the entire matched string. assert_eq!(regex_replace_posix_groups(r"\0"), "${0}"); - assert_eq!(regex_replace_posix_groups(r"prefix\0suffix"), "prefix${0}suffix"); + assert_eq!( + regex_replace_posix_groups(r"prefix\0suffix"), + "prefix${0}suffix" + ); } macro_rules! static_pattern_regexp_replace {