From 16bff7e8f5a023fa98ac5cb6595250639475ceef Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Tue, 25 Mar 2025 14:21:27 -0400 Subject: [PATCH 01/10] Scan files for secrets in _upload_file_to_s3 --- tests/ci/s3_helper.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 86656e6e7c0c..d83258205599 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -6,6 +6,7 @@ from multiprocessing.dummy import Pool from pathlib import Path from typing import Any, List, Union +import os import boto3 # type: ignore import botocore # type: ignore @@ -19,6 +20,31 @@ S3_URL, ) +sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*") +sensitive_strings = {var: value for var, value in os.environ.items() + if sensitive_var_pattern.match(var)} + +def scan_file_for_sensitive_data(file_content, file_name): + """ + Scan the content of a file for sensitive strings. + Raises ValueError if any sensitive values are found. + """ + matches = [] + for line_number, line in enumerate(file_content.splitlines(), start=1): + for match in sensitive_var_pattern.finditer(line): + matches.append((file_name, line_number, match.group(0))) + for name, value in sensitive_strings.items(): + if value in line: + matches.append((file_name, line_number, f"SECRET[{name}]")) + + if not matches: + return + + logging.error(f"Sensitive values found in {file_name}") + for file_name, line_number, match in matches: + logging.error(f"{file_name}:{line_number}: {match}") + + raise ValueError(f"Sensitive values found in {file_name}") def _flatten_list(lst): result = [] @@ -45,6 +71,8 @@ def __init__(self, client: Any = None, endpoint: str = S3_URL): def _upload_file_to_s3( self, bucket_name: str, file_path: Path, s3_path: str ) -> str: + logging.debug("Checking %s for sensitive values", file_path) + scan_file_for_sensitive_data(file_path.read_text(), file_path.name) logging.debug( "Start uploading %s to bucket=%s path=%s", file_path, bucket_name, s3_path ) From 33cea7b56e7b25fff0773804792c151406e13920 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Wed, 26 Mar 2025 10:11:01 -0400 Subject: [PATCH 02/10] Handle UnicodeDecodeError --- tests/ci/s3_helper.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index d83258205599..5573b2f65ec9 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -72,7 +72,11 @@ def _upload_file_to_s3( self, bucket_name: str, file_path: Path, s3_path: str ) -> str: logging.debug("Checking %s for sensitive values", file_path) - scan_file_for_sensitive_data(file_path.read_text(), file_path.name) + try: + file_content = file_path.read_text(encoding="utf-8") + except UnicodeDecodeError: + logging.warning("Failed to read file %s, unknown encoding", file_path) + scan_file_for_sensitive_data(file_content, file_path.name) logging.debug( "Start uploading %s to bucket=%s path=%s", file_path, bucket_name, s3_path ) From 7cc240f44f2076dd5a9811db4f0532feac052f68 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Wed, 26 Mar 2025 15:58:32 -0400 Subject: [PATCH 03/10] fix --- tests/ci/s3_helper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 5573b2f65ec9..97e9007f60e7 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -76,7 +76,9 @@ def _upload_file_to_s3( file_content = file_path.read_text(encoding="utf-8") except UnicodeDecodeError: logging.warning("Failed to read file %s, unknown encoding", file_path) - scan_file_for_sensitive_data(file_content, file_path.name) + else: + scan_file_for_sensitive_data(file_content, file_path.name) + logging.debug( "Start uploading %s to bucket=%s path=%s", file_path, bucket_name, s3_path ) From bcf06ba506196499d9a1907249cd244d898d164e Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Thu, 27 Mar 2025 11:55:57 -0400 Subject: [PATCH 04/10] A variable isn't really secret if the value is 'clickhouse' --- tests/ci/s3_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 97e9007f60e7..f72fb1a7cd20 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -20,7 +20,7 @@ S3_URL, ) -sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*") +sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*(?!=clickhouse\s)") sensitive_strings = {var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var)} From 2332e5a67ba890fefe3a27af6b22ee7ea4acc5dc Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Thu, 27 Mar 2025 14:27:17 -0400 Subject: [PATCH 05/10] fix --- tests/ci/s3_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index f72fb1a7cd20..09302e97337a 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -20,7 +20,7 @@ S3_URL, ) -sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*(?!=clickhouse\s)") +sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*(?!=clickhouse$)") sensitive_strings = {var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var)} From c15db67913f17b6d512226a3d5c5e5b66d468721 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Fri, 28 Mar 2025 10:01:18 -0400 Subject: [PATCH 06/10] regex should also ignore *** --- tests/ci/s3_helper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 09302e97337a..a7492a1c289d 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -20,7 +20,9 @@ S3_URL, ) -sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*(?!=clickhouse$)") +sensitive_var_pattern = re.compile( + r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)" +) sensitive_strings = {var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var)} From 09b31e68e963faf639476f1009c423fb2c27ae96 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Fri, 28 Mar 2025 14:27:14 -0400 Subject: [PATCH 07/10] make regression tests skippable --- .github/workflows/release_branches.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 1b4ce0995e15..e9b7e0c81a9b 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -538,8 +538,8 @@ jobs: ##################################### REGRESSION TESTS ###################################### ############################################################################################# RegressionTestsRelease: - needs: [BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} + needs: [RunConfig, BuilderDebRelease] + if: ${{ !failure() && !cancelled() && !contains(fromJson(needs.RunConfig.outputs.data).ci_settings.exclude_keywords, 'regression') }} uses: ./.github/workflows/regression.yml secrets: inherit with: @@ -549,8 +549,8 @@ jobs: build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 RegressionTestsAarch64: - needs: [BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} + needs: [RunConfig, BuilderDebAarch64] + if: ${{ !failure() && !cancelled() && !contains(fromJson(needs.RunConfig.outputs.data).ci_settings.exclude_keywords, 'regression') && !contains(fromJson(needs.RunConfig.outputs.data).ci_settings.exclude_keywords, 'aarch64')}} uses: ./.github/workflows/regression.yml secrets: inherit with: From 821275b1136036086dc93d0d92d7b37c1aa387cd Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Mon, 31 Mar 2025 09:05:58 -0400 Subject: [PATCH 08/10] print the entire offending log line --- tests/ci/s3_helper.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index a7492a1c289d..39e06d8d635c 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -21,23 +21,31 @@ ) sensitive_var_pattern = re.compile( - r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)" + r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)" ) -sensitive_strings = {var: value for var, value in os.environ.items() - if sensitive_var_pattern.match(var)} +sensitive_strings = { + var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var) +} + def scan_file_for_sensitive_data(file_content, file_name): """ Scan the content of a file for sensitive strings. Raises ValueError if any sensitive values are found. """ + + def clean_line(line): + for name, value in sensitive_strings.items(): + line = line.replace(value, f"SECRET[{name}]") + return line + matches = [] for line_number, line in enumerate(file_content.splitlines(), start=1): for match in sensitive_var_pattern.finditer(line): - matches.append((file_name, line_number, match.group(0))) + matches.append((file_name, line_number, clean_line(line))) for name, value in sensitive_strings.items(): if value in line: - matches.append((file_name, line_number, f"SECRET[{name}]")) + matches.append((file_name, line_number, clean_line(line))) if not matches: return @@ -48,6 +56,7 @@ def scan_file_for_sensitive_data(file_content, file_name): raise ValueError(f"Sensitive values found in {file_name}") + def _flatten_list(lst): result = [] for elem in lst: From 62915394df11a7aea61c056168ae981c5a306716 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Thu, 3 Apr 2025 11:32:19 -0400 Subject: [PATCH 09/10] Fixing more false positives --- tests/ci/s3_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 39e06d8d635c..973507d1d921 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -21,7 +21,7 @@ ) sensitive_var_pattern = re.compile( - r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)" + r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)(?! '\[HIDDEN\]')(?!%)" ) sensitive_strings = { var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var) From 758bda364746706c2b70667699a6ccc03222742f Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Thu, 3 Apr 2025 20:28:06 -0400 Subject: [PATCH 10/10] Fixing more false positives --- tests/ci/s3_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 973507d1d921..5d9d0758e5c3 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -21,7 +21,7 @@ ) sensitive_var_pattern = re.compile( - r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)(?! '\[HIDDEN\]')(?!%)" + r"\b[A-Z_]*(?