From ca283b59f52645f10d40c24c9ba1360299997d27 Mon Sep 17 00:00:00 2001 From: jinlong Date: Wed, 3 Sep 2025 11:49:45 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=95=8F?= =?UTF-8?q?=E6=84=9F=E4=BF=A1=E6=81=AF=E8=BF=87=E6=BB=A4=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 sanitizer 模块实现敏感信息过滤机制 - 添加配置选项支持自定义过滤规则 - 在生成 commit 和分支名前自动过滤敏感信息 - 添加 CLI 参数 --no-sanitize 临时禁用过滤功能 - 新增文档 sanitizer.md 说明使用方法 Signed-off-by: jinlong --- Cargo.lock | 1 + Cargo.toml | 1 + docs/sanitizer.md | 103 ++++++++++++++++++++++++++++++ src/cli.rs | 6 ++ src/config.rs | 20 ++++++ src/generate.rs | 36 ++++++++--- src/main.rs | 5 ++ src/sanitizer.rs | 157 ++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 319 insertions(+), 10 deletions(-) create mode 100644 docs/sanitizer.md create mode 100644 src/sanitizer.rs diff --git a/Cargo.lock b/Cargo.lock index e35fdc1..87d919a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -350,6 +350,7 @@ dependencies = [ "lazy_static", "log", "openai_api_rust", + "regex", "reqwest", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 24c3f81..67b24a1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ env_logger = "0.11.6" lazy_static = "1.5.0" log = "0.4.26" openai_api_rust = { git = "https://github.com/fslongjin/openai-api", rev = "e2a3f6f" } +regex = "1.11.0" reqwest = { version = "0.12.9", features = ["json"] } serde = { version = "1.0.218", features = ["derive"] } serde_json = "1.0.134" diff --git a/docs/sanitizer.md b/docs/sanitizer.md new file mode 100644 index 0000000..90e959b --- /dev/null +++ b/docs/sanitizer.md @@ -0,0 +1,103 @@ +# Sanitizer Configuration Guide + +To prevent leaking sensitive information when sending diffs/user descriptions to model providers, fastcommit includes a built-in secret sanitization mechanism. This mechanism replaces matched sensitive content with placeholders before generating commit messages or branch names, for example: + +``` +AKIAIOSFODNN7EXAMPLE -> [REDACTED:AWS_ACCESS_KEY_ID#1] +-----BEGIN PRIVATE KEY----- ... -> [REDACTED:PRIVATE_KEY_BLOCK#2] +Bearer abcdef123456 .... -> [REDACTED:BEARER_TOKEN#3] +``` + +## 1. Basic Toggle + +Configuration file: `~/.fastcommit/config.toml` + +Field: +``` +sanitize_secrets = true +``` +Set to `false` to completely disable sanitization. + +## 2. Built-in Matching Rules +Current built-in rules (name -> regex description): + +| Name | Description | +|------|-------------| +| PRIVATE_KEY_BLOCK | Matches private key blocks from `-----BEGIN ... PRIVATE KEY-----` to `-----END ... PRIVATE KEY-----` | +| GITHUB_TOKEN | Matches tokens with prefixes like `ghp_` / `ghs_` / `gho_` / `ghr_` / `ghu_` + 36 alphanumeric characters | +| AWS_ACCESS_KEY_ID | Starts with `AKIA` + 16 uppercase alphanumeric characters | +| JWT | Typical 3-segment Base64URL JWT structure | +| BEARER_TOKEN | Bearer token headers (`Bearer xxx`) | +| GENERIC_API_KEY | Common field names: `api_key` / `apikey` / `apiKey` / `secret` / `token` / `authorization` followed by separator and value | + +Matched content will be replaced with `[REDACTED:#sequence_number]`. + +## 3. Custom Rules +You can add custom rules in the configuration file to capture team-specific sensitive string formats. + +Example: +``` +[[custom_sanitize_patterns]] +name = "INTERNAL_URL" +regex = "https://internal\\.corp\\.example\\.com/[A-Za-z0-9/_-]+" + +[[custom_sanitize_patterns]] +name = "UUID_TOKEN" +regex = "[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}" +``` + +Notes: +- `name`: Identifier in the placeholder; recommended to use uppercase underscore style. +- `regex`: Rust regex (ECMAScript-like, but without backtracking support); please escape backslashes appropriately. +- All custom rules are executed after built-in rules. +- If a regex is invalid, it will be skipped and a warning will be output in the logs. + +## 4. Viewing Sanitization Statistics +The current version outputs the following when running with `RUST_LOG=debug`: +``` +Sanitized N potential secrets from diff/prompt +``` +In the future, `--show-redactions` can be added to display more detailed tables (planned feature). + +## 5. Performance and Notes +- There may be minor performance overhead for very large diffs (multiple find-replace passes). If performance is sensitive, reduce the number of custom rules. +- Custom regex should not be overly broad, otherwise it may falsely match normal code context, affecting model understanding. +- The model cannot see the original replaced content. If context hints are needed, design semantically expressive tags with `name`, for example: `DB_PASSWORD`/`INTERNAL_ENDPOINT`. + +## 6. Common Custom Pattern Examples +``` +[[custom_sanitize_patterns]] +name = "SLACK_WEBHOOK" +regex = "https://hooks\\.slack\\.com/services/[A-Za-z0-9/_-]+" + +[[custom_sanitize_patterns]] +name = "DISCORD_WEBHOOK" +regex = "https://discord(?:app)?\\.com/api/webhooks/[0-9]+/[A-Za-z0-9_-]+" + +[[custom_sanitize_patterns]] +name = "GCP_SERVICE_ACCOUNT" +regex = "[0-9]{12}-compute@developer\\.gserviceaccount\\.com" + +[[custom_sanitize_patterns]] +name = "STRIPE_KEY" +regex = "sk_(live|test)_[A-Za-z0-9]{10,}" +``` + +## 7. Complete Example Configuration Snippet +``` +sanitize_secrets = true + +[[custom_sanitize_patterns]] +name = "INTERNAL_URL" +regex = "https://internal\\.corp\\.example\\.com/[A-Za-z0-9/_-]+" + +[[custom_sanitize_patterns]] +name = "STRIPE_KEY" +regex = "sk_(live|test)_[A-Za-z0-9]{10,}" +``` + +## 8. Future Plans +- Report mode: Output table statistics of match categories and counts +- Allow listing redacted placeholder hints at the end of commit messages (configurable) + +For adding new default built-in rules or improvements, welcome to submit Issues / PRs. diff --git a/src/cli.rs b/src/cli.rs index 0dd9067..4e31ae5 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -53,4 +53,10 @@ pub struct Args { help = "Generate commit message (use with -b to output both)" )] pub generate_message: bool, + + #[clap( + long = "no-sanitize", + help = "Temporarily disable sensitive info sanitizer for this run" + )] + pub no_sanitize: bool, } diff --git a/src/config.rs b/src/config.rs index 1d6204e..979ac9a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -3,6 +3,18 @@ use std::{fmt::Display, fs}; use crate::constants::{DEFAULT_MAX_TOKENS, DEFAULT_OPENAI_API_BASE, DEFAULT_OPENAI_MODEL}; +fn default_true() -> bool { + true +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct CustomSanitizePattern { + /// A short name/identifier for the pattern. e.g. "INTERNAL_URL" + pub name: String, + /// The regex pattern string. It should be a valid Rust regex. + pub regex: String, +} + #[derive(Debug, Serialize, Deserialize)] pub struct Config { api_base: Option, @@ -16,6 +28,12 @@ pub struct Config { pub verbosity: Verbosity, /// Prefix for generated branch names (e.g. username in monorepo) pub branch_prefix: Option, + /// Enable sanitizing sensitive information (API keys, tokens, secrets) before sending diff to AI provider. + #[serde(default = "default_true")] + pub sanitize_secrets: bool, + /// User defined extra regex patterns for sanitizer. + #[serde(default)] + pub custom_sanitize_patterns: Vec, } impl Config { @@ -104,6 +122,8 @@ impl Default for Config { language: CommitLanguage::default(), verbosity: Verbosity::default(), branch_prefix: None, + sanitize_secrets: true, + custom_sanitize_patterns: Vec::new(), } } } diff --git a/src/generate.rs b/src/generate.rs index 1f9329e..8c1758e 100644 --- a/src/generate.rs +++ b/src/generate.rs @@ -7,6 +7,7 @@ use crate::config::{self, Config}; use crate::constants::BRANCH_NAME_PROMPT; use crate::constants::{DEFAULT_MAX_TOKENS, DEFAULT_OPENAI_MODEL, DEFAULT_PROMPT_TEMPLATE}; +use crate::sanitizer::sanitize_with_config; use crate::template_engine::{render_template, TemplateContext}; async fn generate_commit_message( @@ -14,14 +15,23 @@ async fn generate_commit_message( config: &config::Config, user_description: Option<&str>, ) -> anyhow::Result { - let auth = Auth::new(config.api_key.as_str()); + // sanitize diff & user description first + let (sanitized_diff, sanitized_user_desc_opt, redactions) = + sanitize_with_config(diff, user_description, config); + if !redactions.is_empty() { + log::debug!( + "Sanitized {} potential secrets from diff/prompt", + redactions.len() + ); + } + let auth = Auth::new(config.api_key.as_str()); let openai = OpenAI::new(auth, &config.api_base()); - // Add "commit message: " prefix to user description if provided - let prefixed_user_description = user_description.map(|desc| { + // Add "commit message: " prefix to user description if provided (after sanitization) + let prefixed_user_description = sanitized_user_desc_opt.map(|desc| { if desc.trim().is_empty() { - desc.to_string() + desc } else { format!("commit message: {}", desc) } @@ -31,7 +41,7 @@ async fn generate_commit_message( config.conventional, config.language, config.verbosity, - diff, + &sanitized_diff, prefixed_user_description.as_deref(), ); @@ -72,7 +82,6 @@ async fn generate_commit_message( .as_ref() .ok_or(anyhow::anyhow!("No message in response"))? .content; - // Extract content between tags let commit_message = extract_aicommit_message(msg)?; Ok(commit_message) } @@ -164,11 +173,19 @@ async fn generate_branch_name_with_ai( prefix: Option<&str>, config: &Config, ) -> anyhow::Result { - let auth = Auth::new(config.api_key.as_str()); + // sanitize diff only (branch name uses only diff) + let (sanitized_diff, _, redactions) = sanitize_with_config(diff, None, config); + if !redactions.is_empty() { + log::debug!( + "Sanitized {} potential secrets from diff before branch generation", + redactions.len() + ); + } + let auth = Auth::new(config.api_key.as_str()); let openai = OpenAI::new(auth, &config.api_base()); - let prompt = BRANCH_NAME_PROMPT.replace("{{diff}}", diff); + let prompt = BRANCH_NAME_PROMPT.replace("{{diff}}", &sanitized_diff); let messages = vec![ Message { role: Role::System, @@ -191,7 +208,7 @@ async fn generate_branch_name_with_ai( top_p: None, n: None, stream: Some(false), - stop: None, // 移除 stop words 以避免思考过程中的干扰 + stop: None, max_tokens: Some(DEFAULT_MAX_TOKENS as i32), presence_penalty: None, frequency_penalty: None, @@ -215,7 +232,6 @@ async fn generate_branch_name_with_ai( let branch_name = extract_aicommit_message(&msg)?; - // Clean up the branch name let branch_name = if let Some(prefix) = prefix { format!("{}{}", prefix.trim(), branch_name.trim()) } else { diff --git a/src/main.rs b/src/main.rs index 307137d..1c4131b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,7 @@ mod cli; mod config; mod constants; mod generate; +mod sanitizer; mod template_engine; mod update_checker; @@ -24,6 +25,10 @@ async fn main() -> anyhow::Result<()> { if let Some(v) = args.verbosity { config.verbosity = v; } + if args.no_sanitize { + // CLI override to disable sanitizer + config.sanitize_secrets = false; + } run_update_checker().await; diff --git a/src/sanitizer.rs b/src/sanitizer.rs new file mode 100644 index 0000000..d18048f --- /dev/null +++ b/src/sanitizer.rs @@ -0,0 +1,157 @@ +use crate::config::Config; +use regex::Regex; // added for sanitize_with_config + +/// Represents a redacted secret occurrence. +#[derive(Debug, Clone)] +pub struct Redaction { + pub _kind: &'static str, + pub _placeholder: String, +} + +impl Redaction { + pub fn new(kind: &'static str, placeholder: impl Into) -> Self { + Self { + _kind: kind, + _placeholder: placeholder.into(), + } + } +} + +/// Runtime redaction entry for custom user patterns. +#[derive(Debug, Clone)] +pub struct CustomRedactionMeta { + pub name: String, + pub regex: Regex, +} + +/// Sanitize text before sending to the model provider with builtin + custom patterns. +pub fn sanitize( + text: &str, + enabled: bool, + custom_patterns: &[CustomRedactionMeta], +) -> (String, Vec) { + if !enabled || text.is_empty() { + return (text.to_string(), Vec::new()); + } + + let mut redactions: Vec = Vec::new(); + let mut counter = 0usize; + + // Built-in patterns (ordered) – static lifetime kinds + let builtin: Vec<(&'static str, Regex)> = vec![ + ( + "PRIVATE_KEY_BLOCK", + Regex::new( + r"-----BEGIN [A-Z ]+PRIVATE KEY-----[\s\S]*?-----END [A-Z ]+PRIVATE KEY-----", + ) + .unwrap(), + ), + ( + "GITHUB_TOKEN", + Regex::new(r"\bgh[pousr]_[A-Za-z0-9]{36}\b").unwrap(), + ), + ( + "AWS_ACCESS_KEY_ID", + Regex::new(r"\bAKIA[0-9A-Z]{16}\b").unwrap(), + ), + ( + "JWT", + Regex::new(r"\b[A-Za-z0-9_-]{10,}\.([A-Za-z0-9_-]{10,})\.[A-Za-z0-9_-]{10,}\b") + .unwrap(), + ), + ( + "BEARER_TOKEN", + Regex::new(r"(?i)bearer\s+[A-Za-z0-9\-_.=]+\b").unwrap(), + ), + ( + "GENERIC_API_KEY", + Regex::new( + r#"(?i)(api_?key|secret|token|authorization)[\s:=\"']+([A-Za-z0-9_\-]{8,})"#, + ) + .unwrap(), + ), + ]; + + let mut sanitized = text.to_string(); + + // Apply builtin patterns + for (kind, re) in builtin.iter() { + loop { + if let Some(m) = re.find(&sanitized) { + counter += 1; + let placeholder = format!("[REDACTED:{}#{}]", kind, counter); + sanitized.replace_range(m.start()..m.end(), &placeholder); + redactions.push(Redaction::new(kind, placeholder)); + } else { + break; + } + } + } + + // Apply custom patterns – use their provided name (converted to static via leak for simplicity) + for meta in custom_patterns { + loop { + if let Some(m) = meta.regex.find(&sanitized) { + counter += 1; + let placeholder = format!("[REDACTED:{}#{}]", meta.name, counter); + sanitized.replace_range(m.start()..m.end(), &placeholder); + // We leak the string to get a 'static str; acceptable given tiny count and CLI nature + let leaked: &'static str = Box::leak(meta.name.clone().into_boxed_str()); + redactions.push(Redaction::new(leaked, placeholder)); + } else { + break; + } + } + } + + (sanitized, redactions) +} + +/// Convenience: sanitize multiple text components and return redaction info combined. +pub fn sanitize_for_model( + diff: &str, + user_prompt: Option<&str>, + enabled: bool, + custom_patterns: &[CustomRedactionMeta], +) -> (String, Option, Vec) { + let (sdiff, mut r1) = sanitize(diff, enabled, custom_patterns); + let (sprompt, r2) = match user_prompt { + Some(p) => { + let (s, rs) = sanitize(p, enabled, custom_patterns); + (Some(s), rs) + } + None => (None, Vec::new()), + }; + r1.extend(r2); + (sdiff, sprompt, r1) +} + +/// Compile custom patterns from config; invalid ones are logged and skipped. +fn compile_custom_patterns( + items: &[crate::config::CustomSanitizePattern], +) -> Vec { + // made private + let mut out = Vec::new(); + for item in items { + match Regex::new(&item.regex) { + Ok(re) => out.push(CustomRedactionMeta { + name: item.name.clone(), + regex: re, + }), + Err(e) => { + log::warn!("Skip invalid custom sanitize regex '{}': {}", item.regex, e); + } + } + } + out +} + +/// High-level helper: directly use full Config, hiding compilation logic from callers. +pub fn sanitize_with_config( + diff: &str, + user_prompt: Option<&str>, + config: &Config, +) -> (String, Option, Vec) { + let compiled = compile_custom_patterns(&config.custom_sanitize_patterns); + sanitize_for_model(diff, user_prompt, config.sanitize_secrets, &compiled) +} From 9065fd6d8861713a6fe70bbbae25afff1c511538 Mon Sep 17 00:00:00 2001 From: jinlong Date: Wed, 3 Sep 2025 11:50:49 +0800 Subject: [PATCH 2/2] =?UTF-8?q?chore:=20=E5=8D=87=E7=BA=A7=E7=89=88?= =?UTF-8?q?=E6=9C=AC=E5=8F=B7=E8=87=B30.3.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 更新Cargo.toml、Cargo.lock和README文档中的版本号,从0.2.3升级到0.3.0。 Signed-off-by: jinlong --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 2 +- README_CN.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 87d919a..4896fb0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -340,7 +340,7 @@ dependencies = [ [[package]] name = "fastcommit" -version = "0.2.3" +version = "0.3.0" dependencies = [ "anyhow", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 67b24a1..6a99740 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fastcommit" -version = "0.2.3" +version = "0.3.0" description = "AI-based command line tool to quickly generate standardized commit messages." edition = "2021" authors = ["longjin "] diff --git a/README.md b/README.md index b4452cb..ecd3dfe 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ You can install `fastcommit` using the following method: ```bash # Install using cargo -cargo install --git https://github.com/fslongjin/fastcommit --tag v0.2.3 +cargo install --git https://github.com/fslongjin/fastcommit --tag v0.3.0 ``` ## Usage diff --git a/README_CN.md b/README_CN.md index 85b7792..4d88c7c 100644 --- a/README_CN.md +++ b/README_CN.md @@ -8,7 +8,7 @@ ```bash # 使用 cargo 安装 -cargo install --git https://github.com/fslongjin/fastcommit --tag v0.2.3 +cargo install --git https://github.com/fslongjin/fastcommit --tag v0.3.0 ``` ## 使用