diff --git a/src/content_blocking.rs b/src/content_blocking.rs index 128d202f..20daf6a3 100644 --- a/src/content_blocking.rs +++ b/src/content_blocking.rs @@ -301,6 +301,71 @@ impl Iterator for CbRuleEquivalentIterator { } } +/// Check if a regex pattern only uses features supported by content-blocking syntax. +/// +/// Content blocking syntax supports: +/// - Literal characters +/// - `.` (any character) +/// - `[...]` character classes (including ranges and negation) +/// - `?`, `+`, `*` quantifiers +/// - `(...)` capturing groups +/// - Escaped special characters like `\.`, `\(`, etc. +fn is_cb_compatible_regex(pattern: &str) -> bool { + let mut chars = pattern.chars().peekable(); + let mut in_char_class = false; + let mut paren_depth = 0; + + while let Some(c) = chars.next() { + if in_char_class { + match c { + '\\' => { + // Any escape is valid inside a character class + chars.next(); + } + ']' => { + in_char_class = false; + } + _ => {} + } + } else { + match c { + '\\' => match chars.next() { + Some('\\') | Some('.') | Some('?') | Some('+') | Some('*') | Some('(') + | Some(')') | Some('[') | Some(']') | Some('^') | Some('$') | Some('-') + | Some('|') | Some('{') | Some('}') | Some('/') => {} + _ => return false, + }, + '[' => { + in_char_class = true; + if chars.peek() == Some(&'^') { + chars.next(); + } + if chars.peek() == Some(&']') { + chars.next(); // ] is literal when it's the first char + } + } + '(' => { + paren_depth += 1; + if chars.peek() == Some(&'?') { + return false; + } + } + ')' => { + if paren_depth == 0 { + return false; + } + paren_depth -= 1; + } + '|' => return false, + '{' => return false, + _ => {} + } + } + } + + !in_char_class && paren_depth == 0 +} + impl TryFrom for CbRuleEquivalent { type Error = CbRuleCreationFailure; @@ -325,9 +390,6 @@ impl TryFrom for CbRuleEquivalent { if v.is_csp() { return Err(CbRuleCreationFailure::NetworkCspUnsupported); } - if v.mask.contains(NetworkFilterMask::IS_COMPLETE_REGEX) { - return Err(CbRuleCreationFailure::FullRegexUnsupported); - } if v.is_removeparam() { return Err(CbRuleCreationFailure::NetworkRemoveparamUnsupported); } @@ -345,86 +407,154 @@ impl TryFrom for CbRuleEquivalent { vec![] }; - let url_filter = match (v.filter, v.hostname) { - (crate::filters::network::FilterPart::AnyOf(_), _) => { - return Err(CbRuleCreationFailure::OptimizedRulesUnsupported) + let is_complete_regex = v.mask.contains(NetworkFilterMask::IS_COMPLETE_REGEX); + + let url_filter = if is_complete_regex { + match (&v.filter, &v.hostname) { + (crate::filters::network::FilterPart::Simple(part), Some(hostname)) => { + if let Some(inner) = + part.strip_prefix('/').and_then(|p| p.strip_suffix('/')) + { + if is_cb_compatible_regex(inner) { + let mut url_filter = format!( + "^[^:]+:(//)?([^/]+\\.)?{}", + SPECIAL_CHARS.replace_all(hostname, r##"\$1"##) + ); + if v.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) { + url_filter += ".*"; + } + url_filter += inner; + if v.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) { + url_filter += "$"; + } + url_filter + } else { + return Err(CbRuleCreationFailure::FullRegexUnsupported); + } + } else { + return Err(CbRuleCreationFailure::FullRegexUnsupported); + } + } + (crate::filters::network::FilterPart::Simple(part), None) => { + if let Some(inner) = + part.strip_prefix('/').and_then(|p| p.strip_suffix('/')) + { + if is_cb_compatible_regex(inner) { + let mut url_filter = + if v.mask.contains(NetworkFilterMask::IS_LEFT_ANCHOR) { + format!("^{inner}") + } else { + let scheme_part = if v.mask.contains( + NetworkFilterMask::FROM_HTTP + | NetworkFilterMask::FROM_HTTPS, + ) { + "" + } else if v.mask.contains(NetworkFilterMask::FROM_HTTP) { + "^http://.*" + } else if v.mask.contains(NetworkFilterMask::FROM_HTTPS) { + "^https://.*" + } else if v.mask.contains(NetworkFilterMask::FROM_WEBSOCKET) + { + "^wss?://.*" + } else { + unreachable!("Invalid scheme information"); + }; + format!("{scheme_part}{inner}") + }; + if v.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) { + url_filter += "$"; + } + url_filter + } else { + return Err(CbRuleCreationFailure::FullRegexUnsupported); + } + } else { + return Err(CbRuleCreationFailure::FullRegexUnsupported); + } + } + _ => return Err(CbRuleCreationFailure::FullRegexUnsupported), } - (crate::filters::network::FilterPart::Simple(part), Some(hostname)) => { - let without_trailing_separator = TRAILING_SEPARATOR.replace_all(&part, ""); - let escaped_special_chars = - SPECIAL_CHARS.replace_all(&without_trailing_separator, r##"\$1"##); - let with_fixed_wildcards = - REPLACE_WILDCARDS.replace_all(&escaped_special_chars, ".*"); - - let mut url_filter = format!( - "^[^:]+:(//)?([^/]+\\.)?{}", - SPECIAL_CHARS.replace_all(&hostname, r##"\$1"##) - ); - - if v.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) { - url_filter += ".*"; + } else { + match (&v.filter, &v.hostname) { + (crate::filters::network::FilterPart::AnyOf(_), _) => { + return Err(CbRuleCreationFailure::OptimizedRulesUnsupported) } + (crate::filters::network::FilterPart::Simple(part), Some(hostname)) => { + let without_trailing_separator = TRAILING_SEPARATOR.replace_all(part, ""); + let escaped_special_chars = + SPECIAL_CHARS.replace_all(&without_trailing_separator, r##"\$1"##); + let with_fixed_wildcards = + REPLACE_WILDCARDS.replace_all(&escaped_special_chars, ".*"); + + let mut url_filter = format!( + "^[^:]+:(//)?([^/]+\\.)?{}", + SPECIAL_CHARS.replace_all(hostname, r##"\$1"##) + ); + + if v.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) { + url_filter += ".*"; + } - url_filter += &with_fixed_wildcards; + url_filter += &with_fixed_wildcards; - if v.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) { - url_filter += "$"; - } + if v.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) { + url_filter += "$"; + } - url_filter - } - (crate::filters::network::FilterPart::Simple(part), None) => { - let without_trailing_separator = TRAILING_SEPARATOR.replace_all(&part, ""); - let escaped_special_chars = - SPECIAL_CHARS.replace_all(&without_trailing_separator, r##"\$1"##); - let with_fixed_wildcards = - REPLACE_WILDCARDS.replace_all(&escaped_special_chars, ".*"); - let mut url_filter = if v.mask.contains(NetworkFilterMask::IS_LEFT_ANCHOR) { - format!("^{with_fixed_wildcards}") - } else { - let scheme_part = if v - .mask - .contains(NetworkFilterMask::FROM_HTTP | NetworkFilterMask::FROM_HTTPS) - { - "" - } else if v.mask.contains(NetworkFilterMask::FROM_HTTP) { - "^http://.*" - } else if v.mask.contains(NetworkFilterMask::FROM_HTTPS) { - "^https://.*" - } else if v.mask.contains(NetworkFilterMask::FROM_WEBSOCKET) { - "^wss?://.*" + url_filter + } + (crate::filters::network::FilterPart::Simple(part), None) => { + let without_trailing_separator = TRAILING_SEPARATOR.replace_all(part, ""); + let escaped_special_chars = + SPECIAL_CHARS.replace_all(&without_trailing_separator, r##"\$1"##); + let with_fixed_wildcards = + REPLACE_WILDCARDS.replace_all(&escaped_special_chars, ".*"); + let mut url_filter = if v.mask.contains(NetworkFilterMask::IS_LEFT_ANCHOR) { + format!("^{with_fixed_wildcards}") } else { - unreachable!("Invalid scheme information"); + let scheme_part = if v.mask.contains( + NetworkFilterMask::FROM_HTTP | NetworkFilterMask::FROM_HTTPS, + ) { + "" + } else if v.mask.contains(NetworkFilterMask::FROM_HTTP) { + "^http://.*" + } else if v.mask.contains(NetworkFilterMask::FROM_HTTPS) { + "^https://.*" + } else if v.mask.contains(NetworkFilterMask::FROM_WEBSOCKET) { + "^wss?://.*" + } else { + unreachable!("Invalid scheme information"); + }; + + format!("{scheme_part}{with_fixed_wildcards}") }; - format!("{scheme_part}{with_fixed_wildcards}") - }; + if v.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) { + url_filter += "$"; + } - if v.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) { - url_filter += "$"; + url_filter } - - url_filter - } - (crate::filters::network::FilterPart::Empty, Some(hostname)) => { - let escaped_special_chars = SPECIAL_CHARS.replace_all(&hostname, r##"\$1"##); - format!("^[^:]+:(//)?([^/]+\\.)?{escaped_special_chars}") - } - (crate::filters::network::FilterPart::Empty, None) => if v - .mask - .contains(NetworkFilterMask::FROM_HTTP | NetworkFilterMask::FROM_HTTPS) - { - "^https?://" - } else if v.mask.contains(NetworkFilterMask::FROM_HTTP) { - "^http://" - } else if v.mask.contains(NetworkFilterMask::FROM_HTTPS) { - "^https://" - } else if v.mask.contains(NetworkFilterMask::FROM_WEBSOCKET) { - "^wss?://" - } else { - unreachable!("Invalid scheme information"); + (crate::filters::network::FilterPart::Empty, Some(hostname)) => { + let escaped_special_chars = SPECIAL_CHARS.replace_all(hostname, r##"\$1"##); + format!("^[^:]+:(//)?([^/]+\\.)?{escaped_special_chars}") + } + (crate::filters::network::FilterPart::Empty, None) => if v + .mask + .contains(NetworkFilterMask::FROM_HTTP | NetworkFilterMask::FROM_HTTPS) + { + "^https?://" + } else if v.mask.contains(NetworkFilterMask::FROM_HTTP) { + "^http://" + } else if v.mask.contains(NetworkFilterMask::FROM_HTTPS) { + "^https://" + } else if v.mask.contains(NetworkFilterMask::FROM_WEBSOCKET) { + "^wss?://" + } else { + unreachable!("Invalid scheme information"); + } + .to_string(), } - .to_string(), }; let (if_domain, unless_domain) = if v.opt_domains.is_some() diff --git a/tests/unit/content_blocking.rs b/tests/unit/content_blocking.rs index a95d048b..b570fcf5 100644 --- a/tests/unit/content_blocking.rs +++ b/tests/unit/content_blocking.rs @@ -665,6 +665,105 @@ mod ab2cb_tests { ); } + #[test] + fn full_regex_supported() { + // Simple full regex with supported features + test_from_abp( + "/banner.*\\.jpg/", + r####"[{ + "action": { + "type": "block" + }, + "trigger": { + "url-filter": "banner.*\\.jpg" + } + }]"####, + ); + + // Full regex with character class + test_from_abp( + "/ad[0-9]+\\.html/", + r####"[{ + "action": { + "type": "block" + }, + "trigger": { + "url-filter": "ad[0-9]+\\.html" + } + }]"####, + ); + + // Full regex with groups + test_from_abp( + "/track(me)?\\./", + r####"[{ + "action": { + "type": "block" + }, + "trigger": { + "url-filter": "track(me)?\\." + } + }]"####, + ); + + // Full regex with left anchor + test_from_abp( + "|/https?:\\/\\/ads\\./", + r####"[{ + "action": { + "type": "block" + }, + "trigger": { + "url-filter": "^https?:\\/\\/ads\\." + } + }]"####, + ); + + // Full regex with right anchor + test_from_abp( + "/banner\\.jpg/|", + r####"[{ + "action": { + "type": "block" + }, + "trigger": { + "url-filter": "banner\\.jpg$" + } + }]"####, + ); + } + + #[test] + fn full_regex_unsupported() { + fn test_unsupported(abp_rule: &str) { + let filter = crate::lists::parse_filter(abp_rule, true, Default::default()) + .expect("Rule under test could not be parsed"); + match CbRuleEquivalent::try_from(filter) { + Ok(_) => panic!("Expected conversion to fail"), + Err(CbRuleCreationFailure::FullRegexUnsupported) => {} + Err(_) => panic!("Expected FullRegexUnsupported"), + } + } + + // \\d is not supported + test_unsupported("/banner\\d+\\.jpg/"); + + // {3} quantified repetition is not supported + test_unsupported("/ad{3}\\.html/"); + + // | alternation is not supported + test_unsupported("/banner|ad/"); + + // (?:...) non-capturing group is not supported + test_unsupported("/(?:ab)c/"); + + // (?=...) lookahead is not supported + test_unsupported("/a(?=b)c/"); + + // \\b word boundary is not supported + test_unsupported("/word\\b/"); + } + #[test] fn badfilter_cancels_matching_rules() { // Test that BAD_FILTER rules cancel out matching rules @@ -790,6 +889,27 @@ mod filterset_tests { Ok(()) } + #[test] + fn full_regex_in_filterset() -> Result<(), ()> { + let list = [ + "/banner.*\\.jpg/", + "/ad[0-9]+\\.html/", + "/banner\\d+\\.jpg/", + "/ad{3}\\.html/", + ]; + let mut set = FilterSet::new(true); + set.add_filters(list, Default::default()); + + let (cb_rules, used_rules) = set.into_content_blocking()?; + + // Only the first two rules are supported; the last two use \\d and {3} + assert_eq!(used_rules, &list[0..2]); + // 2 rules plus `ignore_previous_fp_documents()` + assert_eq!(cb_rules.len(), 3); + + Ok(()) + } + #[test] fn punycode_if_domains() -> Result<(), ()> { let list = [