From 25c5d54c1132d20623b079351ab880021576337a Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 14 Jul 2025 22:35:06 +0100 Subject: [PATCH] Extract predicate helpers --- src/html.rs | 29 +++++++++++++++++------------ src/lib.rs | 7 ++++--- src/reflow.rs | 6 +++++- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/html.rs b/src/html.rs index 61a867ee..f86a7754 100644 --- a/src/html.rs +++ b/src/html.rs @@ -29,6 +29,14 @@ fn node_text(handle: &Handle) -> String { out.trim().to_string() } +fn is_ignored_tag(tag: &str) -> bool { + tag.eq_ignore_ascii_case("script") + || tag.eq_ignore_ascii_case("style") + || tag.eq_ignore_ascii_case("noscript") + || tag.eq_ignore_ascii_case("template") + || tag.eq_ignore_ascii_case("head") +} + /// Recursively appends text nodes from `handle` to `out`, tracking whether the /// previous output was whitespace. fn collect_text(handle: &Handle, out: &mut String, last_space: &mut bool) { @@ -47,13 +55,7 @@ fn collect_text(handle: &Handle, out: &mut String, last_space: &mut bool) { } } NodeData::Element { name, .. } => { - let tag = name.local.as_ref(); - if tag.eq_ignore_ascii_case("script") - || tag.eq_ignore_ascii_case("style") - || tag.eq_ignore_ascii_case("noscript") - || tag.eq_ignore_ascii_case("template") - || tag.eq_ignore_ascii_case("head") - { + if is_ignored_tag(name.local.as_ref()) { return; } for child in handle.children.borrow().iter() { @@ -101,13 +103,16 @@ fn collect_rows(handle: &Handle, rows: &mut Vec) { } } +fn is_bold_tag(tag: &str) -> bool { + tag.eq_ignore_ascii_case("strong") || tag.eq_ignore_ascii_case("b") +} + /// Returns `true` if `handle` contains a `` or `` descendant. fn contains_strong(handle: &Handle) -> bool { - if let NodeData::Element { name, .. } = &handle.data { - let tag = name.local.as_ref(); - if tag.eq_ignore_ascii_case("strong") || tag.eq_ignore_ascii_case("b") { - return true; - } + if let NodeData::Element { name, .. } = &handle.data + && is_bold_tag(name.local.as_ref()) + { + return true; } let children = handle.children.borrow(); children.iter().any(contains_strong) diff --git a/src/lib.rs b/src/lib.rs index 5468c4ea..1963b7b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,6 +31,9 @@ use regex::Regex; /// let cells = split_cells(line); /// assert_eq!(cells, vec!["cell1", "cell2", "cell3"]); /// ``` +fn next_is_pipe(chars: &mut std::iter::Peekable>) -> bool { + chars.peek() == Some(&'|') +} #[must_use] pub fn split_cells(line: &str) -> Vec { let mut s = line.trim(); @@ -46,9 +49,7 @@ pub fn split_cells(line: &str) -> Vec { let mut chars = s.chars().peekable(); while let Some(ch) = chars.next() { if ch == '\\' { - if let Some(&next) = chars.peek() - && next == '|' - { + if next_is_pipe(&mut chars) { // `\|` escapes the pipe so it becomes part of the cell chars.next(); current.push('|'); diff --git a/src/reflow.rs b/src/reflow.rs index f24e419e..b63010b7 100644 --- a/src/reflow.rs +++ b/src/reflow.rs @@ -114,7 +114,7 @@ pub(crate) fn detect_separator( let mut sep_row_idx: Option = None; let sep_invalid = invalid_separator(sep_cells.as_ref(), max_cols); - if sep_invalid && second_row_is_separator(rows) { + if should_use_second_row_as_separator(sep_invalid, rows) { sep_cells = Some(rows[1].clone()); sep_row_idx = Some(1); } @@ -129,6 +129,10 @@ fn invalid_separator(sep_cells: Option<&Vec>, max_cols: usize) -> bool { } } +fn should_use_second_row_as_separator(sep_invalid: bool, rows: &[Vec]) -> bool { + sep_invalid && second_row_is_separator(rows) +} + fn second_row_is_separator(rows: &[Vec]) -> bool { rows.len() > 1 && rows[1].iter().all(|c| crate::SEP_RE.is_match(c)) }