From 4f2a0cd9e17288b3c76abe1e440d291f3cf009d3 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Mon, 27 Apr 2026 01:00:12 -0400 Subject: [PATCH] fix: add disable_bare_second flag (#22) + restore TN abbreviation matching (PR #25 review) Issue #22: bare "second" was always normalized to "2nd" by the ordinal tagger, breaking sentences like "Give me a second to check." Adds an opt-in `disable_bare_second` flag on NormalizeOptions that skips the ordinal tagger only for the single-token, case-insensitive word "second". Compound forms ("twenty second" -> "22nd") and date contexts are unaffected. Default is `false` so existing behavior is preserved. Plumbed through: NormalizeOptions builder, parse_span, normalize_sentence_inner, normalize_inner, FFI signatures (nemo_normalize_with_options, nemo_normalize_sentence_with_options), WASM bindings, Swift wrapper, and C headers. Also addresses Devin AI review on PR #25 (https://github.com/FluidInference/text-processing-rs/pull/25#pullrequestreview-4178192149): the new pretokenizer splits trailing punctuation off of words so ITN can match "twenty one," as "twenty one", but the shared sentence_loop re-joined pretokens with a literal space. That made the TN whitelist see "Dr ." instead of "Dr.", so abbreviation entries like "e.g.", "Prof.", "Inc.", "etc.", "vs." stopped matching, and both-form entries like "Dr"/"Dr." left an orphaned period ("I see Dr. Smith" -> "I see doctor. Smith"). sentence_loop now reconstructs each candidate span using the per-pretoken `sep`, preserving original adjacency. The longest-span-first iteration still tries shorter spans, so trailing punctuation cases ("twenty one,") continue to work. Tests: - tests/en_tests.rs: 3 issue #22 regression tests (default unchanged, sentence-mode opt-in, single-expression opt-in). - tests/en_tn_tests.rs: test_pr25_tn_abbreviation_regression covering Dr., vs., e.g., Inc., Co., Prof. - src/ffi.rs: updated existing FFI tests and added test_ffi_normalize_sentence_with_options_disable_bare_second. - All 1070 tests pass with --features ffi. --- src/ffi.rs | 51 +++++++++- src/lib.rs | 84 ++++++++++++---- src/options.rs | 13 +++ src/wasm.rs | 33 ++++++- .../include/nemo_text_processing.h | 9 +- swift-test/Sources/NemoTest/NemoTest.swift | 17 +++- swift/NemoTextProcessing.swift | 20 +++- swift/include/nemo_text_processing.h | 14 ++- tests/en_tests.rs | 95 +++++++++++++++++++ tests/en_tn_tests.rs | 31 ++++++ 10 files changed, 324 insertions(+), 43 deletions(-) diff --git a/src/ffi.rs b/src/ffi.rs index 3a4909b..7e33341 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -17,7 +17,14 @@ use crate::{ /// /// `max_span_tokens`: `0` means "use library default" (16); any positive /// value is a caller-specified max span. -fn options_from_ffi(concat_compound_numbers: u32, max_span_tokens: u32) -> NormalizeOptions { +/// +/// `disable_bare_second`: any non-zero value blocks the bare word +/// `"second"` from converting to `"2nd"` (issue #22). +fn options_from_ffi( + concat_compound_numbers: u32, + max_span_tokens: u32, + disable_bare_second: u32, +) -> NormalizeOptions { NormalizeOptions { concat_compound_numbers: concat_compound_numbers != 0, max_span_tokens: if max_span_tokens == 0 { @@ -25,6 +32,7 @@ fn options_from_ffi(concat_compound_numbers: u32, max_span_tokens: u32) -> Norma } else { Some(max_span_tokens as usize) }, + disable_bare_second: disable_bare_second != 0, } } @@ -86,6 +94,9 @@ pub unsafe extern "C" fn nemo_normalize_sentence(input: *const c_char) -> *mut c /// concatenate rather than add — e.g. `"thirty five sixty two"` → `"3562"`, /// `"seven eighty eight"` → `"788"`. /// +/// `disable_bare_second`: `0` keeps the default behavior, non-zero blocks +/// the bare word `"second"` from being rewritten to `"2nd"` (issue #22). +/// /// # Safety /// - `input` must be a valid null-terminated UTF-8 string /// - Returns a newly allocated string that must be freed with `nemo_free_string` @@ -93,6 +104,7 @@ pub unsafe extern "C" fn nemo_normalize_sentence(input: *const c_char) -> *mut c pub unsafe extern "C" fn nemo_normalize_with_options( input: *const c_char, concat_compound_numbers: u32, + disable_bare_second: u32, ) -> *mut c_char { if input.is_null() { return ptr::null_mut(); @@ -103,7 +115,7 @@ pub unsafe extern "C" fn nemo_normalize_with_options( Err(_) => return ptr::null_mut(), }; - let options = options_from_ffi(concat_compound_numbers, 0); + let options = options_from_ffi(concat_compound_numbers, 0, disable_bare_second); let result = normalize_with_options(c_str, options); match CString::new(result) { @@ -121,6 +133,9 @@ pub unsafe extern "C" fn nemo_normalize_with_options( /// - `0` — use library default (`16`). /// - `>0` — use the specified max span. /// +/// `disable_bare_second`: `0` keeps the default behavior, non-zero blocks +/// the bare word `"second"` from being rewritten to `"2nd"` (issue #22). +/// /// # Safety /// - `input` must be a valid null-terminated UTF-8 string /// - Returns a newly allocated string that must be freed with `nemo_free_string` @@ -129,6 +144,7 @@ pub unsafe extern "C" fn nemo_normalize_sentence_with_options( input: *const c_char, concat_compound_numbers: u32, max_span_tokens: u32, + disable_bare_second: u32, ) -> *mut c_char { if input.is_null() { return ptr::null_mut(); @@ -139,7 +155,11 @@ pub unsafe extern "C" fn nemo_normalize_sentence_with_options( Err(_) => return ptr::null_mut(), }; - let options = options_from_ffi(concat_compound_numbers, max_span_tokens); + let options = options_from_ffi( + concat_compound_numbers, + max_span_tokens, + disable_bare_second, + ); let result = normalize_sentence_with_options(c_str, options); match CString::new(result) { @@ -435,7 +455,7 @@ mod tests { fn test_ffi_normalize_with_options_concat_compound() { unsafe { let input = CString::new("seven eighty eight").unwrap(); - let result = nemo_normalize_with_options(input.as_ptr(), 1); + let result = nemo_normalize_with_options(input.as_ptr(), 1, 0); assert!(!result.is_null()); let result_str = CStr::from_ptr(result).to_str().unwrap(); assert_eq!(result_str, "788"); @@ -447,11 +467,32 @@ mod tests { fn test_ffi_normalize_sentence_with_options_concat_compound() { unsafe { let input = CString::new("United seven eighty eight").unwrap(); - let result = nemo_normalize_sentence_with_options(input.as_ptr(), 1, 0); + let result = nemo_normalize_sentence_with_options(input.as_ptr(), 1, 0, 0); assert!(!result.is_null()); let result_str = CStr::from_ptr(result).to_str().unwrap(); assert_eq!(result_str, "United 788"); nemo_free_string(result); } } + + #[test] + fn test_ffi_normalize_sentence_with_options_disable_bare_second() { + unsafe { + let input = CString::new("Give me a second to check.").unwrap(); + // Default flag (0) keeps today's behavior. + let baseline = nemo_normalize_sentence_with_options(input.as_ptr(), 0, 0, 0); + assert_eq!( + CStr::from_ptr(baseline).to_str().unwrap(), + "Give me a 2nd to check." + ); + nemo_free_string(baseline); + // Non-zero flag blocks bare second. + let opted_in = nemo_normalize_sentence_with_options(input.as_ptr(), 0, 0, 1); + assert_eq!( + CStr::from_ptr(opted_in).to_str().unwrap(), + "Give me a second to check." + ); + nemo_free_string(opted_in); + } + } } diff --git a/src/lib.rs b/src/lib.rs index 7aef03f..8d74965 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,6 +38,13 @@ use itn::en::{ /// Tries taggers in order of specificity (most specific first). /// Returns original text if no tagger matches. pub fn normalize(input: &str) -> String { + normalize_inner(input, false) +} + +/// Single-expression dispatch with the `disable_bare_second` flag plumbed in. +/// Issue #22: when the flag is set and the trimmed input is exactly the bare +/// word `"second"`, the ordinal tagger is skipped so it stays literal. +fn normalize_inner(input: &str, disable_bare_second: bool) -> String { let input = input.trim(); // Apply custom user rules first (highest priority) @@ -100,9 +107,12 @@ pub fn normalize(input: &str) -> String { return result; } - // Try ordinal numbers - if let Some(result) = ordinal::parse(input) { - return result; + // Try ordinal numbers (issue #22: skip the bare "second" case when opted out). + let skip_ordinal = disable_bare_second && input.eq_ignore_ascii_case("second"); + if !skip_ordinal { + if let Some(result) = ordinal::parse(input) { + return result; + } } // Try cardinal number @@ -136,7 +146,7 @@ pub fn normalize(input: &str) -> String { /// ``` pub fn normalize_with_options(input: &str, options: NormalizeOptions) -> String { if !options.concat_compound_numbers { - return normalize(input); + return normalize_inner(input, options.disable_bare_second); } let input = input.trim(); @@ -162,7 +172,7 @@ pub fn normalize_with_options(input: &str, options: NormalizeOptions) -> String // Fall back to the standard pipeline for anything not recognised // (money, measure, decimal, ordinal, telephone, etc.). - normalize(input) + normalize_inner(input, options.disable_bare_second) } /// Normalize with language selection. @@ -900,7 +910,11 @@ fn tn_parse_span_lang(span: &str, lang: &str) -> Option<(String, u8)> { /// `money`=95) and the regular cardinal fallback at 70 is skipped (the /// concat-compound reader already falls back to grammatical when the /// concat pattern does not apply). -fn parse_span(span: &str, concat_compound: bool) -> Option<(String, u8)> { +fn parse_span( + span: &str, + concat_compound: bool, + disable_bare_second: bool, +) -> Option<(String, u8)> { let token_count = span.split_whitespace().count(); if token_count == 0 { return None; @@ -945,8 +959,17 @@ fn parse_span(span: &str, concat_compound: bool) -> Option<(String, u8)> { if let Some(result) = decimal::parse(span) { return Some((result, 80)); } - if let Some(result) = ordinal::parse(span) { - return Some((result, 75)); + // Issue #22: when `disable_bare_second` is set, the bare standalone + // word `"second"` is *not* converted to `"2nd"` so phrases like + // `"give me a second"` stay literal. Compound ordinals + // (`"twenty second"`) still flow through this branch because they + // span 2+ tokens. + let skip_ordinal = + disable_bare_second && token_count == 1 && span.trim().eq_ignore_ascii_case("second"); + if !skip_ordinal { + if let Some(result) = ordinal::parse(span) { + return Some((result, 75)); + } } // Default cardinal fallback (priority 70). In concat-compound mode the @@ -973,7 +996,7 @@ fn parse_span(span: &str, concat_compound: bool) -> Option<(String, u8)> { /// assert_eq!(normalize_sentence("hello world"), "hello world"); /// ``` pub fn normalize_sentence(input: &str) -> String { - normalize_sentence_inner(input, DEFAULT_MAX_SPAN_TOKENS, false) + normalize_sentence_inner(input, DEFAULT_MAX_SPAN_TOKENS, false, false) } /// Unified sentence-mode entry point. @@ -1002,7 +1025,12 @@ pub fn normalize_sentence(input: &str) -> String { /// ``` pub fn normalize_sentence_with_options(input: &str, options: NormalizeOptions) -> String { let max_span = options.max_span_tokens.unwrap_or(DEFAULT_MAX_SPAN_TOKENS); - normalize_sentence_inner(input, max_span, options.concat_compound_numbers) + normalize_sentence_inner( + input, + max_span, + options.concat_compound_numbers, + options.disable_bare_second, + ) } /// Per-pretoken record: the token text plus the original separator that @@ -1110,13 +1138,23 @@ where let max_end = usize::min(pretokens.len(), i + max_span); let mut best: Option<(usize, String, u8)> = None; - // Longest-span-first search keeps replacements stable and non-overlapping. + // Longest-span-first search keeps replacements stable and + // non-overlapping. Reconstruct each span using the per-pretoken + // separator so adjacency is preserved: pretokens that came from a + // single original word (e.g. `"Dr."` -> `["Dr", "."]` with sep=`""` + // on the period) re-emerge as `"Dr."`, not `"Dr ."`. This is what + // the TN whitelist needs to match abbreviations like `"e.g."` / + // `"Prof."` (PR #25 review feedback). Trying smaller `end` values + // already handles "ignore trailing punctuation" cases like + // `"twenty one,"` -> match `"twenty one"`. for end in (i + 1..=max_end).rev() { - let span: String = pretokens[i..end] - .iter() - .map(|p| p.text.as_str()) - .collect::>() - .join(" "); + let mut span = String::new(); + for (idx, p) in pretokens[i..end].iter().enumerate() { + if idx > 0 { + span.push_str(p.sep); + } + span.push_str(&p.text); + } let Some((candidate, score)) = parser(&span) else { continue; }; @@ -1157,9 +1195,15 @@ where out } -/// Sentence-mode dispatch loop. The `concat_compound` flag is forwarded to -/// [`parse_span`] so each span sees the right tagger priorities. -fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound: bool) -> String { +/// Sentence-mode dispatch loop. The `concat_compound` and +/// `disable_bare_second` flags are forwarded to [`parse_span`] so each span +/// sees the right tagger priorities. +fn normalize_sentence_inner( + input: &str, + max_span_tokens: usize, + concat_compound: bool, + disable_bare_second: bool, +) -> String { let trimmed = input.trim(); if trimmed.is_empty() { return trimmed.to_string(); @@ -1167,7 +1211,7 @@ fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound let pretokens = pretokenize(trimmed); sentence_loop(&pretokens, max_span_tokens, |span| { - parse_span(span, concat_compound) + parse_span(span, concat_compound, disable_bare_second) }) } diff --git a/src/options.rs b/src/options.rs index 24b5c99..fc9ab85 100644 --- a/src/options.rs +++ b/src/options.rs @@ -19,6 +19,12 @@ pub struct NormalizeOptions { /// Sentence-mode sliding-window cap (in tokens). `None` uses /// [`DEFAULT_MAX_SPAN_TOKENS`]. Ignored in single-expression mode. pub max_span_tokens: Option, + + /// Skip the ordinal tagger for the bare word `"second"` so it is not + /// rewritten to `"2nd"` in phrases like `"give me a second"` (issue #22). + /// Compound ordinals (`"twenty second"` → `"22nd"`) and date contexts + /// (`"January second twenty twenty five"`) still convert. Default `false`. + pub disable_bare_second: bool, } impl NormalizeOptions { @@ -27,6 +33,7 @@ impl NormalizeOptions { Self { concat_compound_numbers: false, max_span_tokens: None, + disable_bare_second: false, } } @@ -41,4 +48,10 @@ impl NormalizeOptions { self.max_span_tokens = Some(max_span_tokens); self } + + /// Set [`Self::disable_bare_second`]. + pub const fn with_disable_bare_second(mut self, enabled: bool) -> Self { + self.disable_bare_second = enabled; + self + } } diff --git a/src/wasm.rs b/src/wasm.rs index e9de333..13f475b 100644 --- a/src/wasm.rs +++ b/src/wasm.rs @@ -13,7 +13,11 @@ use crate::{ /// /// `max_span_tokens == 0` is treated as "use library default" so JS callers /// can pass `0` rather than dealing with optional values across the boundary. -fn js_options(concat_compound_numbers: bool, max_span_tokens: u32) -> NormalizeOptions { +fn js_options( + concat_compound_numbers: bool, + max_span_tokens: u32, + disable_bare_second: bool, +) -> NormalizeOptions { NormalizeOptions { concat_compound_numbers, max_span_tokens: if max_span_tokens == 0 { @@ -21,6 +25,7 @@ fn js_options(concat_compound_numbers: bool, max_span_tokens: u32) -> NormalizeO } else { Some(max_span_tokens as usize) }, + disable_bare_second, } } @@ -48,21 +53,39 @@ pub fn normalize_sentence_js(input: &str) -> String { /// Unified single-expression normalize. `concatCompoundNumbers=true` reads /// consecutive number words as concatenation rather than addition, e.g. /// `"thirty five sixty two"` → `"3562"`, `"seven eighty eight"` → `"788"`. +/// `disableBareSecond=true` blocks the bare word `"second"` from converting +/// to `"2nd"` (issue #22). #[wasm_bindgen(js_name = normalizeWithOptions)] -pub fn normalize_with_options_js(input: &str, concat_compound_numbers: bool) -> String { - normalize_with_options(input, js_options(concat_compound_numbers, 0)) +pub fn normalize_with_options_js( + input: &str, + concat_compound_numbers: bool, + disable_bare_second: bool, +) -> String { + normalize_with_options( + input, + js_options(concat_compound_numbers, 0, disable_bare_second), + ) } /// Unified sentence normalize. `concatCompoundNumbers` mirrors the /// single-expression flag; `maxSpanTokens == 0` means "use library default" -/// (16). +/// (16). `disableBareSecond=true` keeps phrases like `"give me a second"` +/// literal (issue #22). #[wasm_bindgen(js_name = normalizeSentenceWithOptions)] pub fn normalize_sentence_with_options_js( input: &str, concat_compound_numbers: bool, max_span_tokens: u32, + disable_bare_second: bool, ) -> String { - normalize_sentence_with_options(input, js_options(concat_compound_numbers, max_span_tokens)) + normalize_sentence_with_options( + input, + js_options( + concat_compound_numbers, + max_span_tokens, + disable_bare_second, + ), + ) } #[wasm_bindgen(js_name = tnNormalize)] diff --git a/swift-test/Sources/CNemoTextProcessing/include/nemo_text_processing.h b/swift-test/Sources/CNemoTextProcessing/include/nemo_text_processing.h index 833fc41..98ce122 100644 --- a/swift-test/Sources/CNemoTextProcessing/include/nemo_text_processing.h +++ b/swift-test/Sources/CNemoTextProcessing/include/nemo_text_processing.h @@ -9,11 +9,16 @@ extern "C" { char* nemo_normalize(const char* input); char* nemo_normalize_sentence(const char* input); -char* nemo_normalize_with_options(const char* input, uint32_t concat_compound_numbers); +char* nemo_normalize_with_options( + const char* input, + uint32_t concat_compound_numbers, + uint32_t disable_bare_second +); char* nemo_normalize_sentence_with_options( const char* input, uint32_t concat_compound_numbers, - uint32_t max_span_tokens + uint32_t max_span_tokens, + uint32_t disable_bare_second ); void nemo_add_rule(const char* spoken, const char* written); int32_t nemo_remove_rule(const char* spoken); diff --git a/swift-test/Sources/NemoTest/NemoTest.swift b/swift-test/Sources/NemoTest/NemoTest.swift index 5d9add2..57f6e8c 100644 --- a/swift-test/Sources/NemoTest/NemoTest.swift +++ b/swift-test/Sources/NemoTest/NemoTest.swift @@ -19,19 +19,28 @@ enum NemoTextProcessing { static func normalizeSentence( _ input: String, concatCompoundNumbers: Bool = false, - maxSpanTokens: UInt32 = 0 + maxSpanTokens: UInt32 = 0, + disableBareSecond: Bool = false ) -> String { let concatFlag: UInt32 = concatCompoundNumbers ? 1 : 0 + let bareSecondFlag: UInt32 = disableBareSecond ? 1 : 0 guard let resultPtr = nemo_normalize_sentence_with_options( - input, concatFlag, maxSpanTokens + input, concatFlag, maxSpanTokens, bareSecondFlag ) else { return input } defer { nemo_free_string(resultPtr) } return String(cString: resultPtr) } - static func normalize(_ input: String, concatCompoundNumbers: Bool) -> String { + static func normalize( + _ input: String, + concatCompoundNumbers: Bool = false, + disableBareSecond: Bool = false + ) -> String { let concatFlag: UInt32 = concatCompoundNumbers ? 1 : 0 - guard let resultPtr = nemo_normalize_with_options(input, concatFlag) else { return input } + let bareSecondFlag: UInt32 = disableBareSecond ? 1 : 0 + guard let resultPtr = nemo_normalize_with_options( + input, concatFlag, bareSecondFlag + ) else { return input } defer { nemo_free_string(resultPtr) } return String(cString: resultPtr) } diff --git a/swift/NemoTextProcessing.swift b/swift/NemoTextProcessing.swift index d27d592..605d3b0 100644 --- a/swift/NemoTextProcessing.swift +++ b/swift/NemoTextProcessing.swift @@ -68,19 +68,24 @@ public enum NemoTextProcessing { /// `"seven eighty eight"` → `"788"`) instead of adding. /// - maxSpanTokens: Maximum consecutive tokens per normalizable span. /// Pass `0` to use the library default (16). + /// - disableBareSecond: When true, the bare word `"second"` is NOT + /// rewritten to `"2nd"` (issue #22). Compound ordinals like + /// `"twenty second"` → `"22nd"` still convert. /// - Returns: Sentence with spoken-form spans replaced public static func normalizeSentence( _ input: String, concatCompoundNumbers: Bool = false, - maxSpanTokens: UInt32 = 0 + maxSpanTokens: UInt32 = 0, + disableBareSecond: Bool = false ) -> String { guard let cString = input.cString(using: .utf8) else { return input } let concatFlag: UInt32 = concatCompoundNumbers ? 1 : 0 + let bareSecondFlag: UInt32 = disableBareSecond ? 1 : 0 guard let resultPtr = nemo_normalize_sentence_with_options( - cString, concatFlag, maxSpanTokens + cString, concatFlag, maxSpanTokens, bareSecondFlag ) else { return input } @@ -94,18 +99,23 @@ public enum NemoTextProcessing { /// /// - Parameters: /// - input: Spoken-form text - /// - concatCompoundNumbers: See `normalizeSentence(_:concatCompoundNumbers:maxSpanTokens:)`. + /// - concatCompoundNumbers: See `normalizeSentence(_:concatCompoundNumbers:maxSpanTokens:disableBareSecond:)`. + /// - disableBareSecond: See `normalizeSentence(_:concatCompoundNumbers:maxSpanTokens:disableBareSecond:)`. /// - Returns: Written-form text, or original if no normalization applies. public static func normalize( _ input: String, - concatCompoundNumbers: Bool + concatCompoundNumbers: Bool = false, + disableBareSecond: Bool = false ) -> String { guard let cString = input.cString(using: .utf8) else { return input } let concatFlag: UInt32 = concatCompoundNumbers ? 1 : 0 - guard let resultPtr = nemo_normalize_with_options(cString, concatFlag) else { + let bareSecondFlag: UInt32 = disableBareSecond ? 1 : 0 + guard let resultPtr = nemo_normalize_with_options( + cString, concatFlag, bareSecondFlag + ) else { return input } diff --git a/swift/include/nemo_text_processing.h b/swift/include/nemo_text_processing.h index 7a2abec..809b925 100644 --- a/swift/include/nemo_text_processing.h +++ b/swift/include/nemo_text_processing.h @@ -40,9 +40,17 @@ char* nemo_normalize_sentence(const char* input); * words concatenate (aviation flight-number style — e.g. * "thirty five sixty two" -> "3562", "seven eighty eight" -> "788") * instead of adding. + * @param disable_bare_second When non-zero, the bare word "second" is NOT + * rewritten to "2nd" (issue #22), so phrases like + * "give me a second" stay literal. Compound ordinals + * ("twenty second" -> "22nd") still convert. * @return Newly allocated string, must be freed with nemo_free_string(). */ -char* nemo_normalize_with_options(const char* input, uint32_t concat_compound_numbers); +char* nemo_normalize_with_options( + const char* input, + uint32_t concat_compound_numbers, + uint32_t disable_bare_second +); /** * Normalize a full sentence with caller-specified options. @@ -51,12 +59,14 @@ char* nemo_normalize_with_options(const char* input, uint32_t concat_compound_nu * @param concat_compound_numbers See nemo_normalize_with_options. * @param max_span_tokens Maximum consecutive tokens per span. Pass 0 to * use the library default (16). + * @param disable_bare_second See nemo_normalize_with_options. * @return Newly allocated string, must be freed with nemo_free_string(). */ char* nemo_normalize_sentence_with_options( const char* input, uint32_t concat_compound_numbers, - uint32_t max_span_tokens + uint32_t max_span_tokens, + uint32_t disable_bare_second ); /** diff --git a/tests/en_tests.rs b/tests/en_tests.rs index 53cf871..44aa263 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -1169,6 +1169,101 @@ fn test_issue_21_space_before_punct_preserved() { ); } +/// Issue #22: bare `"second"` is ambiguous (ordinal vs SI time unit vs +/// verb). The opt-in `disable_bare_second` flag blocks the ordinal tagger +/// for the standalone word so phrases like `"give me a second"` stay +/// literal, while compound ordinals (`"twenty second"` → `"22nd"`) and date +/// contexts still convert. +#[test] +fn test_issue_22_default_behavior_unchanged() { + // No flag, default options: today's behavior is preserved. + assert_eq!( + normalize_sentence("Give me a second to check."), + "Give me a 2nd to check." + ); + assert_eq!( + normalize_sentence_with_options("Give me a second to check.", NormalizeOptions::new()), + "Give me a 2nd to check." + ); +} + +#[test] +fn test_issue_22_sentence_disable_bare_second() { + let opts = NormalizeOptions::new().with_disable_bare_second(true); + + // Reported case from issue #22. + assert_eq!( + normalize_sentence_with_options("Give me a second to check.", opts), + "Give me a second to check." + ); + + // Other common false-positive contexts. + assert_eq!( + normalize_sentence_with_options("Wait a second, I need to think.", opts), + "Wait a second, I need to think." + ); + assert_eq!( + normalize_sentence_with_options("I'll second that motion.", opts), + "I'll second that motion." + ); + + // Compound ordinals must still convert. + assert_eq!( + normalize_sentence_with_options("He finished in the twenty second place", opts), + "He finished in the 22nd place" + ); + assert_eq!( + normalize_sentence_with_options("the one hundred second time", opts), + "the 102nd time" + ); + + // Date contexts still route through the date tagger. + assert_eq!( + normalize_sentence_with_options("January second twenty twenty five", opts), + "January 2 2025" + ); + + // The flag also drops bare-ordinal "second" used as an actual ordinal + // (e.g. "I came in second"); that's the documented trade-off for + // killing the false positives. + assert_eq!( + normalize_sentence_with_options("I came in second", opts), + "I came in second" + ); + + // Other ordinals are not affected by this flag. + assert_eq!( + normalize_sentence_with_options("the third time", opts), + "the 3rd time" + ); + assert_eq!( + normalize_sentence_with_options("the first time", opts), + "the 1st time" + ); +} + +#[test] +fn test_issue_22_single_expression_disable_bare_second() { + let opts = NormalizeOptions::new().with_disable_bare_second(true); + + // Bare "second" stays literal in single-expression mode when opted in. + assert_eq!(normalize_with_options("second", opts), "second"); + assert_eq!(normalize_with_options("Second", opts), "Second"); + + // Compound forms still convert in single-expression mode. + assert_eq!(normalize_with_options("twenty second", opts), "22nd"); + + // Other ordinals unaffected. + assert_eq!(normalize_with_options("third", opts), "3rd"); + assert_eq!(normalize_with_options("first", opts), "1st"); + + // Default options: bare "second" still converts (no behavior change). + assert_eq!( + normalize_with_options("second", NormalizeOptions::new()), + "2nd" + ); +} + /// Punctuation other than `,` and `.` is also split, including paired /// brackets and quotes, while contractions and hyphenated words remain /// intact. diff --git a/tests/en_tn_tests.rs b/tests/en_tn_tests.rs index c60ee8b..0a4c2ef 100644 --- a/tests/en_tn_tests.rs +++ b/tests/en_tn_tests.rs @@ -230,3 +230,34 @@ fn test_issue_16_tn_normalize_sentence_public_api() { normalized_text ); } + +/// Regression: PR #25 introduced a pretokenizer that splits trailing +/// punctuation off of words so ITN can match `"twenty one,"` as +/// `"twenty one"`. The shared sentence loop must rejoin pretokens using +/// their original separator (no inserted whitespace), otherwise the TN +/// whitelist sees `"Dr ."` instead of `"Dr."` and abbreviation entries +/// like `"e.g."` / `"Prof."` / `"Inc."` stop matching. Reported by Devin +/// AI on PR #25: +/// https://github.com/FluidInference/text-processing-rs/pull/25#pullrequestreview-4178192149 +#[test] +fn test_pr25_tn_abbreviation_regression() { + // Both-form whitelist entries (`Dr` / `Dr.`, `vs` / `vs.`): the period + // must be consumed by the abbreviation, not left orphaned. + assert_eq!( + tn_normalize_sentence("I see Dr. Smith today."), + "I see doctor Smith today." + ); + assert_eq!(tn_normalize_sentence("vs. them"), "versus them"); + + // Period-only whitelist entries: only match when the trailing period + // is preserved across the pretokenizer split. + assert_eq!( + tn_normalize_sentence("e.g. she is here"), + "for example she is here" + ); + assert_eq!( + tn_normalize_sentence("Inc. and Co."), + "incorporated and company" + ); + assert_eq!(tn_normalize_sentence("Prof. Jones"), "professor Jones"); +}