From f8abe9e957dbcf5d2c0ac9c8431f3a1766d246f0 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 23:32:02 -0400 Subject: [PATCH 1/4] feat: unified NormalizeOptions API + fix #23 compound concat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address two pieces of feedback from @hongbo-miao: - Issue #15 comment: instead of separate `normalize_sentence_aviation` variants, expose a unified entry point with an options struct. - Issue #23 comment: prefer a generic flag (not a `Domain` label) since other code-style speech contexts want the same "stop adding two numbers" behavior. API --- - New `NormalizeOptions { concat_compound_numbers, max_span_tokens }` with builder helpers. - New `normalize_with_options` and `normalize_sentence_with_options` unified entry points. - Existing `normalize_aviation`, `normalize_sentence_aviation*`, `normalize_sentence_with_max_span` stay as thin wrappers — no breaking change for current callers. - FFI: `nemo_normalize_with_options(input, concat)` and `nemo_normalize_sentence_with_options(input, concat, max_span)`. - WASM: `normalizeWithOptions` / `normalizeSentenceWithOptions`. Issue #23 fix ------------- `words_to_number_aviation` previously only handled digit-prefix + grammatical compound (`"seven eighty eight"` → `"788"`). It still added consecutive grammatical compounds together, so `"thirty five sixty two"` resolved to `"97"` (= 35 + 62). Replaced the digit-prefix path with a general `peel_compound_chunks` helper that greedily splits a phrase into 0-99 chunks and concatenates them when there are 2+. Single-chunk inputs (`"twenty one"`) still go through grammatical, and any phrase with a scale word (`"two thousand seventeen"`) keeps its addition semantics. Updated one stale test (`"twenty one forty two"` was locking in the buggy `63`; it now correctly produces `2142`). --- src/ffi.rs | 94 ++++++++++++++++++++++++++- src/itn/en/cardinal.rs | 74 ++++++++++++++++------ src/lib.rs | 140 +++++++++++++++++++++++++++++++++++++++-- src/wasm.rs | 42 ++++++++++++- tests/en_tests.rs | 131 +++++++++++++++++++++++++++++++++++++- 5 files changed, 450 insertions(+), 31 deletions(-) diff --git a/src/ffi.rs b/src/ffi.rs index 0086a6e..d5f50a8 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -5,11 +5,30 @@ use std::ptr; use crate::{ custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, - normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, tn_normalize, - tn_normalize_lang, tn_normalize_sentence, tn_normalize_sentence_lang, - tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, + normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, + normalize_sentence_with_options, normalize_with_options, tn_normalize, tn_normalize_lang, + tn_normalize_sentence, tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, + tn_normalize_sentence_with_max_span_lang, NormalizeOptions, }; +/// Build [`NormalizeOptions`] from FFI primitives. +/// +/// `concat_compound_numbers`: any non-zero value enables concat behavior +/// (`"thirty five sixty two"` → `"3562"`, `"seven eighty eight"` → `"788"`). +/// +/// `max_span_tokens`: `0` means "use library default" (16); any positive +/// value is a caller-specified max span. +fn options_from_ffi(concat_compound_numbers: u32, max_span_tokens: u32) -> NormalizeOptions { + NormalizeOptions { + concat_compound_numbers: concat_compound_numbers != 0, + max_span_tokens: if max_span_tokens == 0 { + None + } else { + Some(max_span_tokens as usize) + }, + } +} + /// Normalize spoken-form text to written form. /// /// # Safety @@ -174,6 +193,75 @@ pub unsafe extern "C" fn nemo_normalize_sentence_aviation_with_max_span( } } +/// Unified single-expression normalize with caller-specified options. +/// +/// `concat_compound_numbers`: `0` for standard ITN, non-zero for +/// concat-compound (aviation-style) reading where consecutive number words +/// concatenate rather than add — e.g. `"thirty five sixty two"` → `"3562"`, +/// `"seven eighty eight"` → `"788"`. +/// +/// # Safety +/// - `input` must be a valid null-terminated UTF-8 string +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_normalize_with_options( + input: *const c_char, + concat_compound_numbers: u32, +) -> *mut c_char { + if input.is_null() { + return ptr::null_mut(); + } + + let c_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let options = options_from_ffi(concat_compound_numbers, 0); + let result = normalize_with_options(c_str, options); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + +/// Unified sentence normalize with caller-specified options. +/// +/// `concat_compound_numbers`: `0` for standard ITN, non-zero for +/// concat-compound reading. +/// +/// `max_span_tokens`: +/// - `0` — use library default (`16`). +/// - `>0` — use the specified max span. +/// +/// # Safety +/// - `input` must be a valid null-terminated UTF-8 string +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_normalize_sentence_with_options( + input: *const c_char, + concat_compound_numbers: u32, + max_span_tokens: u32, +) -> *mut c_char { + if input.is_null() { + return ptr::null_mut(); + } + + let c_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let options = options_from_ffi(concat_compound_numbers, max_span_tokens); + let result = normalize_sentence_with_options(c_str, options); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + /// Free a string allocated by nemo_normalize or nemo_normalize_sentence. /// /// # Safety diff --git a/src/itn/en/cardinal.rs b/src/itn/en/cardinal.rs index 71d4e14..3bc33c7 100644 --- a/src/itn/en/cardinal.rs +++ b/src/itn/en/cardinal.rs @@ -199,10 +199,17 @@ pub fn words_to_number(input: &str) -> Option { /// Aviation / flight-number / call-sign reading of a number phrase. /// -/// Recognises a leading run of single-digit words concatenated with a trailing -/// grammatical compound, e.g. `"seven eighty eight"` → `788`, -/// `"two thirty five"` → `235`. Falls back to [`words_to_number`] when the -/// aviation pattern does not apply (no digit prefix, scale word present, etc.). +/// Recognises consecutive 0-99 compounds and concatenates them rather than +/// summing. Examples: +/// - `"seven eighty eight"` → `788` (digit + tens+ones compound) +/// - `"two thirty five"` → `235` +/// - `"thirty five sixty two"` → `3562` (two tens+ones compounds — fixes #23) +/// - `"twenty one"` → `21` (single chunk; identical to grammatical) +/// +/// Falls back to [`words_to_number`] (grammatical addition) when the chunk +/// pattern does not apply, including any phrase containing a scale word +/// (`hundred`, `thousand`, ...). This preserves `"two thousand seventeen"` +/// → `2017`. /// /// This is **opt-in**: callers reach for it explicitly from flight-number / /// call-sign contexts. Generic ITN/TN dispatch keeps using [`words_to_number`] @@ -230,22 +237,13 @@ pub fn words_to_number_aviation(input: &str) -> Option { .ok(); } - // Aviation flight-number style: digit prefix + grammatical compound. - // "seven eighty eight" → "7" ‖ 88 = 788. Skipped if a scale word appears, - // since "two thousand seventeen" must stay grammatical (= 2017, not 22017). + // Concatenated 0-99 compound chunks. Skipped if a scale word appears, + // since `"two thousand seventeen"` must stay grammatical (= 2017). let has_scale = words.iter().any(|w| SCALES.contains_key(*w)); if !has_scale { - let prefix_len = words - .iter() - .take_while(|w| single_digit_char(w).is_some()) - .count(); - if prefix_len >= 1 && prefix_len < words.len() { - if let Some(rest_num) = grammatical_words_to_number(&words[prefix_len..]) { - let prefix: String = words[..prefix_len] - .iter() - .map(|w| single_digit_char(w).unwrap()) - .collect(); - let combined = format!("{}{}", prefix, rest_num); + if let Some(chunks) = peel_compound_chunks(&words) { + if chunks.len() >= 2 { + let combined: String = chunks.iter().map(|n| n.to_string()).collect(); return combined.parse::().ok(); } } @@ -254,6 +252,46 @@ pub fn words_to_number_aviation(input: &str) -> Option { grammatical_words_to_number(&words) } +/// Greedily peel `words` into 0-99 number chunks. Each chunk is one of: +/// - A single ONES word (0-19), e.g. `"seven"` → 7, `"sixteen"` → 16 +/// - A single TENS word (20, 30, ... 90), e.g. `"twenty"` → 20 +/// - A TENS word followed by a ones word (1-9), e.g. `"twenty one"` → 21 +/// +/// Returns `None` if any token isn't a recognised number word, so this +/// function refuses to swallow non-number tokens. `"and"` / `"a"` filler +/// must already be removed by the caller. +fn peel_compound_chunks(words: &[&str]) -> Option> { + let mut chunks = Vec::new(); + let mut i = 0; + while i < words.len() { + if let Some(&tens) = TENS.get(words[i]) { + // Greedy: try TENS + ones (1-9) before falling back to standalone. + if i + 1 < words.len() { + if let Some(&ones) = ONES.get(words[i + 1]) { + if (1..=9).contains(&ones) { + chunks.push((tens + ones) as i128); + i += 2; + continue; + } + } + } + chunks.push(tens as i128); + i += 1; + } else if let Some(&ones) = ONES.get(words[i]) { + // 0-19 standalone (covers digit words, ten, and teens). + chunks.push(ones as i128); + i += 1; + } else { + return None; + } + } + if chunks.is_empty() { + None + } else { + Some(chunks) + } +} + /// Parse a grammatical English number with running-sum + scale multiplication. fn grammatical_words_to_number(words: &[&str]) -> Option { // "eleven hundred" = 1100, "twenty hundred" = 2000 diff --git a/src/lib.rs b/src/lib.rs index dbc0c46..df3c2f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,68 @@ use itn::en::{ whitelist, word, }; +/// Options for the unified [`normalize_with_options`] / +/// [`normalize_sentence_with_options`] entry points. +/// +/// Keeping options on a struct (rather than separate `*_aviation` / +/// `*_with_max_span` functions) lets new knobs land without exploding the +/// public API surface — see issues #15 and #23 for the motivating discussion. +/// +/// The flags are intentionally orthogonal and *not* tied to a particular +/// domain. Aviation, military codes, dispatch IDs, etc. all reuse the same +/// underlying behavior toggles. +/// +/// # Examples +/// +/// ``` +/// use text_processing_rs::{normalize_sentence_with_options, NormalizeOptions}; +/// +/// let opts = NormalizeOptions { +/// concat_compound_numbers: true, +/// max_span_tokens: Some(8), +/// }; +/// assert_eq!( +/// normalize_sentence_with_options("United seven eighty eight", opts), +/// "United 788" +/// ); +/// ``` +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct NormalizeOptions { + /// When `true`, sequences of spoken number words concatenate rather than + /// add. `"thirty five sixty two"` → `"3562"` (instead of `35 + 62 = 97`) + /// and `"seven eighty eight"` → `"788"`. Aviation, flight-numbers, + /// call-signs, and other code-style readings want this on. + /// + /// Scale-word grammar is preserved: `"two thousand seventeen"` still + /// resolves to `"2017"` regardless of this flag. + pub concat_compound_numbers: bool, + /// Maximum span size (tokens) considered in sentence mode. `None` means + /// use [`DEFAULT_MAX_SPAN_TOKENS`]. Ignored by [`normalize_with_options`]. + pub max_span_tokens: Option, +} + +impl NormalizeOptions { + /// Default options: standard ITN dispatch, default max span. + pub const fn new() -> Self { + Self { + concat_compound_numbers: false, + max_span_tokens: None, + } + } + + /// Enable / disable compound-number concatenation. + pub const fn with_concat_compound_numbers(mut self, enabled: bool) -> Self { + self.concat_compound_numbers = enabled; + self + } + + /// Set the sentence-mode max span (in tokens). + pub const fn with_max_span_tokens(mut self, max_span_tokens: usize) -> Self { + self.max_span_tokens = Some(max_span_tokens); + self + } +} + /// Normalize spoken-form text to written form. /// /// Tries taggers in order of specificity (most specific first). @@ -135,6 +197,36 @@ pub fn normalize(input: &str) -> String { /// assert_eq!(normalize_aviation("hello world"), "hello world"); /// ``` pub fn normalize_aviation(input: &str) -> String { + normalize_with_options( + input, + NormalizeOptions::new().with_concat_compound_numbers(true), + ) +} + +/// Unified single-expression normalize entry point. +/// +/// Switches between standard and concat-compound (aviation-style) dispatch +/// based on `options.concat_compound_numbers`. The `max_span_tokens` field on +/// [`NormalizeOptions`] is ignored here — it only applies to +/// [`normalize_sentence_with_options`]. +/// +/// ``` +/// use text_processing_rs::{normalize_with_options, NormalizeOptions}; +/// +/// let opts = NormalizeOptions::new().with_concat_compound_numbers(true); +/// assert_eq!(normalize_with_options("seven eighty eight", opts), "788"); +/// ``` +pub fn normalize_with_options(input: &str, options: NormalizeOptions) -> String { + if options.concat_compound_numbers { + normalize_aviation_inner(input) + } else { + normalize(input) + } +} + +/// Aviation single-expression dispatch. Kept private; callers go through +/// [`normalize_aviation`] or [`normalize_with_options`]. +fn normalize_aviation_inner(input: &str) -> String { let input = input.trim(); // High-confidence rules still win. @@ -152,7 +244,7 @@ pub fn normalize_aviation(input: &str) -> String { } // Aviation cardinal beats time/date here. This is the whole point of - // calling `normalize_aviation` instead of `normalize`. + // the aviation domain. if let Some(num) = cardinal::parse_aviation(input) { return num; } @@ -975,6 +1067,35 @@ pub fn normalize_sentence(input: &str) -> String { normalize_sentence_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) } +/// Unified sentence-mode entry point. +/// +/// Combines `concat_compound_numbers` and `max_span_tokens` configuration in +/// a single call. When `max_span_tokens` is `None`, [`DEFAULT_MAX_SPAN_TOKENS`] +/// (16) is used. +/// +/// ``` +/// use text_processing_rs::{normalize_sentence_with_options, NormalizeOptions}; +/// +/// // Default behavior, default span +/// assert_eq!( +/// normalize_sentence_with_options("I have twenty one apples", NormalizeOptions::new()), +/// "I have 21 apples" +/// ); +/// +/// // Concat-compound (aviation-style), custom span +/// let opts = NormalizeOptions::new() +/// .with_concat_compound_numbers(true) +/// .with_max_span_tokens(8); +/// assert_eq!( +/// normalize_sentence_with_options("United seven eighty eight", opts), +/// "United 788" +/// ); +/// ``` +pub fn normalize_sentence_with_options(input: &str, options: NormalizeOptions) -> String { + let max_span = options.max_span_tokens.unwrap_or(DEFAULT_MAX_SPAN_TOKENS); + normalize_sentence_inner(input, max_span, options.concat_compound_numbers) +} + /// Sentence-mode equivalent of [`normalize_aviation`]. Aviation cardinal /// runs at priority 89 (above `date`=88 / `time`=85, below `measure`=90 / /// `money`=95), so flight-number-style spans win over date/time while @@ -999,12 +1120,20 @@ pub fn normalize_sentence(input: &str) -> String { /// ); /// ``` pub fn normalize_sentence_aviation(input: &str) -> String { - normalize_sentence_aviation_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) + normalize_sentence_with_options( + input, + NormalizeOptions::new().with_concat_compound_numbers(true), + ) } /// [`normalize_sentence_aviation`] with a configurable max span size. pub fn normalize_sentence_aviation_with_max_span(input: &str, max_span_tokens: usize) -> String { - normalize_sentence_inner(input, max_span_tokens, true) + normalize_sentence_with_options( + input, + NormalizeOptions::new() + .with_concat_compound_numbers(true) + .with_max_span_tokens(max_span_tokens), + ) } /// Normalize a full sentence with a configurable max span size. @@ -1021,7 +1150,10 @@ pub fn normalize_sentence_aviation_with_max_span(input: &str, max_span_tokens: u /// assert_eq!(normalize_sentence_with_max_span("I have twenty one apples", 4), "I have 21 apples"); /// ``` pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> String { - normalize_sentence_inner(input, max_span_tokens, false) + normalize_sentence_with_options( + input, + NormalizeOptions::new().with_max_span_tokens(max_span_tokens), + ) } /// Sentence-mode dispatch loop. The `aviation` flag is forwarded to diff --git a/src/wasm.rs b/src/wasm.rs index 76b54da..5a3d375 100644 --- a/src/wasm.rs +++ b/src/wasm.rs @@ -5,11 +5,27 @@ use wasm_bindgen::prelude::*; use crate::{ custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, - normalize_with_lang, tn_normalize, tn_normalize_lang, tn_normalize_sentence, - tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, - tn_normalize_sentence_with_max_span_lang, + normalize_sentence_with_options, normalize_with_lang, normalize_with_options, tn_normalize, + tn_normalize_lang, tn_normalize_sentence, tn_normalize_sentence_lang, + tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, + NormalizeOptions, }; +/// Build [`NormalizeOptions`] from JS-friendly primitives. +/// +/// `max_span_tokens == 0` is treated as "use library default" so JS callers +/// can pass `0` rather than dealing with optional values across the boundary. +fn js_options(concat_compound_numbers: bool, max_span_tokens: u32) -> NormalizeOptions { + NormalizeOptions { + concat_compound_numbers, + max_span_tokens: if max_span_tokens == 0 { + None + } else { + Some(max_span_tokens as usize) + }, + } +} + /// Initialize panic hook for better error messages in browser devtools. #[wasm_bindgen] pub fn set_panic_hook() { @@ -51,6 +67,26 @@ pub fn normalize_sentence_aviation_with_max_span_js(input: &str, max_span_tokens normalize_sentence_aviation_with_max_span(input, max_span_tokens as usize) } +/// Unified single-expression normalize. `concatCompoundNumbers=true` reads +/// consecutive number words as concatenation rather than addition, e.g. +/// `"thirty five sixty two"` → `"3562"`, `"seven eighty eight"` → `"788"`. +#[wasm_bindgen(js_name = normalizeWithOptions)] +pub fn normalize_with_options_js(input: &str, concat_compound_numbers: bool) -> String { + normalize_with_options(input, js_options(concat_compound_numbers, 0)) +} + +/// Unified sentence normalize. `concatCompoundNumbers` mirrors the +/// single-expression flag; `maxSpanTokens == 0` means "use library default" +/// (16). +#[wasm_bindgen(js_name = normalizeSentenceWithOptions)] +pub fn normalize_sentence_with_options_js( + input: &str, + concat_compound_numbers: bool, + max_span_tokens: u32, +) -> String { + normalize_sentence_with_options(input, js_options(concat_compound_numbers, max_span_tokens)) +} + #[wasm_bindgen(js_name = tnNormalize)] pub fn tn_normalize_js(input: &str) -> String { tn_normalize(input) diff --git a/tests/en_tests.rs b/tests/en_tests.rs index 414144b..2a0bd62 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -8,7 +8,8 @@ mod common; use std::path::Path; use text_processing_rs::{ custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, - normalize_sentence_with_max_span, + normalize_sentence_with_max_span, normalize_sentence_with_options, normalize_with_options, + NormalizeOptions, }; fn print_failures(results: &common::TestResults) { @@ -928,8 +929,9 @@ fn test_issue_14_normalize_aviation() { assert_eq!(normalize_aviation("seven eighty eight"), "788"); // Beats time tagger. assert_eq!(normalize_aviation("two thirty five"), "235"); - // Beats date old-year reading. - assert_eq!(normalize_aviation("twenty one forty two"), "63"); + // Two consecutive 0-99 compounds concatenate (issue #23 fix). + // Was previously `"63"` (= 20+1+40+2) under the old grammatical fallback. + assert_eq!(normalize_aviation("twenty one forty two"), "2142"); // Non-number phrases fall through unchanged. assert_eq!(normalize_aviation("hello world"), "hello world"); // Money / measure / decimal / ordinal still work via fallback to @@ -974,3 +976,126 @@ fn test_issue_14_normalize_sentence_aviation() { "I have 21 apples" ); } + +// ── Unified options API (issues #15 and #23 follow-up) ──────────────── + +/// `NormalizeOptions::default()` should behave identically to `normalize`. +#[test] +fn test_options_default_matches_normalize() { + let opts = NormalizeOptions::default(); + assert_eq!(normalize_with_options("two hundred", opts), "200"); + assert_eq!(normalize_with_options("five dollars", opts), "$5"); + // Time tagger still wins by default. + assert_eq!(normalize_with_options("two thirty five", opts), "02:35"); +} + +/// `concat_compound_numbers: true` should make `normalize_with_options` +/// behave like `normalize_aviation`. +#[test] +fn test_options_concat_matches_aviation() { + let opts = NormalizeOptions::new().with_concat_compound_numbers(true); + assert_eq!(normalize_with_options("seven eighty eight", opts), "788"); + assert_eq!(normalize_with_options("two thirty five", opts), "235"); + assert_eq!(normalize_with_options("hello world", opts), "hello world"); + // Money / scale-word fallthrough still works. + assert_eq!(normalize_with_options("five dollars", opts), "$5"); + assert_eq!( + normalize_with_options("two thousand seventeen", opts), + "2017" + ); +} + +/// Sentence mode default options match `normalize_sentence`. +#[test] +fn test_sentence_options_default_matches_default() { + let opts = NormalizeOptions::default(); + assert_eq!( + normalize_sentence_with_options("I have twenty one apples", opts), + "I have 21 apples" + ); + assert_eq!( + normalize_sentence_with_options("hello world", opts), + "hello world" + ); +} + +/// Sentence mode with concat enabled matches `normalize_sentence_aviation`. +#[test] +fn test_sentence_options_concat_compound() { + let opts = NormalizeOptions::new().with_concat_compound_numbers(true); + assert_eq!( + normalize_sentence_with_options("United seven eighty eight", opts), + "United 788" + ); + assert_eq!( + normalize_sentence_with_options("flight two thirty five departs at gate four", opts), + "flight 235 departs at gate 4" + ); +} + +/// `max_span_tokens` on the options struct is honoured. +#[test] +fn test_sentence_options_max_span() { + let opts = NormalizeOptions::new().with_max_span_tokens(4); + assert_eq!( + normalize_sentence_with_options("I have twenty one apples", opts), + "I have 21 apples" + ); +} + +/// `None` for `max_span_tokens` should give the same default as +/// `normalize_sentence` (16). +#[test] +fn test_sentence_options_none_max_span_uses_default() { + let with_default = NormalizeOptions::new(); + let with_explicit = NormalizeOptions::new().with_max_span_tokens(16); + let input = "United seven eighty eight"; + assert_eq!( + normalize_sentence_with_options(input, with_default), + normalize_sentence_with_options(input, with_explicit), + ); +} + +/// Builder methods compose: concat flag + max span on one struct. +#[test] +fn test_sentence_options_builder_compose() { + let opts = NormalizeOptions::new() + .with_concat_compound_numbers(true) + .with_max_span_tokens(8); + assert_eq!( + normalize_sentence_with_options("United seven eighty eight", opts), + "United 788" + ); +} + +/// Issue #23: consecutive 0-99 compounds should concatenate, not add. +/// `"thirty five sixty two"` → `"3562"`, not `"97"` (= 35 + 62). +#[test] +fn test_issue_23_compound_concat() { + // Whole-input single-expression form. + assert_eq!(normalize_aviation("thirty five sixty two"), "3562"); + + // Sentence form — the original report. + assert_eq!( + normalize_sentence_aviation( + "Alright thirty five sixty two appreciate your help United seven eighty eight" + ), + "Alright 3562 appreciate your help United 788" + ); + + // Through the unified options API too. + let opts = NormalizeOptions::new().with_concat_compound_numbers(true); + assert_eq!( + normalize_sentence_with_options("thirty five sixty two", opts), + "3562" + ); + + // Mixed digit prefix + compounds: "two thirty five sixty two" → 23562. + assert_eq!(normalize_aviation("two thirty five sixty two"), "23562"); + + // Single chunks must NOT concatenate (preserves grammatical reading). + assert_eq!(normalize_aviation("twenty one"), "21"); + + // Scale words still anchor grammatical addition. + assert_eq!(normalize_aviation("two thousand seventeen"), "2017"); +} From b2d22685ccb702f70e1fb273849296a72114211f Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 23:49:35 -0400 Subject: [PATCH 2/4] refactor!: drop deprecated aviation/max-span wrappers Per issue #15 and #23 follow-up: remove all backwards-compat wrappers now that callers pass through the unified `NormalizeOptions` API. Removed Rust functions: - `normalize_aviation`, `normalize_sentence_aviation` - `normalize_sentence_aviation_with_max_span` - `normalize_sentence_with_max_span` Removed FFI bindings: - `nemo_normalize_aviation`, `nemo_normalize_sentence_aviation` - `nemo_normalize_sentence_with_max_span` - `nemo_normalize_sentence_aviation_with_max_span` Removed WASM bindings: - `normalizeAviation`, `normalizeSentenceAviation` - `normalizeSentenceWithMaxSpan`, `normalizeSentenceAviationWithMaxSpan` Callers should switch to: - Rust: `normalize_with_options` / `normalize_sentence_with_options` - FFI: `nemo_normalize_with_options` / `nemo_normalize_sentence_with_options` - WASM: `normalizeWithOptions` / `normalizeSentenceWithOptions` Swift wrapper and headers updated accordingly. All Rust tests (2050 across the workspace incl. doc tests) and FFI tests pass. --- src/ffi.rs | 128 +-------------- src/lib.rs | 155 ++++-------------- src/wasm.rs | 30 +--- .../include/nemo_text_processing.h | 7 +- swift-test/Sources/NemoTest/NemoTest.swift | 18 +- swift/NemoTextProcessing.swift | 43 ++++- swift/include/nemo_text_processing.h | 24 ++- tests/en_tests.rs | 106 +++++++----- tests/extensive_tests.rs | 9 +- 9 files changed, 198 insertions(+), 322 deletions(-) diff --git a/src/ffi.rs b/src/ffi.rs index d5f50a8..3a4909b 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -4,10 +4,9 @@ use std::ffi::{c_char, CStr, CString}; use std::ptr; use crate::{ - custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, - normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, - normalize_sentence_with_options, normalize_with_options, tn_normalize, tn_normalize_lang, - tn_normalize_sentence, tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, + custom_rules, normalize, normalize_sentence, normalize_sentence_with_options, + normalize_with_options, tn_normalize, tn_normalize_lang, tn_normalize_sentence, + tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, NormalizeOptions, }; @@ -80,119 +79,6 @@ pub unsafe extern "C" fn nemo_normalize_sentence(input: *const c_char) -> *mut c } } -/// Normalize a full sentence with a configurable max span size. -/// -/// `max_span_tokens` controls the maximum number of consecutive tokens -/// considered as a single normalizable expression (default is 16). -/// -/// # Safety -/// - `input` must be a valid null-terminated UTF-8 string -/// - Returns a newly allocated string that must be freed with `nemo_free_string` -#[no_mangle] -pub unsafe extern "C" fn nemo_normalize_sentence_with_max_span( - input: *const c_char, - max_span_tokens: u32, -) -> *mut c_char { - if input.is_null() { - return ptr::null_mut(); - } - - let c_str = match CStr::from_ptr(input).to_str() { - Ok(s) => s, - Err(_) => return ptr::null_mut(), - }; - - let result = normalize_sentence_with_max_span(c_str, max_span_tokens as usize); - - match CString::new(result) { - Ok(c_string) => c_string.into_raw(), - Err(_) => ptr::null_mut(), - } -} - -/// Aviation-flavoured single-input normalize. -/// -/// Layered on top of [`nemo_normalize`]: tries `cardinal::parse_aviation` -/// first so flight-number / call-sign phrases like `"seven eighty eight"` -/// resolve to `"788"`, then falls back to the regular dispatch. -/// -/// # Safety -/// - `input` must be a valid null-terminated UTF-8 string -/// - Returns a newly allocated string that must be freed with `nemo_free_string` -#[no_mangle] -pub unsafe extern "C" fn nemo_normalize_aviation(input: *const c_char) -> *mut c_char { - if input.is_null() { - return ptr::null_mut(); - } - - let c_str = match CStr::from_ptr(input).to_str() { - Ok(s) => s, - Err(_) => return ptr::null_mut(), - }; - - let result = normalize_aviation(c_str); - - match CString::new(result) { - Ok(c_string) => c_string.into_raw(), - Err(_) => ptr::null_mut(), - } -} - -/// Aviation-flavoured sentence normalize. -/// -/// Sentence-mode equivalent of [`nemo_normalize_aviation`]. Aviation cardinal -/// runs at priority 89 (above date / time, below money / measure), so -/// flight-number-style spans win without disturbing money / measure / decimal. -/// -/// # Safety -/// - `input` must be a valid null-terminated UTF-8 string -/// - Returns a newly allocated string that must be freed with `nemo_free_string` -#[no_mangle] -pub unsafe extern "C" fn nemo_normalize_sentence_aviation(input: *const c_char) -> *mut c_char { - if input.is_null() { - return ptr::null_mut(); - } - - let c_str = match CStr::from_ptr(input).to_str() { - Ok(s) => s, - Err(_) => return ptr::null_mut(), - }; - - let result = normalize_sentence_aviation(c_str); - - match CString::new(result) { - Ok(c_string) => c_string.into_raw(), - Err(_) => ptr::null_mut(), - } -} - -/// Aviation sentence normalize with a configurable max span size. -/// -/// # Safety -/// - `input` must be a valid null-terminated UTF-8 string -/// - Returns a newly allocated string that must be freed with `nemo_free_string` -#[no_mangle] -pub unsafe extern "C" fn nemo_normalize_sentence_aviation_with_max_span( - input: *const c_char, - max_span_tokens: u32, -) -> *mut c_char { - if input.is_null() { - return ptr::null_mut(); - } - - let c_str = match CStr::from_ptr(input).to_str() { - Ok(s) => s, - Err(_) => return ptr::null_mut(), - }; - - let result = normalize_sentence_aviation_with_max_span(c_str, max_span_tokens as usize); - - match CString::new(result) { - Ok(c_string) => c_string.into_raw(), - Err(_) => ptr::null_mut(), - } -} - /// Unified single-expression normalize with caller-specified options. /// /// `concat_compound_numbers`: `0` for standard ITN, non-zero for @@ -546,10 +432,10 @@ mod tests { } #[test] - fn test_ffi_normalize_aviation() { + fn test_ffi_normalize_with_options_concat_compound() { unsafe { let input = CString::new("seven eighty eight").unwrap(); - let result = nemo_normalize_aviation(input.as_ptr()); + let result = nemo_normalize_with_options(input.as_ptr(), 1); assert!(!result.is_null()); let result_str = CStr::from_ptr(result).to_str().unwrap(); assert_eq!(result_str, "788"); @@ -558,10 +444,10 @@ mod tests { } #[test] - fn test_ffi_normalize_sentence_aviation() { + fn test_ffi_normalize_sentence_with_options_concat_compound() { unsafe { let input = CString::new("United seven eighty eight").unwrap(); - let result = nemo_normalize_sentence_aviation(input.as_ptr()); + let result = nemo_normalize_sentence_with_options(input.as_ptr(), 1, 0); assert!(!result.is_null()); let result_str = CStr::from_ptr(result).to_str().unwrap(); assert_eq!(result_str, "United 788"); diff --git a/src/lib.rs b/src/lib.rs index df3c2f5..7920f81 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -173,60 +173,31 @@ pub fn normalize(input: &str) -> String { input.to_string() } -/// Normalize a single input with **aviation flight-number reading -/// prioritized**. +/// Single-expression normalize with caller-specified [`NormalizeOptions`]. /// -/// Same dispatch as [`normalize`], with one twist: `cardinal::parse_aviation` +/// When `options.concat_compound_numbers` is `true`, `cardinal::parse_aviation` /// is tried *after* the high-confidence taggers (`custom_rules`, `whitelist`, /// `punctuation`, `word`) but *before* `time` and `date`. The result: when /// the whole input is a number-only phrase like `"two thirty five"` or -/// `"seven eighty eight"`, the aviation reading wins (`"235"`, `"788"`) -/// instead of being eaten as a time (`"02:35"`) or as an old-year via the -/// date tagger. -/// -/// Use this from flight-number / call-sign / aviation-radio contexts. Phrases -/// that aren't pure number words still flow through the rest of the -/// pipeline normally (`"five dollars"` → `"$5"` via the money tagger). -/// -/// ``` -/// use text_processing_rs::normalize_aviation; -/// -/// assert_eq!(normalize_aviation("seven eighty eight"), "788"); -/// assert_eq!(normalize_aviation("two thirty five"), "235"); -/// // Non-number phrases are unaffected. -/// assert_eq!(normalize_aviation("hello world"), "hello world"); -/// ``` -pub fn normalize_aviation(input: &str) -> String { - normalize_with_options( - input, - NormalizeOptions::new().with_concat_compound_numbers(true), - ) -} - -/// Unified single-expression normalize entry point. -/// -/// Switches between standard and concat-compound (aviation-style) dispatch -/// based on `options.concat_compound_numbers`. The `max_span_tokens` field on -/// [`NormalizeOptions`] is ignored here — it only applies to -/// [`normalize_sentence_with_options`]. +/// `"seven eighty eight"`, concat-compound reading wins (`"235"`, `"788"`) +/// instead of being eaten as a time (`"02:35"`) or an old-year via the date +/// tagger. Non-number phrases still flow through the rest of the pipeline +/// (`"five dollars"` → `"$5"` via the money tagger). /// /// ``` /// use text_processing_rs::{normalize_with_options, NormalizeOptions}; /// /// let opts = NormalizeOptions::new().with_concat_compound_numbers(true); /// assert_eq!(normalize_with_options("seven eighty eight", opts), "788"); +/// assert_eq!(normalize_with_options("two thirty five", opts), "235"); +/// // Non-number phrases are unaffected. +/// assert_eq!(normalize_with_options("hello world", opts), "hello world"); /// ``` pub fn normalize_with_options(input: &str, options: NormalizeOptions) -> String { - if options.concat_compound_numbers { - normalize_aviation_inner(input) - } else { - normalize(input) + if !options.concat_compound_numbers { + return normalize(input); } -} -/// Aviation single-expression dispatch. Kept private; callers go through -/// [`normalize_aviation`] or [`normalize_with_options`]. -fn normalize_aviation_inner(input: &str) -> String { let input = input.trim(); // High-confidence rules still win. @@ -243,14 +214,13 @@ fn normalize_aviation_inner(input: &str) -> String { return result; } - // Aviation cardinal beats time/date here. This is the whole point of - // the aviation domain. + // Concat-compound cardinal beats time/date. if let Some(num) = cardinal::parse_aviation(input) { return num; } - // Fall back to the standard pipeline for anything aviation cardinal - // didn't recognise (money, measure, decimal, ordinal, telephone, etc.). + // Fall back to the standard pipeline for anything not recognised + // (money, measure, decimal, ordinal, telephone, etc.). normalize(input) } @@ -987,11 +957,12 @@ const DEFAULT_MAX_SPAN_TOKENS: usize = 16; /// /// Excluded in sentence mode: `word` and `telephone` (over-fire on natural language). /// -/// `aviation`: when `true`, `cardinal::parse_aviation` is tried at priority 89 -/// (above `date`=88 and `time`=85, below `measure`=90 / `money`=95) and the -/// regular cardinal fallback at 70 is skipped (the aviation reader already -/// falls back to grammatical when no digit prefix is present). -fn parse_span(span: &str, aviation: bool) -> Option<(String, u8)> { +/// `concat_compound`: when `true`, `cardinal::parse_aviation` is tried at +/// priority 89 (above `date`=88 and `time`=85, below `measure`=90 / +/// `money`=95) and the regular cardinal fallback at 70 is skipped (the +/// concat-compound reader already falls back to grammatical when the +/// concat pattern does not apply). +fn parse_span(span: &str, concat_compound: bool) -> Option<(String, u8)> { let token_count = span.split_whitespace().count(); if token_count == 0 { return None; @@ -1013,12 +984,12 @@ fn parse_span(span: &str, aviation: bool) -> Option<(String, u8)> { return Some((result, 90)); } - // Aviation cardinal opt-in: priority 89, beats date/time. No short-span - // gate — aviation mode is opt-in, so the caller has accepted aggressive - // matching across longer spans like "one thousand two hundred thirty - // four". `parse_aviation` falls back to grammatical when the digit-prefix - // pattern does not apply, so non-aviation phrases still resolve. - if aviation { + // Concat-compound cardinal opt-in: priority 89, beats date/time. No + // short-span gate — this is opt-in, so the caller has accepted + // aggressive matching across longer spans like "one thousand two + // hundred thirty four". `parse_aviation` falls back to grammatical when + // the concat pattern does not apply, so non-concat phrases still resolve. + if concat_compound { if let Some(result) = cardinal::parse_aviation(span) { return Some((result, 89)); } @@ -1040,9 +1011,9 @@ fn parse_span(span: &str, aviation: bool) -> Option<(String, u8)> { return Some((result, 75)); } - // Default cardinal fallback (priority 70). In aviation mode the cardinal - // path is already covered by the priority-89 branch above. - if !aviation && token_count <= 4 { + // Default cardinal fallback (priority 70). In concat-compound mode the + // cardinal path is already covered by the priority-89 branch above. + if !concat_compound && token_count <= 4 { if let Some(result) = cardinal::parse(span) { return Some((result, 70)); } @@ -1064,7 +1035,7 @@ fn parse_span(span: &str, aviation: bool) -> Option<(String, u8)> { /// assert_eq!(normalize_sentence("hello world"), "hello world"); /// ``` pub fn normalize_sentence(input: &str) -> String { - normalize_sentence_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) + normalize_sentence_inner(input, DEFAULT_MAX_SPAN_TOKENS, false) } /// Unified sentence-mode entry point. @@ -1096,69 +1067,9 @@ pub fn normalize_sentence_with_options(input: &str, options: NormalizeOptions) - normalize_sentence_inner(input, max_span, options.concat_compound_numbers) } -/// Sentence-mode equivalent of [`normalize_aviation`]. Aviation cardinal -/// runs at priority 89 (above `date`=88 / `time`=85, below `measure`=90 / -/// `money`=95), so flight-number-style spans win over date/time while -/// measure / money phrases keep their existing semantics. -/// -/// ``` -/// use text_processing_rs::normalize_sentence_aviation; -/// -/// // Aviation cardinal beats time/date for pure-number spans. -/// assert_eq!( -/// normalize_sentence_aviation("United seven eighty eight"), -/// "United 788" -/// ); -/// assert_eq!( -/// normalize_sentence_aviation("flight two thirty five departs at gate four"), -/// "flight 235 departs at gate 4" -/// ); -/// // Non-aviation spans flow through normally. -/// assert_eq!( -/// normalize_sentence_aviation("I have twenty one apples"), -/// "I have 21 apples" -/// ); -/// ``` -pub fn normalize_sentence_aviation(input: &str) -> String { - normalize_sentence_with_options( - input, - NormalizeOptions::new().with_concat_compound_numbers(true), - ) -} - -/// [`normalize_sentence_aviation`] with a configurable max span size. -pub fn normalize_sentence_aviation_with_max_span(input: &str, max_span_tokens: usize) -> String { - normalize_sentence_with_options( - input, - NormalizeOptions::new() - .with_concat_compound_numbers(true) - .with_max_span_tokens(max_span_tokens), - ) -} - -/// Normalize a full sentence with a configurable max span size. -/// -/// `max_span_tokens` controls the maximum number of consecutive tokens -/// that will be considered as a single normalizable expression. -/// Smaller values are faster but may miss multi-word expressions. -/// Larger values catch more patterns but do more work per token. -/// -/// ``` -/// use text_processing_rs::normalize_sentence_with_max_span; -/// -/// // Short span: only catches small expressions -/// assert_eq!(normalize_sentence_with_max_span("I have twenty one apples", 4), "I have 21 apples"); -/// ``` -pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> String { - normalize_sentence_with_options( - input, - NormalizeOptions::new().with_max_span_tokens(max_span_tokens), - ) -} - -/// Sentence-mode dispatch loop. The `aviation` flag is forwarded to +/// Sentence-mode dispatch loop. The `concat_compound` flag is forwarded to /// [`parse_span`] so each span sees the right tagger priorities. -fn normalize_sentence_inner(input: &str, max_span_tokens: usize, aviation: bool) -> String { +fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound: bool) -> String { let trimmed = input.trim(); if trimmed.is_empty() { return trimmed.to_string(); @@ -1180,7 +1091,7 @@ fn normalize_sentence_inner(input: &str, max_span_tokens: usize, aviation: bool) // Longest-span-first search keeps replacements stable and non-overlapping. for end in (i + 1..=max_end).rev() { let span = tokens[i..end].join(" "); - let Some((candidate, score)) = parse_span(&span, aviation) else { + let Some((candidate, score)) = parse_span(&span, concat_compound) else { continue; }; diff --git a/src/wasm.rs b/src/wasm.rs index 5a3d375..e9de333 100644 --- a/src/wasm.rs +++ b/src/wasm.rs @@ -3,12 +3,10 @@ use wasm_bindgen::prelude::*; use crate::{ - custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, - normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, - normalize_sentence_with_options, normalize_with_lang, normalize_with_options, tn_normalize, - tn_normalize_lang, tn_normalize_sentence, tn_normalize_sentence_lang, - tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, - NormalizeOptions, + custom_rules, normalize, normalize_sentence, normalize_sentence_with_options, + normalize_with_lang, normalize_with_options, tn_normalize, tn_normalize_lang, + tn_normalize_sentence, tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, + tn_normalize_sentence_with_max_span_lang, NormalizeOptions, }; /// Build [`NormalizeOptions`] from JS-friendly primitives. @@ -47,26 +45,6 @@ pub fn normalize_sentence_js(input: &str) -> String { normalize_sentence(input) } -#[wasm_bindgen(js_name = normalizeSentenceWithMaxSpan)] -pub fn normalize_sentence_with_max_span_js(input: &str, max_span_tokens: u32) -> String { - normalize_sentence_with_max_span(input, max_span_tokens as usize) -} - -#[wasm_bindgen(js_name = normalizeAviation)] -pub fn normalize_aviation_js(input: &str) -> String { - normalize_aviation(input) -} - -#[wasm_bindgen(js_name = normalizeSentenceAviation)] -pub fn normalize_sentence_aviation_js(input: &str) -> String { - normalize_sentence_aviation(input) -} - -#[wasm_bindgen(js_name = normalizeSentenceAviationWithMaxSpan)] -pub fn normalize_sentence_aviation_with_max_span_js(input: &str, max_span_tokens: u32) -> String { - normalize_sentence_aviation_with_max_span(input, max_span_tokens as usize) -} - /// Unified single-expression normalize. `concatCompoundNumbers=true` reads /// consecutive number words as concatenation rather than addition, e.g. /// `"thirty five sixty two"` → `"3562"`, `"seven eighty eight"` → `"788"`. diff --git a/swift-test/Sources/CNemoTextProcessing/include/nemo_text_processing.h b/swift-test/Sources/CNemoTextProcessing/include/nemo_text_processing.h index 9199417..833fc41 100644 --- a/swift-test/Sources/CNemoTextProcessing/include/nemo_text_processing.h +++ b/swift-test/Sources/CNemoTextProcessing/include/nemo_text_processing.h @@ -9,7 +9,12 @@ extern "C" { char* nemo_normalize(const char* input); char* nemo_normalize_sentence(const char* input); -char* nemo_normalize_sentence_with_max_span(const char* input, uint32_t max_span_tokens); +char* nemo_normalize_with_options(const char* input, uint32_t concat_compound_numbers); +char* nemo_normalize_sentence_with_options( + const char* input, + uint32_t concat_compound_numbers, + uint32_t max_span_tokens +); void nemo_add_rule(const char* spoken, const char* written); int32_t nemo_remove_rule(const char* spoken); void nemo_clear_rules(void); diff --git a/swift-test/Sources/NemoTest/NemoTest.swift b/swift-test/Sources/NemoTest/NemoTest.swift index e69f421..5d9add2 100644 --- a/swift-test/Sources/NemoTest/NemoTest.swift +++ b/swift-test/Sources/NemoTest/NemoTest.swift @@ -16,8 +16,22 @@ enum NemoTextProcessing { return String(cString: resultPtr) } - static func normalizeSentence(_ input: String, maxSpanTokens: UInt32) -> String { - guard let resultPtr = nemo_normalize_sentence_with_max_span(input, maxSpanTokens) else { return input } + static func normalizeSentence( + _ input: String, + concatCompoundNumbers: Bool = false, + maxSpanTokens: UInt32 = 0 + ) -> String { + let concatFlag: UInt32 = concatCompoundNumbers ? 1 : 0 + guard let resultPtr = nemo_normalize_sentence_with_options( + input, concatFlag, maxSpanTokens + ) else { return input } + defer { nemo_free_string(resultPtr) } + return String(cString: resultPtr) + } + + static func normalize(_ input: String, concatCompoundNumbers: Bool) -> String { + let concatFlag: UInt32 = concatCompoundNumbers ? 1 : 0 + guard let resultPtr = nemo_normalize_with_options(input, concatFlag) else { return input } defer { nemo_free_string(resultPtr) } return String(cString: resultPtr) } diff --git a/swift/NemoTextProcessing.swift b/swift/NemoTextProcessing.swift index 12a501c..d27d592 100644 --- a/swift/NemoTextProcessing.swift +++ b/swift/NemoTextProcessing.swift @@ -59,18 +59,53 @@ public enum NemoTextProcessing { return String(cString: resultPtr) } - /// Normalize a full sentence with a configurable max span size. + /// Normalize a full sentence with caller-specified options. /// /// - Parameters: /// - input: Sentence containing spoken-form spans - /// - maxSpanTokens: Maximum consecutive tokens per normalizable span (default 16) + /// - concatCompoundNumbers: When true, consecutive 0-99 number words + /// concatenate (e.g. `"thirty five sixty two"` → `"3562"`, + /// `"seven eighty eight"` → `"788"`) instead of adding. + /// - maxSpanTokens: Maximum consecutive tokens per normalizable span. + /// Pass `0` to use the library default (16). /// - Returns: Sentence with spoken-form spans replaced - public static func normalizeSentence(_ input: String, maxSpanTokens: UInt32) -> String { + public static func normalizeSentence( + _ input: String, + concatCompoundNumbers: Bool = false, + maxSpanTokens: UInt32 = 0 + ) -> String { + guard let cString = input.cString(using: .utf8) else { + return input + } + + let concatFlag: UInt32 = concatCompoundNumbers ? 1 : 0 + guard let resultPtr = nemo_normalize_sentence_with_options( + cString, concatFlag, maxSpanTokens + ) else { + return input + } + + defer { nemo_free_string(resultPtr) } + + return String(cString: resultPtr) + } + + /// Normalize a single spoken-form expression with caller-specified options. + /// + /// - Parameters: + /// - input: Spoken-form text + /// - concatCompoundNumbers: See `normalizeSentence(_:concatCompoundNumbers:maxSpanTokens:)`. + /// - Returns: Written-form text, or original if no normalization applies. + public static func normalize( + _ input: String, + concatCompoundNumbers: Bool + ) -> String { guard let cString = input.cString(using: .utf8) else { return input } - guard let resultPtr = nemo_normalize_sentence_with_max_span(cString, maxSpanTokens) else { + let concatFlag: UInt32 = concatCompoundNumbers ? 1 : 0 + guard let resultPtr = nemo_normalize_with_options(cString, concatFlag) else { return input } diff --git a/swift/include/nemo_text_processing.h b/swift/include/nemo_text_processing.h index b8ec478..7a2abec 100644 --- a/swift/include/nemo_text_processing.h +++ b/swift/include/nemo_text_processing.h @@ -33,13 +33,31 @@ char* nemo_normalize(const char* input); char* nemo_normalize_sentence(const char* input); /** - * Normalize a full sentence with a configurable max span size. + * Normalize a single spoken-form expression with caller-specified options. * * @param input Null-terminated UTF-8 string - * @param max_span_tokens Maximum number of consecutive tokens per span (default 16) + * @param concat_compound_numbers When non-zero, consecutive 0-99 number + * words concatenate (aviation flight-number style — e.g. + * "thirty five sixty two" -> "3562", "seven eighty eight" -> "788") + * instead of adding. + * @return Newly allocated string, must be freed with nemo_free_string(). + */ +char* nemo_normalize_with_options(const char* input, uint32_t concat_compound_numbers); + +/** + * Normalize a full sentence with caller-specified options. + * + * @param input Null-terminated UTF-8 string + * @param concat_compound_numbers See nemo_normalize_with_options. + * @param max_span_tokens Maximum consecutive tokens per span. Pass 0 to + * use the library default (16). * @return Newly allocated string, must be freed with nemo_free_string(). */ -char* nemo_normalize_sentence_with_max_span(const char* input, uint32_t max_span_tokens); +char* nemo_normalize_sentence_with_options( + const char* input, + uint32_t concat_compound_numbers, + uint32_t max_span_tokens +); /** * Add a custom spoken-to-written normalization rule. diff --git a/tests/en_tests.rs b/tests/en_tests.rs index 2a0bd62..b149270 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -7,11 +7,15 @@ mod common; use std::path::Path; use text_processing_rs::{ - custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, - normalize_sentence_with_max_span, normalize_sentence_with_options, normalize_with_options, - NormalizeOptions, + custom_rules, normalize, normalize_sentence, normalize_sentence_with_options, + normalize_with_options, NormalizeOptions, }; +/// Test helper: shorthand for the concat-compound (aviation-style) options. +fn concat_opts() -> NormalizeOptions { + NormalizeOptions::new().with_concat_compound_numbers(true) +} + fn print_failures(results: &common::TestResults) { for f in &results.failures { println!( @@ -554,7 +558,10 @@ fn test_max_span_tokens() { // Span of 2 is too short to catch "five dollars and fifty cents" (5 tokens) // but can still catch "five dollars" (2 tokens) → "$5" - let result = normalize_sentence_with_max_span("five dollars and fifty cents for lunch", 2); + let result = normalize_sentence_with_options( + "five dollars and fifty cents for lunch", + NormalizeOptions::new().with_max_span_tokens(2), + ); // With max_span=2, it can only see 2 tokens at a time // "five dollars" → "$5", "and" → pass, "fifty cents" → "$0.50" // The exact behavior depends on money tagger matching "fifty cents" alone @@ -563,7 +570,10 @@ fn test_max_span_tokens() { assert_ne!(result, "$5.50 for lunch"); // Span of 1 should basically only catch single-word tokens - let result_1 = normalize_sentence_with_max_span("I have twenty one apples", 1); + let result_1 = normalize_sentence_with_options( + "I have twenty one apples", + NormalizeOptions::new().with_max_span_tokens(1), + ); // "twenty" alone isn't meaningful as a cardinal in most taggers, // but "one" alone → "1" println!("max_span=1: {}", result_1); @@ -900,12 +910,12 @@ fn test_spelled_digit_cardinal_does_not_break_normal_cardinals() { assert_eq!(normalize("one thousand two hundred thirty four"), "1234"); } -/// Issue #14: aviation flight-number reading is exposed as an **opt-in** -/// pipeline. Generic dispatch keeps upstream NeMo semantics (date wins for -/// `"twenty one forty two"`, time wins for `"two thirty five"`); callers -/// who know they're in aviation context reach for the `*_aviation` -/// variants, which run aviation cardinal at priority 89 (above date 88 -/// and time 85). +/// Issue #14: concat-compound (aviation-style) reading is exposed as an +/// **opt-in** option. Default dispatch keeps upstream NeMo semantics +/// (date wins for `"twenty one forty two"`, time wins for `"two thirty +/// five"`); callers who know they're in aviation context pass +/// `concat_compound_numbers: true`, which runs the concat cardinal at +/// priority 89 (above date 88 and time 85). #[test] fn test_issue_14_default_dispatch_unchanged() { // Scale-word grammar is preserved everywhere. @@ -922,57 +932,61 @@ fn test_issue_14_default_dispatch_unchanged() { assert_eq!(normalize("seven eighty eight"), "788"); } -/// Aviation pipeline `normalize_aviation` (single-input). Aviation cardinal -/// runs early enough to beat time/date. +/// Single-input concat-compound mode. Aviation cardinal runs early enough +/// to beat time/date. #[test] fn test_issue_14_normalize_aviation() { - assert_eq!(normalize_aviation("seven eighty eight"), "788"); + let opts = concat_opts(); + assert_eq!(normalize_with_options("seven eighty eight", opts), "788"); // Beats time tagger. - assert_eq!(normalize_aviation("two thirty five"), "235"); + assert_eq!(normalize_with_options("two thirty five", opts), "235"); // Two consecutive 0-99 compounds concatenate (issue #23 fix). // Was previously `"63"` (= 20+1+40+2) under the old grammatical fallback. - assert_eq!(normalize_aviation("twenty one forty two"), "2142"); + assert_eq!(normalize_with_options("twenty one forty two", opts), "2142"); // Non-number phrases fall through unchanged. - assert_eq!(normalize_aviation("hello world"), "hello world"); + assert_eq!(normalize_with_options("hello world", opts), "hello world"); // Money / measure / decimal / ordinal still work via fallback to // standard `normalize`. - assert_eq!(normalize_aviation("five dollars"), "$5"); - assert_eq!(normalize_aviation("five point two"), "5.2"); - assert_eq!(normalize_aviation("twenty first"), "21st"); + assert_eq!(normalize_with_options("five dollars", opts), "$5"); + assert_eq!(normalize_with_options("five point two", opts), "5.2"); + assert_eq!(normalize_with_options("twenty first", opts), "21st"); // Scale-word grammar still wins (no digit prefix → grammatical). - assert_eq!(normalize_aviation("two thousand seventeen"), "2017"); + assert_eq!( + normalize_with_options("two thousand seventeen", opts), + "2017" + ); } -/// Aviation pipeline `normalize_sentence_aviation` (sentence mode). The -/// cardinal-aviation priority bump (89) makes flight-number spans win -/// over date/time in real sentences. +/// Sentence-mode concat-compound. The cardinal-aviation priority bump +/// (89) makes flight-number spans win over date/time in real sentences. #[test] fn test_issue_14_normalize_sentence_aviation() { + let opts = concat_opts(); // The original bug from issue #14. assert_eq!( - normalize_sentence_aviation("United seven eighty eight"), + normalize_sentence_with_options("United seven eighty eight", opts), "United 788" ); assert_eq!( - normalize_sentence_aviation("flight two thirty five departs at gate four"), + normalize_sentence_with_options("flight two thirty five departs at gate four", opts), "flight 235 departs at gate 4" ); // Scale-word grammar is preserved. assert_eq!( - normalize_sentence_aviation("two thousand seventeen"), + normalize_sentence_with_options("two thousand seventeen", opts), "2017" ); // Money / measure stay above aviation (priority 95 / 90 > 89). assert_eq!( - normalize_sentence_aviation("I owe five dollars"), + normalize_sentence_with_options("I owe five dollars", opts), "I owe $5" ); // Plain natural language is untouched. assert_eq!( - normalize_sentence_aviation("I have twenty one apples"), + normalize_sentence_with_options("I have twenty one apples", opts), "I have 21 apples" ); } @@ -989,8 +1003,8 @@ fn test_options_default_matches_normalize() { assert_eq!(normalize_with_options("two thirty five", opts), "02:35"); } -/// `concat_compound_numbers: true` should make `normalize_with_options` -/// behave like `normalize_aviation`. +/// `concat_compound_numbers: true` enables aviation-style concat-compound +/// reading on the unified single-expression path. #[test] fn test_options_concat_matches_aviation() { let opts = NormalizeOptions::new().with_concat_compound_numbers(true); @@ -1019,7 +1033,8 @@ fn test_sentence_options_default_matches_default() { ); } -/// Sentence mode with concat enabled matches `normalize_sentence_aviation`. +/// Sentence mode with concat enabled produces aviation-style flight-number +/// spans. #[test] fn test_sentence_options_concat_compound() { let opts = NormalizeOptions::new().with_concat_compound_numbers(true); @@ -1072,30 +1087,41 @@ fn test_sentence_options_builder_compose() { /// `"thirty five sixty two"` → `"3562"`, not `"97"` (= 35 + 62). #[test] fn test_issue_23_compound_concat() { + let opts = concat_opts(); + // Whole-input single-expression form. - assert_eq!(normalize_aviation("thirty five sixty two"), "3562"); + assert_eq!( + normalize_with_options("thirty five sixty two", opts), + "3562" + ); // Sentence form — the original report. assert_eq!( - normalize_sentence_aviation( - "Alright thirty five sixty two appreciate your help United seven eighty eight" + normalize_sentence_with_options( + "Alright thirty five sixty two appreciate your help United seven eighty eight", + opts, ), "Alright 3562 appreciate your help United 788" ); - // Through the unified options API too. - let opts = NormalizeOptions::new().with_concat_compound_numbers(true); + // Through the sentence options API too. assert_eq!( normalize_sentence_with_options("thirty five sixty two", opts), "3562" ); // Mixed digit prefix + compounds: "two thirty five sixty two" → 23562. - assert_eq!(normalize_aviation("two thirty five sixty two"), "23562"); + assert_eq!( + normalize_with_options("two thirty five sixty two", opts), + "23562" + ); // Single chunks must NOT concatenate (preserves grammatical reading). - assert_eq!(normalize_aviation("twenty one"), "21"); + assert_eq!(normalize_with_options("twenty one", opts), "21"); // Scale words still anchor grammatical addition. - assert_eq!(normalize_aviation("two thousand seventeen"), "2017"); + assert_eq!( + normalize_with_options("two thousand seventeen", opts), + "2017" + ); } diff --git a/tests/extensive_tests.rs b/tests/extensive_tests.rs index c369cfc..f850c1a 100644 --- a/tests/extensive_tests.rs +++ b/tests/extensive_tests.rs @@ -5,8 +5,8 @@ //! boundary conditions, roundtrip consistency, and cross-tagger interference. use text_processing_rs::{ - normalize, normalize_sentence, normalize_sentence_with_max_span, tn_normalize, - tn_normalize_sentence, + normalize, normalize_sentence, normalize_sentence_with_options, tn_normalize, + tn_normalize_sentence, NormalizeOptions, }; // ════════════════════════════════════════════════════════════════════════ @@ -953,7 +953,10 @@ fn test_sentence_itn_single_word_number() { #[test] fn test_sentence_itn_max_span_tokens() { // With max_span=1, multi-word expressions shouldn't be matched - let result = normalize_sentence_with_max_span("twenty one", 1); + let result = normalize_sentence_with_options( + "twenty one", + NormalizeOptions::new().with_max_span_tokens(1), + ); // With span=1, "twenty" alone and "one" alone are both single cardinals // This tests the sliding window behavior assert_eq!(result, "20 1"); From c3c79e2b39b8ac01182e0c41edf96f90a0ca562e Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Mon, 27 Apr 2026 00:09:25 -0400 Subject: [PATCH 3/4] refactor: extract NormalizeOptions into dedicated `options` module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moves `NormalizeOptions`, its builder methods, and `DEFAULT_MAX_SPAN_TOKENS` out of `src/lib.rs` and into a new `src/options.rs`. The struct is the extension point for caller-tunable normalization behavior, and giving it its own module makes room for richer per-field documentation and future flags without further bloating the crate root. Each field now carries: - a "Default" line stating the no-op behavior - a bulleted list of concrete input → output examples - the originating issue number for the behavior - guidance on which use cases want it on/off - explicit interaction notes (which other taggers still win) Also documents the `with_*` builder convention as the preferred construction path so new fields can land without breaking existing call sites that use struct literals. `pub use options::{NormalizeOptions, DEFAULT_MAX_SPAN_TOKENS};` keeps the public path stable — no changes required in FFI, WASM, Swift, or test code. All Rust + FFI tests pass; WASM check clean. --- src/lib.rs | 68 ++----------------------- src/options.rs | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 65 deletions(-) create mode 100644 src/options.rs diff --git a/src/lib.rs b/src/lib.rs index 7920f81..cd58e2e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,6 +18,7 @@ pub mod custom_rules; pub mod itn; +pub mod options; pub mod tn; #[cfg(feature = "ffi")] @@ -25,73 +26,13 @@ pub mod ffi; #[cfg(all(target_arch = "wasm32", feature = "wasm"))] pub mod wasm; +pub use options::{NormalizeOptions, DEFAULT_MAX_SPAN_TOKENS}; + use itn::en::{ cardinal, date, decimal, electronic, measure, money, ordinal, punctuation, telephone, time, whitelist, word, }; -/// Options for the unified [`normalize_with_options`] / -/// [`normalize_sentence_with_options`] entry points. -/// -/// Keeping options on a struct (rather than separate `*_aviation` / -/// `*_with_max_span` functions) lets new knobs land without exploding the -/// public API surface — see issues #15 and #23 for the motivating discussion. -/// -/// The flags are intentionally orthogonal and *not* tied to a particular -/// domain. Aviation, military codes, dispatch IDs, etc. all reuse the same -/// underlying behavior toggles. -/// -/// # Examples -/// -/// ``` -/// use text_processing_rs::{normalize_sentence_with_options, NormalizeOptions}; -/// -/// let opts = NormalizeOptions { -/// concat_compound_numbers: true, -/// max_span_tokens: Some(8), -/// }; -/// assert_eq!( -/// normalize_sentence_with_options("United seven eighty eight", opts), -/// "United 788" -/// ); -/// ``` -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] -pub struct NormalizeOptions { - /// When `true`, sequences of spoken number words concatenate rather than - /// add. `"thirty five sixty two"` → `"3562"` (instead of `35 + 62 = 97`) - /// and `"seven eighty eight"` → `"788"`. Aviation, flight-numbers, - /// call-signs, and other code-style readings want this on. - /// - /// Scale-word grammar is preserved: `"two thousand seventeen"` still - /// resolves to `"2017"` regardless of this flag. - pub concat_compound_numbers: bool, - /// Maximum span size (tokens) considered in sentence mode. `None` means - /// use [`DEFAULT_MAX_SPAN_TOKENS`]. Ignored by [`normalize_with_options`]. - pub max_span_tokens: Option, -} - -impl NormalizeOptions { - /// Default options: standard ITN dispatch, default max span. - pub const fn new() -> Self { - Self { - concat_compound_numbers: false, - max_span_tokens: None, - } - } - - /// Enable / disable compound-number concatenation. - pub const fn with_concat_compound_numbers(mut self, enabled: bool) -> Self { - self.concat_compound_numbers = enabled; - self - } - - /// Set the sentence-mode max span (in tokens). - pub const fn with_max_span_tokens(mut self, max_span_tokens: usize) -> Self { - self.max_span_tokens = Some(max_span_tokens); - self - } -} - /// Normalize spoken-form text to written form. /// /// Tries taggers in order of specificity (most specific first). @@ -946,9 +887,6 @@ fn tn_parse_span_lang(span: &str, lang: &str) -> Option<(String, u8)> { None } -/// Default maximum token span to consider when scanning a sentence. -const DEFAULT_MAX_SPAN_TOKENS: usize = 16; - /// Try to parse a span of text using sentence-safe taggers. /// /// Returns `(replacement, priority_score)` if a tagger matches. diff --git a/src/options.rs b/src/options.rs new file mode 100644 index 0000000..78a76f3 --- /dev/null +++ b/src/options.rs @@ -0,0 +1,132 @@ +//! Public configuration for the unified `*_with_options` entry points. +//! +//! [`NormalizeOptions`] is the single extension point for caller-tunable +//! normalization behavior. Each field is an **orthogonal behavior flag**, not +//! a domain label — aviation flight numbers, sports scores, dispatch IDs and +//! similar code-style readings all reuse the same toggles, and new knobs are +//! added as additional fields rather than as new enum variants or new +//! function names. +//! +//! See issues +//! [#14](https://github.com/FluidInference/text-processing-rs/issues/14), +//! [#15](https://github.com/FluidInference/text-processing-rs/issues/15) and +//! [#23](https://github.com/FluidInference/text-processing-rs/issues/23) for +//! the motivating discussion on why this is a struct rather than a `Domain` +//! enum. +//! +//! # Stability +//! +//! New fields may be added in minor releases. Always construct +//! [`NormalizeOptions`] via [`NormalizeOptions::new`] (or `default()`) and +//! the chainable `with_*` methods — direct struct literals will break when +//! new fields are introduced. +//! +//! # Examples +//! +//! ``` +//! use text_processing_rs::{normalize_sentence_with_options, NormalizeOptions}; +//! +//! // Aviation / flight-number style: consecutive 0-99 chunks concatenate. +//! let opts = NormalizeOptions::new() +//! .with_concat_compound_numbers(true) +//! .with_max_span_tokens(8); +//! +//! assert_eq!( +//! normalize_sentence_with_options("United seven eighty eight", opts), +//! "United 788" +//! ); +//! ``` + +/// Default maximum token span to consider when scanning a sentence. +/// +/// Used by [`crate::normalize_sentence`] and by +/// [`crate::normalize_sentence_with_options`] when +/// [`NormalizeOptions::max_span_tokens`] is `None`. +pub const DEFAULT_MAX_SPAN_TOKENS: usize = 16; + +/// Caller-tunable knobs for the unified +/// [`crate::normalize_with_options`] / +/// [`crate::normalize_sentence_with_options`] entry points. +/// +/// Construct via [`NormalizeOptions::new`] or [`Default::default`] and +/// configure with the chainable `with_*` methods so future fields don't +/// break existing call sites. +/// +/// All fields default to behavior matching plain +/// [`crate::normalize`] / [`crate::normalize_sentence`] — opt-in is the +/// only way to change semantics. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct NormalizeOptions { + /// Read consecutive small-number compounds as concatenated digit groups + /// instead of summing them. + /// + /// **Default:** `false` (preserve upstream NeMo grammatical reading). + /// + /// When `true`, the priority-89 aviation cardinal pass runs ahead of the + /// time/date taggers and uses [`peel_compound_chunks`] semantics: + /// - `"seven eighty eight"` → `"788"` (was `"95"` = 7 + 88) — issue + /// [#14](https://github.com/FluidInference/text-processing-rs/issues/14) + /// - `"thirty five sixty two"` → `"3562"` (was `"97"` = 35 + 62) — + /// issue + /// [#23](https://github.com/FluidInference/text-processing-rs/issues/23) + /// - `"two thirty five sixty two"` → `"23562"` + /// - `"two thousand seventeen"` → `"2017"` (scale words still anchor + /// grammatical addition) + /// - `"twenty one"` → `"21"` (single chunks never concatenate) + /// + /// Use cases: aviation flight numbers / call-signs, sports scores, + /// jersey/room numbers, dispatch IDs, any code-style reading where + /// consecutive small numbers should remain distinct. + /// + /// Money, measure, decimal and ordinal taggers retain their normal + /// priorities and continue to win where they apply (e.g. + /// `"five dollars"` → `"$5"` regardless of this flag). + /// + /// [`peel_compound_chunks`]: ../itn/en/cardinal/fn.peel_compound_chunks.html + pub concat_compound_numbers: bool, + + /// Maximum span size (in whitespace-separated tokens) considered by the + /// sliding-window sentence scanner. + /// + /// **Default:** `None`, which resolves to [`DEFAULT_MAX_SPAN_TOKENS`] + /// (currently `16`). + /// + /// Lower values trade recall for speed and false-positive resistance — + /// a span of `2` will catch `"twenty one"` → `"21"` but not the + /// 5-token `"five dollars and fifty cents"` → `"$5.50"`. A span of `1` + /// disables multi-token matching entirely. + /// + /// Ignored by [`crate::normalize_with_options`] — single-expression + /// mode does not slide. + pub max_span_tokens: Option, +} + +impl NormalizeOptions { + /// Construct an options bag with all fields at their library defaults. + /// + /// Equivalent to [`Default::default`] but `const`, so it can be used + /// in `const` contexts. + pub const fn new() -> Self { + Self { + concat_compound_numbers: false, + max_span_tokens: None, + } + } + + /// Toggle [`Self::concat_compound_numbers`] (concatenate consecutive + /// small-number chunks instead of summing them). + pub const fn with_concat_compound_numbers(mut self, enabled: bool) -> Self { + self.concat_compound_numbers = enabled; + self + } + + /// Set [`Self::max_span_tokens`] (sentence-mode sliding-window cap). + /// + /// Pass [`DEFAULT_MAX_SPAN_TOKENS`] explicitly to lock in the current + /// default; pass `0` for single-token-only matching (rarely useful + /// outside tests). + pub const fn with_max_span_tokens(mut self, max_span_tokens: usize) -> Self { + self.max_span_tokens = Some(max_span_tokens); + self + } +} From a4321d895cc1d249d14e80681fd0c4e13f3de1cb Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Mon, 27 Apr 2026 00:14:37 -0400 Subject: [PATCH 4/4] docs: trim NormalizeOptions doc comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compresses the per-field and module-level docs in `src/options.rs` down to the essentials. The two flags are the user-facing priority — readers need the one-line behavior, the issue ref, and the default; everything else (use-case lists, interaction tables, exhaustive examples) belongs in the README and the integration tests, not in rustdoc bullet walls. --- src/options.rs | 118 +++++++------------------------------------------ 1 file changed, 15 insertions(+), 103 deletions(-) diff --git a/src/options.rs b/src/options.rs index 78a76f3..24b5c99 100644 --- a/src/options.rs +++ b/src/options.rs @@ -1,111 +1,28 @@ -//! Public configuration for the unified `*_with_options` entry points. +//! Caller-tunable options for the unified `*_with_options` entry points. //! -//! [`NormalizeOptions`] is the single extension point for caller-tunable -//! normalization behavior. Each field is an **orthogonal behavior flag**, not -//! a domain label — aviation flight numbers, sports scores, dispatch IDs and -//! similar code-style readings all reuse the same toggles, and new knobs are -//! added as additional fields rather than as new enum variants or new -//! function names. -//! -//! See issues -//! [#14](https://github.com/FluidInference/text-processing-rs/issues/14), -//! [#15](https://github.com/FluidInference/text-processing-rs/issues/15) and -//! [#23](https://github.com/FluidInference/text-processing-rs/issues/23) for -//! the motivating discussion on why this is a struct rather than a `Domain` -//! enum. -//! -//! # Stability -//! -//! New fields may be added in minor releases. Always construct -//! [`NormalizeOptions`] via [`NormalizeOptions::new`] (or `default()`) and -//! the chainable `with_*` methods — direct struct literals will break when -//! new fields are introduced. -//! -//! # Examples -//! -//! ``` -//! use text_processing_rs::{normalize_sentence_with_options, NormalizeOptions}; -//! -//! // Aviation / flight-number style: consecutive 0-99 chunks concatenate. -//! let opts = NormalizeOptions::new() -//! .with_concat_compound_numbers(true) -//! .with_max_span_tokens(8); -//! -//! assert_eq!( -//! normalize_sentence_with_options("United seven eighty eight", opts), -//! "United 788" -//! ); -//! ``` +//! Construct via [`NormalizeOptions::new`] + chainable `with_*` methods so +//! new fields can land without breaking existing call sites. -/// Default maximum token span to consider when scanning a sentence. -/// -/// Used by [`crate::normalize_sentence`] and by -/// [`crate::normalize_sentence_with_options`] when -/// [`NormalizeOptions::max_span_tokens`] is `None`. +/// Default sentence-mode sliding-window cap. pub const DEFAULT_MAX_SPAN_TOKENS: usize = 16; -/// Caller-tunable knobs for the unified -/// [`crate::normalize_with_options`] / -/// [`crate::normalize_sentence_with_options`] entry points. -/// -/// Construct via [`NormalizeOptions::new`] or [`Default::default`] and -/// configure with the chainable `with_*` methods so future fields don't -/// break existing call sites. -/// -/// All fields default to behavior matching plain -/// [`crate::normalize`] / [`crate::normalize_sentence`] — opt-in is the -/// only way to change semantics. +/// Options for [`crate::normalize_with_options`] and +/// [`crate::normalize_sentence_with_options`]. Defaults match plain +/// [`crate::normalize`] / [`crate::normalize_sentence`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] pub struct NormalizeOptions { - /// Read consecutive small-number compounds as concatenated digit groups - /// instead of summing them. - /// - /// **Default:** `false` (preserve upstream NeMo grammatical reading). - /// - /// When `true`, the priority-89 aviation cardinal pass runs ahead of the - /// time/date taggers and uses [`peel_compound_chunks`] semantics: - /// - `"seven eighty eight"` → `"788"` (was `"95"` = 7 + 88) — issue - /// [#14](https://github.com/FluidInference/text-processing-rs/issues/14) - /// - `"thirty five sixty two"` → `"3562"` (was `"97"` = 35 + 62) — - /// issue - /// [#23](https://github.com/FluidInference/text-processing-rs/issues/23) - /// - `"two thirty five sixty two"` → `"23562"` - /// - `"two thousand seventeen"` → `"2017"` (scale words still anchor - /// grammatical addition) - /// - `"twenty one"` → `"21"` (single chunks never concatenate) - /// - /// Use cases: aviation flight numbers / call-signs, sports scores, - /// jersey/room numbers, dispatch IDs, any code-style reading where - /// consecutive small numbers should remain distinct. - /// - /// Money, measure, decimal and ordinal taggers retain their normal - /// priorities and continue to win where they apply (e.g. - /// `"five dollars"` → `"$5"` regardless of this flag). - /// - /// [`peel_compound_chunks`]: ../itn/en/cardinal/fn.peel_compound_chunks.html + /// Concatenate consecutive small-number chunks instead of summing them. + /// `"seven eighty eight"` → `"788"` (issue #14), `"thirty five sixty + /// two"` → `"3562"` (issue #23). Default `false`. pub concat_compound_numbers: bool, - /// Maximum span size (in whitespace-separated tokens) considered by the - /// sliding-window sentence scanner. - /// - /// **Default:** `None`, which resolves to [`DEFAULT_MAX_SPAN_TOKENS`] - /// (currently `16`). - /// - /// Lower values trade recall for speed and false-positive resistance — - /// a span of `2` will catch `"twenty one"` → `"21"` but not the - /// 5-token `"five dollars and fifty cents"` → `"$5.50"`. A span of `1` - /// disables multi-token matching entirely. - /// - /// Ignored by [`crate::normalize_with_options`] — single-expression - /// mode does not slide. + /// Sentence-mode sliding-window cap (in tokens). `None` uses + /// [`DEFAULT_MAX_SPAN_TOKENS`]. Ignored in single-expression mode. pub max_span_tokens: Option, } impl NormalizeOptions { - /// Construct an options bag with all fields at their library defaults. - /// - /// Equivalent to [`Default::default`] but `const`, so it can be used - /// in `const` contexts. + /// `const` constructor with library defaults. pub const fn new() -> Self { Self { concat_compound_numbers: false, @@ -113,18 +30,13 @@ impl NormalizeOptions { } } - /// Toggle [`Self::concat_compound_numbers`] (concatenate consecutive - /// small-number chunks instead of summing them). + /// Set [`Self::concat_compound_numbers`]. pub const fn with_concat_compound_numbers(mut self, enabled: bool) -> Self { self.concat_compound_numbers = enabled; self } - /// Set [`Self::max_span_tokens`] (sentence-mode sliding-window cap). - /// - /// Pass [`DEFAULT_MAX_SPAN_TOKENS`] explicitly to lock in the current - /// default; pass `0` for single-token-only matching (rarely useful - /// outside tests). + /// Set [`Self::max_span_tokens`]. pub const fn with_max_span_tokens(mut self, max_span_tokens: usize) -> Self { self.max_span_tokens = Some(max_span_tokens); self