From 4a87700dd22459224aab8a729e58067b95889d6b Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 12:32:59 -0400 Subject: [PATCH 1/9] fix(en): treat spelled-digit sequences as digit concatenation (#15) Aviation/code-style readings like 'one three five' previously fell through cardinal::words_to_number's running-sum loop and returned 9 (1+3+5) instead of 135. This cascaded into: - normalize_sentence('frequency one three five point six two five') producing 'frequency 9.625' instead of 'frequency 135.625' - The telephone tagger silently swallowing 'point' and emitting '135-625' for normalize() of the same input Fix: - cardinal::words_to_number now recognises a multi-token sequence of pure single-digit words (zero-nine, plus oh/o for 0) and concatenates them as digits, matching how humans read aviation frequencies, flight numbers, codes, etc. Single-token and cardinal-with-scale inputs (twenty one, one hundred thirty five, one thousand two hundred thirty four, etc.) are unaffected. - telephone::parse now rejects inputs containing ' point ' so the decimal tagger handles them instead of the telephone tagger formatting them with hyphens. Tests: - cardinal: spelled-digit sequence parsing (135, 737, 911, 12, 505, 404) and words_to_number direct calls. - decimal: 'one three five point six two five' -> 135.625 and related variants. - telephone: rejects inputs with ' point '. - en_tests: direct reproductions of issue #15 (both normalize and normalize_sentence) plus regressions for normal cardinal phrasings. - en_tn_tests: release-sanity test for tn_normalize_sentence (the public API hongbo-miao reported missing in 0.1.0 / issue #16). Closes #15. --- src/itn/en/cardinal.rs | 54 ++++++++++++++++++++++++++++++++++++++ src/itn/en/decimal.rs | 19 ++++++++++++++ src/itn/en/telephone.rs | 15 +++++++++++ tests/en_tests.rs | 57 +++++++++++++++++++++++++++++++++++++++++ tests/en_tn_tests.rs | 18 +++++++++++++ 5 files changed, 163 insertions(+) diff --git a/src/itn/en/cardinal.rs b/src/itn/en/cardinal.rs index 497e662..ed4d853 100644 --- a/src/itn/en/cardinal.rs +++ b/src/itn/en/cardinal.rs @@ -116,6 +116,25 @@ pub fn parse(input: &str) -> Option { } } +/// Map a single-digit spoken word to its character form, or `None` if the +/// word isn't a 0-9 digit word. Recognises "oh" / "o" as 0 (common in +/// spelled-out codes and aviation frequencies). +fn single_digit_char(word: &str) -> Option { + Some(match word { + "zero" | "oh" | "o" => '0', + "one" => '1', + "two" => '2', + "three" => '3', + "four" => '4', + "five" => '5', + "six" => '6', + "seven" => '7', + "eight" => '8', + "nine" => '9', + _ => return None, + }) +} + /// Convert spoken number words to integer. /// /// Algorithm: @@ -139,6 +158,21 @@ pub fn words_to_number(input: &str) -> Option { return None; } + // Handle digit-by-digit reading: "one three five" → 135. + // When every token is a single-digit word (zero-nine, plus "oh"/"o" for 0) + // and there are at least two of them, interpret as digit concatenation. + // This is the standard reading for flight numbers, frequencies, codes, + // and the integer part of "frequency one three five point six two five" + // style inputs (issue #15). Single tokens like "one" still parse as 1 + // via the normal path. + if words.len() >= 2 && words.iter().all(|w| single_digit_char(w).is_some()) { + let mut s = String::with_capacity(words.len()); + for w in &words { + s.push(single_digit_char(w)?); + } + return s.parse::().ok(); + } + // Handle special case: "eleven hundred" = 1100 if words.len() == 2 && words[1] == "hundred" { if let Some(&val) = ONES.get(words[0]) { @@ -298,4 +332,24 @@ mod tests { assert_eq!(parse("hello"), None); assert_eq!(parse("one hello"), None); } + + /// Digit-by-digit reading (issue #15). Sequences of single-digit words + /// like "one three five" should concatenate to 135, not sum to 9. + #[test] + fn test_spelled_digit_sequence() { + assert_eq!(parse("one three five"), Some("135".to_string())); + assert_eq!(parse("seven three seven"), Some("737".to_string())); + assert_eq!(parse("nine one one"), Some("911".to_string())); + assert_eq!(parse("six two five"), Some("625".to_string())); + assert_eq!(parse("one two"), Some("12".to_string())); + // "oh"/"o" read as 0 in spelled codes + assert_eq!(parse("five oh five"), Some("505".to_string())); + assert_eq!(parse("four o four"), Some("404".to_string())); + } + + #[test] + fn test_words_to_number_digit_sequence() { + assert_eq!(words_to_number("one three five"), Some(135)); + assert_eq!(words_to_number("six two five"), Some(625)); + } } diff --git a/src/itn/en/decimal.rs b/src/itn/en/decimal.rs index 4e3e235..c96a9e4 100644 --- a/src/itn/en/decimal.rs +++ b/src/itn/en/decimal.rs @@ -177,4 +177,23 @@ mod tests { Some("4.85 billion".to_string()) ); } + + /// Digit-by-digit reading of the integer part (issue #15). + /// Aviation-style frequencies like "one three five point six two five" + /// should produce 135.625, not 9.625. + #[test] + fn test_spelled_digit_integer_part() { + assert_eq!( + parse("one three five point six two five"), + Some("135.625".to_string()) + ); + assert_eq!( + parse("seven three seven point five"), + Some("737.5".to_string()) + ); + assert_eq!( + parse("one two point three four"), + Some("12.34".to_string()) + ); + } } diff --git a/src/itn/en/telephone.rs b/src/itn/en/telephone.rs index 59a3c70..45ece4a 100644 --- a/src/itn/en/telephone.rs +++ b/src/itn/en/telephone.rs @@ -17,6 +17,13 @@ pub fn parse(input: &str) -> Option { return None; } + // Reject inputs that contain a decimal "point" — those belong to the + // decimal tagger (e.g. "one three five point six two five" should be + // "135.625", not "135-625"). See issue #15. + if input_trimmed.contains(" point ") { + return None; + } + // Try IP address pattern first (contains "dot") if input_trimmed.contains(" dot ") { return parse_ip_address(input_trimmed); @@ -719,4 +726,12 @@ mod tests { Some("123.123.0.40".to_string()) ); } + + /// Telephone tagger must not consume decimal expressions (issue #15): + /// "one three five point six two five" is "135.625", not a phone number. + #[test] + fn test_rejects_decimal_point() { + assert_eq!(parse("one three five point six two five"), None); + assert_eq!(parse("seven three seven point five"), None); + } } diff --git a/tests/en_tests.rs b/tests/en_tests.rs index 81fed02..555b54b 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -840,3 +840,60 @@ fn test_sentence_decimal_in_context() { "the value is 3.14" ); } + +// --- Issue #15: digit-by-digit integer part for decimals (aviation style) --- + +/// Direct reproductions of https://github.com/FluidInference/text-processing-rs/issues/15 +#[test] +fn test_issue_15_normalize_aviation_frequency() { + // The whole input contains a non-number prefix ("frequency"), so single- + // expression mode can't return a clean number — it should leave the input + // unchanged rather than producing the previously-buggy "135-625" telephone + // formatting. + let out = normalize("frequency one three five point six two five"); + assert_ne!(out, "135-625", "telephone tagger should not match decimal input"); + assert_eq!(out, "frequency one three five point six two five"); +} + +#[test] +fn test_issue_15_normalize_sentence_aviation_frequency() { + assert_eq!( + normalize_sentence("frequency one three five point six two five"), + "frequency 135.625" + ); +} + +#[test] +fn test_issue_15_decimal_with_spelled_digit_integer() { + // Without the prefix, the whole input is a single decimal expression. + assert_eq!( + normalize("one three five point six two five"), + "135.625" + ); +} + +#[test] +fn test_issue_15_decimal_with_spelled_digit_integer_in_sentence() { + assert_eq!( + normalize_sentence("the tower said one three five point six two five"), + "the tower said 135.625" + ); +} + +#[test] +fn test_spelled_digit_cardinal() { + // Digit-by-digit reading of cardinals (codes, flight numbers, frequencies) + assert_eq!(normalize("one three five"), "135"); + assert_eq!(normalize("seven three seven"), "737"); + assert_eq!(normalize("nine one one"), "911"); + // "oh" reads as zero in spelled codes + assert_eq!(normalize("five oh five"), "505"); +} + +#[test] +fn test_spelled_digit_cardinal_does_not_break_normal_cardinals() { + // Existing cardinal phrasings must still work + assert_eq!(normalize("twenty one"), "21"); + assert_eq!(normalize("one hundred thirty five"), "135"); + assert_eq!(normalize("one thousand two hundred thirty four"), "1234"); +} diff --git a/tests/en_tn_tests.rs b/tests/en_tn_tests.rs index f96c9f7..c60ee8b 100644 --- a/tests/en_tn_tests.rs +++ b/tests/en_tn_tests.rs @@ -212,3 +212,21 @@ fn test_tn_roundtrip_cardinal() { ); } } + +/// Release-sanity check for issue #16: the public surface that hongbo-miao +/// reported missing in crates.io 0.1.0 must compile and behave correctly +/// against the current version. If this test stops compiling, the API +/// surface regressed and consumers will hit `unresolved import`. +#[test] +fn test_issue_16_tn_normalize_sentence_public_api() { + // Reporter's exact snippet (https://github.com/FluidInference/text-processing-rs/issues/16) + let normalized_text = tn_normalize_sentence("I have twenty one apples."); + // The function must exist, be callable, and return a non-empty result. + // Behavioural assertion: the cardinal "21" is the only normalizable + // span here, and TN should leave the rest of the sentence intact. + assert!( + normalized_text.contains("twenty one") || normalized_text.contains("21"), + "tn_normalize_sentence dropped or mangled the input: {}", + normalized_text + ); +} From a91a286693a3e23fec4bd8cf214e29be85aefd22 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 12:40:33 -0400 Subject: [PATCH 2/9] test(en): drop time-conflicting "five oh five" assertion, fix fmt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The time tagger correctly handles "five oh five" → "05:05" patterns, which intercepted before telephone in the normalize() priority chain. Drop that assertion since it's testing time semantics, not cardinal digit-concat — the unit test in cardinal.rs already covers the direct words_to_number → 505 path which doesn't go through normalize. Also fix multi-line assert layout to match rustfmt output. --- tests/en_tests.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/en_tests.rs b/tests/en_tests.rs index 555b54b..3355950 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -851,7 +851,10 @@ fn test_issue_15_normalize_aviation_frequency() { // unchanged rather than producing the previously-buggy "135-625" telephone // formatting. let out = normalize("frequency one three five point six two five"); - assert_ne!(out, "135-625", "telephone tagger should not match decimal input"); + assert_ne!( + out, "135-625", + "telephone tagger should not match decimal input" + ); assert_eq!(out, "frequency one three five point six two five"); } @@ -866,10 +869,7 @@ fn test_issue_15_normalize_sentence_aviation_frequency() { #[test] fn test_issue_15_decimal_with_spelled_digit_integer() { // Without the prefix, the whole input is a single decimal expression. - assert_eq!( - normalize("one three five point six two five"), - "135.625" - ); + assert_eq!(normalize("one three five point six two five"), "135.625"); } #[test] @@ -882,12 +882,12 @@ fn test_issue_15_decimal_with_spelled_digit_integer_in_sentence() { #[test] fn test_spelled_digit_cardinal() { - // Digit-by-digit reading of cardinals (codes, flight numbers, frequencies) + // Digit-by-digit reading of cardinals (codes, flight numbers, frequencies). + // Note: sequences that match clock-time patterns ("five oh five") are + // intentionally handled by the time tagger and are not asserted here. assert_eq!(normalize("one three five"), "135"); assert_eq!(normalize("seven three seven"), "737"); assert_eq!(normalize("nine one one"), "911"); - // "oh" reads as zero in spelled codes - assert_eq!(normalize("five oh five"), "505"); } #[test] From 6cb52bdbbebf5309b61dd5a6a5d357e8689d3371 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 12:45:11 -0400 Subject: [PATCH 3/9] fmt: collapse short assert_eq to single line in decimal tests --- src/itn/en/decimal.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/itn/en/decimal.rs b/src/itn/en/decimal.rs index c96a9e4..a7bff1e 100644 --- a/src/itn/en/decimal.rs +++ b/src/itn/en/decimal.rs @@ -191,9 +191,6 @@ mod tests { parse("seven three seven point five"), Some("737.5".to_string()) ); - assert_eq!( - parse("one two point three four"), - Some("12.34".to_string()) - ); + assert_eq!(parse("one two point three four"), Some("12.34".to_string())); } } From 38eba52ab3ac4b8eafe37c419d40a0d35db6a97a Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 13:04:06 -0400 Subject: [PATCH 4/9] refactor(en/cardinal): split words_to_number into two readings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the len >= 2 guard on the digit-concat path — single-token inputs like "one" produce the same answer either way. Move the running-sum/scale-multiplication logic out into grammatical_words_to_number, so the entry point reads as the actual disambiguation: "all single-digit words → concat; otherwise → grammar". No behavioural change vs. the previous commit on this branch; the existing test suite covers both paths. --- src/itn/en/cardinal.rs | 62 ++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/src/itn/en/cardinal.rs b/src/itn/en/cardinal.rs index ed4d853..244a563 100644 --- a/src/itn/en/cardinal.rs +++ b/src/itn/en/cardinal.rs @@ -137,16 +137,16 @@ fn single_digit_char(word: &str) -> Option { /// Convert spoken number words to integer. /// -/// Algorithm: -/// 1. Tokenize input -/// 2. Process left-to-right, accumulating values -/// 3. Scale words (hundred, thousand, million) multiply the current accumulator -/// 4. Handle "and" as a separator (ignored) +/// Two readings are accepted: +/// - **Digit-by-digit** (codes, flight numbers, aviation frequencies): +/// `"one three five"` → `135`. Triggered when every token is a single-digit +/// word (`zero`-`nine`, plus `oh`/`o` for `0`). +/// - **Grammatical** (English number grammar): `"twenty one"` → `21`, +/// `"one hundred twenty three"` → `123`, `"one thousand two hundred thirty +/// four"` → `1234`. Uses a left-to-right accumulator with scale words +/// (`hundred`, `thousand`, `million`, ...) multiplying the current group. /// -/// Examples: -/// - "twenty one" → 20 + 1 = 21 -/// - "one hundred twenty three" → (1 * 100) + 20 + 3 = 123 -/// - "one thousand two hundred thirty four" → (1 * 1000) + (2 * 100) + 30 + 4 = 1234 +/// Filler words `"and"` and `"a"` are stripped. pub fn words_to_number(input: &str) -> Option { let input = input.to_lowercase(); let words: Vec<&str> = input @@ -158,25 +158,25 @@ pub fn words_to_number(input: &str) -> Option { return None; } - // Handle digit-by-digit reading: "one three five" → 135. - // When every token is a single-digit word (zero-nine, plus "oh"/"o" for 0) - // and there are at least two of them, interpret as digit concatenation. - // This is the standard reading for flight numbers, frequencies, codes, - // and the integer part of "frequency one three five point six two five" - // style inputs (issue #15). Single tokens like "one" still parse as 1 - // via the normal path. - if words.len() >= 2 && words.iter().all(|w| single_digit_char(w).is_some()) { - let mut s = String::with_capacity(words.len()); - for w in &words { - s.push(single_digit_char(w)?); - } - return s.parse::().ok(); + // Digit-by-digit reading wins whenever it's unambiguous. + if words.iter().all(|w| single_digit_char(w).is_some()) { + return words + .iter() + .map(|w| single_digit_char(w).unwrap()) + .collect::() + .parse() + .ok(); } - // Handle special case: "eleven hundred" = 1100 + grammatical_words_to_number(&words) +} + +/// Parse a grammatical English number with running-sum + scale multiplication. +fn grammatical_words_to_number(words: &[&str]) -> Option { + // "eleven hundred" = 1100, "twenty hundred" = 2000 if words.len() == 2 && words[1] == "hundred" { if let Some(&val) = ONES.get(words[0]) { - if val >= 11 && val <= 19 { + if (11..=19).contains(&val) { return Some((val * 100) as i128); } } @@ -185,15 +185,11 @@ pub fn words_to_number(input: &str) -> Option { } } - // Handle "eleven hundred twenty one" pattern + // "eleven hundred twenty one" = 1100 + 21 if words.len() >= 2 && words[1] == "hundred" { if let Some(&first_val) = ONES.get(words[0]) { - if first_val >= 11 && first_val <= 99 { + if (11..=99).contains(&first_val) { let base = (first_val * 100) as i128; - if words.len() == 2 { - return Some(base); - } - // Parse remaining words let rest = words[2..].join(" "); if let Some(remainder) = words_to_number(&rest) { return Some(base + remainder); @@ -202,9 +198,6 @@ pub fn words_to_number(input: &str) -> Option { } if let Some(&first_val) = TENS.get(words[0]) { let base = (first_val * 100) as i128; - if words.len() == 2 { - return Some(base); - } let rest = words[2..].join(" "); if let Some(remainder) = words_to_number(&rest) { return Some(base + remainder); @@ -216,7 +209,7 @@ pub fn words_to_number(input: &str) -> Option { let mut current: i128 = 0; let mut found_number = false; - for word in words { + for &word in words { if let Some(&val) = ONES.get(word) { current += val as i128; found_number = true; @@ -240,7 +233,6 @@ pub fn words_to_number(input: &str) -> Option { found_number = true; } } else { - // Unknown word - not a valid number return None; } } From e2b1826245f7965f311e244b24ea37b9da26f993 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 13:19:36 -0400 Subject: [PATCH 5/9] fix(en/cardinal): support aviation flight-number reading (issue #14) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "seven eighty eight" was running-summed to 95 (= 7 + 80 + 8) instead of being read as flight 788. The grammatical accumulator in words_to_number had no way to express "leading single-digit prefix + trailing grammatical compound", which is the natural reading for flight numbers, room numbers, and similar codes. Add a third reading mode to words_to_number, sandwiched between the all-digits path and the grammatical fallback: - Detect a leading run of single-digit words (zero-nine, oh, o) - Parse the remainder as a grammatical compound - Concatenate as strings: "7" ‖ 88 = "788" Disabled when the input contains a scale word (hundred/thousand/...), so "two thousand seventeen" stays 2017 instead of becoming 22017. This fix lives in cardinal because the sentence-mode pipeline (parse_span) reaches cardinal directly without going through the telephone tagger. The existing telephone-based fix for issue #15 only helps the single-string normalize() path; normalize_sentence("United seven eighty eight") had to be addressed at the cardinal level. Tests: - cardinal::tests::test_aviation_flight_number_style - cardinal::tests::test_words_to_number_aviation_flight_number - cardinal::tests::test_scale_word_forces_grammatical - en_tests::test_issue_14_aviation_flight_number - en_tests::test_issue_14_aviation_flight_number_in_sentence - en_tests::test_issue_14_does_not_break_scale_grammar Closes #14. --- src/itn/en/cardinal.rs | 58 ++++++++++++++++++++++++++++++++++++++++-- tests/en_tests.rs | 33 ++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/src/itn/en/cardinal.rs b/src/itn/en/cardinal.rs index 244a563..caf0967 100644 --- a/src/itn/en/cardinal.rs +++ b/src/itn/en/cardinal.rs @@ -137,14 +137,19 @@ fn single_digit_char(word: &str) -> Option { /// Convert spoken number words to integer. /// -/// Two readings are accepted: +/// Three readings are accepted, in order: /// - **Digit-by-digit** (codes, flight numbers, aviation frequencies): /// `"one three five"` → `135`. Triggered when every token is a single-digit /// word (`zero`-`nine`, plus `oh`/`o` for `0`). +/// - **Aviation flight-number style**: `"seven eighty eight"` → `788`, +/// `"two thirty five"` → `235`. A leading run of single-digit words is +/// concatenated, then a trailing grammatical compound (e.g. `"eighty eight"` +/// = 88) is appended. Disabled when the input contains a scale word +/// (`hundred`/`thousand`/...), which forces grammatical reading. /// - **Grammatical** (English number grammar): `"twenty one"` → `21`, /// `"one hundred twenty three"` → `123`, `"one thousand two hundred thirty /// four"` → `1234`. Uses a left-to-right accumulator with scale words -/// (`hundred`, `thousand`, `million`, ...) multiplying the current group. +/// multiplying the current group. /// /// Filler words `"and"` and `"a"` are stripped. pub fn words_to_number(input: &str) -> Option { @@ -168,6 +173,27 @@ pub fn words_to_number(input: &str) -> Option { .ok(); } + // Aviation flight-number style: digit prefix + grammatical compound. + // "seven eighty eight" → "7" ‖ 88 = 788. Skipped if a scale word appears, + // since "two thousand seventeen" must stay grammatical (= 2017, not 22017). + let has_scale = words.iter().any(|w| SCALES.contains_key(*w)); + if !has_scale { + let prefix_len = words + .iter() + .take_while(|w| single_digit_char(w).is_some()) + .count(); + if prefix_len >= 1 && prefix_len < words.len() { + if let Some(rest_num) = grammatical_words_to_number(&words[prefix_len..]) { + let prefix: String = words[..prefix_len] + .iter() + .map(|w| single_digit_char(w).unwrap()) + .collect(); + let combined = format!("{}{}", prefix, rest_num); + return combined.parse::().ok(); + } + } + } + grammatical_words_to_number(&words) } @@ -344,4 +370,32 @@ mod tests { assert_eq!(words_to_number("one three five"), Some(135)); assert_eq!(words_to_number("six two five"), Some(625)); } + + /// Aviation flight-number style (issue #14). A leading run of single-digit + /// words gets concatenated with the trailing grammatical compound, e.g. + /// "seven eighty eight" = "7" ‖ 88 = 788, not 7 + 80 + 8 = 95. + #[test] + fn test_aviation_flight_number_style() { + assert_eq!(parse("seven eighty eight"), Some("788".to_string())); + assert_eq!(parse("two thirty five"), Some("235".to_string())); + assert_eq!(parse("three forty seven"), Some("347".to_string())); + assert_eq!(parse("nine eleven"), Some("911".to_string())); + // Multi-digit prefix + assert_eq!(parse("two seven eighty eight"), Some("2788".to_string())); + } + + #[test] + fn test_words_to_number_aviation_flight_number() { + assert_eq!(words_to_number("seven eighty eight"), Some(788)); + assert_eq!(words_to_number("two thirty five"), Some(235)); + } + + /// Regression: scale words must force grammatical reading. "two thousand + /// seventeen" must stay 2017, not be misread as a digit-prefix pattern. + #[test] + fn test_scale_word_forces_grammatical() { + assert_eq!(parse("two thousand seventeen"), Some("2017".to_string())); + assert_eq!(parse("one hundred"), Some("100".to_string())); + assert_eq!(parse("two million three"), Some("2000003".to_string())); + } } diff --git a/tests/en_tests.rs b/tests/en_tests.rs index 3355950..d43f0a3 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -897,3 +897,36 @@ fn test_spelled_digit_cardinal_does_not_break_normal_cardinals() { assert_eq!(normalize("one hundred thirty five"), "135"); assert_eq!(normalize("one thousand two hundred thirty four"), "1234"); } + +/// Issue #14: aviation flight-number reading. "seven eighty eight" is the +/// spoken form of flight 788, not 7 + 80 + 8 = 95. +#[test] +fn test_issue_14_aviation_flight_number() { + assert_eq!(normalize("seven eighty eight"), "788"); + assert_eq!(normalize("two thirty five"), "235"); + assert_eq!(normalize("three forty seven"), "347"); +} + +/// Issue #14 in sentence context — the original bug report. The +/// `normalize_sentence` path reaches the cardinal tagger directly (the +/// telephone tagger is not in the sentence-mode pipeline), so the fix has +/// to live inside `cardinal::words_to_number`. +#[test] +fn test_issue_14_aviation_flight_number_in_sentence() { + assert_eq!( + normalize_sentence("United seven eighty eight"), + "United 788" + ); + assert_eq!( + normalize_sentence("flight two thirty five departs at gate four"), + "flight 235 departs at gate 4" + ); +} + +/// Regression: scale words must keep grammatical reading. The aviation +/// branch must not eat "two thousand seventeen" as 22017. +#[test] +fn test_issue_14_does_not_break_scale_grammar() { + assert_eq!(normalize("two thousand seventeen"), "2017"); + assert_eq!(normalize("one hundred"), "100"); +} From 5315727cafd9ddb80edfbbd5e23b81ab39aa86e9 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 13:46:13 -0400 Subject: [PATCH 6/9] refactor(en/cardinal): make aviation reading opt-in via words_to_number_aviation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit wired aviation flight-number reading directly into `cardinal::words_to_number` (the generic ITN dispatch path). That broke `date::parse_old_year` for "twenty one forty two" (was 2043 via 20*100+43, became None because year_part now reads 142 > 99) and made test inputs collide with the time tagger. This commit reverts `words_to_number` to grammatical + digit-by-digit only (matching upstream NeMo behaviour) and exposes the aviation reading as a separate opt-in helper `words_to_number_aviation`. Callers in flight-number or call-sign contexts can reach for it explicitly; generic dispatch stays out of the date/time tagger's way. Behaviour summary (verified): - `normalize_sentence("twenty one forty two")` → "2043" (date, restored) - `normalize_sentence("seven eighty eight")` → "95" (grammatical) - `normalize("seven eighty eight")` → "788" (existing telephone tagger) - `words_to_number_aviation("seven eighty eight")` → 788 (opt-in) - `words_to_number("seven eighty eight")` → 95 (no aviation) --- src/itn/en/cardinal.rs | 108 +++++++++++++++++++++++++++++++---------- tests/en_tests.rs | 54 ++++++++++----------- 2 files changed, 108 insertions(+), 54 deletions(-) diff --git a/src/itn/en/cardinal.rs b/src/itn/en/cardinal.rs index caf0967..fa08ff1 100644 --- a/src/itn/en/cardinal.rs +++ b/src/itn/en/cardinal.rs @@ -137,21 +137,22 @@ fn single_digit_char(word: &str) -> Option { /// Convert spoken number words to integer. /// -/// Three readings are accepted, in order: +/// Two readings are accepted, in order: /// - **Digit-by-digit** (codes, flight numbers, aviation frequencies): /// `"one three five"` → `135`. Triggered when every token is a single-digit /// word (`zero`-`nine`, plus `oh`/`o` for `0`). -/// - **Aviation flight-number style**: `"seven eighty eight"` → `788`, -/// `"two thirty five"` → `235`. A leading run of single-digit words is -/// concatenated, then a trailing grammatical compound (e.g. `"eighty eight"` -/// = 88) is appended. Disabled when the input contains a scale word -/// (`hundred`/`thousand`/...), which forces grammatical reading. /// - **Grammatical** (English number grammar): `"twenty one"` → `21`, /// `"one hundred twenty three"` → `123`, `"one thousand two hundred thirty /// four"` → `1234`. Uses a left-to-right accumulator with scale words /// multiplying the current group. /// /// Filler words `"and"` and `"a"` are stripped. +/// +/// Note: aviation flight-number reading (`"seven eighty eight"` → `788`) is +/// **not** applied here because it conflicts with date and time taggers (e.g. +/// `"twenty one forty two"` must remain readable as old-year `2042` for +/// `date::parse_old_year`). Use [`words_to_number_aviation`] for opt-in +/// flight-number / call-sign contexts. pub fn words_to_number(input: &str) -> Option { let input = input.to_lowercase(); let words: Vec<&str> = input @@ -173,6 +174,41 @@ pub fn words_to_number(input: &str) -> Option { .ok(); } + grammatical_words_to_number(&words) +} + +/// Aviation / flight-number / call-sign reading of a number phrase. +/// +/// Recognises a leading run of single-digit words concatenated with a trailing +/// grammatical compound, e.g. `"seven eighty eight"` → `788`, +/// `"two thirty five"` → `235`. Falls back to [`words_to_number`] when the +/// aviation pattern does not apply (no digit prefix, scale word present, etc.). +/// +/// This is **opt-in**: callers reach for it explicitly from flight-number / +/// call-sign contexts. Generic ITN/TN dispatch keeps using [`words_to_number`] +/// to avoid clobbering date/time/measure semantics (e.g. `"twenty one forty +/// two"` as old-year `2042`). +pub fn words_to_number_aviation(input: &str) -> Option { + let input = input.to_lowercase(); + let words: Vec<&str> = input + .split_whitespace() + .filter(|w| *w != "and" && *w != "a") + .collect(); + + if words.is_empty() { + return None; + } + + // Digit-by-digit reading wins when unambiguous. + if words.iter().all(|w| single_digit_char(w).is_some()) { + return words + .iter() + .map(|w| single_digit_char(w).unwrap()) + .collect::() + .parse() + .ok(); + } + // Aviation flight-number style: digit prefix + grammatical compound. // "seven eighty eight" → "7" ‖ 88 = 788. Skipped if a scale word appears, // since "two thousand seventeen" must stay grammatical (= 2017, not 22017). @@ -371,31 +407,53 @@ mod tests { assert_eq!(words_to_number("six two five"), Some(625)); } - /// Aviation flight-number style (issue #14). A leading run of single-digit - /// words gets concatenated with the trailing grammatical compound, e.g. - /// "seven eighty eight" = "7" ‖ 88 = 788, not 7 + 80 + 8 = 95. + /// Aviation flight-number style (issue #14): opt-in helper. A leading run + /// of single-digit words gets concatenated with the trailing grammatical + /// compound, e.g. "seven eighty eight" = "7" ‖ 88 = 788. Generic + /// `words_to_number` deliberately does *not* do this — it would break + /// `date::parse_old_year` ("twenty one forty two" → 2042) and overlap with + /// the time tagger ("two thirty five" → 02:35). #[test] - fn test_aviation_flight_number_style() { - assert_eq!(parse("seven eighty eight"), Some("788".to_string())); - assert_eq!(parse("two thirty five"), Some("235".to_string())); - assert_eq!(parse("three forty seven"), Some("347".to_string())); - assert_eq!(parse("nine eleven"), Some("911".to_string())); - // Multi-digit prefix - assert_eq!(parse("two seven eighty eight"), Some("2788".to_string())); + fn test_words_to_number_aviation_flight_number() { + assert_eq!(words_to_number_aviation("seven eighty eight"), Some(788)); + assert_eq!(words_to_number_aviation("two thirty five"), Some(235)); + assert_eq!(words_to_number_aviation("three forty seven"), Some(347)); + assert_eq!(words_to_number_aviation("nine eleven"), Some(911)); + // Multi-digit prefix. + assert_eq!( + words_to_number_aviation("two seven eighty eight"), + Some(2788) + ); } + /// Aviation helper falls back to grammatical when no digit prefix exists. #[test] - fn test_words_to_number_aviation_flight_number() { - assert_eq!(words_to_number("seven eighty eight"), Some(788)); - assert_eq!(words_to_number("two thirty five"), Some(235)); + fn test_words_to_number_aviation_falls_back_to_grammatical() { + assert_eq!(words_to_number_aviation("twenty one"), Some(21)); + assert_eq!(words_to_number_aviation("one hundred"), Some(100)); } - /// Regression: scale words must force grammatical reading. "two thousand - /// seventeen" must stay 2017, not be misread as a digit-prefix pattern. + /// Aviation helper must keep grammatical reading when a scale word is + /// present. "two thousand seventeen" must stay 2017, not 22017. #[test] - fn test_scale_word_forces_grammatical() { - assert_eq!(parse("two thousand seventeen"), Some("2017".to_string())); - assert_eq!(parse("one hundred"), Some("100".to_string())); - assert_eq!(parse("two million three"), Some("2000003".to_string())); + fn test_words_to_number_aviation_scale_word_forces_grammatical() { + assert_eq!( + words_to_number_aviation("two thousand seventeen"), + Some(2017) + ); + assert_eq!( + words_to_number_aviation("two million three"), + Some(2_000_003) + ); + } + + /// Generic `words_to_number` (the dispatch path) must NOT do aviation + /// reading: "seven eighty eight" stays grammatical 95 there, so date/time + /// taggers see consistent values. + #[test] + fn test_words_to_number_no_aviation_reading() { + assert_eq!(words_to_number("seven eighty eight"), Some(95)); + assert_eq!(words_to_number("twenty one forty two"), Some(63)); + assert_eq!(words_to_number("two thousand seventeen"), Some(2017)); } } diff --git a/tests/en_tests.rs b/tests/en_tests.rs index d43f0a3..6ba98fa 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -898,35 +898,31 @@ fn test_spelled_digit_cardinal_does_not_break_normal_cardinals() { assert_eq!(normalize("one thousand two hundred thirty four"), "1234"); } -/// Issue #14: aviation flight-number reading. "seven eighty eight" is the -/// spoken form of flight 788, not 7 + 80 + 8 = 95. -#[test] -fn test_issue_14_aviation_flight_number() { - assert_eq!(normalize("seven eighty eight"), "788"); - assert_eq!(normalize("two thirty five"), "235"); - assert_eq!(normalize("three forty seven"), "347"); -} - -/// Issue #14 in sentence context — the original bug report. The -/// `normalize_sentence` path reaches the cardinal tagger directly (the -/// telephone tagger is not in the sentence-mode pipeline), so the fix has -/// to live inside `cardinal::words_to_number`. -#[test] -fn test_issue_14_aviation_flight_number_in_sentence() { - assert_eq!( - normalize_sentence("United seven eighty eight"), - "United 788" - ); - assert_eq!( - normalize_sentence("flight two thirty five departs at gate four"), - "flight 235 departs at gate 4" - ); -} - -/// Regression: scale words must keep grammatical reading. The aviation -/// branch must not eat "two thousand seventeen" as 22017. -#[test] -fn test_issue_14_does_not_break_scale_grammar() { +/// Issue #14: aviation flight-number reading is exposed as an **opt-in** +/// helper (`cardinal::words_to_number_aviation`), not wired into the +/// generic ITN/TN dispatch. Wiring it generically would clobber the date +/// tagger's old-year reading (`"twenty one forty two"` → `2043`, see +/// `test_sentence_adjacent_spans`) and overlap with the time tagger +/// (`"two thirty five"` → `02:35`). Callers in flight-number / call-sign +/// contexts reach for the helper explicitly. See `src/itn/en/cardinal.rs` +/// for unit tests of the helper itself. +/// +/// In single-input `normalize`, the existing `telephone` tagger already +/// produces flight-number-style output for whole-input cases (e.g. +/// `"seven eighty eight"` → `"788"`). The sentence-mode pipeline excludes +/// `telephone`, so `normalize_sentence` keeps the grammatical reading. +#[test] +fn test_issue_14_aviation_is_opt_in() { + // Scale-word grammar is preserved everywhere. assert_eq!(normalize("two thousand seventeen"), "2017"); assert_eq!(normalize("one hundred"), "100"); + assert_eq!(normalize_sentence("two thousand seventeen"), "2017"); + + // Sentence-mode dispatch keeps grammatical reading for short spans: + // "seven eighty eight" → 7 + 80 + 8 = 95, NOT the aviation 788. + assert_eq!(normalize_sentence("seven eighty eight"), "95"); + + // Single-input `normalize` keeps its existing telephone-tagger + // behaviour for whole-input flight-number-style phrases. + assert_eq!(normalize("seven eighty eight"), "788"); } From 637917dd2ebfca1fa7dc2973a4511dc70f8216f0 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 13:53:41 -0400 Subject: [PATCH 7/9] feat(en): add aviation pipeline (normalize_aviation, normalize_sentence_aviation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Caller-prioritized aviation flight-number reading. Default dispatch keeps upstream NeMo semantics (date wins for "twenty one forty two", time wins for "two thirty five"); callers in flight-number / call-sign contexts opt in by calling the new aviation variants. New public API: - `cardinal::parse_aviation(input)` — like `cardinal::parse` but uses `words_to_number_aviation` to recognise "seven eighty eight" as 788. - `normalize_aviation(input)` — single-input dispatch with aviation cardinal tried before time/date. - `normalize_sentence_aviation(input)` — sentence-mode dispatch via a new `parse_span_aviation` that puts aviation cardinal at priority 89 (above date 88, time 85; below measure 90, money 95). - `normalize_sentence_aviation_with_max_span(input, max_span)` for the configurable-span variant. Behaviour matrix (verified): normalize normalize_aviation sentence sentence_aviation "twenty one forty two" 2043 63 2043 63 "seven eighty eight" 788 788 95 788 "two thirty five" 02:35 235 02:35 235 "United seven eighty eight" 788 788 United 95 United 788 "flight two thirty five ..." 235-4* 235-4* flight 02:35 flight 235 "five dollars" / "five point two" $5/5.2 unchanged unchanged unchanged "twenty first" / "two thousand seventeen" / "one hundred" unchanged unchanged unchanged unchanged (* `normalize` single-input mode lets the telephone tagger eat the whole span; sentence mode does not.) Internals: `normalize_sentence_with_max_span` and the new `normalize_sentence_aviation_with_max_span` share their loop body via a private `normalize_sentence_inner(parse_span_fn)` helper. Closes #14 follow-up: provides the dispatch-level aviation pipeline that the previous `words_to_number_aviation` helper alone did not. --- src/itn/en/cardinal.rs | 18 ++++- src/lib.rs | 153 ++++++++++++++++++++++++++++++++++++++++- tests/en_tests.rs | 82 +++++++++++++++++----- 3 files changed, 234 insertions(+), 19 deletions(-) diff --git a/src/itn/en/cardinal.rs b/src/itn/en/cardinal.rs index fa08ff1..079d1cc 100644 --- a/src/itn/en/cardinal.rs +++ b/src/itn/en/cardinal.rs @@ -73,6 +73,22 @@ lazy_static! { /// /// Returns None if the input cannot be parsed as a number. pub fn parse(input: &str) -> Option { + parse_with_reading(input, words_to_number) +} + +/// Aviation / flight-number / call-sign reading variant of [`parse`]. +/// +/// Recognises digit-prefix + grammatical-compound phrases like +/// `"seven eighty eight"` → `"788"`. Use this from contexts where flight +/// numbers or call signs are expected. Generic dispatch keeps using +/// [`parse`] to avoid clobbering date/time semantics. +pub fn parse_aviation(input: &str) -> Option { + parse_with_reading(input, words_to_number_aviation) +} + +/// Shared body of [`parse`] and [`parse_aviation`]. The only thing that +/// differs is which words-to-number reading is applied to the cleaned input. +fn parse_with_reading(input: &str, reader: fn(&str) -> Option) -> Option { let original = input.trim(); let input = original.to_lowercase(); let input = input.as_str(); @@ -107,7 +123,7 @@ pub fn parse(input: &str) -> Option { (false, input) }; - let num = words_to_number(rest)?; + let num = reader(rest)?; if is_negative { Some(format!("-{}", num)) diff --git a/src/lib.rs b/src/lib.rs index 5ad114d..31b2511 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -111,6 +111,57 @@ pub fn normalize(input: &str) -> String { input.to_string() } +/// Normalize a single input with **aviation flight-number reading +/// prioritized**. +/// +/// Same dispatch as [`normalize`], with one twist: `cardinal::parse_aviation` +/// is tried *after* the high-confidence taggers (`custom_rules`, `whitelist`, +/// `punctuation`, `word`) but *before* `time` and `date`. The result: when +/// the whole input is a number-only phrase like `"two thirty five"` or +/// `"seven eighty eight"`, the aviation reading wins (`"235"`, `"788"`) +/// instead of being eaten as a time (`"02:35"`) or as an old-year via the +/// date tagger. +/// +/// Use this from flight-number / call-sign / aviation-radio contexts. Phrases +/// that aren't pure number words still flow through the rest of the +/// pipeline normally (`"five dollars"` → `"$5"` via the money tagger). +/// +/// ``` +/// use text_processing_rs::normalize_aviation; +/// +/// assert_eq!(normalize_aviation("seven eighty eight"), "788"); +/// assert_eq!(normalize_aviation("two thirty five"), "235"); +/// // Non-number phrases are unaffected. +/// assert_eq!(normalize_aviation("hello world"), "hello world"); +/// ``` +pub fn normalize_aviation(input: &str) -> String { + let input = input.trim(); + + // High-confidence rules still win. + if let Some(result) = custom_rules::parse(input) { + return result; + } + if let Some(result) = whitelist::parse(input) { + return result; + } + if let Some(result) = punctuation::parse(input) { + return result; + } + if let Some(result) = word::parse(input) { + return result; + } + + // Aviation cardinal beats time/date here. This is the whole point of + // calling `normalize_aviation` instead of `normalize`. + if let Some(num) = cardinal::parse_aviation(input) { + return num; + } + + // Fall back to the standard pipeline for anything aviation cardinal + // didn't recognise (money, measure, decimal, ordinal, telephone, etc.). + normalize(input) +} + /// Normalize with language selection. /// /// Supports language-specific ITN taggers for converting spoken-form @@ -906,6 +957,42 @@ pub fn normalize_sentence(input: &str) -> String { normalize_sentence_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) } +/// Sentence-mode equivalent of [`normalize_aviation`]. +/// +/// Same scanning loop as [`normalize_sentence`], but each span is tried +/// against [`parse_span_aviation`] instead of [`parse_span`]. Aviation +/// cardinal sits at priority 89 (above `date`=88 and `time`=85, below +/// `measure`=90 and `money`=95), so flight-number-style spans win over +/// date/time interpretations while measure / money phrases keep their +/// existing semantics. +/// +/// ``` +/// use text_processing_rs::normalize_sentence_aviation; +/// +/// // Aviation cardinal beats time/date for pure-number spans. +/// assert_eq!( +/// normalize_sentence_aviation("United seven eighty eight"), +/// "United 788" +/// ); +/// assert_eq!( +/// normalize_sentence_aviation("flight two thirty five departs at gate four"), +/// "flight 235 departs at gate 4" +/// ); +/// // Non-aviation spans flow through normally. +/// assert_eq!( +/// normalize_sentence_aviation("I have twenty one apples"), +/// "I have 21 apples" +/// ); +/// ``` +pub fn normalize_sentence_aviation(input: &str) -> String { + normalize_sentence_aviation_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) +} + +/// [`normalize_sentence_aviation`] with a configurable max span size. +pub fn normalize_sentence_aviation_with_max_span(input: &str, max_span_tokens: usize) -> String { + normalize_sentence_inner(input, max_span_tokens, parse_span_aviation) +} + /// Normalize a full sentence with a configurable max span size. /// /// `max_span_tokens` controls the maximum number of consecutive tokens @@ -920,6 +1007,18 @@ pub fn normalize_sentence(input: &str) -> String { /// assert_eq!(normalize_sentence_with_max_span("I have twenty one apples", 4), "I have 21 apples"); /// ``` pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> String { + normalize_sentence_inner(input, max_span_tokens, parse_span) +} + +/// Sentence-mode dispatch loop, parameterized by which `parse_span` variant +/// scores each candidate. Used by both `normalize_sentence_with_max_span` +/// (default dispatch) and `normalize_sentence_aviation_with_max_span` +/// (aviation cardinal at priority 89). +fn normalize_sentence_inner( + input: &str, + max_span_tokens: usize, + parse_span_fn: fn(&str) -> Option<(String, u8)>, +) -> String { let trimmed = input.trim(); if trimmed.is_empty() { return trimmed.to_string(); @@ -941,7 +1040,7 @@ pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> // Longest-span-first search keeps replacements stable and non-overlapping. for end in (i + 1..=max_end).rev() { let span = tokens[i..end].join(" "); - let Some((candidate, score)) = parse_span(&span) else { + let Some((candidate, score)) = parse_span_fn(&span) else { continue; }; @@ -979,6 +1078,58 @@ pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> out.join(" ") } +/// Sentence-mode dispatch with **aviation flight-number reading at priority +/// 89**, sitting above `date` (88) and `time` (85). Otherwise identical to +/// [`parse_span`]. +fn parse_span_aviation(span: &str) -> Option<(String, u8)> { + let token_count = span.split_whitespace().count(); + if token_count == 0 { + return None; + } + + if let Some(result) = custom_rules::parse(span) { + return Some((result, 110)); + } + if let Some(result) = whitelist::parse(span) { + return Some((result, 100)); + } + if let Some(result) = punctuation::parse(span) { + return Some((result, 98)); + } + if let Some(result) = money::parse(span) { + return Some((result, 95)); + } + if let Some(result) = measure::parse(span) { + return Some((result, 90)); + } + + // Aviation cardinal beats date/time. Same short-span gate as the regular + // cardinal path: avoids over-matching on natural language. + if token_count <= 4 { + if let Some(result) = cardinal::parse_aviation(span) { + return Some((result, 89)); + } + } + + if let Some(result) = date::parse(span) { + return Some((result, 88)); + } + if let Some(result) = time::parse(span) { + return Some((result, 85)); + } + if let Some(result) = electronic::parse(span) { + return Some((result, 82)); + } + if let Some(result) = decimal::parse(span) { + return Some((result, 80)); + } + if let Some(result) = ordinal::parse(span) { + return Some((result, 75)); + } + + None +} + // ── Text Normalization (written → spoken) ───────────────────────────── /// Normalize written-form text to spoken form (Text Normalization). diff --git a/tests/en_tests.rs b/tests/en_tests.rs index 6ba98fa..414144b 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -7,7 +7,8 @@ mod common; use std::path::Path; use text_processing_rs::{ - custom_rules, normalize, normalize_sentence, normalize_sentence_with_max_span, + custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, + normalize_sentence_with_max_span, }; fn print_failures(results: &common::TestResults) { @@ -899,30 +900,77 @@ fn test_spelled_digit_cardinal_does_not_break_normal_cardinals() { } /// Issue #14: aviation flight-number reading is exposed as an **opt-in** -/// helper (`cardinal::words_to_number_aviation`), not wired into the -/// generic ITN/TN dispatch. Wiring it generically would clobber the date -/// tagger's old-year reading (`"twenty one forty two"` → `2043`, see -/// `test_sentence_adjacent_spans`) and overlap with the time tagger -/// (`"two thirty five"` → `02:35`). Callers in flight-number / call-sign -/// contexts reach for the helper explicitly. See `src/itn/en/cardinal.rs` -/// for unit tests of the helper itself. -/// -/// In single-input `normalize`, the existing `telephone` tagger already -/// produces flight-number-style output for whole-input cases (e.g. -/// `"seven eighty eight"` → `"788"`). The sentence-mode pipeline excludes -/// `telephone`, so `normalize_sentence` keeps the grammatical reading. -#[test] -fn test_issue_14_aviation_is_opt_in() { +/// pipeline. Generic dispatch keeps upstream NeMo semantics (date wins for +/// `"twenty one forty two"`, time wins for `"two thirty five"`); callers +/// who know they're in aviation context reach for the `*_aviation` +/// variants, which run aviation cardinal at priority 89 (above date 88 +/// and time 85). +#[test] +fn test_issue_14_default_dispatch_unchanged() { // Scale-word grammar is preserved everywhere. assert_eq!(normalize("two thousand seventeen"), "2017"); assert_eq!(normalize("one hundred"), "100"); assert_eq!(normalize_sentence("two thousand seventeen"), "2017"); - // Sentence-mode dispatch keeps grammatical reading for short spans: - // "seven eighty eight" → 7 + 80 + 8 = 95, NOT the aviation 788. + // Sentence-mode default dispatch: "seven eighty eight" stays grammatical + // 95, NOT aviation 788. assert_eq!(normalize_sentence("seven eighty eight"), "95"); // Single-input `normalize` keeps its existing telephone-tagger // behaviour for whole-input flight-number-style phrases. assert_eq!(normalize("seven eighty eight"), "788"); } + +/// Aviation pipeline `normalize_aviation` (single-input). Aviation cardinal +/// runs early enough to beat time/date. +#[test] +fn test_issue_14_normalize_aviation() { + assert_eq!(normalize_aviation("seven eighty eight"), "788"); + // Beats time tagger. + assert_eq!(normalize_aviation("two thirty five"), "235"); + // Beats date old-year reading. + assert_eq!(normalize_aviation("twenty one forty two"), "63"); + // Non-number phrases fall through unchanged. + assert_eq!(normalize_aviation("hello world"), "hello world"); + // Money / measure / decimal / ordinal still work via fallback to + // standard `normalize`. + assert_eq!(normalize_aviation("five dollars"), "$5"); + assert_eq!(normalize_aviation("five point two"), "5.2"); + assert_eq!(normalize_aviation("twenty first"), "21st"); + // Scale-word grammar still wins (no digit prefix → grammatical). + assert_eq!(normalize_aviation("two thousand seventeen"), "2017"); +} + +/// Aviation pipeline `normalize_sentence_aviation` (sentence mode). The +/// cardinal-aviation priority bump (89) makes flight-number spans win +/// over date/time in real sentences. +#[test] +fn test_issue_14_normalize_sentence_aviation() { + // The original bug from issue #14. + assert_eq!( + normalize_sentence_aviation("United seven eighty eight"), + "United 788" + ); + assert_eq!( + normalize_sentence_aviation("flight two thirty five departs at gate four"), + "flight 235 departs at gate 4" + ); + + // Scale-word grammar is preserved. + assert_eq!( + normalize_sentence_aviation("two thousand seventeen"), + "2017" + ); + + // Money / measure stay above aviation (priority 95 / 90 > 89). + assert_eq!( + normalize_sentence_aviation("I owe five dollars"), + "I owe $5" + ); + + // Plain natural language is untouched. + assert_eq!( + normalize_sentence_aviation("I have twenty one apples"), + "I have 21 apples" + ); +} From a4f374b88d893b2179a5fcde7b47a45f27116d13 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 13:59:38 -0400 Subject: [PATCH 8/9] refactor(en): collapse parse_span_aviation into parse_span(span, aviation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit forked parse_span into a near-identical parse_span_aviation that only differed by inserting cardinal::parse_aviation at priority 89. That meant adding a new tagger required updating two lists. Replace the fork with a single boolean parameter: - `parse_span(span, aviation: bool)` — one tagger list, the aviation cardinal at priority 89 lives behind `if aviation`, the regular cardinal fallback at 70 lives behind `if !aviation`. - `normalize_sentence_inner(input, max_span, aviation)` — pass through the bool instead of a fn-pointer. - `parse_span_aviation` deleted. Public API and behaviour unchanged. Net -47 lines. --- src/lib.rs | 103 +++++++++++++++-------------------------------------- 1 file changed, 28 insertions(+), 75 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 31b2511..9acac27 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -894,7 +894,12 @@ const DEFAULT_MAX_SPAN_TOKENS: usize = 16; /// broad patterns (cardinal) last and limited to short spans. /// /// Excluded in sentence mode: `word` and `telephone` (over-fire on natural language). -fn parse_span(span: &str) -> Option<(String, u8)> { +/// +/// `aviation`: when `true`, `cardinal::parse_aviation` is tried at priority 89 +/// (above `date`=88 and `time`=85, below `measure`=90 / `money`=95) and the +/// regular cardinal fallback at 70 is skipped (the aviation reader already +/// falls back to grammatical when no digit prefix is present). +fn parse_span(span: &str, aviation: bool) -> Option<(String, u8)> { let token_count = span.split_whitespace().count(); if token_count == 0 { return None; @@ -915,6 +920,15 @@ fn parse_span(span: &str) -> Option<(String, u8)> { if let Some(result) = measure::parse(span) { return Some((result, 90)); } + + // Aviation cardinal opt-in: priority 89, beats date/time. Same short-span + // gate as the regular cardinal fallback below. + if aviation && token_count <= 4 { + if let Some(result) = cardinal::parse_aviation(span) { + return Some((result, 89)); + } + } + if let Some(result) = date::parse(span) { return Some((result, 88)); } @@ -931,8 +945,9 @@ fn parse_span(span: &str) -> Option<(String, u8)> { return Some((result, 75)); } - // Cardinal only for short spans to avoid over-matching on natural language. - if token_count <= 4 { + // Default cardinal fallback (priority 70). In aviation mode the cardinal + // path is already covered by the priority-89 branch above. + if !aviation && token_count <= 4 { if let Some(result) = cardinal::parse(span) { return Some((result, 70)); } @@ -957,14 +972,10 @@ pub fn normalize_sentence(input: &str) -> String { normalize_sentence_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) } -/// Sentence-mode equivalent of [`normalize_aviation`]. -/// -/// Same scanning loop as [`normalize_sentence`], but each span is tried -/// against [`parse_span_aviation`] instead of [`parse_span`]. Aviation -/// cardinal sits at priority 89 (above `date`=88 and `time`=85, below -/// `measure`=90 and `money`=95), so flight-number-style spans win over -/// date/time interpretations while measure / money phrases keep their -/// existing semantics. +/// Sentence-mode equivalent of [`normalize_aviation`]. Aviation cardinal +/// runs at priority 89 (above `date`=88 / `time`=85, below `measure`=90 / +/// `money`=95), so flight-number-style spans win over date/time while +/// measure / money phrases keep their existing semantics. /// /// ``` /// use text_processing_rs::normalize_sentence_aviation; @@ -990,7 +1001,7 @@ pub fn normalize_sentence_aviation(input: &str) -> String { /// [`normalize_sentence_aviation`] with a configurable max span size. pub fn normalize_sentence_aviation_with_max_span(input: &str, max_span_tokens: usize) -> String { - normalize_sentence_inner(input, max_span_tokens, parse_span_aviation) + normalize_sentence_inner(input, max_span_tokens, true) } /// Normalize a full sentence with a configurable max span size. @@ -1007,18 +1018,12 @@ pub fn normalize_sentence_aviation_with_max_span(input: &str, max_span_tokens: u /// assert_eq!(normalize_sentence_with_max_span("I have twenty one apples", 4), "I have 21 apples"); /// ``` pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> String { - normalize_sentence_inner(input, max_span_tokens, parse_span) + normalize_sentence_inner(input, max_span_tokens, false) } -/// Sentence-mode dispatch loop, parameterized by which `parse_span` variant -/// scores each candidate. Used by both `normalize_sentence_with_max_span` -/// (default dispatch) and `normalize_sentence_aviation_with_max_span` -/// (aviation cardinal at priority 89). -fn normalize_sentence_inner( - input: &str, - max_span_tokens: usize, - parse_span_fn: fn(&str) -> Option<(String, u8)>, -) -> String { +/// Sentence-mode dispatch loop. The `aviation` flag is forwarded to +/// [`parse_span`] so each span sees the right tagger priorities. +fn normalize_sentence_inner(input: &str, max_span_tokens: usize, aviation: bool) -> String { let trimmed = input.trim(); if trimmed.is_empty() { return trimmed.to_string(); @@ -1040,7 +1045,7 @@ fn normalize_sentence_inner( // Longest-span-first search keeps replacements stable and non-overlapping. for end in (i + 1..=max_end).rev() { let span = tokens[i..end].join(" "); - let Some((candidate, score)) = parse_span_fn(&span) else { + let Some((candidate, score)) = parse_span(&span, aviation) else { continue; }; @@ -1078,58 +1083,6 @@ fn normalize_sentence_inner( out.join(" ") } -/// Sentence-mode dispatch with **aviation flight-number reading at priority -/// 89**, sitting above `date` (88) and `time` (85). Otherwise identical to -/// [`parse_span`]. -fn parse_span_aviation(span: &str) -> Option<(String, u8)> { - let token_count = span.split_whitespace().count(); - if token_count == 0 { - return None; - } - - if let Some(result) = custom_rules::parse(span) { - return Some((result, 110)); - } - if let Some(result) = whitelist::parse(span) { - return Some((result, 100)); - } - if let Some(result) = punctuation::parse(span) { - return Some((result, 98)); - } - if let Some(result) = money::parse(span) { - return Some((result, 95)); - } - if let Some(result) = measure::parse(span) { - return Some((result, 90)); - } - - // Aviation cardinal beats date/time. Same short-span gate as the regular - // cardinal path: avoids over-matching on natural language. - if token_count <= 4 { - if let Some(result) = cardinal::parse_aviation(span) { - return Some((result, 89)); - } - } - - if let Some(result) = date::parse(span) { - return Some((result, 88)); - } - if let Some(result) = time::parse(span) { - return Some((result, 85)); - } - if let Some(result) = electronic::parse(span) { - return Some((result, 82)); - } - if let Some(result) = decimal::parse(span) { - return Some((result, 80)); - } - if let Some(result) = ordinal::parse(span) { - return Some((result, 75)); - } - - None -} - // ── Text Normalization (written → spoken) ───────────────────────────── /// Normalize written-form text to spoken form (Text Normalization). From ef11df276a90624c7ca5251d37b80b67492b6dc2 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 26 Apr 2026 14:21:48 -0400 Subject: [PATCH 9/9] fix(en/cardinal,ffi,wasm): address Devin review on PR #20 - words_to_number / words_to_number_aviation: require words.len() >= 2 for digit-by-digit reading. Bare "oh" / "o" no longer resolve to 0 (those forms are only digits inside a longer spelled code; in isolation they are interjections / letters). Bare "zero" still works via the grammatical fallback. - parse_span: drop the <=4 token gate for the aviation cardinal branch. Aviation sentence mode is opt-in, so the caller has accepted aggressive matching across longer spans like "one thousand two hundred thirty four". parse_aviation falls back to grammatical, so non-aviation phrases still resolve. - ffi.rs / wasm.rs: expose the aviation pipeline (per AGENTS.md rule that lib / ffi / wasm be kept in sync). Adds nemo_normalize_aviation, nemo_normalize_sentence_aviation, nemo_normalize_sentence_aviation_with_max_span and the matching wasm-bindgen exports. --- src/ffi.rs | 110 ++++++++++++++++++++++++++++++++++++++++- src/itn/en/cardinal.rs | 30 +++++++++-- src/lib.rs | 9 ++-- src/wasm.rs | 18 ++++++- 4 files changed, 158 insertions(+), 9 deletions(-) diff --git a/src/ffi.rs b/src/ffi.rs index cf65abe..0086a6e 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -4,7 +4,8 @@ use std::ffi::{c_char, CStr, CString}; use std::ptr; use crate::{ - custom_rules, normalize, normalize_sentence, normalize_sentence_with_max_span, tn_normalize, + custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, + normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, tn_normalize, tn_normalize_lang, tn_normalize_sentence, tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, }; @@ -90,6 +91,89 @@ pub unsafe extern "C" fn nemo_normalize_sentence_with_max_span( } } +/// Aviation-flavoured single-input normalize. +/// +/// Layered on top of [`nemo_normalize`]: tries `cardinal::parse_aviation` +/// first so flight-number / call-sign phrases like `"seven eighty eight"` +/// resolve to `"788"`, then falls back to the regular dispatch. +/// +/// # Safety +/// - `input` must be a valid null-terminated UTF-8 string +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_normalize_aviation(input: *const c_char) -> *mut c_char { + if input.is_null() { + return ptr::null_mut(); + } + + let c_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = normalize_aviation(c_str); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + +/// Aviation-flavoured sentence normalize. +/// +/// Sentence-mode equivalent of [`nemo_normalize_aviation`]. Aviation cardinal +/// runs at priority 89 (above date / time, below money / measure), so +/// flight-number-style spans win without disturbing money / measure / decimal. +/// +/// # Safety +/// - `input` must be a valid null-terminated UTF-8 string +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_normalize_sentence_aviation(input: *const c_char) -> *mut c_char { + if input.is_null() { + return ptr::null_mut(); + } + + let c_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = normalize_sentence_aviation(c_str); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + +/// Aviation sentence normalize with a configurable max span size. +/// +/// # Safety +/// - `input` must be a valid null-terminated UTF-8 string +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_normalize_sentence_aviation_with_max_span( + input: *const c_char, + max_span_tokens: u32, +) -> *mut c_char { + if input.is_null() { + return ptr::null_mut(); + } + + let c_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = normalize_sentence_aviation_with_max_span(c_str, max_span_tokens as usize); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + /// Free a string allocated by nemo_normalize or nemo_normalize_sentence. /// /// # Safety @@ -372,4 +456,28 @@ mod tests { assert!(result.is_null()); } } + + #[test] + fn test_ffi_normalize_aviation() { + unsafe { + let input = CString::new("seven eighty eight").unwrap(); + let result = nemo_normalize_aviation(input.as_ptr()); + assert!(!result.is_null()); + let result_str = CStr::from_ptr(result).to_str().unwrap(); + assert_eq!(result_str, "788"); + nemo_free_string(result); + } + } + + #[test] + fn test_ffi_normalize_sentence_aviation() { + unsafe { + let input = CString::new("United seven eighty eight").unwrap(); + let result = nemo_normalize_sentence_aviation(input.as_ptr()); + assert!(!result.is_null()); + let result_str = CStr::from_ptr(result).to_str().unwrap(); + assert_eq!(result_str, "United 788"); + nemo_free_string(result); + } + } } diff --git a/src/itn/en/cardinal.rs b/src/itn/en/cardinal.rs index 079d1cc..71d4e14 100644 --- a/src/itn/en/cardinal.rs +++ b/src/itn/en/cardinal.rs @@ -180,8 +180,12 @@ pub fn words_to_number(input: &str) -> Option { return None; } - // Digit-by-digit reading wins whenever it's unambiguous. - if words.iter().all(|w| single_digit_char(w).is_some()) { + // Digit-by-digit reading wins whenever it's unambiguous, but only for + // multi-token inputs. Single-token "oh" / "o" must not read as 0 — those + // forms are only digits in the context of a longer code (e.g. "oh oh + // seven"). Single-token "zero" / "one" / ... fall through to grammatical + // and resolve correctly there. + if words.len() >= 2 && words.iter().all(|w| single_digit_char(w).is_some()) { return words .iter() .map(|w| single_digit_char(w).unwrap()) @@ -215,8 +219,9 @@ pub fn words_to_number_aviation(input: &str) -> Option { return None; } - // Digit-by-digit reading wins when unambiguous. - if words.iter().all(|w| single_digit_char(w).is_some()) { + // Digit-by-digit reading wins when unambiguous (multi-token only — see + // [`words_to_number`] for the rationale on rejecting bare "oh" / "o"). + if words.len() >= 2 && words.iter().all(|w| single_digit_char(w).is_some()) { return words .iter() .map(|w| single_digit_char(w).unwrap()) @@ -417,6 +422,23 @@ mod tests { assert_eq!(parse("four o four"), Some("404".to_string())); } + /// Single-token "oh" / "o" must not be read as digit 0. Those forms + /// are only digits inside a longer spelled code; in isolation they are + /// interjections / letters. + #[test] + fn test_bare_oh_not_zero() { + assert_eq!(words_to_number("oh"), None); + assert_eq!(words_to_number("o"), None); + assert_eq!(words_to_number_aviation("oh"), None); + assert_eq!(words_to_number_aviation("o"), None); + assert_eq!(parse("oh"), None); + assert_eq!(parse_aviation("oh"), None); + // Sanity: bare "zero" still resolves (via grammatical), and the + // multi-token spelled forms still work. + assert_eq!(words_to_number("zero"), Some(0)); + assert_eq!(words_to_number("oh oh seven"), Some(7)); + } + #[test] fn test_words_to_number_digit_sequence() { assert_eq!(words_to_number("one three five"), Some(135)); diff --git a/src/lib.rs b/src/lib.rs index 9acac27..dbc0c46 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -921,9 +921,12 @@ fn parse_span(span: &str, aviation: bool) -> Option<(String, u8)> { return Some((result, 90)); } - // Aviation cardinal opt-in: priority 89, beats date/time. Same short-span - // gate as the regular cardinal fallback below. - if aviation && token_count <= 4 { + // Aviation cardinal opt-in: priority 89, beats date/time. No short-span + // gate — aviation mode is opt-in, so the caller has accepted aggressive + // matching across longer spans like "one thousand two hundred thirty + // four". `parse_aviation` falls back to grammatical when the digit-prefix + // pattern does not apply, so non-aviation phrases still resolve. + if aviation { if let Some(result) = cardinal::parse_aviation(span) { return Some((result, 89)); } diff --git a/src/wasm.rs b/src/wasm.rs index 0ccfd04..76b54da 100644 --- a/src/wasm.rs +++ b/src/wasm.rs @@ -3,7 +3,8 @@ use wasm_bindgen::prelude::*; use crate::{ - custom_rules, normalize, normalize_sentence, normalize_sentence_with_max_span, + custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, + normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, normalize_with_lang, tn_normalize, tn_normalize_lang, tn_normalize_sentence, tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, @@ -35,6 +36,21 @@ pub fn normalize_sentence_with_max_span_js(input: &str, max_span_tokens: u32) -> normalize_sentence_with_max_span(input, max_span_tokens as usize) } +#[wasm_bindgen(js_name = normalizeAviation)] +pub fn normalize_aviation_js(input: &str) -> String { + normalize_aviation(input) +} + +#[wasm_bindgen(js_name = normalizeSentenceAviation)] +pub fn normalize_sentence_aviation_js(input: &str) -> String { + normalize_sentence_aviation(input) +} + +#[wasm_bindgen(js_name = normalizeSentenceAviationWithMaxSpan)] +pub fn normalize_sentence_aviation_with_max_span_js(input: &str, max_span_tokens: u32) -> String { + normalize_sentence_aviation_with_max_span(input, max_span_tokens as usize) +} + #[wasm_bindgen(js_name = tnNormalize)] pub fn tn_normalize_js(input: &str) -> String { tn_normalize(input)