diff --git a/src/ffi.rs b/src/ffi.rs index cf65abe..0086a6e 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -4,7 +4,8 @@ use std::ffi::{c_char, CStr, CString}; use std::ptr; use crate::{ - custom_rules, normalize, normalize_sentence, normalize_sentence_with_max_span, tn_normalize, + custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, + normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, tn_normalize, tn_normalize_lang, tn_normalize_sentence, tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, }; @@ -90,6 +91,89 @@ pub unsafe extern "C" fn nemo_normalize_sentence_with_max_span( } } +/// Aviation-flavoured single-input normalize. +/// +/// Layered on top of [`nemo_normalize`]: tries `cardinal::parse_aviation` +/// first so flight-number / call-sign phrases like `"seven eighty eight"` +/// resolve to `"788"`, then falls back to the regular dispatch. +/// +/// # Safety +/// - `input` must be a valid null-terminated UTF-8 string +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_normalize_aviation(input: *const c_char) -> *mut c_char { + if input.is_null() { + return ptr::null_mut(); + } + + let c_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = normalize_aviation(c_str); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + +/// Aviation-flavoured sentence normalize. +/// +/// Sentence-mode equivalent of [`nemo_normalize_aviation`]. Aviation cardinal +/// runs at priority 89 (above date / time, below money / measure), so +/// flight-number-style spans win without disturbing money / measure / decimal. +/// +/// # Safety +/// - `input` must be a valid null-terminated UTF-8 string +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_normalize_sentence_aviation(input: *const c_char) -> *mut c_char { + if input.is_null() { + return ptr::null_mut(); + } + + let c_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = normalize_sentence_aviation(c_str); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + +/// Aviation sentence normalize with a configurable max span size. +/// +/// # Safety +/// - `input` must be a valid null-terminated UTF-8 string +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_normalize_sentence_aviation_with_max_span( + input: *const c_char, + max_span_tokens: u32, +) -> *mut c_char { + if input.is_null() { + return ptr::null_mut(); + } + + let c_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = normalize_sentence_aviation_with_max_span(c_str, max_span_tokens as usize); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + /// Free a string allocated by nemo_normalize or nemo_normalize_sentence. /// /// # Safety @@ -372,4 +456,28 @@ mod tests { assert!(result.is_null()); } } + + #[test] + fn test_ffi_normalize_aviation() { + unsafe { + let input = CString::new("seven eighty eight").unwrap(); + let result = nemo_normalize_aviation(input.as_ptr()); + assert!(!result.is_null()); + let result_str = CStr::from_ptr(result).to_str().unwrap(); + assert_eq!(result_str, "788"); + nemo_free_string(result); + } + } + + #[test] + fn test_ffi_normalize_sentence_aviation() { + unsafe { + let input = CString::new("United seven eighty eight").unwrap(); + let result = nemo_normalize_sentence_aviation(input.as_ptr()); + assert!(!result.is_null()); + let result_str = CStr::from_ptr(result).to_str().unwrap(); + assert_eq!(result_str, "United 788"); + nemo_free_string(result); + } + } } diff --git a/src/itn/en/cardinal.rs b/src/itn/en/cardinal.rs index 497e662..71d4e14 100644 --- a/src/itn/en/cardinal.rs +++ b/src/itn/en/cardinal.rs @@ -73,6 +73,22 @@ lazy_static! { /// /// Returns None if the input cannot be parsed as a number. pub fn parse(input: &str) -> Option { + parse_with_reading(input, words_to_number) +} + +/// Aviation / flight-number / call-sign reading variant of [`parse`]. +/// +/// Recognises digit-prefix + grammatical-compound phrases like +/// `"seven eighty eight"` → `"788"`. Use this from contexts where flight +/// numbers or call signs are expected. Generic dispatch keeps using +/// [`parse`] to avoid clobbering date/time semantics. +pub fn parse_aviation(input: &str) -> Option { + parse_with_reading(input, words_to_number_aviation) +} + +/// Shared body of [`parse`] and [`parse_aviation`]. The only thing that +/// differs is which words-to-number reading is applied to the cleaned input. +fn parse_with_reading(input: &str, reader: fn(&str) -> Option) -> Option { let original = input.trim(); let input = original.to_lowercase(); let input = input.as_str(); @@ -107,7 +123,7 @@ pub fn parse(input: &str) -> Option { (false, input) }; - let num = words_to_number(rest)?; + let num = reader(rest)?; if is_negative { Some(format!("-{}", num)) @@ -116,18 +132,43 @@ pub fn parse(input: &str) -> Option { } } +/// Map a single-digit spoken word to its character form, or `None` if the +/// word isn't a 0-9 digit word. Recognises "oh" / "o" as 0 (common in +/// spelled-out codes and aviation frequencies). +fn single_digit_char(word: &str) -> Option { + Some(match word { + "zero" | "oh" | "o" => '0', + "one" => '1', + "two" => '2', + "three" => '3', + "four" => '4', + "five" => '5', + "six" => '6', + "seven" => '7', + "eight" => '8', + "nine" => '9', + _ => return None, + }) +} + /// Convert spoken number words to integer. /// -/// Algorithm: -/// 1. Tokenize input -/// 2. Process left-to-right, accumulating values -/// 3. Scale words (hundred, thousand, million) multiply the current accumulator -/// 4. Handle "and" as a separator (ignored) +/// Two readings are accepted, in order: +/// - **Digit-by-digit** (codes, flight numbers, aviation frequencies): +/// `"one three five"` → `135`. Triggered when every token is a single-digit +/// word (`zero`-`nine`, plus `oh`/`o` for `0`). +/// - **Grammatical** (English number grammar): `"twenty one"` → `21`, +/// `"one hundred twenty three"` → `123`, `"one thousand two hundred thirty +/// four"` → `1234`. Uses a left-to-right accumulator with scale words +/// multiplying the current group. +/// +/// Filler words `"and"` and `"a"` are stripped. /// -/// Examples: -/// - "twenty one" → 20 + 1 = 21 -/// - "one hundred twenty three" → (1 * 100) + 20 + 3 = 123 -/// - "one thousand two hundred thirty four" → (1 * 1000) + (2 * 100) + 30 + 4 = 1234 +/// Note: aviation flight-number reading (`"seven eighty eight"` → `788`) is +/// **not** applied here because it conflicts with date and time taggers (e.g. +/// `"twenty one forty two"` must remain readable as old-year `2042` for +/// `date::parse_old_year`). Use [`words_to_number_aviation`] for opt-in +/// flight-number / call-sign contexts. pub fn words_to_number(input: &str) -> Option { let input = input.to_lowercase(); let words: Vec<&str> = input @@ -139,10 +180,86 @@ pub fn words_to_number(input: &str) -> Option { return None; } - // Handle special case: "eleven hundred" = 1100 + // Digit-by-digit reading wins whenever it's unambiguous, but only for + // multi-token inputs. Single-token "oh" / "o" must not read as 0 — those + // forms are only digits in the context of a longer code (e.g. "oh oh + // seven"). Single-token "zero" / "one" / ... fall through to grammatical + // and resolve correctly there. + if words.len() >= 2 && words.iter().all(|w| single_digit_char(w).is_some()) { + return words + .iter() + .map(|w| single_digit_char(w).unwrap()) + .collect::() + .parse() + .ok(); + } + + grammatical_words_to_number(&words) +} + +/// Aviation / flight-number / call-sign reading of a number phrase. +/// +/// Recognises a leading run of single-digit words concatenated with a trailing +/// grammatical compound, e.g. `"seven eighty eight"` → `788`, +/// `"two thirty five"` → `235`. Falls back to [`words_to_number`] when the +/// aviation pattern does not apply (no digit prefix, scale word present, etc.). +/// +/// This is **opt-in**: callers reach for it explicitly from flight-number / +/// call-sign contexts. Generic ITN/TN dispatch keeps using [`words_to_number`] +/// to avoid clobbering date/time/measure semantics (e.g. `"twenty one forty +/// two"` as old-year `2042`). +pub fn words_to_number_aviation(input: &str) -> Option { + let input = input.to_lowercase(); + let words: Vec<&str> = input + .split_whitespace() + .filter(|w| *w != "and" && *w != "a") + .collect(); + + if words.is_empty() { + return None; + } + + // Digit-by-digit reading wins when unambiguous (multi-token only — see + // [`words_to_number`] for the rationale on rejecting bare "oh" / "o"). + if words.len() >= 2 && words.iter().all(|w| single_digit_char(w).is_some()) { + return words + .iter() + .map(|w| single_digit_char(w).unwrap()) + .collect::() + .parse() + .ok(); + } + + // Aviation flight-number style: digit prefix + grammatical compound. + // "seven eighty eight" → "7" ‖ 88 = 788. Skipped if a scale word appears, + // since "two thousand seventeen" must stay grammatical (= 2017, not 22017). + let has_scale = words.iter().any(|w| SCALES.contains_key(*w)); + if !has_scale { + let prefix_len = words + .iter() + .take_while(|w| single_digit_char(w).is_some()) + .count(); + if prefix_len >= 1 && prefix_len < words.len() { + if let Some(rest_num) = grammatical_words_to_number(&words[prefix_len..]) { + let prefix: String = words[..prefix_len] + .iter() + .map(|w| single_digit_char(w).unwrap()) + .collect(); + let combined = format!("{}{}", prefix, rest_num); + return combined.parse::().ok(); + } + } + } + + grammatical_words_to_number(&words) +} + +/// Parse a grammatical English number with running-sum + scale multiplication. +fn grammatical_words_to_number(words: &[&str]) -> Option { + // "eleven hundred" = 1100, "twenty hundred" = 2000 if words.len() == 2 && words[1] == "hundred" { if let Some(&val) = ONES.get(words[0]) { - if val >= 11 && val <= 19 { + if (11..=19).contains(&val) { return Some((val * 100) as i128); } } @@ -151,15 +268,11 @@ pub fn words_to_number(input: &str) -> Option { } } - // Handle "eleven hundred twenty one" pattern + // "eleven hundred twenty one" = 1100 + 21 if words.len() >= 2 && words[1] == "hundred" { if let Some(&first_val) = ONES.get(words[0]) { - if first_val >= 11 && first_val <= 99 { + if (11..=99).contains(&first_val) { let base = (first_val * 100) as i128; - if words.len() == 2 { - return Some(base); - } - // Parse remaining words let rest = words[2..].join(" "); if let Some(remainder) = words_to_number(&rest) { return Some(base + remainder); @@ -168,9 +281,6 @@ pub fn words_to_number(input: &str) -> Option { } if let Some(&first_val) = TENS.get(words[0]) { let base = (first_val * 100) as i128; - if words.len() == 2 { - return Some(base); - } let rest = words[2..].join(" "); if let Some(remainder) = words_to_number(&rest) { return Some(base + remainder); @@ -182,7 +292,7 @@ pub fn words_to_number(input: &str) -> Option { let mut current: i128 = 0; let mut found_number = false; - for word in words { + for &word in words { if let Some(&val) = ONES.get(word) { current += val as i128; found_number = true; @@ -206,7 +316,6 @@ pub fn words_to_number(input: &str) -> Option { found_number = true; } } else { - // Unknown word - not a valid number return None; } } @@ -298,4 +407,91 @@ mod tests { assert_eq!(parse("hello"), None); assert_eq!(parse("one hello"), None); } + + /// Digit-by-digit reading (issue #15). Sequences of single-digit words + /// like "one three five" should concatenate to 135, not sum to 9. + #[test] + fn test_spelled_digit_sequence() { + assert_eq!(parse("one three five"), Some("135".to_string())); + assert_eq!(parse("seven three seven"), Some("737".to_string())); + assert_eq!(parse("nine one one"), Some("911".to_string())); + assert_eq!(parse("six two five"), Some("625".to_string())); + assert_eq!(parse("one two"), Some("12".to_string())); + // "oh"/"o" read as 0 in spelled codes + assert_eq!(parse("five oh five"), Some("505".to_string())); + assert_eq!(parse("four o four"), Some("404".to_string())); + } + + /// Single-token "oh" / "o" must not be read as digit 0. Those forms + /// are only digits inside a longer spelled code; in isolation they are + /// interjections / letters. + #[test] + fn test_bare_oh_not_zero() { + assert_eq!(words_to_number("oh"), None); + assert_eq!(words_to_number("o"), None); + assert_eq!(words_to_number_aviation("oh"), None); + assert_eq!(words_to_number_aviation("o"), None); + assert_eq!(parse("oh"), None); + assert_eq!(parse_aviation("oh"), None); + // Sanity: bare "zero" still resolves (via grammatical), and the + // multi-token spelled forms still work. + assert_eq!(words_to_number("zero"), Some(0)); + assert_eq!(words_to_number("oh oh seven"), Some(7)); + } + + #[test] + fn test_words_to_number_digit_sequence() { + assert_eq!(words_to_number("one three five"), Some(135)); + assert_eq!(words_to_number("six two five"), Some(625)); + } + + /// Aviation flight-number style (issue #14): opt-in helper. A leading run + /// of single-digit words gets concatenated with the trailing grammatical + /// compound, e.g. "seven eighty eight" = "7" ‖ 88 = 788. Generic + /// `words_to_number` deliberately does *not* do this — it would break + /// `date::parse_old_year` ("twenty one forty two" → 2042) and overlap with + /// the time tagger ("two thirty five" → 02:35). + #[test] + fn test_words_to_number_aviation_flight_number() { + assert_eq!(words_to_number_aviation("seven eighty eight"), Some(788)); + assert_eq!(words_to_number_aviation("two thirty five"), Some(235)); + assert_eq!(words_to_number_aviation("three forty seven"), Some(347)); + assert_eq!(words_to_number_aviation("nine eleven"), Some(911)); + // Multi-digit prefix. + assert_eq!( + words_to_number_aviation("two seven eighty eight"), + Some(2788) + ); + } + + /// Aviation helper falls back to grammatical when no digit prefix exists. + #[test] + fn test_words_to_number_aviation_falls_back_to_grammatical() { + assert_eq!(words_to_number_aviation("twenty one"), Some(21)); + assert_eq!(words_to_number_aviation("one hundred"), Some(100)); + } + + /// Aviation helper must keep grammatical reading when a scale word is + /// present. "two thousand seventeen" must stay 2017, not 22017. + #[test] + fn test_words_to_number_aviation_scale_word_forces_grammatical() { + assert_eq!( + words_to_number_aviation("two thousand seventeen"), + Some(2017) + ); + assert_eq!( + words_to_number_aviation("two million three"), + Some(2_000_003) + ); + } + + /// Generic `words_to_number` (the dispatch path) must NOT do aviation + /// reading: "seven eighty eight" stays grammatical 95 there, so date/time + /// taggers see consistent values. + #[test] + fn test_words_to_number_no_aviation_reading() { + assert_eq!(words_to_number("seven eighty eight"), Some(95)); + assert_eq!(words_to_number("twenty one forty two"), Some(63)); + assert_eq!(words_to_number("two thousand seventeen"), Some(2017)); + } } diff --git a/src/itn/en/decimal.rs b/src/itn/en/decimal.rs index 4e3e235..a7bff1e 100644 --- a/src/itn/en/decimal.rs +++ b/src/itn/en/decimal.rs @@ -177,4 +177,20 @@ mod tests { Some("4.85 billion".to_string()) ); } + + /// Digit-by-digit reading of the integer part (issue #15). + /// Aviation-style frequencies like "one three five point six two five" + /// should produce 135.625, not 9.625. + #[test] + fn test_spelled_digit_integer_part() { + assert_eq!( + parse("one three five point six two five"), + Some("135.625".to_string()) + ); + assert_eq!( + parse("seven three seven point five"), + Some("737.5".to_string()) + ); + assert_eq!(parse("one two point three four"), Some("12.34".to_string())); + } } diff --git a/src/itn/en/telephone.rs b/src/itn/en/telephone.rs index 59a3c70..45ece4a 100644 --- a/src/itn/en/telephone.rs +++ b/src/itn/en/telephone.rs @@ -17,6 +17,13 @@ pub fn parse(input: &str) -> Option { return None; } + // Reject inputs that contain a decimal "point" — those belong to the + // decimal tagger (e.g. "one three five point six two five" should be + // "135.625", not "135-625"). See issue #15. + if input_trimmed.contains(" point ") { + return None; + } + // Try IP address pattern first (contains "dot") if input_trimmed.contains(" dot ") { return parse_ip_address(input_trimmed); @@ -719,4 +726,12 @@ mod tests { Some("123.123.0.40".to_string()) ); } + + /// Telephone tagger must not consume decimal expressions (issue #15): + /// "one three five point six two five" is "135.625", not a phone number. + #[test] + fn test_rejects_decimal_point() { + assert_eq!(parse("one three five point six two five"), None); + assert_eq!(parse("seven three seven point five"), None); + } } diff --git a/src/lib.rs b/src/lib.rs index 5ad114d..dbc0c46 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -111,6 +111,57 @@ pub fn normalize(input: &str) -> String { input.to_string() } +/// Normalize a single input with **aviation flight-number reading +/// prioritized**. +/// +/// Same dispatch as [`normalize`], with one twist: `cardinal::parse_aviation` +/// is tried *after* the high-confidence taggers (`custom_rules`, `whitelist`, +/// `punctuation`, `word`) but *before* `time` and `date`. The result: when +/// the whole input is a number-only phrase like `"two thirty five"` or +/// `"seven eighty eight"`, the aviation reading wins (`"235"`, `"788"`) +/// instead of being eaten as a time (`"02:35"`) or as an old-year via the +/// date tagger. +/// +/// Use this from flight-number / call-sign / aviation-radio contexts. Phrases +/// that aren't pure number words still flow through the rest of the +/// pipeline normally (`"five dollars"` → `"$5"` via the money tagger). +/// +/// ``` +/// use text_processing_rs::normalize_aviation; +/// +/// assert_eq!(normalize_aviation("seven eighty eight"), "788"); +/// assert_eq!(normalize_aviation("two thirty five"), "235"); +/// // Non-number phrases are unaffected. +/// assert_eq!(normalize_aviation("hello world"), "hello world"); +/// ``` +pub fn normalize_aviation(input: &str) -> String { + let input = input.trim(); + + // High-confidence rules still win. + if let Some(result) = custom_rules::parse(input) { + return result; + } + if let Some(result) = whitelist::parse(input) { + return result; + } + if let Some(result) = punctuation::parse(input) { + return result; + } + if let Some(result) = word::parse(input) { + return result; + } + + // Aviation cardinal beats time/date here. This is the whole point of + // calling `normalize_aviation` instead of `normalize`. + if let Some(num) = cardinal::parse_aviation(input) { + return num; + } + + // Fall back to the standard pipeline for anything aviation cardinal + // didn't recognise (money, measure, decimal, ordinal, telephone, etc.). + normalize(input) +} + /// Normalize with language selection. /// /// Supports language-specific ITN taggers for converting spoken-form @@ -843,7 +894,12 @@ const DEFAULT_MAX_SPAN_TOKENS: usize = 16; /// broad patterns (cardinal) last and limited to short spans. /// /// Excluded in sentence mode: `word` and `telephone` (over-fire on natural language). -fn parse_span(span: &str) -> Option<(String, u8)> { +/// +/// `aviation`: when `true`, `cardinal::parse_aviation` is tried at priority 89 +/// (above `date`=88 and `time`=85, below `measure`=90 / `money`=95) and the +/// regular cardinal fallback at 70 is skipped (the aviation reader already +/// falls back to grammatical when no digit prefix is present). +fn parse_span(span: &str, aviation: bool) -> Option<(String, u8)> { let token_count = span.split_whitespace().count(); if token_count == 0 { return None; @@ -864,6 +920,18 @@ fn parse_span(span: &str) -> Option<(String, u8)> { if let Some(result) = measure::parse(span) { return Some((result, 90)); } + + // Aviation cardinal opt-in: priority 89, beats date/time. No short-span + // gate — aviation mode is opt-in, so the caller has accepted aggressive + // matching across longer spans like "one thousand two hundred thirty + // four". `parse_aviation` falls back to grammatical when the digit-prefix + // pattern does not apply, so non-aviation phrases still resolve. + if aviation { + if let Some(result) = cardinal::parse_aviation(span) { + return Some((result, 89)); + } + } + if let Some(result) = date::parse(span) { return Some((result, 88)); } @@ -880,8 +948,9 @@ fn parse_span(span: &str) -> Option<(String, u8)> { return Some((result, 75)); } - // Cardinal only for short spans to avoid over-matching on natural language. - if token_count <= 4 { + // Default cardinal fallback (priority 70). In aviation mode the cardinal + // path is already covered by the priority-89 branch above. + if !aviation && token_count <= 4 { if let Some(result) = cardinal::parse(span) { return Some((result, 70)); } @@ -906,6 +975,38 @@ pub fn normalize_sentence(input: &str) -> String { normalize_sentence_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) } +/// Sentence-mode equivalent of [`normalize_aviation`]. Aviation cardinal +/// runs at priority 89 (above `date`=88 / `time`=85, below `measure`=90 / +/// `money`=95), so flight-number-style spans win over date/time while +/// measure / money phrases keep their existing semantics. +/// +/// ``` +/// use text_processing_rs::normalize_sentence_aviation; +/// +/// // Aviation cardinal beats time/date for pure-number spans. +/// assert_eq!( +/// normalize_sentence_aviation("United seven eighty eight"), +/// "United 788" +/// ); +/// assert_eq!( +/// normalize_sentence_aviation("flight two thirty five departs at gate four"), +/// "flight 235 departs at gate 4" +/// ); +/// // Non-aviation spans flow through normally. +/// assert_eq!( +/// normalize_sentence_aviation("I have twenty one apples"), +/// "I have 21 apples" +/// ); +/// ``` +pub fn normalize_sentence_aviation(input: &str) -> String { + normalize_sentence_aviation_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) +} + +/// [`normalize_sentence_aviation`] with a configurable max span size. +pub fn normalize_sentence_aviation_with_max_span(input: &str, max_span_tokens: usize) -> String { + normalize_sentence_inner(input, max_span_tokens, true) +} + /// Normalize a full sentence with a configurable max span size. /// /// `max_span_tokens` controls the maximum number of consecutive tokens @@ -920,6 +1021,12 @@ pub fn normalize_sentence(input: &str) -> String { /// assert_eq!(normalize_sentence_with_max_span("I have twenty one apples", 4), "I have 21 apples"); /// ``` pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> String { + normalize_sentence_inner(input, max_span_tokens, false) +} + +/// Sentence-mode dispatch loop. The `aviation` flag is forwarded to +/// [`parse_span`] so each span sees the right tagger priorities. +fn normalize_sentence_inner(input: &str, max_span_tokens: usize, aviation: bool) -> String { let trimmed = input.trim(); if trimmed.is_empty() { return trimmed.to_string(); @@ -941,7 +1048,7 @@ pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> // Longest-span-first search keeps replacements stable and non-overlapping. for end in (i + 1..=max_end).rev() { let span = tokens[i..end].join(" "); - let Some((candidate, score)) = parse_span(&span) else { + let Some((candidate, score)) = parse_span(&span, aviation) else { continue; }; diff --git a/src/wasm.rs b/src/wasm.rs index 0ccfd04..76b54da 100644 --- a/src/wasm.rs +++ b/src/wasm.rs @@ -3,7 +3,8 @@ use wasm_bindgen::prelude::*; use crate::{ - custom_rules, normalize, normalize_sentence, normalize_sentence_with_max_span, + custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, + normalize_sentence_aviation_with_max_span, normalize_sentence_with_max_span, normalize_with_lang, tn_normalize, tn_normalize_lang, tn_normalize_sentence, tn_normalize_sentence_lang, tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, @@ -35,6 +36,21 @@ pub fn normalize_sentence_with_max_span_js(input: &str, max_span_tokens: u32) -> normalize_sentence_with_max_span(input, max_span_tokens as usize) } +#[wasm_bindgen(js_name = normalizeAviation)] +pub fn normalize_aviation_js(input: &str) -> String { + normalize_aviation(input) +} + +#[wasm_bindgen(js_name = normalizeSentenceAviation)] +pub fn normalize_sentence_aviation_js(input: &str) -> String { + normalize_sentence_aviation(input) +} + +#[wasm_bindgen(js_name = normalizeSentenceAviationWithMaxSpan)] +pub fn normalize_sentence_aviation_with_max_span_js(input: &str, max_span_tokens: u32) -> String { + normalize_sentence_aviation_with_max_span(input, max_span_tokens as usize) +} + #[wasm_bindgen(js_name = tnNormalize)] pub fn tn_normalize_js(input: &str) -> String { tn_normalize(input) diff --git a/tests/en_tests.rs b/tests/en_tests.rs index 81fed02..414144b 100644 --- a/tests/en_tests.rs +++ b/tests/en_tests.rs @@ -7,7 +7,8 @@ mod common; use std::path::Path; use text_processing_rs::{ - custom_rules, normalize, normalize_sentence, normalize_sentence_with_max_span, + custom_rules, normalize, normalize_aviation, normalize_sentence, normalize_sentence_aviation, + normalize_sentence_with_max_span, }; fn print_failures(results: &common::TestResults) { @@ -840,3 +841,136 @@ fn test_sentence_decimal_in_context() { "the value is 3.14" ); } + +// --- Issue #15: digit-by-digit integer part for decimals (aviation style) --- + +/// Direct reproductions of https://github.com/FluidInference/text-processing-rs/issues/15 +#[test] +fn test_issue_15_normalize_aviation_frequency() { + // The whole input contains a non-number prefix ("frequency"), so single- + // expression mode can't return a clean number — it should leave the input + // unchanged rather than producing the previously-buggy "135-625" telephone + // formatting. + let out = normalize("frequency one three five point six two five"); + assert_ne!( + out, "135-625", + "telephone tagger should not match decimal input" + ); + assert_eq!(out, "frequency one three five point six two five"); +} + +#[test] +fn test_issue_15_normalize_sentence_aviation_frequency() { + assert_eq!( + normalize_sentence("frequency one three five point six two five"), + "frequency 135.625" + ); +} + +#[test] +fn test_issue_15_decimal_with_spelled_digit_integer() { + // Without the prefix, the whole input is a single decimal expression. + assert_eq!(normalize("one three five point six two five"), "135.625"); +} + +#[test] +fn test_issue_15_decimal_with_spelled_digit_integer_in_sentence() { + assert_eq!( + normalize_sentence("the tower said one three five point six two five"), + "the tower said 135.625" + ); +} + +#[test] +fn test_spelled_digit_cardinal() { + // Digit-by-digit reading of cardinals (codes, flight numbers, frequencies). + // Note: sequences that match clock-time patterns ("five oh five") are + // intentionally handled by the time tagger and are not asserted here. + assert_eq!(normalize("one three five"), "135"); + assert_eq!(normalize("seven three seven"), "737"); + assert_eq!(normalize("nine one one"), "911"); +} + +#[test] +fn test_spelled_digit_cardinal_does_not_break_normal_cardinals() { + // Existing cardinal phrasings must still work + assert_eq!(normalize("twenty one"), "21"); + assert_eq!(normalize("one hundred thirty five"), "135"); + assert_eq!(normalize("one thousand two hundred thirty four"), "1234"); +} + +/// Issue #14: aviation flight-number reading is exposed as an **opt-in** +/// pipeline. Generic dispatch keeps upstream NeMo semantics (date wins for +/// `"twenty one forty two"`, time wins for `"two thirty five"`); callers +/// who know they're in aviation context reach for the `*_aviation` +/// variants, which run aviation cardinal at priority 89 (above date 88 +/// and time 85). +#[test] +fn test_issue_14_default_dispatch_unchanged() { + // Scale-word grammar is preserved everywhere. + assert_eq!(normalize("two thousand seventeen"), "2017"); + assert_eq!(normalize("one hundred"), "100"); + assert_eq!(normalize_sentence("two thousand seventeen"), "2017"); + + // Sentence-mode default dispatch: "seven eighty eight" stays grammatical + // 95, NOT aviation 788. + assert_eq!(normalize_sentence("seven eighty eight"), "95"); + + // Single-input `normalize` keeps its existing telephone-tagger + // behaviour for whole-input flight-number-style phrases. + assert_eq!(normalize("seven eighty eight"), "788"); +} + +/// Aviation pipeline `normalize_aviation` (single-input). Aviation cardinal +/// runs early enough to beat time/date. +#[test] +fn test_issue_14_normalize_aviation() { + assert_eq!(normalize_aviation("seven eighty eight"), "788"); + // Beats time tagger. + assert_eq!(normalize_aviation("two thirty five"), "235"); + // Beats date old-year reading. + assert_eq!(normalize_aviation("twenty one forty two"), "63"); + // Non-number phrases fall through unchanged. + assert_eq!(normalize_aviation("hello world"), "hello world"); + // Money / measure / decimal / ordinal still work via fallback to + // standard `normalize`. + assert_eq!(normalize_aviation("five dollars"), "$5"); + assert_eq!(normalize_aviation("five point two"), "5.2"); + assert_eq!(normalize_aviation("twenty first"), "21st"); + // Scale-word grammar still wins (no digit prefix → grammatical). + assert_eq!(normalize_aviation("two thousand seventeen"), "2017"); +} + +/// Aviation pipeline `normalize_sentence_aviation` (sentence mode). The +/// cardinal-aviation priority bump (89) makes flight-number spans win +/// over date/time in real sentences. +#[test] +fn test_issue_14_normalize_sentence_aviation() { + // The original bug from issue #14. + assert_eq!( + normalize_sentence_aviation("United seven eighty eight"), + "United 788" + ); + assert_eq!( + normalize_sentence_aviation("flight two thirty five departs at gate four"), + "flight 235 departs at gate 4" + ); + + // Scale-word grammar is preserved. + assert_eq!( + normalize_sentence_aviation("two thousand seventeen"), + "2017" + ); + + // Money / measure stay above aviation (priority 95 / 90 > 89). + assert_eq!( + normalize_sentence_aviation("I owe five dollars"), + "I owe $5" + ); + + // Plain natural language is untouched. + assert_eq!( + normalize_sentence_aviation("I have twenty one apples"), + "I have 21 apples" + ); +} diff --git a/tests/en_tn_tests.rs b/tests/en_tn_tests.rs index f96c9f7..c60ee8b 100644 --- a/tests/en_tn_tests.rs +++ b/tests/en_tn_tests.rs @@ -212,3 +212,21 @@ fn test_tn_roundtrip_cardinal() { ); } } + +/// Release-sanity check for issue #16: the public surface that hongbo-miao +/// reported missing in crates.io 0.1.0 must compile and behave correctly +/// against the current version. If this test stops compiling, the API +/// surface regressed and consumers will hit `unresolved import`. +#[test] +fn test_issue_16_tn_normalize_sentence_public_api() { + // Reporter's exact snippet (https://github.com/FluidInference/text-processing-rs/issues/16) + let normalized_text = tn_normalize_sentence("I have twenty one apples."); + // The function must exist, be callable, and return a non-empty result. + // Behavioural assertion: the cardinal "21" is the only normalizable + // span here, and TN should leave the rest of the sentence intact. + assert!( + normalized_text.contains("twenty one") || normalized_text.contains("21"), + "tn_normalize_sentence dropped or mangled the input: {}", + normalized_text + ); +}