Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 46 additions & 5 deletions src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,22 @@ use crate::{
///
/// `max_span_tokens`: `0` means "use library default" (16); any positive
/// value is a caller-specified max span.
fn options_from_ffi(concat_compound_numbers: u32, max_span_tokens: u32) -> NormalizeOptions {
///
/// `disable_bare_second`: any non-zero value blocks the bare word
/// `"second"` from converting to `"2nd"` (issue #22).
fn options_from_ffi(
concat_compound_numbers: u32,
max_span_tokens: u32,
disable_bare_second: u32,
) -> NormalizeOptions {
NormalizeOptions {
concat_compound_numbers: concat_compound_numbers != 0,
max_span_tokens: if max_span_tokens == 0 {
None
} else {
Some(max_span_tokens as usize)
},
disable_bare_second: disable_bare_second != 0,
}
}

Expand Down Expand Up @@ -86,13 +94,17 @@ pub unsafe extern "C" fn nemo_normalize_sentence(input: *const c_char) -> *mut c
/// concatenate rather than add — e.g. `"thirty five sixty two"` → `"3562"`,
/// `"seven eighty eight"` → `"788"`.
///
/// `disable_bare_second`: `0` keeps the default behavior, non-zero blocks
/// the bare word `"second"` from being rewritten to `"2nd"` (issue #22).
///
/// # Safety
/// - `input` must be a valid null-terminated UTF-8 string
/// - Returns a newly allocated string that must be freed with `nemo_free_string`
#[no_mangle]
pub unsafe extern "C" fn nemo_normalize_with_options(
input: *const c_char,
concat_compound_numbers: u32,
disable_bare_second: u32,
) -> *mut c_char {
if input.is_null() {
return ptr::null_mut();
Expand All @@ -103,7 +115,7 @@ pub unsafe extern "C" fn nemo_normalize_with_options(
Err(_) => return ptr::null_mut(),
};

let options = options_from_ffi(concat_compound_numbers, 0);
let options = options_from_ffi(concat_compound_numbers, 0, disable_bare_second);
let result = normalize_with_options(c_str, options);

match CString::new(result) {
Expand All @@ -121,6 +133,9 @@ pub unsafe extern "C" fn nemo_normalize_with_options(
/// - `0` — use library default (`16`).
/// - `>0` — use the specified max span.
///
/// `disable_bare_second`: `0` keeps the default behavior, non-zero blocks
/// the bare word `"second"` from being rewritten to `"2nd"` (issue #22).
///
/// # Safety
/// - `input` must be a valid null-terminated UTF-8 string
/// - Returns a newly allocated string that must be freed with `nemo_free_string`
Expand All @@ -129,6 +144,7 @@ pub unsafe extern "C" fn nemo_normalize_sentence_with_options(
input: *const c_char,
concat_compound_numbers: u32,
max_span_tokens: u32,
disable_bare_second: u32,
) -> *mut c_char {
if input.is_null() {
return ptr::null_mut();
Expand All @@ -139,7 +155,11 @@ pub unsafe extern "C" fn nemo_normalize_sentence_with_options(
Err(_) => return ptr::null_mut(),
};

let options = options_from_ffi(concat_compound_numbers, max_span_tokens);
let options = options_from_ffi(
concat_compound_numbers,
max_span_tokens,
disable_bare_second,
);
let result = normalize_sentence_with_options(c_str, options);

match CString::new(result) {
Expand Down Expand Up @@ -435,7 +455,7 @@ mod tests {
fn test_ffi_normalize_with_options_concat_compound() {
unsafe {
let input = CString::new("seven eighty eight").unwrap();
let result = nemo_normalize_with_options(input.as_ptr(), 1);
let result = nemo_normalize_with_options(input.as_ptr(), 1, 0);
assert!(!result.is_null());
let result_str = CStr::from_ptr(result).to_str().unwrap();
assert_eq!(result_str, "788");
Expand All @@ -447,11 +467,32 @@ mod tests {
fn test_ffi_normalize_sentence_with_options_concat_compound() {
unsafe {
let input = CString::new("United seven eighty eight").unwrap();
let result = nemo_normalize_sentence_with_options(input.as_ptr(), 1, 0);
let result = nemo_normalize_sentence_with_options(input.as_ptr(), 1, 0, 0);
assert!(!result.is_null());
let result_str = CStr::from_ptr(result).to_str().unwrap();
assert_eq!(result_str, "United 788");
nemo_free_string(result);
}
}

#[test]
fn test_ffi_normalize_sentence_with_options_disable_bare_second() {
unsafe {
let input = CString::new("Give me a second to check.").unwrap();
// Default flag (0) keeps today's behavior.
let baseline = nemo_normalize_sentence_with_options(input.as_ptr(), 0, 0, 0);
assert_eq!(
CStr::from_ptr(baseline).to_str().unwrap(),
"Give me a 2nd to check."
);
nemo_free_string(baseline);
// Non-zero flag blocks bare second.
let opted_in = nemo_normalize_sentence_with_options(input.as_ptr(), 0, 0, 1);
assert_eq!(
CStr::from_ptr(opted_in).to_str().unwrap(),
"Give me a second to check."
);
nemo_free_string(opted_in);
}
}
}
84 changes: 64 additions & 20 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ use itn::en::{
/// Tries taggers in order of specificity (most specific first).
/// Returns original text if no tagger matches.
pub fn normalize(input: &str) -> String {
normalize_inner(input, false)
}

/// Single-expression dispatch with the `disable_bare_second` flag plumbed in.
/// Issue #22: when the flag is set and the trimmed input is exactly the bare
/// word `"second"`, the ordinal tagger is skipped so it stays literal.
fn normalize_inner(input: &str, disable_bare_second: bool) -> String {
let input = input.trim();

// Apply custom user rules first (highest priority)
Expand Down Expand Up @@ -100,9 +107,12 @@ pub fn normalize(input: &str) -> String {
return result;
}

// Try ordinal numbers
if let Some(result) = ordinal::parse(input) {
return result;
// Try ordinal numbers (issue #22: skip the bare "second" case when opted out).
let skip_ordinal = disable_bare_second && input.eq_ignore_ascii_case("second");
if !skip_ordinal {
if let Some(result) = ordinal::parse(input) {
return result;
}
}

// Try cardinal number
Expand Down Expand Up @@ -136,7 +146,7 @@ pub fn normalize(input: &str) -> String {
/// ```
pub fn normalize_with_options(input: &str, options: NormalizeOptions) -> String {
if !options.concat_compound_numbers {
return normalize(input);
return normalize_inner(input, options.disable_bare_second);
}

let input = input.trim();
Expand All @@ -162,7 +172,7 @@ pub fn normalize_with_options(input: &str, options: NormalizeOptions) -> String

// Fall back to the standard pipeline for anything not recognised
// (money, measure, decimal, ordinal, telephone, etc.).
normalize(input)
normalize_inner(input, options.disable_bare_second)
}

/// Normalize with language selection.
Expand Down Expand Up @@ -900,7 +910,11 @@ fn tn_parse_span_lang(span: &str, lang: &str) -> Option<(String, u8)> {
/// `money`=95) and the regular cardinal fallback at 70 is skipped (the
/// concat-compound reader already falls back to grammatical when the
/// concat pattern does not apply).
fn parse_span(span: &str, concat_compound: bool) -> Option<(String, u8)> {
fn parse_span(
span: &str,
concat_compound: bool,
disable_bare_second: bool,
) -> Option<(String, u8)> {
let token_count = span.split_whitespace().count();
if token_count == 0 {
return None;
Expand Down Expand Up @@ -945,8 +959,17 @@ fn parse_span(span: &str, concat_compound: bool) -> Option<(String, u8)> {
if let Some(result) = decimal::parse(span) {
return Some((result, 80));
}
if let Some(result) = ordinal::parse(span) {
return Some((result, 75));
// Issue #22: when `disable_bare_second` is set, the bare standalone
// word `"second"` is *not* converted to `"2nd"` so phrases like
// `"give me a second"` stay literal. Compound ordinals
// (`"twenty second"`) still flow through this branch because they
// span 2+ tokens.
let skip_ordinal =
disable_bare_second && token_count == 1 && span.trim().eq_ignore_ascii_case("second");
if !skip_ordinal {
if let Some(result) = ordinal::parse(span) {
return Some((result, 75));
}
}

// Default cardinal fallback (priority 70). In concat-compound mode the
Expand All @@ -973,7 +996,7 @@ fn parse_span(span: &str, concat_compound: bool) -> Option<(String, u8)> {
/// assert_eq!(normalize_sentence("hello world"), "hello world");
/// ```
pub fn normalize_sentence(input: &str) -> String {
normalize_sentence_inner(input, DEFAULT_MAX_SPAN_TOKENS, false)
normalize_sentence_inner(input, DEFAULT_MAX_SPAN_TOKENS, false, false)
}

/// Unified sentence-mode entry point.
Expand Down Expand Up @@ -1002,7 +1025,12 @@ pub fn normalize_sentence(input: &str) -> String {
/// ```
pub fn normalize_sentence_with_options(input: &str, options: NormalizeOptions) -> String {
let max_span = options.max_span_tokens.unwrap_or(DEFAULT_MAX_SPAN_TOKENS);
normalize_sentence_inner(input, max_span, options.concat_compound_numbers)
normalize_sentence_inner(
input,
max_span,
options.concat_compound_numbers,
options.disable_bare_second,
)
}

/// Per-pretoken record: the token text plus the original separator that
Expand Down Expand Up @@ -1110,13 +1138,23 @@ where
let max_end = usize::min(pretokens.len(), i + max_span);
let mut best: Option<(usize, String, u8)> = None;

// Longest-span-first search keeps replacements stable and non-overlapping.
// Longest-span-first search keeps replacements stable and
// non-overlapping. Reconstruct each span using the per-pretoken
// separator so adjacency is preserved: pretokens that came from a
// single original word (e.g. `"Dr."` -> `["Dr", "."]` with sep=`""`
// on the period) re-emerge as `"Dr."`, not `"Dr ."`. This is what
// the TN whitelist needs to match abbreviations like `"e.g."` /
// `"Prof."` (PR #25 review feedback). Trying smaller `end` values
// already handles "ignore trailing punctuation" cases like
// `"twenty one,"` -> match `"twenty one"`.
for end in (i + 1..=max_end).rev() {
let span: String = pretokens[i..end]
.iter()
.map(|p| p.text.as_str())
.collect::<Vec<_>>()
.join(" ");
let mut span = String::new();
for (idx, p) in pretokens[i..end].iter().enumerate() {
if idx > 0 {
span.push_str(p.sep);
}
span.push_str(&p.text);
}
let Some((candidate, score)) = parser(&span) else {
continue;
};
Expand Down Expand Up @@ -1157,17 +1195,23 @@ where
out
}

/// Sentence-mode dispatch loop. The `concat_compound` flag is forwarded to
/// [`parse_span`] so each span sees the right tagger priorities.
fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound: bool) -> String {
/// Sentence-mode dispatch loop. The `concat_compound` and
/// `disable_bare_second` flags are forwarded to [`parse_span`] so each span
/// sees the right tagger priorities.
fn normalize_sentence_inner(
input: &str,
max_span_tokens: usize,
concat_compound: bool,
disable_bare_second: bool,
) -> String {
let trimmed = input.trim();
if trimmed.is_empty() {
return trimmed.to_string();
}

let pretokens = pretokenize(trimmed);
sentence_loop(&pretokens, max_span_tokens, |span| {
parse_span(span, concat_compound)
parse_span(span, concat_compound, disable_bare_second)
})
}

Expand Down
13 changes: 13 additions & 0 deletions src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ pub struct NormalizeOptions {
/// Sentence-mode sliding-window cap (in tokens). `None` uses
/// [`DEFAULT_MAX_SPAN_TOKENS`]. Ignored in single-expression mode.
pub max_span_tokens: Option<usize>,

/// Skip the ordinal tagger for the bare word `"second"` so it is not
/// rewritten to `"2nd"` in phrases like `"give me a second"` (issue #22).
/// Compound ordinals (`"twenty second"` → `"22nd"`) and date contexts
/// (`"January second twenty twenty five"`) still convert. Default `false`.
pub disable_bare_second: bool,
}

impl NormalizeOptions {
Expand All @@ -27,6 +33,7 @@ impl NormalizeOptions {
Self {
concat_compound_numbers: false,
max_span_tokens: None,
disable_bare_second: false,
}
}

Expand All @@ -41,4 +48,10 @@ impl NormalizeOptions {
self.max_span_tokens = Some(max_span_tokens);
self
}

/// Set [`Self::disable_bare_second`].
pub const fn with_disable_bare_second(mut self, enabled: bool) -> Self {
self.disable_bare_second = enabled;
self
}
}
33 changes: 28 additions & 5 deletions src/wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,19 @@ use crate::{
///
/// `max_span_tokens == 0` is treated as "use library default" so JS callers
/// can pass `0` rather than dealing with optional values across the boundary.
fn js_options(concat_compound_numbers: bool, max_span_tokens: u32) -> NormalizeOptions {
fn js_options(
concat_compound_numbers: bool,
max_span_tokens: u32,
disable_bare_second: bool,
) -> NormalizeOptions {
NormalizeOptions {
concat_compound_numbers,
max_span_tokens: if max_span_tokens == 0 {
None
} else {
Some(max_span_tokens as usize)
},
disable_bare_second,
}
}

Expand Down Expand Up @@ -48,21 +53,39 @@ pub fn normalize_sentence_js(input: &str) -> String {
/// Unified single-expression normalize. `concatCompoundNumbers=true` reads
/// consecutive number words as concatenation rather than addition, e.g.
/// `"thirty five sixty two"` → `"3562"`, `"seven eighty eight"` → `"788"`.
/// `disableBareSecond=true` blocks the bare word `"second"` from converting
/// to `"2nd"` (issue #22).
#[wasm_bindgen(js_name = normalizeWithOptions)]
pub fn normalize_with_options_js(input: &str, concat_compound_numbers: bool) -> String {
normalize_with_options(input, js_options(concat_compound_numbers, 0))
pub fn normalize_with_options_js(
input: &str,
concat_compound_numbers: bool,
disable_bare_second: bool,
) -> String {
normalize_with_options(
input,
js_options(concat_compound_numbers, 0, disable_bare_second),
)
}

/// Unified sentence normalize. `concatCompoundNumbers` mirrors the
/// single-expression flag; `maxSpanTokens == 0` means "use library default"
/// (16).
/// (16). `disableBareSecond=true` keeps phrases like `"give me a second"`
/// literal (issue #22).
#[wasm_bindgen(js_name = normalizeSentenceWithOptions)]
pub fn normalize_sentence_with_options_js(
input: &str,
concat_compound_numbers: bool,
max_span_tokens: u32,
disable_bare_second: bool,
) -> String {
normalize_sentence_with_options(input, js_options(concat_compound_numbers, max_span_tokens))
normalize_sentence_with_options(
input,
js_options(
concat_compound_numbers,
max_span_tokens,
disable_bare_second,
),
)
}

#[wasm_bindgen(js_name = tnNormalize)]
Expand Down
Loading
Loading