Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
240 changes: 125 additions & 115 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1005,31 +1005,119 @@ pub fn normalize_sentence_with_options(input: &str, options: NormalizeOptions) -
normalize_sentence_inner(input, max_span, options.concat_compound_numbers)
}

/// Sentence-mode dispatch loop. The `concat_compound` flag is forwarded to
/// [`parse_span`] so each span sees the right tagger priorities.
fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound: bool) -> String {
let trimmed = input.trim();
if trimmed.is_empty() {
return trimmed.to_string();
/// Per-pretoken record: the token text plus the original separator that
/// preceded it (`" "` for whitespace, `""` if the token came from a
/// punctuation split or starts the input).
struct Pretoken {
text: String,
sep: &'static str,
}

/// ASCII punctuation characters that are split off the leading or trailing
/// edge of a whitespace-separated word so taggers see clean number phrases
/// (issue #21). Apostrophes and hyphens are intentionally excluded so
/// contractions ("don't") and hyphenated words ("twenty-one") stay intact.
fn is_split_punct(c: char) -> bool {
matches!(
c,
',' | '.' | ';' | ':' | '!' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '"'
)
}

/// Split each whitespace-separated word into leading punctuation, core, and
/// trailing punctuation pretokens, preserving the original leading separator
/// on the first piece of each word so output reconstruction matches input
/// spacing.
fn pretokenize(input: &str) -> Vec<Pretoken> {
let mut out: Vec<Pretoken> = Vec::new();
let mut first_word = true;

for word in input.split_whitespace() {
let word_lead: &'static str = if first_word { "" } else { " " };
first_word = false;

// char_indices keeps multi-byte chars intact when slicing.
let chars: Vec<(usize, char)> = word.char_indices().collect();
if chars.is_empty() {
continue;
}

let mut start = 0usize;
while start < chars.len() && is_split_punct(chars[start].1) {
start += 1;
}
let mut end = chars.len();
while end > start && is_split_punct(chars[end - 1].1) {
end -= 1;
}

let mut next_sep = word_lead;

// Leading punctuation: each character becomes its own pretoken.
for &(_, ch) in &chars[..start] {
out.push(Pretoken {
text: ch.to_string(),
sep: next_sep,
});
next_sep = "";
}

// Core text (may be empty if the whole word was punctuation).
if start < end {
let core_start = chars[start].0;
let core_end = if end < chars.len() {
chars[end].0
} else {
word.len()
};
out.push(Pretoken {
text: word[core_start..core_end].to_string(),
sep: next_sep,
});
next_sep = "";
}

// Trailing punctuation.
for &(_, ch) in &chars[end..] {
out.push(Pretoken {
text: ch.to_string(),
sep: next_sep,
});
next_sep = "";
}
}

out
}

/// Shared sliding-window match loop used by the three sentence-mode entry
/// points. `parser` returns `Some((replacement, score))` if the joined span
/// can be normalized.
fn sentence_loop<F>(pretokens: &[Pretoken], max_span_tokens: usize, parser: F) -> String
where
F: Fn(&str) -> Option<(String, u8)>,
{
let max_span = if max_span_tokens == 0 {
1
} else {
max_span_tokens
};
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
let mut out: Vec<String> = Vec::with_capacity(tokens.len());

let mut out = String::new();
let mut i = 0usize;

while i < tokens.len() {
let max_end = usize::min(tokens.len(), i + max_span);
while i < pretokens.len() {
let max_end = usize::min(pretokens.len(), i + max_span);
let mut best: Option<(usize, String, u8)> = None;

// Longest-span-first search keeps replacements stable and non-overlapping.
for end in (i + 1..=max_end).rev() {
let span = tokens[i..end].join(" ");
let Some((candidate, score)) = parse_span(&span, concat_compound) else {
let span: String = pretokens[i..end]
.iter()
.map(|p| p.text.as_str())
.collect::<Vec<_>>()
.join(" ");
Comment on lines +1115 to +1119
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Pretokenizer breaks TN whitelist abbreviations ending with '.' in sentence mode

The pretokenize function splits trailing periods from whitespace tokens (e.g., "Dr."["Dr", "."]). In sentence_loop, pretokens are then joined with spaces to form spans for the parser (.join(" ")), so the parser sees "Dr ." instead of "Dr.". This causes two classes of TN sentence-mode regressions:

  1. Entries with only the period form (e.g., "e.g.", "etc.", "Prof.", "Inc.", "approx." — ~25 entries in the English TN whitelist alone): the abbreviation is never matched. "e.g." → pretokens ["e.g", "."] → single-pretoken span "e.g" has no whitelist hit, two-pretoken span "e.g ." also has no hit → abbreviation passes through unnormalized.

  2. Entries with both forms (e.g., "Dr"/"Dr.", "vs"/"vs."): the bare form matches, but the period remains as a separate pretoken, producing a spurious period: "Dr. Smith""doctor. Smith" instead of "doctor Smith".

This affects all 7 TN language whitelists (en, fr, es, de, zh, hi, ja) in tn_normalize_sentence, tn_normalize_sentence_with_max_span, and tn_normalize_sentence_with_max_span_lang.

Trace for "e.g." through pretokenize + sentence_loop

word = "e.g." → chars [('e',0), ('.',1), ('g',2), ('.',3)]
start scan: 'e' not punct → start=0
end scan: chars[3]='.' is punct → end=3; chars[2]='g' not punct → stop
Core: word[0..3] = "e.g", trailing: "."
Pretokens: ["e.g", "."]

In sentence_loop:
Span [0..2]: "e.g" + " " + "." = "e.g ." → WHITELIST.get("e.g .") → None
Span [0..1]: "e.g" → WHITELIST.get("e.g") → None (only "e.g." is in the whitelist)
Fallback: output "e.g" then ".", no normalization occurs

Prompt for agents
The root cause is that sentence_loop joins all pretokens with spaces when constructing spans for the parser. This destroys the original adjacency of split-off punctuation: pretokens ["Dr", "."] from the word "Dr." become span "Dr ." instead of "Dr.", breaking TN whitelist matching.

The fix needs to reconstruct the original text when building spans, not unconditionally insert spaces. The Pretoken.sep field already records the original separator ("" for adjacent/split pretokens, " " for whitespace-separated). The span construction at src/lib.rs:1115-1119 could use sep values for pretokens after the first:

  pretokens[i..end].iter().enumerate().fold(String::new(), |mut s, (j, p)| {
      if j > 0 { s.push_str(p.sep); }
      s.push_str(&p.text);
      s
  })

However, this would make the span include the glued punctuation (e.g., "twenty one,"), which was the original problem the pretokenizer aimed to fix. A more nuanced approach is needed, such as:

1. Reconstruct spans using sep (preserving adjacency) so parsers see original text like "Dr." or "e.g.", and ALSO try the sub-span without trailing punctuation pretokens so "twenty one" can still be tried without the comma.

2. Alternatively, make the pretokenizer TN-aware: skip splitting when the full token matches a whitelist entry.

3. Or exclude '.' from is_split_punct and handle trailing periods differently (e.g., the parser strips trailing periods before matching, or the sentence_loop retries without the last character on failure).

All TN language whitelists in src/tn/{en,fr,es,de,zh,hi,ja}/whitelist.rs are affected. Affected public APIs: tn_normalize_sentence, tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang.
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

let Some((candidate, score)) = parser(&span) else {
continue;
};

Expand All @@ -1056,15 +1144,31 @@ fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound
}

if let Some((end, replacement, _)) = best {
out.push(replacement);
out.push_str(pretokens[i].sep);
out.push_str(&replacement);
i = end;
} else {
out.push(tokens[i].to_string());
out.push_str(pretokens[i].sep);
out.push_str(&pretokens[i].text);
i += 1;
}
}

out.join(" ")
out
}

/// Sentence-mode dispatch loop. The `concat_compound` flag is forwarded to
/// [`parse_span`] so each span sees the right tagger priorities.
fn normalize_sentence_inner(input: &str, max_span_tokens: usize, concat_compound: bool) -> String {
let trimmed = input.trim();
if trimmed.is_empty() {
return trimmed.to_string();
}

let pretokens = pretokenize(trimmed);
sentence_loop(&pretokens, max_span_tokens, |span| {
parse_span(span, concat_compound)
})
}

// ── Text Normalization (written → spoken) ─────────────────────────────
Expand Down Expand Up @@ -1210,56 +1314,10 @@ pub fn tn_normalize_sentence_with_max_span_lang(
return trimmed.to_string();
}

let max_span = if max_span_tokens == 0 {
1
} else {
max_span_tokens
};
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
let mut out: Vec<String> = Vec::with_capacity(tokens.len());
let mut i = 0usize;

while i < tokens.len() {
let max_end = usize::min(tokens.len(), i + max_span);
let mut best: Option<(usize, String, u8)> = None;

for end in (i + 1..=max_end).rev() {
let span = tokens[i..end].join(" ");
let Some((candidate, score)) = tn_parse_span_lang(&span, lang) else {
continue;
};

let candidate_trimmed = candidate.trim();
if candidate_trimmed.is_empty() || candidate_trimmed == span {
continue;
}

let candidate_len = end - i;
match &best {
None => {
best = Some((end, candidate, score));
}
Some((best_end, _, best_score)) => {
let best_len = *best_end - i;
if candidate_len > best_len
|| (candidate_len == best_len && score > *best_score)
{
best = Some((end, candidate, score));
}
}
}
}

if let Some((end, replacement, _)) = best {
out.push(replacement);
i = end;
} else {
out.push(tokens[i].to_string());
i += 1;
}
}

out.join(" ")
let pretokens = pretokenize(trimmed);
sentence_loop(&pretokens, max_span_tokens, |span| {
tn_parse_span_lang(span, lang)
})
}
}
}
Expand All @@ -1271,56 +1329,8 @@ pub fn tn_normalize_sentence_with_max_span(input: &str, max_span_tokens: usize)
return trimmed.to_string();
}

let max_span = if max_span_tokens == 0 {
1
} else {
max_span_tokens
};
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
let mut out: Vec<String> = Vec::with_capacity(tokens.len());
let mut i = 0usize;

while i < tokens.len() {
let max_end = usize::min(tokens.len(), i + max_span);
let mut best: Option<(usize, String, u8)> = None;

for end in (i + 1..=max_end).rev() {
let span = tokens[i..end].join(" ");
let Some((candidate, score)) = tn_parse_span(&span) else {
continue;
};

let candidate_trimmed = candidate.trim();
if candidate_trimmed.is_empty() || candidate_trimmed == span {
continue;
}

let candidate_len = end - i;
match &best {
None => {
best = Some((end, candidate, score));
}
Some((best_end, _, best_score)) => {
let best_len = *best_end - i;
if candidate_len > best_len
|| (candidate_len == best_len && score > *best_score)
{
best = Some((end, candidate, score));
}
}
}
}

if let Some((end, replacement, _)) = best {
out.push(replacement);
i = end;
} else {
out.push(tokens[i].to_string());
i += 1;
}
}

out.join(" ")
let pretokens = pretokenize(trimmed);
sentence_loop(&pretokens, max_span_tokens, tn_parse_span)
}

#[cfg(test)]
Expand Down
65 changes: 65 additions & 0 deletions tests/en_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1125,3 +1125,68 @@ fn test_issue_23_compound_concat() {
"2017"
);
}

/// Issue #21: trailing punctuation glued to the last word of a number phrase
/// (e.g. `"eight,"`) used to block the cardinal tagger because
/// `split_whitespace` left punctuation attached. The pretokenizer splits
/// leading/trailing ASCII punctuation off each whitespace token while
/// preserving the original spacing on output.
#[test]
fn test_issue_21_trailing_comma() {
let opts = concat_opts();
assert_eq!(
normalize_sentence_with_options(
"United seven eighty eight, please come up on frequency one three five point six two five, thanks.",
opts,
),
"United 788, please come up on frequency 135.625, thanks."
);
}

#[test]
fn test_issue_21_trailing_period() {
let opts = concat_opts();
assert_eq!(
normalize_sentence_with_options(
"United seven eighty eight. please come up on frequency one three five point six two five. thanks.",
opts,
),
"United 788. please come up on frequency 135.625. thanks."
);
}

/// The pre-existing space-before-punctuation case must continue to work,
/// preserving the explicit space in output.
#[test]
fn test_issue_21_space_before_punct_preserved() {
let opts = concat_opts();
assert_eq!(
normalize_sentence_with_options(
"United seven eighty eight , please come up on frequency one three five point six two five , thanks .",
opts,
),
"United 788 , please come up on frequency 135.625 , thanks ."
);
}

/// Punctuation other than `,` and `.` is also split, including paired
/// brackets and quotes, while contractions and hyphenated words remain
/// intact.
#[test]
fn test_issue_21_other_punctuation() {
// Default options (no concat) still benefits from the fix.
assert_eq!(
normalize_sentence("I have twenty one apples."),
"I have 21 apples."
);
assert_eq!(
normalize_sentence("I have (twenty one) apples!"),
"I have (21) apples!"
);
assert_eq!(normalize_sentence("\"twenty one\" apples"), "\"21\" apples");
// Apostrophe inside contraction is NOT split.
assert_eq!(
normalize_sentence("don't eat twenty one apples"),
"don't eat 21 apples"
);
}
Loading