From 93434da9ca7668b711591eb2405c98b9386ef1f9 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Sat, 2 Aug 2025 11:41:54 +0700 Subject: [PATCH 01/12] Add Vietnamese measure text normalization support - Added measure tagger and verbalizer for Vietnamese TN - Updated money tagger and verbalizer to handle per-unit measurements - Added test cases for measure normalization - Updated fraction handling for better integration - Added data files for measurements, prefixes, and per-unit bases Signed-off-by: folivoramanh --- .../text_normalization/normalize.py | 6 +- .../vi/data/measure/__init__.py | 13 ++ .../vi/data/measure/base_units.tsv | 17 ++ .../vi/data/measure/measurements_minimal.tsv | 25 +++ .../vi/data/measure/prefixes.tsv | 17 ++ .../vi/data/money/per_unit_bases.tsv | 8 + .../{per_unit.tsv => per_unit_non_metric.tsv} | 21 +-- .../vi/data/money/per_unit_prefixes.tsv | 6 + .../text_normalization/vi/graph_utils.py | 11 ++ .../text_normalization/vi/taggers/fraction.py | 5 + .../text_normalization/vi/taggers/measure.py | 125 +++++++++++++++ .../text_normalization/vi/taggers/money.py | 24 ++- .../text_normalization/vi/taggers/range.py | 5 +- .../vi/taggers/tokenize_and_classify.py | 15 +- .../vi/verbalizers/fraction.py | 4 +- .../vi/verbalizers/measure.py | 61 ++++++++ .../vi/verbalizers/money.py | 20 ++- .../vi/verbalizers/post_processing.py | 148 ++++++++++++++++++ .../vi/verbalizers/verbalize.py | 5 + .../test_cases_measure.txt | 57 +++++++ .../test_cases_money.txt | 3 +- .../test_cases_range.txt | 4 +- tests/nemo_text_processing/vi/test_measure.py | 47 ++++-- .../vi/test_sparrowhawk_normalization.sh | 8 +- .../pynini_export.py | 3 + 25 files changed, 609 insertions(+), 49 deletions(-) create mode 100644 nemo_text_processing/text_normalization/vi/data/measure/__init__.py create mode 100644 nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv rename nemo_text_processing/text_normalization/vi/data/money/{per_unit.tsv => per_unit_non_metric.tsv} (56%) create mode 100644 nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv create mode 100644 nemo_text_processing/text_normalization/vi/taggers/measure.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/measure.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py create mode 100644 tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 329b28338..6abc7f3a0 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -177,6 +177,10 @@ def __init__( elif lang == 'vi': from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.vi.verbalizers.verbalize_final import VerbalizeFinalFst + from nemo_text_processing.text_normalization.vi.verbalizers.post_processing import PostProcessingFst + + if post_process: + self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -377,7 +381,7 @@ def normalize( return text output = SPACE_DUP.sub(' ', output[1:]) - if self.lang == "en" and hasattr(self, 'post_processor'): + if self.lang in ["en", "vi"] and hasattr(self, 'post_processor'): output = self.post_process(output) if punct_post_process: diff --git a/nemo_text_processing/text_normalization/vi/data/measure/__init__.py b/nemo_text_processing/text_normalization/vi/data/measure/__init__.py new file mode 100644 index 000000000..b2de1dca7 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/measure/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv b/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv new file mode 100644 index 000000000..fa6c32658 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv @@ -0,0 +1,17 @@ +m mét +m2 mét vuông +m3 mét khối +m² mét vuông +m³ mét khối +g gam +l lít +s giây +v vôn +w oát +hz hẹc +A am pe +b bai +B byte +pa pascal +ω ohm +Ω ôm \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv b/nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv new file mode 100644 index 000000000..403360057 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv @@ -0,0 +1,25 @@ +°f độ f +°c độ c +°k độ k +ha héc ta +mi mile +ft foot +inch inch +yd yard +% phần trăm +hp mã lực +rad radian +kwh ki lô oát giờ +kbps kilobit trên giây +mbps megabit trên giây +ghz gi ga hẹc +mhz mê ga hẹc +tw tê ra oát +kcal ki lô calo +gb gi ga bai +mb mê ga bai +mV mi li vôn +MV mê ga vôn +tb terabyte +pb petabyte +g gam \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv b/nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv new file mode 100644 index 000000000..649ce73a7 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv @@ -0,0 +1,17 @@ +k ki lô +M mê ga +G gi ga +T tê ra +P pê ta +E ex xa +h hếc tô +da đề ca +d đề xi +c xăng ti +m mi li +µ mi crô +μ mi cờ rô +n na nô +p pi cô +f fem tô +a át tô \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv b/nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv new file mode 100644 index 000000000..feb1808d6 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv @@ -0,0 +1,8 @@ +g gam +m mét +m² mét vuông +m2 mét vuông +m³ mét khối +m3 mét khối +l lít +B bai \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/money/per_unit.tsv b/nemo_text_processing/text_normalization/vi/data/money/per_unit_non_metric.tsv similarity index 56% rename from nemo_text_processing/text_normalization/vi/data/money/per_unit.tsv rename to nemo_text_processing/text_normalization/vi/data/money/per_unit_non_metric.tsv index 66030c5c5..c1ccbdf69 100644 --- a/nemo_text_processing/text_normalization/vi/data/money/per_unit.tsv +++ b/nemo_text_processing/text_normalization/vi/data/money/per_unit_non_metric.tsv @@ -1,5 +1,4 @@ /giờ trên giờ -/g trên giờ /h trên giờ /ngày trên ngày /d trên ngày @@ -13,33 +12,17 @@ /lần một lần /cái một cái /chiếc một chiếc -/kg một ki lô gam -/g một gam -/cm một xăng ti mét -/m một mét -/km một ki lô mét -/cm² một xăng ti mét vuông -/m² một mét vuông -/m2 một mét vuông -/m³ một mét khối -/m3 một mét khối -/l một lít -/ml một mi li lít /người một người /chỗ một chỗ /bài một bài /trang một trang /từ một từ /đồng một đồng -/KB một kilobyte -/GB một gigabyte -/MB một megabyte -/TB một terabyte -/tấn một tấn /đêm một đêm /buổi một buổi /ca một ca /dự án một dự án /lớp một lớp /khóa một khóa -/suất một suất \ No newline at end of file +/suất một suất +/tấn một tấn \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv b/nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv new file mode 100644 index 000000000..154aa7306 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv @@ -0,0 +1,6 @@ +k ki lô +M mê ga +G gi ga +c xăng ti +m mi li +T tê ra \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/graph_utils.py b/nemo_text_processing/text_normalization/vi/graph_utils.py index 4acb76439..8c45bcef5 100644 --- a/nemo_text_processing/text_normalization/vi/graph_utils.py +++ b/nemo_text_processing/text_normalization/vi/graph_utils.py @@ -62,6 +62,17 @@ def extract_field(field_name): return pynutil.delete(f"{field_name}:") + delete_space + pynutil.delete("\"") + quoted_text + pynutil.delete("\"") +def extract_wrapper_content(wrapper_type: str, content_graph): + """Helper to extract content from wrapper like 'decimal { ... }'""" + return ( + pynutil.delete(f"{wrapper_type} {{") + + delete_space + + content_graph + + delete_space + + pynutil.delete("}") + ) + + def convert_space(fst) -> "pynini.FstLike": """ Converts space to nonbreaking space. diff --git a/nemo_text_processing/text_normalization/vi/taggers/fraction.py b/nemo_text_processing/text_normalization/vi/taggers/fraction.py index ed3394120..86fd6de12 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/taggers/fraction.py @@ -62,6 +62,11 @@ def __init__(self, cardinal: CardinalFst, deterministic: bool = True): simple_fraction = numerator + denominator mixed_fraction = integer_part + pynutil.delete(" ") + numerator + denominator + + # Create graph without negative for reuse in other FSTs (like measure) + fraction_wo_negative = simple_fraction | mixed_fraction + self.final_graph_wo_negative = fraction_wo_negative.optimize() + optional_graph_negative = (pynutil.insert("negative: ") + pynini.cross("-", "\"true\" ")).ques self.fst = self.add_tokens(optional_graph_negative + (simple_fraction | mixed_fraction)).optimize() diff --git a/nemo_text_processing/text_normalization/vi/taggers/measure.py b/nemo_text_processing/text_normalization/vi/taggers/measure.py new file mode 100644 index 000000000..9b97f0a53 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/measure.py @@ -0,0 +1,125 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import ( + NEMO_DIGIT, + NEMO_COMMA, + NEMO_SPACE, + GraphFst, +) +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure for Vietnamese, e.g. + 12kg -> measure { cardinal { integer: "mười hai" } units: "ki lô gam" } + 1kg -> measure { cardinal { integer: "một" } units: "ki lô gam" } + 0.5kg -> measure { decimal { fractional_part: "năm" } units: "ki lô gam" } + -12kg -> measure { negative: "true" cardinal { integer: "mười hai" } units: "ki lô gam" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + fraction: FractionFst + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def _create_measure_subgraph(self, measure_type: str, number_graph, optional_negative, graph_unit): + """Helper to create measure subgraph pattern - reduces duplication""" + return ( + optional_negative + + pynutil.insert(f"{measure_type} {{ ") + + number_graph + + pynutil.insert(" } units: \"") + + graph_unit + + pynutil.insert('"') + ) + + def __init__( + self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, + ): + super().__init__(name="measure", kind="classify", deterministic=deterministic) + + cardinal_graph = cardinal.graph + + # Load minimal measurement files (massive redundancy removed via subfst) + measurements_path = get_abs_path("data/measure/measurements_minimal.tsv") + prefixes_path = get_abs_path("data/measure/prefixes.tsv") + base_units_path = get_abs_path("data/measure/base_units.tsv") + + # Create subfst for metric units: prefix + space + base_unit + graph_prefixes = pynini.string_file(prefixes_path) + graph_base_units = pynini.string_file(base_units_path) + space = pynutil.insert(NEMO_SPACE) + graph_metric_units = graph_prefixes + space + graph_base_units + + # Load non-metric and special units + graph_special_units = pynini.string_file(measurements_path) + + # Also allow base units without prefixes (e.g., 'g' not just 'kg') + graph_standalone_units = graph_base_units + + # Combine all unit mappings + graph_unit = graph_metric_units | graph_special_units | graph_standalone_units + + # Create unit symbol pattern using FST operations (no loops needed) + prefix_symbols = pynini.project(graph_prefixes, "input") # Extract prefix symbols + base_symbols = pynini.project(graph_base_units, "input") # Extract base symbols + special_symbols = pynini.project(graph_special_units, "input") # Extract special symbols + + # Build unit pattern: metric combinations | standalone bases | special units + metric_pattern = prefix_symbols + base_symbols # All prefix+base combinations + unit_pattern = metric_pattern | base_symbols | special_symbols + + number = pynini.closure(NEMO_DIGIT, 1) + decimal_number = number + NEMO_COMMA + pynini.closure(NEMO_DIGIT, 1) + + # Optional negative sign handling for Vietnamese + optional_graph_negative = pynini.closure( + pynini.cross(pynini.union("âm", "trừ"), "negative: \"true\" "), + 0, + 1, + ) + + # Domain restriction patterns - only match core number+unit patterns + # Remove punctuation handling to let punctuation tagger handle it separately + integer_measure_domain = number + unit_pattern + decimal_measure_domain = decimal_number + unit_pattern + fraction_measure_domain = number + "/" + number + unit_pattern + + cardinal_number_graph = pynutil.insert('integer: "') + (number @ cardinal_graph) + pynutil.insert('"') + + subgraph_cardinal = self._create_measure_subgraph("cardinal", cardinal_number_graph, optional_graph_negative, graph_unit) + subgraph_decimal = self._create_measure_subgraph("decimal", decimal.final_graph_wo_negative, optional_graph_negative, graph_unit) + subgraph_fraction = self._create_measure_subgraph("fraction", fraction.final_graph_wo_negative, optional_graph_negative, graph_unit) + + # Apply domain restrictions to ensure we only match complete number+unit patterns + subgraph_cardinal = pynini.compose(integer_measure_domain, subgraph_cardinal) + subgraph_decimal = pynini.compose(decimal_measure_domain, subgraph_decimal) + subgraph_fraction = pynini.compose(fraction_measure_domain, subgraph_fraction) + + # Final graph combining main patterns + final_graph = ( + subgraph_cardinal + | subgraph_decimal + | subgraph_fraction + ) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/money.py b/nemo_text_processing/text_normalization/vi/taggers/money.py index eed524d73..30b384006 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/money.py +++ b/nemo_text_processing/text_normalization/vi/taggers/money.py @@ -47,7 +47,29 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = currency_major_labels = load_labels(get_abs_path("data/money/currency.tsv")) currency_minor_labels = load_labels(get_abs_path("data/money/currency_minor.tsv")) quantity_graph = pynini.string_file(get_abs_path("data/numbers/quantity_abbr.tsv")) - per_unit_graph = pynini.string_file(get_abs_path("data/money/per_unit.tsv")) + + # Load optimized per_unit files using subfst approach + per_unit_non_metric_path = get_abs_path("data/money/per_unit_non_metric.tsv") + per_unit_prefixes_path = get_abs_path("data/money/per_unit_prefixes.tsv") + per_unit_bases_path = get_abs_path("data/money/per_unit_bases.tsv") + + # Create subfst for metric per_unit patterns + graph_prefixes = pynini.string_file(per_unit_prefixes_path) + graph_bases = pynini.string_file(per_unit_bases_path) + + # Build metric combinations: "/kg" -> "một ki lô gam" + slash = pynutil.delete("/") + one_space = pynutil.insert("một ") + space = pynutil.insert(NEMO_SPACE) + + graph_metric_per_units = slash + one_space + graph_prefixes + space + graph_bases + graph_standalone_per_units = slash + one_space + graph_bases + + # Load non-metric per_unit entries + graph_non_metric_per_units = pynini.string_file(per_unit_non_metric_path) + + # Combine all per_unit mappings + per_unit_graph = graph_metric_per_units | graph_standalone_per_units | graph_non_metric_per_units # Basic components cardinal_graph = cardinal.graph diff --git a/nemo_text_processing/text_normalization/vi/taggers/range.py b/nemo_text_processing/text_normalization/vi/taggers/range.py index 8f8f0d23f..f52341d9d 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/range.py +++ b/nemo_text_processing/text_normalization/vi/taggers/range.py @@ -41,6 +41,7 @@ def __init__( date: GraphFst, decimal: GraphFst, money: GraphFst, + measure: GraphFst, deterministic: bool = True, ): super().__init__(name="range", kind="classify", deterministic=deterministic) @@ -50,11 +51,11 @@ def __init__( # Pattern: X-Y -> X đến Y # This will handle time ranges, date ranges, decimal ranges, and money ranges with dash range_pattern = ( - (time | date | decimal | money) + (time | date | decimal | money | measure) + delete_space + pynini.cross("-", " đến ") + delete_space - + (time | date | decimal | money) + + (time | date | decimal | money | measure) ) self.graph = range_pattern diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 9e97ac940..2339690c0 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -18,6 +18,7 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.text_normalization.vi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.vi.graph_utils import ( GraphFst, delete_extra_space, @@ -39,6 +40,8 @@ from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst as VCardinalFst from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst as VDateFst from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst as VDecimalFst +from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst as VFractionFst +from nemo_text_processing.text_normalization.vi.verbalizers.measure import MeasureFst as VMeasureFst from nemo_text_processing.text_normalization.vi.verbalizers.money import MoneyFst as VMoneyFst from nemo_text_processing.text_normalization.vi.verbalizers.time import TimeFst as VTimeFst from nemo_text_processing.utils.logging import logger @@ -122,6 +125,11 @@ def __init__( money_graph = money.fst logger.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") + start_time = time.time() + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) + measure_graph = measure.fst + logger.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") + # Create composed verbalizers for range processing start_time = time.time() v_cardinal = VCardinalFst(deterministic=deterministic) @@ -137,9 +145,13 @@ def __init__( v_money = VMoneyFst(deterministic=deterministic) money_final = pynini.compose(money_graph, v_money.fst) + v_fraction = VFractionFst(deterministic=deterministic) + v_measure = VMeasureFst(decimal=v_decimal, cardinal=v_cardinal, fraction=v_fraction, deterministic=deterministic) + measure_final = pynini.compose(measure_graph, v_measure.fst) + # Create range graph range_fst = RangeFst( - time=time_final, date=date_final, decimal=decimal_final, money=money_final, deterministic=deterministic + time=time_final, date=date_final, decimal=decimal_final, money=money_final, measure=measure_final, deterministic=deterministic ) range_graph = range_fst.fst logger.debug(f"range: {time.time() - start_time: .2f}s -- {range_graph.num_states()} nodes") @@ -155,6 +167,7 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) punct = ( diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py index 328bbcded..fcb3608e9 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py @@ -50,4 +50,6 @@ def __init__(self, deterministic: bool = True): simple_fraction = fraction_part mixed_fraction = integer_tagged + delete_space + pynutil.insert(" và ") + fraction_part - self.fst = self.delete_tokens(optional_sign + (simple_fraction | mixed_fraction)).optimize() + self.numbers = optional_sign + (simple_fraction | mixed_fraction) + + self.fst = self.delete_tokens(self.numbers).optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/measure.py b/nemo_text_processing/text_normalization/vi/verbalizers/measure.py new file mode 100644 index 000000000..805942d1b --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/measure.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.vi.graph_utils import ( + GraphFst, + delete_space, + delete_preserve_order, + extract_wrapper_content, + insert_space, + extract_field, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure for Vietnamese, e.g. + measure { negative: "true" cardinal { integer: "mười hai" } units: "ki lô gam" } -> âm mười hai ki lô gam + measure { decimal { integer_part: "mười hai" fractional_part: "năm" } units: "ki lô gam" } -> mười hai phẩy năm ki lô gam + measure { cardinal { integer: "một" } units: "ki lô gam" } -> một ki lô gam + + Args: + decimal: DecimalFst verbalizer + cardinal: CardinalFst verbalizer + fraction: FractionFst verbalizer + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True): + super().__init__(name="measure", kind="verbalize", deterministic=deterministic) + + # Extract components + unit = extract_field("units") + + # Combine all number types into single graph + number_graph = ( + extract_wrapper_content("decimal", decimal.numbers) + | extract_wrapper_content("cardinal", cardinal.numbers) + | extract_wrapper_content("fraction", fraction.numbers) + ) + + # Main pattern: number + space + unit (most common case) + graph = number_graph + delete_space + insert_space + unit + + # Handle preserve_order: unit + space + number + graph |= ( + unit + delete_space + insert_space + number_graph + delete_preserve_order + ) + + self.fst = self.delete_tokens(graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/money.py b/nemo_text_processing/text_normalization/vi/verbalizers/money.py index 035b4acd1..94d8927fa 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/money.py @@ -18,6 +18,7 @@ from nemo_text_processing.text_normalization.vi.graph_utils import ( NEMO_COMMA_VI, NEMO_NOT_QUOTE, + NEMO_SPACE, GraphFst, delete_preserve_order, delete_space, @@ -96,8 +97,23 @@ def __init__(self, deterministic: bool = True): | graph_integer # Handle simple cases (most common, lowest priority) ) - # Add per-unit support (following English pattern) - per_units = pynini.string_file(get_abs_path("data/money/per_unit.tsv")) + per_units_non_metric = pynini.string_file(get_abs_path("data/money/per_unit_non_metric.tsv")) + + per_unit_prefixes = pynini.string_file(get_abs_path("data/money/per_unit_prefixes.tsv")) + per_unit_bases = pynini.string_file(get_abs_path("data/money/per_unit_bases.tsv")) + + prefixes_vn = pynini.project(per_unit_prefixes, "output") + bases_vn = pynini.project(per_unit_bases, "output") + + space_accept = pynini.accep(NEMO_SPACE) + one_space = pynini.accep("một ") + + # Accept metric combinations: "một ki lô gam" + metric_per_units = one_space + prefixes_vn + space_accept + bases_vn + standalone_per_units = one_space + bases_vn + + # Combine all per_unit recognitions + per_units = per_units_non_metric | metric_per_units | standalone_per_units per_units_normalized = pynini.project(per_units, "output") per_unit_pattern = ( pynutil.delete(' morphosyntactic_features: "') + insert_space + per_units_normalized + pynutil.delete('"') diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py new file mode 100644 index 000000000..a42fdcef3 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py @@ -0,0 +1,148 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Dict, List + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import ( + NEMO_SIGMA, + generator_main, +) +from nemo_text_processing.utils.logging import logger + + +class PostProcessingFst: + """ + Finite state transducer that post-processes an entire Vietnamese sentence after verbalization is complete, e.g. + removes extra spaces around punctuation marks " ( một trăm hai mươi ba ) " -> "(một trăm hai mươi ba)" + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "vi_tn_post_processing.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] + logger.info(f'Post processing graph was restored from {far_file}.') + else: + self.set_punct_dict() + self.fst = self.get_punct_postprocess_graph() + + if far_file: + generator_main(far_file, {"post_process_graph": self.fst}) + + def get_vietnamese_punct_config(self) -> Dict[str, List[str]]: + """ + Returns Vietnamese-specific punctuation configuration. + This method can be easily modified or extended for different Vietnamese punctuation rules. + """ + return { + # Punctuation that should not have space before them + 'no_space_before': [",", ".", "!", "?", ":", ";", ")", r"\]", "}", "\""], + + # Punctuation that should not have space after them + 'no_space_after': ["(", r"\[", "{"], + + # Punctuation that can have space before them (exceptions) + 'allow_space_before': ["&", "-", "—", "–", "(", r"\[", "{", "\"", "'", "«", "»"], + + # Special Vietnamese punctuation handling + 'vietnamese_special': { + # Vietnamese quotation marks + 'quotes': ["\"", "'", "«", "»", """, """, "'", "'"], + # Vietnamese dashes and separators + 'dashes': ["-", "—", "–"], + # Vietnamese brackets + 'brackets': ["(", ")", r"\[", r"\]", "{", "}"], + } + } + + def set_punct_dict(self): + # Vietnamese punctuation marks that might need special handling + self.punct_marks = { + "'": [ + "'", + '´', + 'ʹ', + 'ʻ', + 'ʼ', + 'ʽ', + 'ʾ', + 'ˈ', + 'ˊ', + 'ˋ', + '˴', + 'ʹ', + '΄', + '`', + '´', + '’', + '‛', + '′', + '‵', + 'ꞌ', + ''', + '`', + ], + } + + def get_punct_postprocess_graph(self): + """ + Returns graph to post process punctuation marks for Vietnamese. + + Uses dynamic configuration for flexible punctuation handling. + Vietnamese punctuation spacing rules are defined in get_vietnamese_punct_config(). + """ + # Get dynamic punctuation configuration + punct_config = self.get_vietnamese_punct_config() + + # Extract configuration + no_space_before_punct = punct_config['no_space_before'] + no_space_after_punct = punct_config['no_space_after'] + + # Create FSTs for punctuation rules + no_space_before_punct_fst = pynini.union(*no_space_before_punct) + no_space_after_punct_fst = pynini.union(*no_space_after_punct) + + delete_space = pynutil.delete(" ") + + # Rule 1: Remove space before punctuation (primary rule) + remove_space_before = pynini.cdrewrite( + delete_space + no_space_before_punct_fst, # " ," -> "," + "", # any context before + "", # any context after + NEMO_SIGMA + ).optimize() + + # Rule 2: Remove space after opening brackets + remove_space_after = pynini.cdrewrite( + no_space_after_punct_fst + delete_space, # "( " -> "(" + "", + "", + NEMO_SIGMA + ).optimize() + + # Combine the two main rules + graph = pynini.compose(remove_space_before, remove_space_after) + + return graph.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index d241301ff..551d539c4 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo_text_processing.text_normalization.vi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst @@ -59,6 +60,9 @@ def __init__(self, deterministic: bool = True): money = MoneyFst(deterministic=deterministic) money_graph = money.fst + measure = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) + measure_graph = measure.fst + graph = ( cardinal_graph | whitelist_graph @@ -70,6 +74,7 @@ def __init__(self, deterministic: bool = True): | roman_graph | time_graph | money_graph + | measure_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..356d228cc --- /dev/null +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt @@ -0,0 +1,57 @@ +204m~hai trăm linh bốn mét +12kg~mười hai ki lô gam +1kg~một ki lô gam +100g~một trăm gam +500mg~năm trăm mi li gam +175cm~một trăm bảy mươi lăm xăng ti mét +2m~hai mét +100mm~một trăm mi li mét +5km~năm ki lô mét +1inch~một inch +500ml~năm trăm mi li lít +2l~hai lít +1m³~một mét khối +100cm³~một trăm xăng ti mét khối +2gb~hai gi ga bai +1tb~một terabyte +512Mb~năm trăm mười hai mê ga bai +64kb~sáu mươi tư ki lô bai +25°c~hai mươi lăm độ c +100°f~một trăm độ f +273°k~hai trăm bảy mươi ba độ k +50%~năm mươi phần trăm +100%~một trăm phần trăm +25%~hai mươi lăm phần trăm +220v~hai trăm hai mươi vôn +1kw~một ki lô oát +500mV~năm trăm mi li vôn +1000mA~một nghìn mi li am pe +50hz~năm mươi hẹc +2ghz~hai gi ga hẹc +100Mhz~một trăm mê ga hẹc +1000kw~một nghìn ki lô oát +5hp~năm mã lực +1tw~một tê ra oát +100m²~một trăm mét vuông +5km²~năm ki lô mét vuông +1km2~một ki lô mét vuông +8,5m2~tám phẩy năm mét vuông +1ha~một héc ta +1/2kg~một phần hai ki lô gam +3/4m~ba phần tư mét +1/3l~một phần ba lít +Tôi có 12kg gạo~Tôi có mười hai ki lô gam gạo +Chiều cao 175cm~Chiều cao một trăm bảy mươi lăm xăng ti mét +Dung lượng 2gb~Dung lượng hai gi ga bai +Nhiệt độ 25°c~Nhiệt độ hai mươi lăm độ c +Cân nặng 1/2kg~Cân nặng một phần hai ki lô gam +Điện áp 220v~Điện áp hai trăm hai mươi vôn +Tỷ lệ 50%~Tỷ lệ năm mươi phần trăm +Bộ nhớ 1tb~Bộ nhớ một terabyte +Thể tích 500ml~Thể tích năm trăm mi li lít +1234kg~một nghìn hai trăm ba mươi tư ki lô gam +2500m~hai nghìn năm trăm mét +10000gb~mười nghìn gi ga bai +Kích thước 100cm x 50cm~Kích thước một trăm xăng ti mét x năm mươi xăng ti mét +1,5m2~một phẩy năm mét vuông +1,5m~một phẩy năm mét \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_money.txt index b5ef741ac..755a1030a 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_money.txt @@ -26,4 +26,5 @@ 0,01$~một xu 2,50€~hai ơ rô năm mươi xu 1000,50 VND~một nghìn phẩy năm không đồng -5,99$~năm đô la chín mươi chín xu \ No newline at end of file +5,99$~năm đô la chín mươi chín xu +30đ/TB~ba mươi đồng một tê ra bai \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt index 64263db2d..8f776fd17 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt @@ -5,4 +5,6 @@ 1t-2t~một tỷ đến hai tỷ 10:00-11:00~mười giờ đến mười một giờ 10$-20$~mười đô la đến hai mươi đô la -50.000đ-100.000đ~năm mươi nghìn đồng đến một trăm nghìn đồng \ No newline at end of file +50.000đ-100.000đ~năm mươi nghìn đồng đến một trăm nghìn đồng +3kg-6kg~ba ki lô gam đến sáu ki lô gam +15cm-25cm~mười lăm xăng ti mét đến hai mươi lăm xăng ti mét \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_measure.py b/tests/nemo_text_processing/vi/test_measure.py index 991cbc487..2b060b8fc 100644 --- a/tests/nemo_text_processing/vi/test_measure.py +++ b/tests/nemo_text_processing/vi/test_measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,32 +12,47 @@ # See the License for the specific language governing permissions and # limitations under the License. - +# pytest tests/nemo_text_processing/vi/test_measure.py --cpu --cache-clear import pytest from parameterized import parameterized -from ..utils import CACHE_DIR, parse_test_case_file - -try: - from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio - PYNINI_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - PYNINI_AVAILABLE = False +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestMeasure: - inverse_normalizer = ( - InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None - ) + inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_measure.txt')) - @pytest.mark.skipif( - not PYNINI_AVAILABLE, - reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh", - ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) + + normalizer_with_audio = ( + NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + if CACHE_DIR and RUN_AUDIO_BASED_TESTS + else None + ) + + @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" + + if self.normalizer_with_audio: + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, + n_tagged=30, + punct_post_process=False, + ) + assert expected in pred_non_deterministic, f"input: {test_input}" \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh index 4ab27882a..c11d66ef0 100644 --- a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh @@ -58,10 +58,10 @@ testTNTime() { runtest $input } -# testTNMeasure() { -# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_measure.txt -# runtest $input -# } +testTNMeasure() { + input=$PROJECT_DIR/vi/data_text_normalization/test_cases_measure.txt + runtest $input +} testTNMoney() { input=$PROJECT_DIR/vi/data_text_normalization/test_cases_money.txt diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index bc19f428d..445c71c98 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -243,6 +243,9 @@ def parse_args(): from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) + from nemo_text_processing.text_normalization.vi.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'zh': from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ( From 5d7d3e8a53a69645aca5a55b0930e3945b22eabb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 2 Aug 2025 05:15:16 +0000 Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: folivoramanh --- .../text_normalization/normalize.py | 4 +- .../text_normalization/vi/graph_utils.py | 8 +-- .../text_normalization/vi/taggers/fraction.py | 4 +- .../text_normalization/vi/taggers/measure.py | 63 ++++++++++--------- .../text_normalization/vi/taggers/money.py | 14 ++--- .../vi/taggers/tokenize_and_classify.py | 15 +++-- .../vi/verbalizers/fraction.py | 4 +- .../vi/verbalizers/measure.py | 16 +++-- .../vi/verbalizers/money.py | 10 +-- .../vi/verbalizers/post_processing.py | 33 ++++------ .../vi/verbalizers/verbalize.py | 4 +- tests/nemo_text_processing/vi/test_measure.py | 12 ++-- 12 files changed, 90 insertions(+), 97 deletions(-) diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 6abc7f3a0..4ce71fa2b 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -176,9 +176,9 @@ def __init__( from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst elif lang == 'vi': from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst - from nemo_text_processing.text_normalization.vi.verbalizers.verbalize_final import VerbalizeFinalFst from nemo_text_processing.text_normalization.vi.verbalizers.post_processing import PostProcessingFst - + from nemo_text_processing.text_normalization.vi.verbalizers.verbalize_final import VerbalizeFinalFst + if post_process: self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) else: diff --git a/nemo_text_processing/text_normalization/vi/graph_utils.py b/nemo_text_processing/text_normalization/vi/graph_utils.py index 8c45bcef5..1c0c1a0ab 100644 --- a/nemo_text_processing/text_normalization/vi/graph_utils.py +++ b/nemo_text_processing/text_normalization/vi/graph_utils.py @@ -64,13 +64,7 @@ def extract_field(field_name): def extract_wrapper_content(wrapper_type: str, content_graph): """Helper to extract content from wrapper like 'decimal { ... }'""" - return ( - pynutil.delete(f"{wrapper_type} {{") - + delete_space - + content_graph - + delete_space - + pynutil.delete("}") - ) + return pynutil.delete(f"{wrapper_type} {{") + delete_space + content_graph + delete_space + pynutil.delete("}") def convert_space(fst) -> "pynini.FstLike": diff --git a/nemo_text_processing/text_normalization/vi/taggers/fraction.py b/nemo_text_processing/text_normalization/vi/taggers/fraction.py index 86fd6de12..ca1d11ebf 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/taggers/fraction.py @@ -62,11 +62,11 @@ def __init__(self, cardinal: CardinalFst, deterministic: bool = True): simple_fraction = numerator + denominator mixed_fraction = integer_part + pynutil.delete(" ") + numerator + denominator - + # Create graph without negative for reuse in other FSTs (like measure) fraction_wo_negative = simple_fraction | mixed_fraction self.final_graph_wo_negative = fraction_wo_negative.optimize() - + optional_graph_negative = (pynutil.insert("negative: ") + pynini.cross("-", "\"true\" ")).ques self.fst = self.add_tokens(optional_graph_negative + (simple_fraction | mixed_fraction)).optimize() diff --git a/nemo_text_processing/text_normalization/vi/taggers/measure.py b/nemo_text_processing/text_normalization/vi/taggers/measure.py index 9b97f0a53..3e597e9c9 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/vi/taggers/measure.py @@ -15,12 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.graph_utils import ( - NEMO_DIGIT, - NEMO_COMMA, - NEMO_SPACE, - GraphFst, -) +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_COMMA, NEMO_DIGIT, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels @@ -46,50 +41,54 @@ def _create_measure_subgraph(self, measure_type: str, number_graph, optional_neg optional_negative + pynutil.insert(f"{measure_type} {{ ") + number_graph - + pynutil.insert(" } units: \"") - + graph_unit + + pynutil.insert(" } units: \"") + + graph_unit + pynutil.insert('"') ) def __init__( - self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, + self, + cardinal: GraphFst, + decimal: GraphFst, + fraction: GraphFst, + deterministic: bool = True, ): super().__init__(name="measure", kind="classify", deterministic=deterministic) - + cardinal_graph = cardinal.graph # Load minimal measurement files (massive redundancy removed via subfst) measurements_path = get_abs_path("data/measure/measurements_minimal.tsv") prefixes_path = get_abs_path("data/measure/prefixes.tsv") base_units_path = get_abs_path("data/measure/base_units.tsv") - + # Create subfst for metric units: prefix + space + base_unit graph_prefixes = pynini.string_file(prefixes_path) graph_base_units = pynini.string_file(base_units_path) space = pynutil.insert(NEMO_SPACE) graph_metric_units = graph_prefixes + space + graph_base_units - + # Load non-metric and special units graph_special_units = pynini.string_file(measurements_path) - + # Also allow base units without prefixes (e.g., 'g' not just 'kg') graph_standalone_units = graph_base_units - + # Combine all unit mappings graph_unit = graph_metric_units | graph_special_units | graph_standalone_units - + # Create unit symbol pattern using FST operations (no loops needed) prefix_symbols = pynini.project(graph_prefixes, "input") # Extract prefix symbols - base_symbols = pynini.project(graph_base_units, "input") # Extract base symbols + base_symbols = pynini.project(graph_base_units, "input") # Extract base symbols special_symbols = pynini.project(graph_special_units, "input") # Extract special symbols - + # Build unit pattern: metric combinations | standalone bases | special units metric_pattern = prefix_symbols + base_symbols # All prefix+base combinations unit_pattern = metric_pattern | base_symbols | special_symbols - + number = pynini.closure(NEMO_DIGIT, 1) decimal_number = number + NEMO_COMMA + pynini.closure(NEMO_DIGIT, 1) - + # Optional negative sign handling for Vietnamese optional_graph_negative = pynini.closure( pynini.cross(pynini.union("âm", "trừ"), "negative: \"true\" "), @@ -100,26 +99,28 @@ def __init__( # Domain restriction patterns - only match core number+unit patterns # Remove punctuation handling to let punctuation tagger handle it separately integer_measure_domain = number + unit_pattern - decimal_measure_domain = decimal_number + unit_pattern + decimal_measure_domain = decimal_number + unit_pattern fraction_measure_domain = number + "/" + number + unit_pattern cardinal_number_graph = pynutil.insert('integer: "') + (number @ cardinal_graph) + pynutil.insert('"') - - subgraph_cardinal = self._create_measure_subgraph("cardinal", cardinal_number_graph, optional_graph_negative, graph_unit) - subgraph_decimal = self._create_measure_subgraph("decimal", decimal.final_graph_wo_negative, optional_graph_negative, graph_unit) - subgraph_fraction = self._create_measure_subgraph("fraction", fraction.final_graph_wo_negative, optional_graph_negative, graph_unit) + + subgraph_cardinal = self._create_measure_subgraph( + "cardinal", cardinal_number_graph, optional_graph_negative, graph_unit + ) + subgraph_decimal = self._create_measure_subgraph( + "decimal", decimal.final_graph_wo_negative, optional_graph_negative, graph_unit + ) + subgraph_fraction = self._create_measure_subgraph( + "fraction", fraction.final_graph_wo_negative, optional_graph_negative, graph_unit + ) # Apply domain restrictions to ensure we only match complete number+unit patterns subgraph_cardinal = pynini.compose(integer_measure_domain, subgraph_cardinal) - subgraph_decimal = pynini.compose(decimal_measure_domain, subgraph_decimal) + subgraph_decimal = pynini.compose(decimal_measure_domain, subgraph_decimal) subgraph_fraction = pynini.compose(fraction_measure_domain, subgraph_fraction) # Final graph combining main patterns - final_graph = ( - subgraph_cardinal - | subgraph_decimal - | subgraph_fraction - ) + final_graph = subgraph_cardinal | subgraph_decimal | subgraph_fraction final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/vi/taggers/money.py b/nemo_text_processing/text_normalization/vi/taggers/money.py index 30b384006..540094591 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/money.py +++ b/nemo_text_processing/text_normalization/vi/taggers/money.py @@ -47,27 +47,27 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = currency_major_labels = load_labels(get_abs_path("data/money/currency.tsv")) currency_minor_labels = load_labels(get_abs_path("data/money/currency_minor.tsv")) quantity_graph = pynini.string_file(get_abs_path("data/numbers/quantity_abbr.tsv")) - + # Load optimized per_unit files using subfst approach per_unit_non_metric_path = get_abs_path("data/money/per_unit_non_metric.tsv") per_unit_prefixes_path = get_abs_path("data/money/per_unit_prefixes.tsv") per_unit_bases_path = get_abs_path("data/money/per_unit_bases.tsv") - + # Create subfst for metric per_unit patterns graph_prefixes = pynini.string_file(per_unit_prefixes_path) graph_bases = pynini.string_file(per_unit_bases_path) - - # Build metric combinations: "/kg" -> "một ki lô gam" + + # Build metric combinations: "/kg" -> "một ki lô gam" slash = pynutil.delete("/") one_space = pynutil.insert("một ") space = pynutil.insert(NEMO_SPACE) - + graph_metric_per_units = slash + one_space + graph_prefixes + space + graph_bases graph_standalone_per_units = slash + one_space + graph_bases - + # Load non-metric per_unit entries graph_non_metric_per_units = pynini.string_file(per_unit_non_metric_path) - + # Combine all per_unit mappings per_unit_graph = graph_metric_per_units | graph_standalone_per_units | graph_non_metric_per_units diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 2339690c0..4588ebe75 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -18,7 +18,6 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.vi.graph_utils import ( GraphFst, delete_extra_space, @@ -29,6 +28,7 @@ from nemo_text_processing.text_normalization.vi.taggers.date import DateFst from nemo_text_processing.text_normalization.vi.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.vi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.vi.taggers.money import MoneyFst from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst @@ -129,7 +129,7 @@ def __init__( measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst logger.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") - + # Create composed verbalizers for range processing start_time = time.time() v_cardinal = VCardinalFst(deterministic=deterministic) @@ -146,12 +146,19 @@ def __init__( money_final = pynini.compose(money_graph, v_money.fst) v_fraction = VFractionFst(deterministic=deterministic) - v_measure = VMeasureFst(decimal=v_decimal, cardinal=v_cardinal, fraction=v_fraction, deterministic=deterministic) + v_measure = VMeasureFst( + decimal=v_decimal, cardinal=v_cardinal, fraction=v_fraction, deterministic=deterministic + ) measure_final = pynini.compose(measure_graph, v_measure.fst) # Create range graph range_fst = RangeFst( - time=time_final, date=date_final, decimal=decimal_final, money=money_final, measure=measure_final, deterministic=deterministic + time=time_final, + date=date_final, + decimal=decimal_final, + money=money_final, + measure=measure_final, + deterministic=deterministic, ) range_graph = range_fst.fst logger.debug(f"range: {time.time() - start_time: .2f}s -- {range_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py index fcb3608e9..675d959df 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py @@ -51,5 +51,5 @@ def __init__(self, deterministic: bool = True): mixed_fraction = integer_tagged + delete_space + pynutil.insert(" và ") + fraction_part self.numbers = optional_sign + (simple_fraction | mixed_fraction) - - self.fst = self.delete_tokens(self.numbers).optimize() \ No newline at end of file + + self.fst = self.delete_tokens(self.numbers).optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/measure.py b/nemo_text_processing/text_normalization/vi/verbalizers/measure.py index 805942d1b..aa5ae2ca4 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/measure.py @@ -14,11 +14,11 @@ from nemo_text_processing.text_normalization.vi.graph_utils import ( GraphFst, - delete_space, delete_preserve_order, + delete_space, + extract_field, extract_wrapper_content, insert_space, - extract_field, ) @@ -31,7 +31,7 @@ class MeasureFst(GraphFst): Args: decimal: DecimalFst verbalizer - cardinal: CardinalFst verbalizer + cardinal: CardinalFst verbalizer fraction: FractionFst verbalizer deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) @@ -39,10 +39,10 @@ class MeasureFst(GraphFst): def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) - + # Extract components unit = extract_field("units") - + # Combine all number types into single graph number_graph = ( extract_wrapper_content("decimal", decimal.numbers) @@ -54,8 +54,6 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, de graph = number_graph + delete_space + insert_space + unit # Handle preserve_order: unit + space + number - graph |= ( - unit + delete_space + insert_space + number_graph + delete_preserve_order - ) + graph |= unit + delete_space + insert_space + number_graph + delete_preserve_order - self.fst = self.delete_tokens(graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/money.py b/nemo_text_processing/text_normalization/vi/verbalizers/money.py index 94d8927fa..391cce1b3 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/money.py @@ -98,20 +98,20 @@ def __init__(self, deterministic: bool = True): ) per_units_non_metric = pynini.string_file(get_abs_path("data/money/per_unit_non_metric.tsv")) - + per_unit_prefixes = pynini.string_file(get_abs_path("data/money/per_unit_prefixes.tsv")) per_unit_bases = pynini.string_file(get_abs_path("data/money/per_unit_bases.tsv")) - + prefixes_vn = pynini.project(per_unit_prefixes, "output") bases_vn = pynini.project(per_unit_bases, "output") - + space_accept = pynini.accep(NEMO_SPACE) one_space = pynini.accep("một ") - + # Accept metric combinations: "một ki lô gam" metric_per_units = one_space + prefixes_vn + space_accept + bases_vn standalone_per_units = one_space + bases_vn - + # Combine all per_unit recognitions per_units = per_units_non_metric | metric_per_units | standalone_per_units per_units_normalized = pynini.project(per_units, "output") diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py index a42fdcef3..499251e33 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py @@ -18,10 +18,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.graph_utils import ( - NEMO_SIGMA, - generator_main, -) +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_SIGMA, generator_main from nemo_text_processing.utils.logging import logger @@ -59,13 +56,10 @@ def get_vietnamese_punct_config(self) -> Dict[str, List[str]]: return { # Punctuation that should not have space before them 'no_space_before': [",", ".", "!", "?", ":", ";", ")", r"\]", "}", "\""], - - # Punctuation that should not have space after them + # Punctuation that should not have space after them 'no_space_after': ["(", r"\[", "{"], - # Punctuation that can have space before them (exceptions) 'allow_space_before': ["&", "-", "—", "–", "(", r"\[", "{", "\"", "'", "«", "»"], - # Special Vietnamese punctuation handling 'vietnamese_special': { # Vietnamese quotation marks @@ -74,7 +68,7 @@ def get_vietnamese_punct_config(self) -> Dict[str, List[str]]: 'dashes': ["-", "—", "–"], # Vietnamese brackets 'brackets': ["(", ")", r"\[", r"\]", "{", "}"], - } + }, } def set_punct_dict(self): @@ -109,40 +103,37 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ Returns graph to post process punctuation marks for Vietnamese. - + Uses dynamic configuration for flexible punctuation handling. Vietnamese punctuation spacing rules are defined in get_vietnamese_punct_config(). """ # Get dynamic punctuation configuration punct_config = self.get_vietnamese_punct_config() - + # Extract configuration no_space_before_punct = punct_config['no_space_before'] no_space_after_punct = punct_config['no_space_after'] - + # Create FSTs for punctuation rules no_space_before_punct_fst = pynini.union(*no_space_before_punct) no_space_after_punct_fst = pynini.union(*no_space_after_punct) - + delete_space = pynutil.delete(" ") # Rule 1: Remove space before punctuation (primary rule) remove_space_before = pynini.cdrewrite( delete_space + no_space_before_punct_fst, # " ," -> "," "", # any context before - "", # any context after - NEMO_SIGMA + "", # any context after + NEMO_SIGMA, ).optimize() - # Rule 2: Remove space after opening brackets + # Rule 2: Remove space after opening brackets remove_space_after = pynini.cdrewrite( - no_space_after_punct_fst + delete_space, # "( " -> "(" - "", - "", - NEMO_SIGMA + no_space_after_punct_fst + delete_space, "", "", NEMO_SIGMA # "( " -> "(" ).optimize() # Combine the two main rules graph = pynini.compose(remove_space_before, remove_space_after) - + return graph.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index 551d539c4..851cd35a9 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.vi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.vi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.vi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.vi.verbalizers.roman import RomanFst @@ -62,7 +62,7 @@ def __init__(self, deterministic: bool = True): measure = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst - + graph = ( cardinal_graph | whitelist_graph diff --git a/tests/nemo_text_processing/vi/test_measure.py b/tests/nemo_text_processing/vi/test_measure.py index 2b060b8fc..21e50ef11 100644 --- a/tests/nemo_text_processing/vi/test_measure.py +++ b/tests/nemo_text_processing/vi/test_measure.py @@ -25,8 +25,8 @@ class TestMeasure: - inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) - + inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_measure.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit @@ -34,8 +34,10 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer = Normalizer(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) - + normalizer = Normalizer( + input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + normalizer_with_audio = ( NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if CACHE_DIR and RUN_AUDIO_BASED_TESTS @@ -55,4 +57,4 @@ def test_norm(self, test_input, expected): n_tagged=30, punct_post_process=False, ) - assert expected in pred_non_deterministic, f"input: {test_input}" \ No newline at end of file + assert expected in pred_non_deterministic, f"input: {test_input}" From 3ed2bd338162d1fa56284a930c1f89220eeffdc2 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Sat, 2 Aug 2025 12:19:50 +0700 Subject: [PATCH 03/12] add test case for range measure Signed-off-by: folivoramanh --- .../vi/data_text_normalization/test_cases_range.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt index 8f776fd17..ea858dc4f 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt @@ -7,4 +7,5 @@ 10$-20$~mười đô la đến hai mươi đô la 50.000đ-100.000đ~năm mươi nghìn đồng đến một trăm nghìn đồng 3kg-6kg~ba ki lô gam đến sáu ki lô gam -15cm-25cm~mười lăm xăng ti mét đến hai mươi lăm xăng ti mét \ No newline at end of file +15cm-25cm~mười lăm xăng ti mét đến hai mươi lăm xăng ti mét +31Mhz-44Mhz~ba mươi mốt mê ga hẹc đến bốn mươi tư mê ga hẹc \ No newline at end of file From 06778f7ef140e42979539421d6f6b8fb2123c636 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Tue, 5 Aug 2025 03:06:32 +0700 Subject: [PATCH 04/12] additional support for cardinal and remove duplicate test case Signed-off-by: folivoramanh --- .../text_normalization/run_evaluate.py | 2 +- .../vi/data/numbers/magnitudes.tsv | 3 + .../text_normalization/vi/taggers/cardinal.py | 168 ++++++++++-------- .../test_cases_cardinal.txt | 1 - 4 files changed, 94 insertions(+), 80 deletions(-) diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 0438579a7..3b1696c0e 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'vi'], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv index c8a08083c..da60cb686 100644 --- a/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv +++ b/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv @@ -1,5 +1,8 @@ thousand nghìn million triệu billion tỷ +trillion nghìn tỷ +quadrillion triệu tỷ +quintillion tỷ tỷ hundred trăm linh linh \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index 8d12c9e72..5e77f2c2f 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -16,7 +16,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst, insert_space -from nemo_text_processing.text_normalization.vi.utils import get_abs_path +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels class CardinalFst(GraphFst): @@ -31,15 +31,11 @@ def __init__(self, deterministic: bool = True): } self.zero, self.digit, self.teen, self.ties = resources.values() - with open(get_abs_path("data/numbers/magnitudes.tsv"), 'r', encoding='utf-8') as f: - self.magnitudes = {parts[0]: parts[1] for line in f if len(parts := line.strip().split('\t')) == 2} + magnitudes_labels = load_labels(get_abs_path("data/numbers/magnitudes.tsv")) + self.magnitudes = {parts[0]: parts[1] for parts in magnitudes_labels if len(parts) == 2} - with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f: - special = { - parts[0]: {'std': parts[1], 'alt': parts[2]} - for line in f - if len(parts := line.strip().split('\t')) >= 3 - } + digit_special_labels = load_labels(get_abs_path("data/numbers/digit_special.tsv")) + special = {parts[0]: {'std': parts[1], 'alt': parts[2]} for parts in digit_special_labels if len(parts) >= 3} self.special_digits = pynini.union( *[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]] @@ -73,64 +69,43 @@ def __init__(self, deterministic: bool = True): + self.linh_digits, # XYZ: một trăm hai mười ba, etc. self.single_digit + insert_space + pynutil.insert(hundred_word) + insert_space + self.two_digit, + # 0YZ: Handle numbers starting with 0 (e.g., 087 -> tám mươi bảy) + pynutil.delete("0") + self.two_digit, + # 00Z: Handle numbers starting with 00 (e.g., 008 -> tám) + pynutil.delete("00") + self.single_digit, ) self.hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ self.hundreds_pattern - # Build magnitude patterns (thousands, millions, billions) - self.thousand = self._build_magnitude_pattern("thousand", 4, 6, 3) - self.million = self._build_magnitude_pattern("million", 7, 9, 6, self.thousand) - self.billion = self._build_magnitude_pattern("billion", 10, 12, 9, self.million) + # Build magnitude patterns dynamically + self.magnitude_patterns = self._build_all_magnitude_patterns() # Handle dot-separated numbers: 1.000, 1.000.000, etc. delete_dot = pynutil.delete(".") dot_patterns = [] - # Thousand with dots: 1.000 - dot_patterns.append( - pynini.compose( - (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 2) + delete_dot + NEMO_DIGIT**3, self.thousand - ) - ) - - # Million with dots: 1.000.000 - dot_patterns.append( - pynini.compose( - (NEMO_DIGIT - "0") - + pynini.closure(NEMO_DIGIT, 0, 2) - + delete_dot - + NEMO_DIGIT**3 - + delete_dot - + NEMO_DIGIT**3, - self.million, - ) - ) - - # Billion with dots: 1.000.000.000 - dot_patterns.append( - pynini.compose( - (NEMO_DIGIT - "0") - + pynini.closure(NEMO_DIGIT, 0, 2) - + delete_dot - + NEMO_DIGIT**3 - + delete_dot - + NEMO_DIGIT**3 - + delete_dot - + NEMO_DIGIT**3, - self.billion, - ) - ) - - self.graph = pynini.union( - self.billion, - self.million, - self.thousand, + # Build dot patterns automatically for all available magnitudes + for i, magnitude_name in enumerate( + ["thousand", "million", "billion", "trillion", "quadrillion", "quintillion"], 1 + ): + if magnitude_name in self.magnitude_patterns: + # Build pattern: (non-zero digit) + up to 2 digits + (dot + 3 digits) repeated i times + pattern = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 2) + for _ in range(i): # i = number of dot groups for this magnitude + pattern += delete_dot + NEMO_DIGIT ** 3 + + dot_patterns.append(pynini.compose(pattern, self.magnitude_patterns[magnitude_name])) + + # Build final graph with all magnitude patterns + all_patterns = [ + *self.magnitude_patterns.values(), # All magnitude patterns (trillion, billion, million, thousand) self.hundreds, self.two_digit, self.single_digit, self.zero, *dot_patterns, - ).optimize() + ] + self.graph = pynini.union(*all_patterns).optimize() self.single_digits_graph = self.single_digit | self.zero self.graph_with_and = self.graph @@ -172,40 +147,23 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre trailing_prefix = prefix + pynutil.delete("0" * trailing_zeros) if remaining_digits == 1: - digit_patterns.append( + # Prefer "linh" pattern with better weight + linh_pattern = ( trailing_prefix + insert_space + pynutil.insert(linh_word) + insert_space + self.linh_digits ) + digit_patterns.append(pynutil.add_weight(linh_pattern, -0.1)) elif remaining_digits == 2: digit_patterns.append(trailing_prefix + insert_space + self.two_digit) elif remaining_digits == 3: digit_patterns.append(trailing_prefix + insert_space + self.hundreds_pattern) - if name == "million" and digits == 7: - # Handle patterns like 1001001 -> một triệu một nghìn linh một + # Handle special cross-magnitude patterns (e.g., 1001001 -> một triệu một nghìn linh một) + if name == "million" and digits == 7 and "thousand" in self.magnitudes: + # Use helper method to build linh patterns consistently digit_patterns.extend( [ - prefix - + pynutil.delete("00") - + insert_space - + self.single_digit - + insert_space - + pynutil.insert(self.magnitudes["thousand"]) - + pynutil.delete("00") - + insert_space - + pynutil.insert(linh_word) - + insert_space - + self.linh_digits, - prefix - + pynutil.delete("0") - + insert_space - + self.two_digit - + insert_space - + pynutil.insert(self.magnitudes["thousand"]) - + pynutil.delete("00") - + insert_space - + pynutil.insert(linh_word) - + insert_space - + self.linh_digits, + self._build_linh_pattern(prefix, 2, self.magnitudes["thousand"], linh_word, self.single_digit), + self._build_linh_pattern(prefix, 1, self.magnitudes["thousand"], linh_word, self.two_digit), ] ) elif name == "billion" and digits == 10: @@ -229,3 +187,57 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre patterns.append(pynini.closure(NEMO_DIGIT, digits, digits) @ pynini.union(*digit_patterns)) return pynini.union(*patterns) + + def _build_linh_pattern(self, prefix, zeros_to_delete, magnitude_word, linh_word, digit_pattern): + """ + Helper method to build linh patterns consistently + Args: + prefix: base prefix pattern + zeros_to_delete: number of zeros to delete (0, 00, etc.) + magnitude_word: magnitude word to insert + linh_word: linh word to insert + digit_pattern: pattern for the digits (single_digit or two_digit) + """ + pattern = ( + prefix + + pynutil.delete("0" * zeros_to_delete) + + insert_space + + digit_pattern + + insert_space + + pynutil.insert(magnitude_word) + + pynutil.delete("00") + + insert_space + + pynutil.insert(linh_word) + + insert_space + + self.linh_digits + ) + return pynutil.add_weight(pattern, -0.1) + + def _build_all_magnitude_patterns(self): + """ + Dynamically build all magnitude patterns + Returns: dict mapping magnitude names to their FST patterns + """ + # Define magnitude hierarchy (name, min_digits, max_digits, zero_count) + magnitude_config = [ + ("thousand", 4, 6, 3), + ("million", 7, 9, 6), + ("billion", 10, 12, 9), + ("trillion", 13, 15, 12), + ("quadrillion", 16, 18, 15), + ("quintillion", 19, 21, 18), + ] + + patterns = {} + prev_pattern = None + + for name, min_digits, max_digits, zero_count in magnitude_config: + # Only build pattern if the magnitude word exists in magnitudes.tsv + if name in self.magnitudes: + patterns[name] = self._build_magnitude_pattern(name, min_digits, max_digits, zero_count, prev_pattern) + prev_pattern = patterns[name] + else: + # Stop building patterns if magnitude word doesn't exist + break + + return patterns diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt index aad7ae8c1..7eace4dea 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt @@ -54,7 +54,6 @@ -1000~âm một nghìn 0~không 1000~một nghìn -1001~một nghìn linh một 101~một trăm linh một 104~một trăm linh bốn 105~một trăm linh năm From 23535cb4ad3c9f41e177983e260fd9383ef6a904 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Aug 2025 20:07:13 +0000 Subject: [PATCH 05/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/vi/taggers/cardinal.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index 5e77f2c2f..c6d01bb80 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -92,7 +92,7 @@ def __init__(self, deterministic: bool = True): # Build pattern: (non-zero digit) + up to 2 digits + (dot + 3 digits) repeated i times pattern = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 2) for _ in range(i): # i = number of dot groups for this magnitude - pattern += delete_dot + NEMO_DIGIT ** 3 + pattern += delete_dot + NEMO_DIGIT**3 dot_patterns.append(pynini.compose(pattern, self.magnitude_patterns[magnitude_name])) @@ -195,7 +195,7 @@ def _build_linh_pattern(self, prefix, zeros_to_delete, magnitude_word, linh_word prefix: base prefix pattern zeros_to_delete: number of zeros to delete (0, 00, etc.) magnitude_word: magnitude word to insert - linh_word: linh word to insert + linh_word: linh word to insert digit_pattern: pattern for the digits (single_digit or two_digit) """ pattern = ( @@ -215,7 +215,7 @@ def _build_linh_pattern(self, prefix, zeros_to_delete, magnitude_word, linh_word def _build_all_magnitude_patterns(self): """ - Dynamically build all magnitude patterns + Dynamically build all magnitude patterns Returns: dict mapping magnitude names to their FST patterns """ # Define magnitude hierarchy (name, min_digits, max_digits, zero_count) From 60c96a808a4d0557db1cc33ccea69395f22d952f Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Tue, 5 Aug 2025 15:07:59 +0700 Subject: [PATCH 06/12] refractor cardinal and add test cases Signed-off-by: folivoramanh --- .../text_normalization/data_loader_utils.py | 6 +- .../text_normalization/vi/taggers/cardinal.py | 184 +++++++----------- .../test_cases_cardinal.txt | 13 +- 3 files changed, 82 insertions(+), 121 deletions(-) diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 5e7fa1892..cc8800821 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -140,9 +140,9 @@ def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = N acc = acc + 1 else: if input: - print(f"inpu: {json.dumps(input[i])}") - print(f"gold: {json.dumps(label_norm)}") - print(f"pred: {json.dumps(pred_norm)}") + print(f"input: {json.dumps(input[i], ensure_ascii=False)}") + print(f"gold: {json.dumps(label_norm, ensure_ascii=False)}") + print(f"pred: {json.dumps(pred_norm, ensure_ascii=False)}") return acc / nums diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index c6d01bb80..e7ba0b075 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -77,40 +77,19 @@ def __init__(self, deterministic: bool = True): self.hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ self.hundreds_pattern - # Build magnitude patterns dynamically self.magnitude_patterns = self._build_all_magnitude_patterns() - - # Handle dot-separated numbers: 1.000, 1.000.000, etc. - delete_dot = pynutil.delete(".") - dot_patterns = [] - - # Build dot patterns automatically for all available magnitudes - for i, magnitude_name in enumerate( - ["thousand", "million", "billion", "trillion", "quadrillion", "quintillion"], 1 - ): - if magnitude_name in self.magnitude_patterns: - # Build pattern: (non-zero digit) + up to 2 digits + (dot + 3 digits) repeated i times - pattern = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 2) - for _ in range(i): # i = number of dot groups for this magnitude - pattern += delete_dot + NEMO_DIGIT**3 - - dot_patterns.append(pynini.compose(pattern, self.magnitude_patterns[magnitude_name])) - - # Build final graph with all magnitude patterns + custom_patterns = self._build_all_patterns() + all_patterns = [ - *self.magnitude_patterns.values(), # All magnitude patterns (trillion, billion, million, thousand) - self.hundreds, - self.two_digit, - self.single_digit, - self.zero, - *dot_patterns, + *custom_patterns, + *self.magnitude_patterns.values(), + self.hundreds, self.two_digit, self.single_digit, self.zero, ] self.graph = pynini.union(*all_patterns).optimize() self.single_digits_graph = self.single_digit | self.zero self.graph_with_and = self.graph - # Build final FST with optional negative and integer wrapper negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = negative + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") self.fst = self.add_tokens(final_graph).optimize() @@ -118,126 +97,105 @@ def __init__(self, deterministic: bool = True): def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, prev_pattern=None): magnitude_word = self.magnitudes[name] linh_word = self.magnitudes["linh"] - patterns = [] + for digits in range(min_digits, max_digits + 1): leading_digits = digits - zero_count - - # Choose leading pattern based on digit count - if leading_digits == 1: - leading_fst = self.single_digit - elif leading_digits == 2: - leading_fst = self.two_digit - else: # 3 digits - leading_fst = self.hundreds_pattern + if leading_digits == 1: leading_fst = self.single_digit + elif leading_digits == 2: leading_fst = self.two_digit + else: leading_fst = self.hundreds_pattern prefix = leading_fst + insert_space + pynutil.insert(magnitude_word) - digit_patterns = [] - - # Case 1: All trailing zeros (e.g., 1000 -> một nghìn) - digit_patterns.append(prefix + pynutil.delete("0" * zero_count)) + digit_patterns = [prefix + pynutil.delete("0" * zero_count)] - # Case 2: Has lower magnitude (e.g., 1001000 -> một triệu một nghìn) - if prev_pattern: + if prev_pattern and name not in ["quadrillion", "quintillion"]: digit_patterns.append(prefix + insert_space + prev_pattern) - # Case 3: Trailing patterns with linh (e.g., 1001 -> một nghìn linh một) for trailing_zeros in range(zero_count): remaining_digits = zero_count - trailing_zeros trailing_prefix = prefix + pynutil.delete("0" * trailing_zeros) if remaining_digits == 1: - # Prefer "linh" pattern with better weight - linh_pattern = ( - trailing_prefix + insert_space + pynutil.insert(linh_word) + insert_space + self.linh_digits - ) + linh_pattern = trailing_prefix + insert_space + pynutil.insert(linh_word) + insert_space + self.linh_digits digit_patterns.append(pynutil.add_weight(linh_pattern, -0.1)) elif remaining_digits == 2: digit_patterns.append(trailing_prefix + insert_space + self.two_digit) elif remaining_digits == 3: digit_patterns.append(trailing_prefix + insert_space + self.hundreds_pattern) - # Handle special cross-magnitude patterns (e.g., 1001001 -> một triệu một nghìn linh một) - if name == "million" and digits == 7 and "thousand" in self.magnitudes: - # Use helper method to build linh patterns consistently - digit_patterns.extend( - [ - self._build_linh_pattern(prefix, 2, self.magnitudes["thousand"], linh_word, self.single_digit), - self._build_linh_pattern(prefix, 1, self.magnitudes["thousand"], linh_word, self.two_digit), - ] - ) - elif name == "billion" and digits == 10: - # Handle patterns like 1001001001 - digit_patterns.append( - prefix - + pynutil.delete("00") - + insert_space - + self.single_digit - + insert_space - + pynutil.insert(self.magnitudes["million"]) - + pynutil.delete("00") - + insert_space - + self.single_digit - + insert_space - + pynutil.insert(self.magnitudes["thousand"]) - + insert_space - + self.hundreds_pattern - ) - patterns.append(pynini.closure(NEMO_DIGIT, digits, digits) @ pynini.union(*digit_patterns)) return pynini.union(*patterns) - def _build_linh_pattern(self, prefix, zeros_to_delete, magnitude_word, linh_word, digit_pattern): - """ - Helper method to build linh patterns consistently - Args: - prefix: base prefix pattern - zeros_to_delete: number of zeros to delete (0, 00, etc.) - magnitude_word: magnitude word to insert - linh_word: linh word to insert - digit_pattern: pattern for the digits (single_digit or two_digit) - """ - pattern = ( - prefix - + pynutil.delete("0" * zeros_to_delete) - + insert_space - + digit_pattern - + insert_space - + pynutil.insert(magnitude_word) - + pynutil.delete("00") - + insert_space - + pynutil.insert(linh_word) - + insert_space - + self.linh_digits - ) - return pynutil.add_weight(pattern, -0.1) - def _build_all_magnitude_patterns(self): - """ - Dynamically build all magnitude patterns - Returns: dict mapping magnitude names to their FST patterns - """ - # Define magnitude hierarchy (name, min_digits, max_digits, zero_count) magnitude_config = [ - ("thousand", 4, 6, 3), - ("million", 7, 9, 6), - ("billion", 10, 12, 9), - ("trillion", 13, 15, 12), - ("quadrillion", 16, 18, 15), - ("quintillion", 19, 21, 18), + ("thousand", 4, 6, 3), ("million", 7, 9, 6), ("billion", 10, 12, 9), + ("trillion", 13, 15, 12), ("quadrillion", 16, 18, 15), ("quintillion", 19, 21, 18), ] - patterns = {} prev_pattern = None - for name, min_digits, max_digits, zero_count in magnitude_config: - # Only build pattern if the magnitude word exists in magnitudes.tsv if name in self.magnitudes: patterns[name] = self._build_magnitude_pattern(name, min_digits, max_digits, zero_count, prev_pattern) prev_pattern = patterns[name] else: - # Stop building patterns if magnitude word doesn't exist break + return patterns + def _get_zero_or_magnitude_pattern(self, digits, magnitude_key): + """Create pattern that handles all-zeros or normal magnitude processing""" + all_zeros = "0" * digits + return pynini.union(pynini.cross(all_zeros, ""), NEMO_DIGIT**digits @ self.magnitude_patterns[magnitude_key]) + + def _build_all_patterns(self): + patterns = [] + delete_dot = pynutil.delete(".") + + # Large number split patterns (>12 digits): front + "tỷ" + back(9 digits) + if "billion" in self.magnitudes: + billion_word = self.magnitudes["billion"] + back_digits = 9 + + for total_digits in range(13, 22): + front_digits = total_digits - back_digits + front_pattern = self._get_pattern_for_digits(front_digits) + if front_pattern: + back_pattern = self._get_zero_or_magnitude_pattern(back_digits, "million") + split_pattern = front_pattern + insert_space + pynutil.insert(billion_word) + insert_space + back_pattern + patterns.append(NEMO_DIGIT**total_digits @ pynutil.add_weight(split_pattern, -0.5)) + + # Dot patterns + dot_configs = [(6, None), (5, None), (4, None), (3, "billion"), (2, "million"), (1, "thousand")] + for dots, magnitude in dot_configs: + pattern = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 2) + for _ in range(dots): + pattern += delete_dot + NEMO_DIGIT**3 + + if magnitude and magnitude in self.magnitude_patterns: + patterns.append(pynini.compose(pynutil.add_weight(pattern, -0.3), self.magnitude_patterns[magnitude])) + elif not magnitude: + if dots == 4: digit_range = [13, 14, 15] + elif dots == 5: digit_range = [16, 17, 18] + elif dots == 6: digit_range = [19, 20, 21] + else: digit_range = [] + + for digit_count in digit_range: + if 13 <= digit_count <= 21: + front_digits = digit_count - back_digits + front_pattern = self._get_pattern_for_digits(front_digits) + if front_pattern: + back_pattern = self._get_zero_or_magnitude_pattern(back_digits, "million") + split = (NEMO_DIGIT**front_digits @ front_pattern) + insert_space + pynutil.insert(self.magnitudes["billion"]) + insert_space + back_pattern + patterns.append(pynini.compose(pattern, NEMO_DIGIT**digit_count @ pynutil.add_weight(split, -1.0))) + return patterns + + def _get_pattern_for_digits(self, digit_count): + if digit_count <= 0: return None + elif digit_count == 1: return self.single_digit + elif digit_count == 2: return self.two_digit + elif digit_count == 3: return self.hundreds_pattern + elif digit_count <= 6: return self.magnitude_patterns.get("thousand") + elif digit_count <= 9: return self.magnitude_patterns.get("million") + elif digit_count <= 12: return self.magnitude_patterns.get("billion") + else: return None diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt index 7eace4dea..74d2b7e98 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt @@ -53,10 +53,6 @@ -100~âm một trăm -1000~âm một nghìn 0~không -1000~một nghìn -101~một trăm linh một -104~một trăm linh bốn -105~một trăm linh năm 24~hai mươi tư 35~ba mươi lăm 41~bốn mươi mốt @@ -103,4 +99,11 @@ 1000101~một triệu một trăm linh một 1010001~một triệu mười nghìn linh một 10000000000~mười tỷ -150~một trăm năm mươi \ No newline at end of file +150~một trăm năm mươi +1000000000000~một nghìn tỷ +1234567890123~một nghìn hai trăm ba mươi tư tỷ năm trăm sáu mươi bảy triệu tám trăm chín mươi nghìn một trăm hai mươi ba +9876543210987~chín nghìn tám trăm bảy mươi sáu tỷ năm trăm bốn mươi ba triệu hai trăm mười nghìn chín trăm tám mươi bảy +1000000000000000~một triệu tỷ +1111111111111111~một triệu một trăm mười một nghìn một trăm mười một tỷ một trăm mười một triệu một trăm mười một nghìn một trăm mười một +5432109876543210~năm triệu bốn trăm ba mươi hai nghìn một trăm linh chín tỷ tám trăm bảy mươi sáu triệu năm trăm bốn mươi ba nghìn hai trăm mười +1000000000000000000~một tỷ tỷ \ No newline at end of file From c5ff5655100c0b17be13c9e97893a57d259a4072 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Aug 2025 08:09:28 +0000 Subject: [PATCH 07/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/vi/taggers/cardinal.py | 98 +++++++++++++------ 1 file changed, 66 insertions(+), 32 deletions(-) diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index e7ba0b075..59bb86d26 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -79,11 +79,14 @@ def __init__(self, deterministic: bool = True): self.magnitude_patterns = self._build_all_magnitude_patterns() custom_patterns = self._build_all_patterns() - + all_patterns = [ *custom_patterns, *self.magnitude_patterns.values(), - self.hundreds, self.two_digit, self.single_digit, self.zero, + self.hundreds, + self.two_digit, + self.single_digit, + self.zero, ] self.graph = pynini.union(*all_patterns).optimize() @@ -98,12 +101,15 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre magnitude_word = self.magnitudes[name] linh_word = self.magnitudes["linh"] patterns = [] - + for digits in range(min_digits, max_digits + 1): leading_digits = digits - zero_count - if leading_digits == 1: leading_fst = self.single_digit - elif leading_digits == 2: leading_fst = self.two_digit - else: leading_fst = self.hundreds_pattern + if leading_digits == 1: + leading_fst = self.single_digit + elif leading_digits == 2: + leading_fst = self.two_digit + else: + leading_fst = self.hundreds_pattern prefix = leading_fst + insert_space + pynutil.insert(magnitude_word) digit_patterns = [prefix + pynutil.delete("0" * zero_count)] @@ -116,7 +122,9 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre trailing_prefix = prefix + pynutil.delete("0" * trailing_zeros) if remaining_digits == 1: - linh_pattern = trailing_prefix + insert_space + pynutil.insert(linh_word) + insert_space + self.linh_digits + linh_pattern = ( + trailing_prefix + insert_space + pynutil.insert(linh_word) + insert_space + self.linh_digits + ) digit_patterns.append(pynutil.add_weight(linh_pattern, -0.1)) elif remaining_digits == 2: digit_patterns.append(trailing_prefix + insert_space + self.two_digit) @@ -129,8 +137,12 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre def _build_all_magnitude_patterns(self): magnitude_config = [ - ("thousand", 4, 6, 3), ("million", 7, 9, 6), ("billion", 10, 12, 9), - ("trillion", 13, 15, 12), ("quadrillion", 16, 18, 15), ("quintillion", 19, 21, 18), + ("thousand", 4, 6, 3), + ("million", 7, 9, 6), + ("billion", 10, 12, 9), + ("trillion", 13, 15, 12), + ("quadrillion", 16, 18, 15), + ("quintillion", 19, 21, 18), ] patterns = {} prev_pattern = None @@ -146,56 +158,78 @@ def _get_zero_or_magnitude_pattern(self, digits, magnitude_key): """Create pattern that handles all-zeros or normal magnitude processing""" all_zeros = "0" * digits return pynini.union(pynini.cross(all_zeros, ""), NEMO_DIGIT**digits @ self.magnitude_patterns[magnitude_key]) - + def _build_all_patterns(self): patterns = [] delete_dot = pynutil.delete(".") - + # Large number split patterns (>12 digits): front + "tỷ" + back(9 digits) if "billion" in self.magnitudes: billion_word = self.magnitudes["billion"] back_digits = 9 - + for total_digits in range(13, 22): front_digits = total_digits - back_digits front_pattern = self._get_pattern_for_digits(front_digits) if front_pattern: back_pattern = self._get_zero_or_magnitude_pattern(back_digits, "million") - split_pattern = front_pattern + insert_space + pynutil.insert(billion_word) + insert_space + back_pattern + split_pattern = ( + front_pattern + insert_space + pynutil.insert(billion_word) + insert_space + back_pattern + ) patterns.append(NEMO_DIGIT**total_digits @ pynutil.add_weight(split_pattern, -0.5)) - + # Dot patterns dot_configs = [(6, None), (5, None), (4, None), (3, "billion"), (2, "million"), (1, "thousand")] for dots, magnitude in dot_configs: pattern = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 2) for _ in range(dots): pattern += delete_dot + NEMO_DIGIT**3 - + if magnitude and magnitude in self.magnitude_patterns: patterns.append(pynini.compose(pynutil.add_weight(pattern, -0.3), self.magnitude_patterns[magnitude])) elif not magnitude: - if dots == 4: digit_range = [13, 14, 15] - elif dots == 5: digit_range = [16, 17, 18] - elif dots == 6: digit_range = [19, 20, 21] - else: digit_range = [] - + if dots == 4: + digit_range = [13, 14, 15] + elif dots == 5: + digit_range = [16, 17, 18] + elif dots == 6: + digit_range = [19, 20, 21] + else: + digit_range = [] + for digit_count in digit_range: if 13 <= digit_count <= 21: front_digits = digit_count - back_digits front_pattern = self._get_pattern_for_digits(front_digits) if front_pattern: back_pattern = self._get_zero_or_magnitude_pattern(back_digits, "million") - split = (NEMO_DIGIT**front_digits @ front_pattern) + insert_space + pynutil.insert(self.magnitudes["billion"]) + insert_space + back_pattern - patterns.append(pynini.compose(pattern, NEMO_DIGIT**digit_count @ pynutil.add_weight(split, -1.0))) - + split = ( + (NEMO_DIGIT**front_digits @ front_pattern) + + insert_space + + pynutil.insert(self.magnitudes["billion"]) + + insert_space + + back_pattern + ) + patterns.append( + pynini.compose(pattern, NEMO_DIGIT**digit_count @ pynutil.add_weight(split, -1.0)) + ) + return patterns - + def _get_pattern_for_digits(self, digit_count): - if digit_count <= 0: return None - elif digit_count == 1: return self.single_digit - elif digit_count == 2: return self.two_digit - elif digit_count == 3: return self.hundreds_pattern - elif digit_count <= 6: return self.magnitude_patterns.get("thousand") - elif digit_count <= 9: return self.magnitude_patterns.get("million") - elif digit_count <= 12: return self.magnitude_patterns.get("billion") - else: return None + if digit_count <= 0: + return None + elif digit_count == 1: + return self.single_digit + elif digit_count == 2: + return self.two_digit + elif digit_count == 3: + return self.hundreds_pattern + elif digit_count <= 6: + return self.magnitude_patterns.get("thousand") + elif digit_count <= 9: + return self.magnitude_patterns.get("million") + elif digit_count <= 12: + return self.magnitude_patterns.get("billion") + else: + return None From 90b64416705efcccdfc6e3ceb2f084caa716bb98 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Tue, 5 Aug 2025 16:20:05 +0700 Subject: [PATCH 08/12] remove duplicate lines in run_eval file Signed-off-by: folivoramanh --- nemo_text_processing/text_normalization/run_evaluate.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 3b1696c0e..ae3160f78 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -104,8 +104,6 @@ def parse_args(): print("- Accuracy: " + str(sum(token_weighted_accuracy) / sum(token_count_per_type.values()))) print(" - Total: " + str(sum(token_count_per_type.values())), '\n') - print(" - Total: " + str(sum(token_count_per_type.values())), '\n') - for token_type in token_accuracy: if token_type not in known_types: raise ValueError("Unexpected token type: " + token_type) From 373a89f0a26ac9580fdad943e0832fe8d508c7f6 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Fri, 8 Aug 2025 01:31:11 +0700 Subject: [PATCH 09/12] refractor minor code Signed-off-by: folivoramanh --- .../text_normalization/data_loader_utils.py | 6 +-- .../vi/data/numbers/digit_special.tsv | 2 +- .../text_normalization/vi/taggers/measure.py | 2 +- .../vi/verbalizers/money.py | 7 ++- tests/nemo_text_processing/vi/test_measure.py | 46 +++++++++---------- 5 files changed, 31 insertions(+), 32 deletions(-) diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index cc8800821..f861a4336 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -140,9 +140,9 @@ def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = N acc = acc + 1 else: if input: - print(f"input: {json.dumps(input[i], ensure_ascii=False)}") - print(f"gold: {json.dumps(label_norm, ensure_ascii=False)}") - print(f"pred: {json.dumps(pred_norm, ensure_ascii=False)}") + print(f"input: {json.dumps(input[i], ensure_ascii=True)}") + print(f"gold: {json.dumps(label_norm, ensure_ascii=True)}") + print(f"pred: {json.dumps(pred_norm, ensure_ascii=True)}") return acc / nums diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv index 919baaf6e..3c3421528 100644 --- a/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv +++ b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv @@ -1,3 +1,3 @@ 1 một mốt 4 bốn tư -5 năm lăm \ No newline at end of file +5 năm lăm \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/measure.py b/nemo_text_processing/text_normalization/vi/taggers/measure.py index 3e597e9c9..450ca96c4 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/vi/taggers/measure.py @@ -16,7 +16,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_COMMA, NEMO_DIGIT, NEMO_SPACE, GraphFst -from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels +from nemo_text_processing.text_normalization.vi.utils import get_abs_path class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/money.py b/nemo_text_processing/text_normalization/vi/verbalizers/money.py index 391cce1b3..028fd0a35 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/money.py @@ -105,12 +105,11 @@ def __init__(self, deterministic: bool = True): prefixes_vn = pynini.project(per_unit_prefixes, "output") bases_vn = pynini.project(per_unit_bases, "output") - space_accept = pynini.accep(NEMO_SPACE) - one_space = pynini.accep("một ") + one = pynini.accep("một") # Accept metric combinations: "một ki lô gam" - metric_per_units = one_space + prefixes_vn + space_accept + bases_vn - standalone_per_units = one_space + bases_vn + metric_per_units = one + insert_space + prefixes_vn + insert_space + bases_vn + standalone_per_units = one + insert_space + bases_vn # Combine all per_unit recognitions per_units = per_units_non_metric | metric_per_units | standalone_per_units diff --git a/tests/nemo_text_processing/vi/test_measure.py b/tests/nemo_text_processing/vi/test_measure.py index 21e50ef11..18daec547 100644 --- a/tests/nemo_text_processing/vi/test_measure.py +++ b/tests/nemo_text_processing/vi/test_measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,49 +12,49 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_measure.py --cpu --cache-clear + import pytest from parameterized import parameterized -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from ..utils import CACHE_DIR, parse_test_case_file -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +try: + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from nemo_text_processing.text_normalization.normalize import Normalizer + PYNINI_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + PYNINI_AVAILABLE = False -class TestMeasure: - inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) +class TestMeasure: + inverse_normalizer = ( + InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None + ) @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.skipif( + not PYNINI_AVAILABLE, + reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh", + ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + normalizer = Normalizer( input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True ) - normalizer_with_audio = ( - NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) - if CACHE_DIR and RUN_AUDIO_BASED_TESTS - else None - ) - @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_measure.txt')) + @pytest.mark.skipif( + not PYNINI_AVAILABLE, + reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh", + ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) - assert pred == expected, f"input: {test_input}" - - if self.normalizer_with_audio: - pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, - n_tagged=30, - punct_post_process=False, - ) - assert expected in pred_non_deterministic, f"input: {test_input}" + assert pred == expected, f"input: {test_input}" \ No newline at end of file From 2b8d86b19490dc8fadaf9c74dd0612dd62a525b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Aug 2025 18:34:40 +0000 Subject: [PATCH 10/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/nemo_text_processing/vi/test_measure.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/nemo_text_processing/vi/test_measure.py b/tests/nemo_text_processing/vi/test_measure.py index 18daec547..4cb89cf80 100644 --- a/tests/nemo_text_processing/vi/test_measure.py +++ b/tests/nemo_text_processing/vi/test_measure.py @@ -43,7 +43,6 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer = Normalizer( input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True ) @@ -57,4 +56,4 @@ def test_denorm(self, test_input, expected): @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) - assert pred == expected, f"input: {test_input}" \ No newline at end of file + assert pred == expected, f"input: {test_input}" From d8259670cb054175f1319801cba962b54e1e717e Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Mon, 11 Aug 2025 16:54:00 +0700 Subject: [PATCH 11/12] add measure support for unit per unit cases Signed-off-by: folivoramanh --- .../vi/data/measure/base_units.tsv | 5 ++- .../text_normalization/vi/taggers/measure.py | 33 +++++++++++++++---- .../test_cases_measure.txt | 8 ++++- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv b/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv index fa6c32658..eb9faf2f5 100644 --- a/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv +++ b/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv @@ -14,4 +14,7 @@ b bai B byte pa pascal ω ohm -Ω ôm \ No newline at end of file +Ω ôm +h giờ +min phút +hr giờ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/measure.py b/nemo_text_processing/text_normalization/vi/taggers/measure.py index 450ca96c4..38485f3be 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/vi/taggers/measure.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_COMMA, NEMO_DIGIT, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_COMMA, NEMO_DIGIT, NEMO_SPACE, GraphFst, delete_space from nemo_text_processing.text_normalization.vi.utils import get_abs_path @@ -41,7 +41,9 @@ def _create_measure_subgraph(self, measure_type: str, number_graph, optional_neg optional_negative + pynutil.insert(f"{measure_type} {{ ") + number_graph - + pynutil.insert(" } units: \"") + + pynutil.insert(" }") + + delete_space + + pynutil.insert(" units: \"") + graph_unit + pynutil.insert('"') ) @@ -77,6 +79,20 @@ def __init__( # Combine all unit mappings graph_unit = graph_metric_units | graph_special_units | graph_standalone_units + # Add compound unit support (unit/unit patterns like km/h) + graph_unit_compound = ( + pynini.cross("/", " trên ") + pynutil.insert(NEMO_SPACE) + graph_unit + ) + + optional_graph_unit_compound = pynini.closure( + pynutil.insert(NEMO_SPACE) + graph_unit_compound, + 0, + 1, + ) + + # Update unit graph to include compound units + graph_unit = graph_unit + optional_graph_unit_compound | graph_unit_compound + # Create unit symbol pattern using FST operations (no loops needed) prefix_symbols = pynini.project(graph_prefixes, "input") # Extract prefix symbols base_symbols = pynini.project(graph_base_units, "input") # Extract base symbols @@ -84,7 +100,11 @@ def __init__( # Build unit pattern: metric combinations | standalone bases | special units metric_pattern = prefix_symbols + base_symbols # All prefix+base combinations - unit_pattern = metric_pattern | base_symbols | special_symbols + simple_unit_pattern = metric_pattern | base_symbols | special_symbols + + # Add compound unit patterns to recognition + compound_pattern = simple_unit_pattern + "/" + simple_unit_pattern + unit_pattern = simple_unit_pattern | compound_pattern number = pynini.closure(NEMO_DIGIT, 1) decimal_number = number + NEMO_COMMA + pynini.closure(NEMO_DIGIT, 1) @@ -98,9 +118,10 @@ def __init__( # Domain restriction patterns - only match core number+unit patterns # Remove punctuation handling to let punctuation tagger handle it separately - integer_measure_domain = number + unit_pattern - decimal_measure_domain = decimal_number + unit_pattern - fraction_measure_domain = number + "/" + number + unit_pattern + optional_space = pynini.closure(NEMO_SPACE, 0, 1) + integer_measure_domain = number + optional_space + unit_pattern + decimal_measure_domain = decimal_number + optional_space + unit_pattern + fraction_measure_domain = number + "/" + number + optional_space + unit_pattern cardinal_number_graph = pynutil.insert('integer: "') + (number @ cardinal_graph) + pynutil.insert('"') diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt index 356d228cc..d3a7adeaa 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt @@ -54,4 +54,10 @@ Thể tích 500ml~Thể tích năm trăm mi li lít 10000gb~mười nghìn gi ga bai Kích thước 100cm x 50cm~Kích thước một trăm xăng ti mét x năm mươi xăng ti mét 1,5m2~một phẩy năm mét vuông -1,5m~một phẩy năm mét \ No newline at end of file +1,5m~một phẩy năm mét +120km/h~một trăm hai mươi ki lô mét trên giờ +100 km/h~một trăm ki lô mét trên giờ +50m/s~năm mươi mét trên giây +30 m/min~ba mươi mét trên phút +5cm/s~năm xăng ti mét trên giây +200mg/ml~hai trăm mi li gam trên mi li lít \ No newline at end of file From e9a68dd47fd752b0e9c58ec07b23d2b224aa798f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Aug 2025 09:55:16 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/vi/taggers/measure.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/text_normalization/vi/taggers/measure.py b/nemo_text_processing/text_normalization/vi/taggers/measure.py index 38485f3be..19650510c 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/vi/taggers/measure.py @@ -15,7 +15,13 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_COMMA, NEMO_DIGIT, NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import ( + NEMO_COMMA, + NEMO_DIGIT, + NEMO_SPACE, + GraphFst, + delete_space, +) from nemo_text_processing.text_normalization.vi.utils import get_abs_path @@ -80,10 +86,8 @@ def __init__( graph_unit = graph_metric_units | graph_special_units | graph_standalone_units # Add compound unit support (unit/unit patterns like km/h) - graph_unit_compound = ( - pynini.cross("/", " trên ") + pynutil.insert(NEMO_SPACE) + graph_unit - ) - + graph_unit_compound = pynini.cross("/", " trên ") + pynutil.insert(NEMO_SPACE) + graph_unit + optional_graph_unit_compound = pynini.closure( pynutil.insert(NEMO_SPACE) + graph_unit_compound, 0, @@ -101,8 +105,8 @@ def __init__( # Build unit pattern: metric combinations | standalone bases | special units metric_pattern = prefix_symbols + base_symbols # All prefix+base combinations simple_unit_pattern = metric_pattern | base_symbols | special_symbols - - # Add compound unit patterns to recognition + + # Add compound unit patterns to recognition compound_pattern = simple_unit_pattern + "/" + simple_unit_pattern unit_pattern = simple_unit_pattern | compound_pattern