diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 5e7fa1892..f861a4336 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -140,9 +140,9 @@ def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = N acc = acc + 1 else: if input: - print(f"inpu: {json.dumps(input[i])}") - print(f"gold: {json.dumps(label_norm)}") - print(f"pred: {json.dumps(pred_norm)}") + print(f"input: {json.dumps(input[i], ensure_ascii=True)}") + print(f"gold: {json.dumps(label_norm, ensure_ascii=True)}") + print(f"pred: {json.dumps(pred_norm, ensure_ascii=True)}") return acc / nums diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 329b28338..4ce71fa2b 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -176,7 +176,11 @@ def __init__( from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst elif lang == 'vi': from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.vi.verbalizers.post_processing import PostProcessingFst from nemo_text_processing.text_normalization.vi.verbalizers.verbalize_final import VerbalizeFinalFst + + if post_process: + self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -377,7 +381,7 @@ def normalize( return text output = SPACE_DUP.sub(' ', output[1:]) - if self.lang == "en" and hasattr(self, 'post_processor'): + if self.lang in ["en", "vi"] and hasattr(self, 'post_processor'): output = self.post_process(output) if punct_post_process: diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 0438579a7..ae3160f78 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'vi'], default="en", type=str, ) @@ -104,8 +104,6 @@ def parse_args(): print("- Accuracy: " + str(sum(token_weighted_accuracy) / sum(token_count_per_type.values()))) print(" - Total: " + str(sum(token_count_per_type.values())), '\n') - print(" - Total: " + str(sum(token_count_per_type.values())), '\n') - for token_type in token_accuracy: if token_type not in known_types: raise ValueError("Unexpected token type: " + token_type) diff --git a/nemo_text_processing/text_normalization/vi/data/measure/__init__.py b/nemo_text_processing/text_normalization/vi/data/measure/__init__.py new file mode 100644 index 000000000..b2de1dca7 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/measure/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv b/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv new file mode 100644 index 000000000..eb9faf2f5 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv @@ -0,0 +1,20 @@ +m mét +m2 mét vuông +m3 mét khối +m² mét vuông +m³ mét khối +g gam +l lít +s giây +v vôn +w oát +hz hẹc +A am pe +b bai +B byte +pa pascal +ω ohm +Ω ôm +h giờ +min phút +hr giờ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv b/nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv new file mode 100644 index 000000000..403360057 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv @@ -0,0 +1,25 @@ +°f độ f +°c độ c +°k độ k +ha héc ta +mi mile +ft foot +inch inch +yd yard +% phần trăm +hp mã lực +rad radian +kwh ki lô oát giờ +kbps kilobit trên giây +mbps megabit trên giây +ghz gi ga hẹc +mhz mê ga hẹc +tw tê ra oát +kcal ki lô calo +gb gi ga bai +mb mê ga bai +mV mi li vôn +MV mê ga vôn +tb terabyte +pb petabyte +g gam \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv b/nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv new file mode 100644 index 000000000..649ce73a7 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv @@ -0,0 +1,17 @@ +k ki lô +M mê ga +G gi ga +T tê ra +P pê ta +E ex xa +h hếc tô +da đề ca +d đề xi +c xăng ti +m mi li +µ mi crô +μ mi cờ rô +n na nô +p pi cô +f fem tô +a át tô \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv b/nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv new file mode 100644 index 000000000..feb1808d6 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv @@ -0,0 +1,8 @@ +g gam +m mét +m² mét vuông +m2 mét vuông +m³ mét khối +m3 mét khối +l lít +B bai \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/money/per_unit.tsv b/nemo_text_processing/text_normalization/vi/data/money/per_unit_non_metric.tsv similarity index 56% rename from nemo_text_processing/text_normalization/vi/data/money/per_unit.tsv rename to nemo_text_processing/text_normalization/vi/data/money/per_unit_non_metric.tsv index 66030c5c5..c1ccbdf69 100644 --- a/nemo_text_processing/text_normalization/vi/data/money/per_unit.tsv +++ b/nemo_text_processing/text_normalization/vi/data/money/per_unit_non_metric.tsv @@ -1,5 +1,4 @@ /giờ trên giờ -/g trên giờ /h trên giờ /ngày trên ngày /d trên ngày @@ -13,33 +12,17 @@ /lần một lần /cái một cái /chiếc một chiếc -/kg một ki lô gam -/g một gam -/cm một xăng ti mét -/m một mét -/km một ki lô mét -/cm² một xăng ti mét vuông -/m² một mét vuông -/m2 một mét vuông -/m³ một mét khối -/m3 một mét khối -/l một lít -/ml một mi li lít /người một người /chỗ một chỗ /bài một bài /trang một trang /từ một từ /đồng một đồng -/KB một kilobyte -/GB một gigabyte -/MB một megabyte -/TB một terabyte -/tấn một tấn /đêm một đêm /buổi một buổi /ca một ca /dự án một dự án /lớp một lớp /khóa một khóa -/suất một suất \ No newline at end of file +/suất một suất +/tấn một tấn \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv b/nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv new file mode 100644 index 000000000..154aa7306 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv @@ -0,0 +1,6 @@ +k ki lô +M mê ga +G gi ga +c xăng ti +m mi li +T tê ra \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv index 919baaf6e..3c3421528 100644 --- a/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv +++ b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv @@ -1,3 +1,3 @@ 1 một mốt 4 bốn tư -5 năm lăm \ No newline at end of file +5 năm lăm \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv index c8a08083c..da60cb686 100644 --- a/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv +++ b/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv @@ -1,5 +1,8 @@ thousand nghìn million triệu billion tỷ +trillion nghìn tỷ +quadrillion triệu tỷ +quintillion tỷ tỷ hundred trăm linh linh \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/graph_utils.py b/nemo_text_processing/text_normalization/vi/graph_utils.py index 4acb76439..1c0c1a0ab 100644 --- a/nemo_text_processing/text_normalization/vi/graph_utils.py +++ b/nemo_text_processing/text_normalization/vi/graph_utils.py @@ -62,6 +62,11 @@ def extract_field(field_name): return pynutil.delete(f"{field_name}:") + delete_space + pynutil.delete("\"") + quoted_text + pynutil.delete("\"") +def extract_wrapper_content(wrapper_type: str, content_graph): + """Helper to extract content from wrapper like 'decimal { ... }'""" + return pynutil.delete(f"{wrapper_type} {{") + delete_space + content_graph + delete_space + pynutil.delete("}") + + def convert_space(fst) -> "pynini.FstLike": """ Converts space to nonbreaking space. diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index 8d12c9e72..59bb86d26 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -16,7 +16,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst, insert_space -from nemo_text_processing.text_normalization.vi.utils import get_abs_path +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels class CardinalFst(GraphFst): @@ -31,15 +31,11 @@ def __init__(self, deterministic: bool = True): } self.zero, self.digit, self.teen, self.ties = resources.values() - with open(get_abs_path("data/numbers/magnitudes.tsv"), 'r', encoding='utf-8') as f: - self.magnitudes = {parts[0]: parts[1] for line in f if len(parts := line.strip().split('\t')) == 2} + magnitudes_labels = load_labels(get_abs_path("data/numbers/magnitudes.tsv")) + self.magnitudes = {parts[0]: parts[1] for parts in magnitudes_labels if len(parts) == 2} - with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f: - special = { - parts[0]: {'std': parts[1], 'alt': parts[2]} - for line in f - if len(parts := line.strip().split('\t')) >= 3 - } + digit_special_labels = load_labels(get_abs_path("data/numbers/digit_special.tsv")) + special = {parts[0]: {'std': parts[1], 'alt': parts[2]} for parts in digit_special_labels if len(parts) >= 3} self.special_digits = pynini.union( *[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]] @@ -73,69 +69,30 @@ def __init__(self, deterministic: bool = True): + self.linh_digits, # XYZ: một trăm hai mười ba, etc. self.single_digit + insert_space + pynutil.insert(hundred_word) + insert_space + self.two_digit, + # 0YZ: Handle numbers starting with 0 (e.g., 087 -> tám mươi bảy) + pynutil.delete("0") + self.two_digit, + # 00Z: Handle numbers starting with 00 (e.g., 008 -> tám) + pynutil.delete("00") + self.single_digit, ) self.hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ self.hundreds_pattern - # Build magnitude patterns (thousands, millions, billions) - self.thousand = self._build_magnitude_pattern("thousand", 4, 6, 3) - self.million = self._build_magnitude_pattern("million", 7, 9, 6, self.thousand) - self.billion = self._build_magnitude_pattern("billion", 10, 12, 9, self.million) + self.magnitude_patterns = self._build_all_magnitude_patterns() + custom_patterns = self._build_all_patterns() - # Handle dot-separated numbers: 1.000, 1.000.000, etc. - delete_dot = pynutil.delete(".") - dot_patterns = [] - - # Thousand with dots: 1.000 - dot_patterns.append( - pynini.compose( - (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 2) + delete_dot + NEMO_DIGIT**3, self.thousand - ) - ) - - # Million with dots: 1.000.000 - dot_patterns.append( - pynini.compose( - (NEMO_DIGIT - "0") - + pynini.closure(NEMO_DIGIT, 0, 2) - + delete_dot - + NEMO_DIGIT**3 - + delete_dot - + NEMO_DIGIT**3, - self.million, - ) - ) - - # Billion with dots: 1.000.000.000 - dot_patterns.append( - pynini.compose( - (NEMO_DIGIT - "0") - + pynini.closure(NEMO_DIGIT, 0, 2) - + delete_dot - + NEMO_DIGIT**3 - + delete_dot - + NEMO_DIGIT**3 - + delete_dot - + NEMO_DIGIT**3, - self.billion, - ) - ) - - self.graph = pynini.union( - self.billion, - self.million, - self.thousand, + all_patterns = [ + *custom_patterns, + *self.magnitude_patterns.values(), self.hundreds, self.two_digit, self.single_digit, self.zero, - *dot_patterns, - ).optimize() + ] + self.graph = pynini.union(*all_patterns).optimize() self.single_digits_graph = self.single_digit | self.zero self.graph_with_and = self.graph - # Build final FST with optional negative and integer wrapper negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = negative + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") self.fst = self.add_tokens(final_graph).optimize() @@ -143,89 +100,136 @@ def __init__(self, deterministic: bool = True): def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, prev_pattern=None): magnitude_word = self.magnitudes[name] linh_word = self.magnitudes["linh"] - patterns = [] + for digits in range(min_digits, max_digits + 1): leading_digits = digits - zero_count - - # Choose leading pattern based on digit count if leading_digits == 1: leading_fst = self.single_digit elif leading_digits == 2: leading_fst = self.two_digit - else: # 3 digits + else: leading_fst = self.hundreds_pattern prefix = leading_fst + insert_space + pynutil.insert(magnitude_word) - digit_patterns = [] - - # Case 1: All trailing zeros (e.g., 1000 -> một nghìn) - digit_patterns.append(prefix + pynutil.delete("0" * zero_count)) + digit_patterns = [prefix + pynutil.delete("0" * zero_count)] - # Case 2: Has lower magnitude (e.g., 1001000 -> một triệu một nghìn) - if prev_pattern: + if prev_pattern and name not in ["quadrillion", "quintillion"]: digit_patterns.append(prefix + insert_space + prev_pattern) - # Case 3: Trailing patterns with linh (e.g., 1001 -> một nghìn linh một) for trailing_zeros in range(zero_count): remaining_digits = zero_count - trailing_zeros trailing_prefix = prefix + pynutil.delete("0" * trailing_zeros) if remaining_digits == 1: - digit_patterns.append( + linh_pattern = ( trailing_prefix + insert_space + pynutil.insert(linh_word) + insert_space + self.linh_digits ) + digit_patterns.append(pynutil.add_weight(linh_pattern, -0.1)) elif remaining_digits == 2: digit_patterns.append(trailing_prefix + insert_space + self.two_digit) elif remaining_digits == 3: digit_patterns.append(trailing_prefix + insert_space + self.hundreds_pattern) - if name == "million" and digits == 7: - # Handle patterns like 1001001 -> một triệu một nghìn linh một - digit_patterns.extend( - [ - prefix - + pynutil.delete("00") - + insert_space - + self.single_digit - + insert_space - + pynutil.insert(self.magnitudes["thousand"]) - + pynutil.delete("00") - + insert_space - + pynutil.insert(linh_word) - + insert_space - + self.linh_digits, - prefix - + pynutil.delete("0") - + insert_space - + self.two_digit - + insert_space - + pynutil.insert(self.magnitudes["thousand"]) - + pynutil.delete("00") - + insert_space - + pynutil.insert(linh_word) - + insert_space - + self.linh_digits, - ] - ) - elif name == "billion" and digits == 10: - # Handle patterns like 1001001001 - digit_patterns.append( - prefix - + pynutil.delete("00") - + insert_space - + self.single_digit - + insert_space - + pynutil.insert(self.magnitudes["million"]) - + pynutil.delete("00") - + insert_space - + self.single_digit - + insert_space - + pynutil.insert(self.magnitudes["thousand"]) - + insert_space - + self.hundreds_pattern - ) - patterns.append(pynini.closure(NEMO_DIGIT, digits, digits) @ pynini.union(*digit_patterns)) return pynini.union(*patterns) + + def _build_all_magnitude_patterns(self): + magnitude_config = [ + ("thousand", 4, 6, 3), + ("million", 7, 9, 6), + ("billion", 10, 12, 9), + ("trillion", 13, 15, 12), + ("quadrillion", 16, 18, 15), + ("quintillion", 19, 21, 18), + ] + patterns = {} + prev_pattern = None + for name, min_digits, max_digits, zero_count in magnitude_config: + if name in self.magnitudes: + patterns[name] = self._build_magnitude_pattern(name, min_digits, max_digits, zero_count, prev_pattern) + prev_pattern = patterns[name] + else: + break + return patterns + + def _get_zero_or_magnitude_pattern(self, digits, magnitude_key): + """Create pattern that handles all-zeros or normal magnitude processing""" + all_zeros = "0" * digits + return pynini.union(pynini.cross(all_zeros, ""), NEMO_DIGIT**digits @ self.magnitude_patterns[magnitude_key]) + + def _build_all_patterns(self): + patterns = [] + delete_dot = pynutil.delete(".") + + # Large number split patterns (>12 digits): front + "tỷ" + back(9 digits) + if "billion" in self.magnitudes: + billion_word = self.magnitudes["billion"] + back_digits = 9 + + for total_digits in range(13, 22): + front_digits = total_digits - back_digits + front_pattern = self._get_pattern_for_digits(front_digits) + if front_pattern: + back_pattern = self._get_zero_or_magnitude_pattern(back_digits, "million") + split_pattern = ( + front_pattern + insert_space + pynutil.insert(billion_word) + insert_space + back_pattern + ) + patterns.append(NEMO_DIGIT**total_digits @ pynutil.add_weight(split_pattern, -0.5)) + + # Dot patterns + dot_configs = [(6, None), (5, None), (4, None), (3, "billion"), (2, "million"), (1, "thousand")] + for dots, magnitude in dot_configs: + pattern = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 2) + for _ in range(dots): + pattern += delete_dot + NEMO_DIGIT**3 + + if magnitude and magnitude in self.magnitude_patterns: + patterns.append(pynini.compose(pynutil.add_weight(pattern, -0.3), self.magnitude_patterns[magnitude])) + elif not magnitude: + if dots == 4: + digit_range = [13, 14, 15] + elif dots == 5: + digit_range = [16, 17, 18] + elif dots == 6: + digit_range = [19, 20, 21] + else: + digit_range = [] + + for digit_count in digit_range: + if 13 <= digit_count <= 21: + front_digits = digit_count - back_digits + front_pattern = self._get_pattern_for_digits(front_digits) + if front_pattern: + back_pattern = self._get_zero_or_magnitude_pattern(back_digits, "million") + split = ( + (NEMO_DIGIT**front_digits @ front_pattern) + + insert_space + + pynutil.insert(self.magnitudes["billion"]) + + insert_space + + back_pattern + ) + patterns.append( + pynini.compose(pattern, NEMO_DIGIT**digit_count @ pynutil.add_weight(split, -1.0)) + ) + + return patterns + + def _get_pattern_for_digits(self, digit_count): + if digit_count <= 0: + return None + elif digit_count == 1: + return self.single_digit + elif digit_count == 2: + return self.two_digit + elif digit_count == 3: + return self.hundreds_pattern + elif digit_count <= 6: + return self.magnitude_patterns.get("thousand") + elif digit_count <= 9: + return self.magnitude_patterns.get("million") + elif digit_count <= 12: + return self.magnitude_patterns.get("billion") + else: + return None diff --git a/nemo_text_processing/text_normalization/vi/taggers/fraction.py b/nemo_text_processing/text_normalization/vi/taggers/fraction.py index ed3394120..ca1d11ebf 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/taggers/fraction.py @@ -62,6 +62,11 @@ def __init__(self, cardinal: CardinalFst, deterministic: bool = True): simple_fraction = numerator + denominator mixed_fraction = integer_part + pynutil.delete(" ") + numerator + denominator + + # Create graph without negative for reuse in other FSTs (like measure) + fraction_wo_negative = simple_fraction | mixed_fraction + self.final_graph_wo_negative = fraction_wo_negative.optimize() + optional_graph_negative = (pynutil.insert("negative: ") + pynini.cross("-", "\"true\" ")).ques self.fst = self.add_tokens(optional_graph_negative + (simple_fraction | mixed_fraction)).optimize() diff --git a/nemo_text_processing/text_normalization/vi/taggers/measure.py b/nemo_text_processing/text_normalization/vi/taggers/measure.py new file mode 100644 index 000000000..19650510c --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/measure.py @@ -0,0 +1,151 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import ( + NEMO_COMMA, + NEMO_DIGIT, + NEMO_SPACE, + GraphFst, + delete_space, +) +from nemo_text_processing.text_normalization.vi.utils import get_abs_path + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure for Vietnamese, e.g. + 12kg -> measure { cardinal { integer: "mười hai" } units: "ki lô gam" } + 1kg -> measure { cardinal { integer: "một" } units: "ki lô gam" } + 0.5kg -> measure { decimal { fractional_part: "năm" } units: "ki lô gam" } + -12kg -> measure { negative: "true" cardinal { integer: "mười hai" } units: "ki lô gam" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + fraction: FractionFst + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def _create_measure_subgraph(self, measure_type: str, number_graph, optional_negative, graph_unit): + """Helper to create measure subgraph pattern - reduces duplication""" + return ( + optional_negative + + pynutil.insert(f"{measure_type} {{ ") + + number_graph + + pynutil.insert(" }") + + delete_space + + pynutil.insert(" units: \"") + + graph_unit + + pynutil.insert('"') + ) + + def __init__( + self, + cardinal: GraphFst, + decimal: GraphFst, + fraction: GraphFst, + deterministic: bool = True, + ): + super().__init__(name="measure", kind="classify", deterministic=deterministic) + + cardinal_graph = cardinal.graph + + # Load minimal measurement files (massive redundancy removed via subfst) + measurements_path = get_abs_path("data/measure/measurements_minimal.tsv") + prefixes_path = get_abs_path("data/measure/prefixes.tsv") + base_units_path = get_abs_path("data/measure/base_units.tsv") + + # Create subfst for metric units: prefix + space + base_unit + graph_prefixes = pynini.string_file(prefixes_path) + graph_base_units = pynini.string_file(base_units_path) + space = pynutil.insert(NEMO_SPACE) + graph_metric_units = graph_prefixes + space + graph_base_units + + # Load non-metric and special units + graph_special_units = pynini.string_file(measurements_path) + + # Also allow base units without prefixes (e.g., 'g' not just 'kg') + graph_standalone_units = graph_base_units + + # Combine all unit mappings + graph_unit = graph_metric_units | graph_special_units | graph_standalone_units + + # Add compound unit support (unit/unit patterns like km/h) + graph_unit_compound = pynini.cross("/", " trên ") + pynutil.insert(NEMO_SPACE) + graph_unit + + optional_graph_unit_compound = pynini.closure( + pynutil.insert(NEMO_SPACE) + graph_unit_compound, + 0, + 1, + ) + + # Update unit graph to include compound units + graph_unit = graph_unit + optional_graph_unit_compound | graph_unit_compound + + # Create unit symbol pattern using FST operations (no loops needed) + prefix_symbols = pynini.project(graph_prefixes, "input") # Extract prefix symbols + base_symbols = pynini.project(graph_base_units, "input") # Extract base symbols + special_symbols = pynini.project(graph_special_units, "input") # Extract special symbols + + # Build unit pattern: metric combinations | standalone bases | special units + metric_pattern = prefix_symbols + base_symbols # All prefix+base combinations + simple_unit_pattern = metric_pattern | base_symbols | special_symbols + + # Add compound unit patterns to recognition + compound_pattern = simple_unit_pattern + "/" + simple_unit_pattern + unit_pattern = simple_unit_pattern | compound_pattern + + number = pynini.closure(NEMO_DIGIT, 1) + decimal_number = number + NEMO_COMMA + pynini.closure(NEMO_DIGIT, 1) + + # Optional negative sign handling for Vietnamese + optional_graph_negative = pynini.closure( + pynini.cross(pynini.union("âm", "trừ"), "negative: \"true\" "), + 0, + 1, + ) + + # Domain restriction patterns - only match core number+unit patterns + # Remove punctuation handling to let punctuation tagger handle it separately + optional_space = pynini.closure(NEMO_SPACE, 0, 1) + integer_measure_domain = number + optional_space + unit_pattern + decimal_measure_domain = decimal_number + optional_space + unit_pattern + fraction_measure_domain = number + "/" + number + optional_space + unit_pattern + + cardinal_number_graph = pynutil.insert('integer: "') + (number @ cardinal_graph) + pynutil.insert('"') + + subgraph_cardinal = self._create_measure_subgraph( + "cardinal", cardinal_number_graph, optional_graph_negative, graph_unit + ) + subgraph_decimal = self._create_measure_subgraph( + "decimal", decimal.final_graph_wo_negative, optional_graph_negative, graph_unit + ) + subgraph_fraction = self._create_measure_subgraph( + "fraction", fraction.final_graph_wo_negative, optional_graph_negative, graph_unit + ) + + # Apply domain restrictions to ensure we only match complete number+unit patterns + subgraph_cardinal = pynini.compose(integer_measure_domain, subgraph_cardinal) + subgraph_decimal = pynini.compose(decimal_measure_domain, subgraph_decimal) + subgraph_fraction = pynini.compose(fraction_measure_domain, subgraph_fraction) + + # Final graph combining main patterns + final_graph = subgraph_cardinal | subgraph_decimal | subgraph_fraction + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/vi/taggers/money.py b/nemo_text_processing/text_normalization/vi/taggers/money.py index eed524d73..540094591 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/money.py +++ b/nemo_text_processing/text_normalization/vi/taggers/money.py @@ -47,7 +47,29 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = currency_major_labels = load_labels(get_abs_path("data/money/currency.tsv")) currency_minor_labels = load_labels(get_abs_path("data/money/currency_minor.tsv")) quantity_graph = pynini.string_file(get_abs_path("data/numbers/quantity_abbr.tsv")) - per_unit_graph = pynini.string_file(get_abs_path("data/money/per_unit.tsv")) + + # Load optimized per_unit files using subfst approach + per_unit_non_metric_path = get_abs_path("data/money/per_unit_non_metric.tsv") + per_unit_prefixes_path = get_abs_path("data/money/per_unit_prefixes.tsv") + per_unit_bases_path = get_abs_path("data/money/per_unit_bases.tsv") + + # Create subfst for metric per_unit patterns + graph_prefixes = pynini.string_file(per_unit_prefixes_path) + graph_bases = pynini.string_file(per_unit_bases_path) + + # Build metric combinations: "/kg" -> "một ki lô gam" + slash = pynutil.delete("/") + one_space = pynutil.insert("một ") + space = pynutil.insert(NEMO_SPACE) + + graph_metric_per_units = slash + one_space + graph_prefixes + space + graph_bases + graph_standalone_per_units = slash + one_space + graph_bases + + # Load non-metric per_unit entries + graph_non_metric_per_units = pynini.string_file(per_unit_non_metric_path) + + # Combine all per_unit mappings + per_unit_graph = graph_metric_per_units | graph_standalone_per_units | graph_non_metric_per_units # Basic components cardinal_graph = cardinal.graph diff --git a/nemo_text_processing/text_normalization/vi/taggers/range.py b/nemo_text_processing/text_normalization/vi/taggers/range.py index 8f8f0d23f..f52341d9d 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/range.py +++ b/nemo_text_processing/text_normalization/vi/taggers/range.py @@ -41,6 +41,7 @@ def __init__( date: GraphFst, decimal: GraphFst, money: GraphFst, + measure: GraphFst, deterministic: bool = True, ): super().__init__(name="range", kind="classify", deterministic=deterministic) @@ -50,11 +51,11 @@ def __init__( # Pattern: X-Y -> X đến Y # This will handle time ranges, date ranges, decimal ranges, and money ranges with dash range_pattern = ( - (time | date | decimal | money) + (time | date | decimal | money | measure) + delete_space + pynini.cross("-", " đến ") + delete_space - + (time | date | decimal | money) + + (time | date | decimal | money | measure) ) self.graph = range_pattern diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 9e97ac940..4588ebe75 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -28,6 +28,7 @@ from nemo_text_processing.text_normalization.vi.taggers.date import DateFst from nemo_text_processing.text_normalization.vi.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.vi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.vi.taggers.money import MoneyFst from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst @@ -39,6 +40,8 @@ from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst as VCardinalFst from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst as VDateFst from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst as VDecimalFst +from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst as VFractionFst +from nemo_text_processing.text_normalization.vi.verbalizers.measure import MeasureFst as VMeasureFst from nemo_text_processing.text_normalization.vi.verbalizers.money import MoneyFst as VMoneyFst from nemo_text_processing.text_normalization.vi.verbalizers.time import TimeFst as VTimeFst from nemo_text_processing.utils.logging import logger @@ -122,6 +125,11 @@ def __init__( money_graph = money.fst logger.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") + start_time = time.time() + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) + measure_graph = measure.fst + logger.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") + # Create composed verbalizers for range processing start_time = time.time() v_cardinal = VCardinalFst(deterministic=deterministic) @@ -137,9 +145,20 @@ def __init__( v_money = VMoneyFst(deterministic=deterministic) money_final = pynini.compose(money_graph, v_money.fst) + v_fraction = VFractionFst(deterministic=deterministic) + v_measure = VMeasureFst( + decimal=v_decimal, cardinal=v_cardinal, fraction=v_fraction, deterministic=deterministic + ) + measure_final = pynini.compose(measure_graph, v_measure.fst) + # Create range graph range_fst = RangeFst( - time=time_final, date=date_final, decimal=decimal_final, money=money_final, deterministic=deterministic + time=time_final, + date=date_final, + decimal=decimal_final, + money=money_final, + measure=measure_final, + deterministic=deterministic, ) range_graph = range_fst.fst logger.debug(f"range: {time.time() - start_time: .2f}s -- {range_graph.num_states()} nodes") @@ -155,6 +174,7 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) punct = ( diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py index 328bbcded..675d959df 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py @@ -50,4 +50,6 @@ def __init__(self, deterministic: bool = True): simple_fraction = fraction_part mixed_fraction = integer_tagged + delete_space + pynutil.insert(" và ") + fraction_part - self.fst = self.delete_tokens(optional_sign + (simple_fraction | mixed_fraction)).optimize() + self.numbers = optional_sign + (simple_fraction | mixed_fraction) + + self.fst = self.delete_tokens(self.numbers).optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/measure.py b/nemo_text_processing/text_normalization/vi/verbalizers/measure.py new file mode 100644 index 000000000..aa5ae2ca4 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/measure.py @@ -0,0 +1,59 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.vi.graph_utils import ( + GraphFst, + delete_preserve_order, + delete_space, + extract_field, + extract_wrapper_content, + insert_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure for Vietnamese, e.g. + measure { negative: "true" cardinal { integer: "mười hai" } units: "ki lô gam" } -> âm mười hai ki lô gam + measure { decimal { integer_part: "mười hai" fractional_part: "năm" } units: "ki lô gam" } -> mười hai phẩy năm ki lô gam + measure { cardinal { integer: "một" } units: "ki lô gam" } -> một ki lô gam + + Args: + decimal: DecimalFst verbalizer + cardinal: CardinalFst verbalizer + fraction: FractionFst verbalizer + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True): + super().__init__(name="measure", kind="verbalize", deterministic=deterministic) + + # Extract components + unit = extract_field("units") + + # Combine all number types into single graph + number_graph = ( + extract_wrapper_content("decimal", decimal.numbers) + | extract_wrapper_content("cardinal", cardinal.numbers) + | extract_wrapper_content("fraction", fraction.numbers) + ) + + # Main pattern: number + space + unit (most common case) + graph = number_graph + delete_space + insert_space + unit + + # Handle preserve_order: unit + space + number + graph |= unit + delete_space + insert_space + number_graph + delete_preserve_order + + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/money.py b/nemo_text_processing/text_normalization/vi/verbalizers/money.py index 035b4acd1..028fd0a35 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/money.py @@ -18,6 +18,7 @@ from nemo_text_processing.text_normalization.vi.graph_utils import ( NEMO_COMMA_VI, NEMO_NOT_QUOTE, + NEMO_SPACE, GraphFst, delete_preserve_order, delete_space, @@ -96,8 +97,22 @@ def __init__(self, deterministic: bool = True): | graph_integer # Handle simple cases (most common, lowest priority) ) - # Add per-unit support (following English pattern) - per_units = pynini.string_file(get_abs_path("data/money/per_unit.tsv")) + per_units_non_metric = pynini.string_file(get_abs_path("data/money/per_unit_non_metric.tsv")) + + per_unit_prefixes = pynini.string_file(get_abs_path("data/money/per_unit_prefixes.tsv")) + per_unit_bases = pynini.string_file(get_abs_path("data/money/per_unit_bases.tsv")) + + prefixes_vn = pynini.project(per_unit_prefixes, "output") + bases_vn = pynini.project(per_unit_bases, "output") + + one = pynini.accep("một") + + # Accept metric combinations: "một ki lô gam" + metric_per_units = one + insert_space + prefixes_vn + insert_space + bases_vn + standalone_per_units = one + insert_space + bases_vn + + # Combine all per_unit recognitions + per_units = per_units_non_metric | metric_per_units | standalone_per_units per_units_normalized = pynini.project(per_units, "output") per_unit_pattern = ( pynutil.delete(' morphosyntactic_features: "') + insert_space + per_units_normalized + pynutil.delete('"') diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py new file mode 100644 index 000000000..499251e33 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py @@ -0,0 +1,139 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Dict, List + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_SIGMA, generator_main +from nemo_text_processing.utils.logging import logger + + +class PostProcessingFst: + """ + Finite state transducer that post-processes an entire Vietnamese sentence after verbalization is complete, e.g. + removes extra spaces around punctuation marks " ( một trăm hai mươi ba ) " -> "(một trăm hai mươi ba)" + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "vi_tn_post_processing.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] + logger.info(f'Post processing graph was restored from {far_file}.') + else: + self.set_punct_dict() + self.fst = self.get_punct_postprocess_graph() + + if far_file: + generator_main(far_file, {"post_process_graph": self.fst}) + + def get_vietnamese_punct_config(self) -> Dict[str, List[str]]: + """ + Returns Vietnamese-specific punctuation configuration. + This method can be easily modified or extended for different Vietnamese punctuation rules. + """ + return { + # Punctuation that should not have space before them + 'no_space_before': [",", ".", "!", "?", ":", ";", ")", r"\]", "}", "\""], + # Punctuation that should not have space after them + 'no_space_after': ["(", r"\[", "{"], + # Punctuation that can have space before them (exceptions) + 'allow_space_before': ["&", "-", "—", "–", "(", r"\[", "{", "\"", "'", "«", "»"], + # Special Vietnamese punctuation handling + 'vietnamese_special': { + # Vietnamese quotation marks + 'quotes': ["\"", "'", "«", "»", """, """, "'", "'"], + # Vietnamese dashes and separators + 'dashes': ["-", "—", "–"], + # Vietnamese brackets + 'brackets': ["(", ")", r"\[", r"\]", "{", "}"], + }, + } + + def set_punct_dict(self): + # Vietnamese punctuation marks that might need special handling + self.punct_marks = { + "'": [ + "'", + '´', + 'ʹ', + 'ʻ', + 'ʼ', + 'ʽ', + 'ʾ', + 'ˈ', + 'ˊ', + 'ˋ', + '˴', + 'ʹ', + '΄', + '`', + '´', + '’', + '‛', + '′', + '‵', + 'ꞌ', + ''', + '`', + ], + } + + def get_punct_postprocess_graph(self): + """ + Returns graph to post process punctuation marks for Vietnamese. + + Uses dynamic configuration for flexible punctuation handling. + Vietnamese punctuation spacing rules are defined in get_vietnamese_punct_config(). + """ + # Get dynamic punctuation configuration + punct_config = self.get_vietnamese_punct_config() + + # Extract configuration + no_space_before_punct = punct_config['no_space_before'] + no_space_after_punct = punct_config['no_space_after'] + + # Create FSTs for punctuation rules + no_space_before_punct_fst = pynini.union(*no_space_before_punct) + no_space_after_punct_fst = pynini.union(*no_space_after_punct) + + delete_space = pynutil.delete(" ") + + # Rule 1: Remove space before punctuation (primary rule) + remove_space_before = pynini.cdrewrite( + delete_space + no_space_before_punct_fst, # " ," -> "," + "", # any context before + "", # any context after + NEMO_SIGMA, + ).optimize() + + # Rule 2: Remove space after opening brackets + remove_space_after = pynini.cdrewrite( + no_space_after_punct_fst + delete_space, "", "", NEMO_SIGMA # "( " -> "(" + ).optimize() + + # Combine the two main rules + graph = pynini.compose(remove_space_before, remove_space_after) + + return graph.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index d241301ff..851cd35a9 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -17,6 +17,7 @@ from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.vi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.vi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.vi.verbalizers.roman import RomanFst @@ -59,6 +60,9 @@ def __init__(self, deterministic: bool = True): money = MoneyFst(deterministic=deterministic) money_graph = money.fst + measure = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) + measure_graph = measure.fst + graph = ( cardinal_graph | whitelist_graph @@ -70,6 +74,7 @@ def __init__(self, deterministic: bool = True): | roman_graph | time_graph | money_graph + | measure_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt index aad7ae8c1..74d2b7e98 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt @@ -53,11 +53,6 @@ -100~âm một trăm -1000~âm một nghìn 0~không -1000~một nghìn -1001~một nghìn linh một -101~một trăm linh một -104~một trăm linh bốn -105~một trăm linh năm 24~hai mươi tư 35~ba mươi lăm 41~bốn mươi mốt @@ -104,4 +99,11 @@ 1000101~một triệu một trăm linh một 1010001~một triệu mười nghìn linh một 10000000000~mười tỷ -150~một trăm năm mươi \ No newline at end of file +150~một trăm năm mươi +1000000000000~một nghìn tỷ +1234567890123~một nghìn hai trăm ba mươi tư tỷ năm trăm sáu mươi bảy triệu tám trăm chín mươi nghìn một trăm hai mươi ba +9876543210987~chín nghìn tám trăm bảy mươi sáu tỷ năm trăm bốn mươi ba triệu hai trăm mười nghìn chín trăm tám mươi bảy +1000000000000000~một triệu tỷ +1111111111111111~một triệu một trăm mười một nghìn một trăm mười một tỷ một trăm mười một triệu một trăm mười một nghìn một trăm mười một +5432109876543210~năm triệu bốn trăm ba mươi hai nghìn một trăm linh chín tỷ tám trăm bảy mươi sáu triệu năm trăm bốn mươi ba nghìn hai trăm mười +1000000000000000000~một tỷ tỷ \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..d3a7adeaa --- /dev/null +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_measure.txt @@ -0,0 +1,63 @@ +204m~hai trăm linh bốn mét +12kg~mười hai ki lô gam +1kg~một ki lô gam +100g~một trăm gam +500mg~năm trăm mi li gam +175cm~một trăm bảy mươi lăm xăng ti mét +2m~hai mét +100mm~một trăm mi li mét +5km~năm ki lô mét +1inch~một inch +500ml~năm trăm mi li lít +2l~hai lít +1m³~một mét khối +100cm³~một trăm xăng ti mét khối +2gb~hai gi ga bai +1tb~một terabyte +512Mb~năm trăm mười hai mê ga bai +64kb~sáu mươi tư ki lô bai +25°c~hai mươi lăm độ c +100°f~một trăm độ f +273°k~hai trăm bảy mươi ba độ k +50%~năm mươi phần trăm +100%~một trăm phần trăm +25%~hai mươi lăm phần trăm +220v~hai trăm hai mươi vôn +1kw~một ki lô oát +500mV~năm trăm mi li vôn +1000mA~một nghìn mi li am pe +50hz~năm mươi hẹc +2ghz~hai gi ga hẹc +100Mhz~một trăm mê ga hẹc +1000kw~một nghìn ki lô oát +5hp~năm mã lực +1tw~một tê ra oát +100m²~một trăm mét vuông +5km²~năm ki lô mét vuông +1km2~một ki lô mét vuông +8,5m2~tám phẩy năm mét vuông +1ha~một héc ta +1/2kg~một phần hai ki lô gam +3/4m~ba phần tư mét +1/3l~một phần ba lít +Tôi có 12kg gạo~Tôi có mười hai ki lô gam gạo +Chiều cao 175cm~Chiều cao một trăm bảy mươi lăm xăng ti mét +Dung lượng 2gb~Dung lượng hai gi ga bai +Nhiệt độ 25°c~Nhiệt độ hai mươi lăm độ c +Cân nặng 1/2kg~Cân nặng một phần hai ki lô gam +Điện áp 220v~Điện áp hai trăm hai mươi vôn +Tỷ lệ 50%~Tỷ lệ năm mươi phần trăm +Bộ nhớ 1tb~Bộ nhớ một terabyte +Thể tích 500ml~Thể tích năm trăm mi li lít +1234kg~một nghìn hai trăm ba mươi tư ki lô gam +2500m~hai nghìn năm trăm mét +10000gb~mười nghìn gi ga bai +Kích thước 100cm x 50cm~Kích thước một trăm xăng ti mét x năm mươi xăng ti mét +1,5m2~một phẩy năm mét vuông +1,5m~một phẩy năm mét +120km/h~một trăm hai mươi ki lô mét trên giờ +100 km/h~một trăm ki lô mét trên giờ +50m/s~năm mươi mét trên giây +30 m/min~ba mươi mét trên phút +5cm/s~năm xăng ti mét trên giây +200mg/ml~hai trăm mi li gam trên mi li lít \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_money.txt index b5ef741ac..755a1030a 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_money.txt @@ -26,4 +26,5 @@ 0,01$~một xu 2,50€~hai ơ rô năm mươi xu 1000,50 VND~một nghìn phẩy năm không đồng -5,99$~năm đô la chín mươi chín xu \ No newline at end of file +5,99$~năm đô la chín mươi chín xu +30đ/TB~ba mươi đồng một tê ra bai \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt index 64263db2d..ea858dc4f 100644 --- a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_range.txt @@ -5,4 +5,7 @@ 1t-2t~một tỷ đến hai tỷ 10:00-11:00~mười giờ đến mười một giờ 10$-20$~mười đô la đến hai mươi đô la -50.000đ-100.000đ~năm mươi nghìn đồng đến một trăm nghìn đồng \ No newline at end of file +50.000đ-100.000đ~năm mươi nghìn đồng đến một trăm nghìn đồng +3kg-6kg~ba ki lô gam đến sáu ki lô gam +15cm-25cm~mười lăm xăng ti mét đến hai mươi lăm xăng ti mét +31Mhz-44Mhz~ba mươi mốt mê ga hẹc đến bốn mươi tư mê ga hẹc \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_measure.py b/tests/nemo_text_processing/vi/test_measure.py index 991cbc487..4cb89cf80 100644 --- a/tests/nemo_text_processing/vi/test_measure.py +++ b/tests/nemo_text_processing/vi/test_measure.py @@ -20,6 +20,7 @@ try: from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from nemo_text_processing.text_normalization.normalize import Normalizer PYNINI_AVAILABLE = True except (ImportError, ModuleNotFoundError): @@ -41,3 +42,18 @@ class TestMeasure: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer( + input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_measure.txt')) + @pytest.mark.skipif( + not PYNINI_AVAILABLE, + reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh", + ) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh index 4ab27882a..c11d66ef0 100644 --- a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh @@ -58,10 +58,10 @@ testTNTime() { runtest $input } -# testTNMeasure() { -# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_measure.txt -# runtest $input -# } +testTNMeasure() { + input=$PROJECT_DIR/vi/data_text_normalization/test_cases_measure.txt + runtest $input +} testTNMoney() { input=$PROJECT_DIR/vi/data_text_normalization/test_cases_money.txt diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index bc19f428d..445c71c98 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -243,6 +243,9 @@ def parse_args(): from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) + from nemo_text_processing.text_normalization.vi.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'zh': from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import (