diff --git a/Jenkinsfile b/Jenkinsfile index 84c0c3206..3e8dd559d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -476,4 +476,4 @@ pipeline { cleanWs() } } -} +} \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py index 366282985..d7c2cc874 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py @@ -1,32 +1,32 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space - - -class WordFst(GraphFst): - ''' - tokens { char: "一" } -> 一 - ''' - - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="char", kind="verbalize", deterministic=deterministic) - - graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") - graph = pynini.closure(delete_space) + graph + pynini.closure(delete_space) - self.fst = graph.optimize() +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class WordFst(GraphFst): + ''' + tokens { char: "一" } -> 一 + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="char", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + graph = pynini.closure(delete_space) + graph + pynini.closure(delete_space) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index c534428bf..b0988ab94 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -52,6 +52,8 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT) + accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") accepted_common_domains = pynini.project( pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" @@ -135,6 +137,18 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) graph |= cc_phrases + if deterministic: + # credit card cues + numbers = pynini.closure(NEMO_DIGIT, 4, 16) + cc_phrases = ( + pynutil.insert("protocol: \"") + + cc_cues + + pynutil.insert("\" domain: \"") + + numbers + + pynutil.insert("\"") + ) + graph |= cc_phrases + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 786319627..d0c6d2ca5 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -19,7 +19,6 @@ from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -# def get_quantity(decimal): suffix = pynini.union( "万", "十万", @@ -107,7 +106,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa # larger money as decimals graph_decimal = ( pynutil.insert('integer_part: \"') - + pynini.closure( + + ( pynini.closure(cardinal, 1) + pynutil.delete('.') + pynutil.insert('点') @@ -117,14 +116,16 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa ) graph_decimal_money = ( pynini.closure(graph_decimal, 1) - + pynini.closure(pynutil.insert(' quantity: \"') + suffix + pynutil.insert('\"')) + + pynini.closure((pynutil.insert(' quantity: \"') + suffix + pynutil.insert('\"')), 0, 1) + pynutil.insert(" ") + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) - + pynini.closure(pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')) + + pynini.closure( + (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 + ) ) graph = ( @@ -134,7 +135,5 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = graph - - final_graph = self.add_tokens(final_graph) + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh index 705f4bdaf..610cd6c20 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh @@ -82,4 +82,4 @@ testITNWord() { shift $# # Load shUnit2 -. /workspace/shunit2/shunit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh index 8c701e06a..fe622bbe7 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh @@ -82,4 +82,4 @@ testITNWord() { shift $# # Load shUnit2 -. /workspace/shunit2/shunit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh index 3d5f7ae19..f15f2290d 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh @@ -119,4 +119,4 @@ testTNMath() { shift $# # Load shUnit2 -. /workspace/shunit2/shunit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/mr/test_cardinal.py b/tests/nemo_text_processing/mr/test_cardinal.py index e7bd452fd..588b7ebba 100644 --- a/tests/nemo_text_processing/mr/test_cardinal.py +++ b/tests/nemo_text_processing/mr/test_cardinal.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file -class TestCardinal: +class TestPreprocess: + inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_cardinal.txt')) diff --git a/tests/nemo_text_processing/mr/test_date.py b/tests/nemo_text_processing/mr/test_date.py index 4ad5eb74d..b58c1e532 100644 --- a/tests/nemo_text_processing/mr/test_date.py +++ b/tests/nemo_text_processing/mr/test_date.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt index 23270bf82..81ef6cdb3 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt @@ -4,4 +4,22 @@ 只有智商超过一定数值的人才能破解~只有智商超过一定数值的人才能破解 这是由人工智能控制的系统~这是由人工智能控制的系统 欧洲旅游目的地多到不知道怎么选~欧洲旅游目的地多到不知道怎么选 -马斯科卖掉豪宅住进折叠屋~马斯科卖掉豪宅住进折叠屋 \ No newline at end of file +马斯科卖掉豪宅住进折叠屋~马斯科卖掉豪宅住进折叠屋 +免除GOOGLE在一桩诽谤官司中的法律责任。~免除GOOGLE在一桩诽谤官司中的法律责任。 +这对CHROME是有利的。~这对CHROME是有利的。 +这可能是PILde使用者。~这可能是PILde使用者。 +CSI侧重科学办案,也就是现场搜正和鉴识。~CSI侧重科学办案,也就是现场搜正和鉴识。 +我以前非常喜欢一个软体,DRAW。~我以前非常喜欢一个软体,DRAW。 +我爱你病毒。~我爱你病毒。 +微软举办了RACETOMARKETCHALLENGE竞赛。~微软举办了RACETOMARKETCHALLENGE竞赛。 +苹果销售量的复苏程度远超PC市场。~苹果销售量的复苏程度远超PC市场。 +第三季还有两款ANDROID手机亮相。~第三季还有两款ANDROID手机亮相。 +反而应试著让所有GOOGLE服务更加社交化。~反而应试著让所有GOOGLE服务更加社交化。 +GOOGLE已提供一项NATIVECLIENT软体。~GOOGLE已提供一项NATIVECLIENT软体。 +这些程式都支援PRE与ITUNES同步化。~这些程式都支援PRE与ITUNES同步化。 +可以推断此次NTT可能也会将同样的策略用在LTE上。~可以推断此次NTT可能也会将同样的策略用在LTE上。 +现今许多小型企业因成本考量被迫采用一般PC作为伺服器。~现今许多小型企业因成本考量被迫采用一般PC作为伺服器。 +部落格宣布GOOGLECHROMES的诞生。~部落格宣布GOOGLECHROMES的诞生。 +由ZIP订购机场接送或观光景点共乘服务。~由ZIP订购机场接送或观光景点共乘服务。 +PAQUE表示短时间应该还不会全面开放。~PAQUE表示短时间应该还不会全面开放。 +CBS是美国一家重要的广播电视网路公司。~CBS是美国一家重要的广播电视网路公司。 diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh index dd352b42b..5089427b6 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh @@ -1,7 +1,7 @@ #! /bin/sh GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} -PROJECT_DIR=${2:-"/workspace/tests/en"} +PROJECT_DIR=${2:-"/workspace/tests"} runtest () { input=$1 diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh index 82d4d4179..2e4a0b998 100644 --- a/tools/text_processing_deployment/export_grammars.sh +++ b/tools/text_processing_deployment/export_grammars.sh @@ -107,4 +107,3 @@ else echo "done mode: $MODE" exit 0 fi - diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index c7607ca17..d6ceb84f2 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -266,6 +266,10 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, diff --git a/tools/text_processing_deployment/sh_test.sh b/tools/text_processing_deployment/sh_test.sh index 32b5f9774..3e31de37c 100644 --- a/tools/text_processing_deployment/sh_test.sh +++ b/tools/text_processing_deployment/sh_test.sh @@ -63,4 +63,4 @@ VERBALIZE_FAR=${CACHE_DIR}_${GRAMMARS}_${INPUT_CASE}/verbalize/verbalize.far CONFIG=${LANGUAGE}_${GRAMMARS}_${INPUT_CASE} cp $CLASSIFY_FAR /workspace/sparrowhawk/documentation/grammars_${CONFIG}/en_toy/classify/ -cp $VERBALIZE_FAR /workspace/sparrowhawk/documentation/grammars_${CONFIG}/en_toy/verbalize/ +cp $VERBALIZE_FAR /workspace/sparrowhawk/documentation/grammars_${CONFIG}/en_toy/verbalize/ \ No newline at end of file