From 39e1a0d170e9474431b4413b6bc0e50d9337359c Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Thu, 8 Dec 2022 16:50:56 -0800 Subject: [PATCH] remove useless files. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- nemo/collections/common/data/vocabs.py | 383 ------------------------- 1 file changed, 383 deletions(-) delete mode 100644 nemo/collections/common/data/vocabs.py diff --git a/nemo/collections/common/data/vocabs.py b/nemo/collections/common/data/vocabs.py deleted file mode 100644 index 2a4123ff2c69..000000000000 --- a/nemo/collections/common/data/vocabs.py +++ /dev/null @@ -1,383 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import re -import string -import time -import unicodedata -from abc import ABC, abstractmethod -from builtins import str as unicode -from contextlib import contextmanager -from typing import List - -import nltk -import torch - -from nemo.collections.common.parts.preprocessing import parsers -from nemo.utils import logging -from nemo.utils.get_rank import is_global_rank_zero - -_words_re = re.compile("([a-z\-]+'[a-z\-]+|[a-z\-]+)|([^a-z{}]+)") - - -def _text_preprocessing(text): - text = unicode(text) - text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') - text = text.lower() - text = re.sub("[^ a-z'\".,?!()\[\]:;\-]", "", text) - return text - - -def _word_tokenize(text): - words = _words_re.findall(text) - words = [re.sub(r'\s(\d)', r'\1', word[1].upper()) if word[0] == '' else word[0] for word in words] - return words - - -def download_corpora(): - # Download NLTK datasets if this class is to be instantiated - try: - nltk.data.find('taggers/averaged_perceptron_tagger.zip') - except LookupError: - nltk.download('averaged_perceptron_tagger', quiet=True) - try: - nltk.data.find('corpora/cmudict.zip') - except LookupError: - nltk.download('cmudict', quiet=True) - - -class G2p: - def __init__( - self, - g2p_library, - phoneme_dict_path=None, - use_seq2seq_for_oov=False, - ignore_ambiguous_words=True, - text_preprocessing_func=_text_preprocessing, - word_tokenize_func=_word_tokenize, - ): - self._g2p = g2p_library - self.homograph2features = self._g2p.homograph2features - self.g2p_dict = self._construct_grapheme2phoneme_dict(phoneme_dict_path) - self.use_seq2seq_for_oov = use_seq2seq_for_oov - self.ignore_ambiguous_words = ignore_ambiguous_words - - self.text_preprocessing_func = text_preprocessing_func - self.word_tokenize_func = word_tokenize_func - - @staticmethod - def _construct_grapheme2phoneme_dict(phoneme_dict_path=None, encoding='latin-1'): - if phoneme_dict_path is None: - from nltk.corpus import cmudict - - return cmudict.dict() - - _alt_re = re.compile(r'\([0-9]+\)') - g2p_dict = {} - with open(phoneme_dict_path, encoding=encoding) as file: - for line in file: - if len(line) > 0 and ('A' <= line[0] <= 'Z' or line[0] == "'"): - parts = line.split(' ') - word = re.sub(_alt_re, '', parts[0]) - word = word.lower() - - pronunciation = parts[1].strip().split(" ") - if word in g2p_dict: - g2p_dict[word].append(pronunciation) - else: - g2p_dict[word] = [pronunciation] - return g2p_dict - - def handle_ambiguous(self, word): - if not self.ignore_ambiguous_words or len(self.g2p_dict[word]) == 1: - return True - return False - - def __call__(self, text): - text = self.text_preprocessing_func(text) - words = self.word_tokenize_func(text) - words_and_pos_tags = nltk.pos_tag(words) - - prons = [] - for word, pos in words_and_pos_tags: - word_by_hyphen = word.split("-") - - # punctuation - if re.search("[a-zA-Z]", word) is None: - pron = list(word) - # homograph - elif word in self.homograph2features: - pron1, pron2, pos1 = self.homograph2features[word] - if pos.startswith(pos1): - pron = pron1 - else: - pron = pron2 - # `'s` suffix - elif ( - len(word) > 2 - and word.endswith("'s") - and (word not in self.g2p_dict) - and (word[:-2] in self.g2p_dict) - and self.handle_ambiguous(word[:-2]) - ): - pron = self.g2p_dict[word[:-2]][0] + ["Z"] - # `s` suffix - elif ( - len(word) > 1 - and word.endswith("s") - and (word not in self.g2p_dict) - and (word[:-1] in self.g2p_dict) - and self.handle_ambiguous(word[:-1]) - ): - pron = self.g2p_dict[word[:-1]][0] + ["Z"] - # g2p dict - elif word in self.g2p_dict and self.handle_ambiguous(word): - pron = self.g2p_dict[word][0] - # word with hyphens - elif len(word_by_hyphen) > 1 and all( - [sub_word in self.g2p_dict and self.handle_ambiguous(sub_word) for sub_word in word_by_hyphen] - ): - pron = [] - for sub_word in word_by_hyphen: - pron.extend(self.g2p_dict[sub_word][0]) - pron.extend(["-"]) - pron.pop() - else: - if self.use_seq2seq_for_oov: - # run gru-based seq2seq model from _g2p - pron = self._g2p.predict(word) - else: - pron = word - - prons.extend(pron) - - return prons - - -class Base(ABC): - """Vocabulary for turning str text to list of int tokens.""" - - # fmt: off - PUNCT = ( # Derived from LJSpeech and "/" additionally - ',', '.', '!', '?', '-', - ':', ';', '/', '"', '(', - ')', '[', ']', '{', '}', - ) - # fmt: on - PAD, BLANK, OOV = '', '', '' - - def __init__(self, labels, *, pad=PAD, blank=BLANK, oov=OOV, sep='', add_blank_at="last_but_one"): - super().__init__() - - labels = list(labels) - self.pad, labels = len(labels), labels + [pad] # Padding - - if add_blank_at is not None: - self.blank, labels = len(labels), labels + [blank] # Reserved for blank from QN - else: - # use add_blank_at=None only for ASR where blank is added automatically - self.blank = -1 - - self.oov, labels = len(labels), labels + [oov] # Out Of Vocabulary - - if add_blank_at == "last": - labels[-1], labels[-2] = labels[-2], labels[-1] - self.oov, self.blank = self.blank, self.oov - - self.labels = labels - self.sep = sep - - self._util_ids = {self.pad, self.blank, self.oov} - self._label2id = {l: i for i, l in enumerate(labels)} - self._id2label = labels - - def __call__(self, text: str) -> List[int]: - return self.encode(text) - - @abstractmethod - def encode(self, text: str) -> List[int]: - """Turns str text into int tokens.""" - - def decode(self, tokens: List[int]) -> str: - """Turns ints tokens into str text.""" - return self.sep.join(self._id2label[t] for t in tokens if t not in self._util_ids) - - -class Chars(Base): - """Chars vocabulary.""" - - def __init__( - self, punct=True, spaces=False, apostrophe=True, add_blank_at="last_but_one", - ): - labels = [] - self.space, labels = len(labels), labels + [' '] # Space - labels.extend(string.ascii_lowercase) - if apostrophe: - labels.append("'") # Apostrophe for saving "don't" and "Joe's" - - if punct: - labels.extend(self.PUNCT) - - super().__init__(labels, add_blank_at=add_blank_at) - - self.punct = punct - self.spaces = spaces - - self._parser = parsers.ENCharParser(labels) - - def encode(self, text): - """See base class.""" - text = self._parser._normalize(text) # noqa - - if self.spaces: - for p in set(text) & set(self.PUNCT): - text = text.replace(p, f' {p} ') - text = text.strip().replace(' ', ' ') - - return self._parser._tokenize(text) # noqa - - -class Phonemes(Base): - """Phonemes vocabulary.""" - - # fmt: off - VOWELS = ( - 'AA', 'AE', 'AH', 'AO', 'AW', - 'AY', 'EH', 'ER', 'EY', 'IH', - 'IY', 'OW', 'OY', 'UH', 'UW', - ) - CONSONANTS = ( - 'B', 'CH', 'D', 'DH', 'F', 'G', - 'HH', 'JH', 'K', 'L', 'M', 'N', - 'NG', 'P', 'R', 'S', 'SH', 'T', - 'TH', 'V', 'W', 'Y', 'Z', 'ZH', - ) - # fmt: on - - def __init__( - self, - punct=True, - stresses=False, - spaces=True, - chars=False, - *, - space=' ', - silence=None, - apostrophe=True, - oov=Base.OOV, - sep='|', # To be able to distinguish between 2/3 letters codes. - add_blank_at="last_but_one", - pad_with_space=False, - improved_version_g2p=False, - phoneme_dict_path=None, - ): - labels = [] - self.space, labels = len(labels), labels + [space] # Space - - if silence is not None: - self.silence, labels = len(labels), labels + [silence] # Silence - - labels.extend(self.CONSONANTS) - vowels = list(self.VOWELS) - - if stresses: - vowels = [f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))] - labels.extend(vowels) - - if chars: - labels.extend(string.ascii_lowercase) - - if apostrophe: - labels.append("'") # Apostrophe - - if punct: - labels.extend(self.PUNCT) - - super().__init__(labels, oov=oov, sep=sep, add_blank_at=add_blank_at) - - self.punct = punct - self.stresses = stresses - self.spaces = spaces - self.pad_with_space = pad_with_space - - # g2p_en tries to run download_corpora() on import but it is not rank zero guarded - # Try to check if torch distributed is available, if not get global rank zero to download corpora and make - # all other ranks sleep for a minute - if torch.distributed.is_available() and torch.distributed.is_initialized(): - group = torch.distributed.group.WORLD - if is_global_rank_zero(): - download_corpora() - torch.distributed.barrier(group=group) - elif is_global_rank_zero(): - logging.error( - f"Torch distributed needs to be initialized before you initialized {self}. This class is prone to " - "data access race conditions. Now downloading corpora from global rank 0. If other ranks pass this " - "before rank 0, errors might result." - ) - download_corpora() - else: - logging.error( - f"Torch distributed needs to be initialized before you initialized {self}. This class is prone to " - "data access race conditions. This process is not rank 0, and now going to sleep for 1 min. If this " - "rank wakes from sleep prior to rank 0 finishing downloading, errors might result." - ) - time.sleep(60) - - import g2p_en # noqa pylint: disable=import-outside-toplevel - - _g2p = g2p_en.G2p() - _g2p.variables = None - - if improved_version_g2p: - self.g2p = G2p(_g2p, phoneme_dict_path) - else: - self.g2p = _g2p - - def encode(self, text): - """See base class.""" - ps, space, labels = [], self.labels[self.space], set(self.labels) - - for p in self.g2p(text): # noqa - # Remove stress - if p.isalnum() and len(p) == 3 and not self.stresses: - p = p[:2] - - # Add space if last one isn't one - if p == space and len(ps) > 0 and ps[-1] != space: - ps.append(p) - - # Add next phoneme - if (p.isalnum() or p == "'") and p in labels: - ps.append(p) - - # Add punct and remove space if needed - if (p in self.PUNCT) and self.punct: - if not self.spaces and len(ps) > 0 and ps[-1] == space: - ps.pop() - ps.append(p) - - # Remove trailing spaces - while ps[-1] == space: - ps.pop() - - if self.pad_with_space: - ps = [space] + ps + [space] - - return [self._label2id[p] for p in ps] - - @contextmanager - def set_phone_prob(self, prob=None): - # Add do nothing since this class doesn't support mixed g2p - yield