From da622be4a66387a6c27ea93cce7f958b2cf6621f Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 16 Jul 2024 00:35:10 +0800 Subject: [PATCH 01/44] Add multi-dictionary preprocessing and training --- basics/base_binarizer.py | 98 +++++--- basics/base_exporter.py | 15 ++ basics/base_task.py | 26 +- deployment/exporters/acoustic_exporter.py | 17 +- deployment/exporters/variance_exporter.py | 17 +- deployment/modules/fastspeech2.py | 2 +- inference/ds_acoustic.py | 18 +- inference/ds_variance.py | 18 +- modules/fastspeech/acoustic_encoder.py | 2 +- modules/fastspeech/variance_encoder.py | 2 +- preprocessing/acoustic_binarizer.py | 12 +- preprocessing/variance_binarizer.py | 10 +- training/acoustic_task.py | 2 +- training/variance_task.py | 4 +- utils/phoneme_utils.py | 279 ++++++++++++++-------- utils/text_encoder.py | 53 ---- 16 files changed, 318 insertions(+), 257 deletions(-) delete mode 100644 utils/text_encoder.py diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index ddad6e02e..896bfd3a4 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -13,9 +13,8 @@ from utils.hparams import hparams from utils.indexed_datasets import IndexedDatasetBuilder from utils.multiprocess_utils import chunked_multiprocess_run -from utils.phoneme_utils import build_phoneme_list, locate_dictionary +from utils.phoneme_utils import load_phoneme_dictionary from utils.plot import distribution_to_figure -from utils.text_encoder import TokenTextEncoder class BinarizationError(Exception): @@ -58,17 +57,22 @@ def __init__(self, data_dir=None, data_attrs=None): self.augmentation_args = hparams.get('augmentation_args', {}) self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.spk_map = None + self.spk_map = {} self.spk_ids = hparams['spk_ids'] self.speakers = hparams['speakers'] self.build_spk_map() + self.lang_map = {} + self.dictionaries = hparams['dictionaries'] + self.languages = hparams['languages'] + self.build_lang_map() + self.items = {} self.item_names: list = None self._train_item_names: list = None self._valid_item_names: list = None - self.phone_encoder = TokenTextEncoder(vocab_list=build_phoneme_list()) + self.phoneme_dictionary = load_phoneme_dictionary() self.timestep = hparams['hop_size'] / hparams['audio_sample_rate'] def build_spk_map(self): @@ -83,7 +87,6 @@ def build_spk_map(self): assert max(self.spk_ids) < hparams['num_spk'], \ f'Index in spk_id sequence {self.spk_ids} is out of range. All values should be smaller than num_spk.' - self.spk_map = {} for spk_name, spk_id in zip(self.speakers, self.spk_ids): if spk_name in self.spk_map and self.spk_map[spk_name] != spk_id: raise ValueError(f'Invalid speaker ID assignment. Name \'{spk_name}\' is assigned ' @@ -92,7 +95,19 @@ def build_spk_map(self): print("| spk_map: ", self.spk_map) - def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id): + def build_lang_map(self): + assert isinstance(self.languages, list), 'Languages must be a list' + assert len(self.languages) == len(self.raw_data_dirs), \ + 'Number of raw data dirs must equal number of language names!' + for lang in self.languages: + assert lang in self.dictionaries, f'Unrecognized language name: {lang}' + assert len(self.dictionaries.keys()) <= hparams['num_lang'], \ + 'Number of languages must not be greater than num_lang!' + + for lang_id, lang_name in enumerate(sorted(self.dictionaries.keys())): + self.lang_map[lang_name] = lang_id + + def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): raise NotImplementedError() def split_train_valid_set(self, item_names): @@ -167,8 +182,8 @@ def meta_data_iterator(self, prefix): def process(self): # load each dataset - for ds_id, spk_id, data_dir in zip(range(len(self.raw_data_dirs)), self.spk_ids, self.raw_data_dirs): - self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk_id=spk_id) + for ds_id, (data_dir, spk, lang) in enumerate(zip(self.raw_data_dirs, self.speakers, self.languages)): + self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk=spk, lang=lang) self.item_names = sorted(list(self.items.keys())) self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names) @@ -177,11 +192,15 @@ def process(self): self.binary_data_dir.mkdir(parents=True, exist_ok=True) - # Copy spk_map and dictionary to binary data dir + # Copy spk_map, lang_map and dictionary to binary data dir spk_map_fn = self.binary_data_dir / 'spk_map.json' with open(spk_map_fn, 'w', encoding='utf-8') as f: - json.dump(self.spk_map, f) - shutil.copy(locate_dictionary(), self.binary_data_dir / 'dictionary.txt') + json.dump(self.spk_map, f, ensure_ascii=False) + lang_map_fn = self.binary_data_dir / 'lang_map.json' + with open(lang_map_fn, 'w', encoding='utf-8') as f: + json.dump(self.spk_map, f, ensure_ascii=False) + for lang, dict_path in hparams['dictionaries'].items(): + shutil.copy(dict_path, self.binary_data_dir / f'dictionary-{lang}.txt') self.check_coverage() # Process valid set and train set @@ -197,40 +216,45 @@ def process(self): def check_coverage(self): # Group by phonemes in the dictionary. - ph_required = set(build_phoneme_list()) - phoneme_map = {} - for ph in ph_required: - phoneme_map[ph] = 0 - ph_occurred = [] + ph_idx_required = set(range(1, len(self.phoneme_dictionary))) + ph_idx_occurred = set() + ph_idx_count_map = { + idx: 0 + for idx in ph_idx_required + } # Load and count those phones that appear in the actual data for item_name in self.items: - ph_occurred += self.items[item_name]['ph_seq'] - if len(ph_occurred) == 0: - raise BinarizationError(f'Empty tokens in {item_name}.') - for ph in ph_occurred: - if ph not in ph_required: - continue - phoneme_map[ph] += 1 - ph_occurred = set(ph_occurred) + ph_idx_occurred.update(self.items[item_name]['ph_seq']) + for idx in self.items[item_name]['ph_seq']: + ph_idx_count_map[idx] += 1 + ph_count_map = { + self.phoneme_dictionary.decode_one(idx, scalar=False): count + for idx, count in ph_idx_count_map.items() + } print('===== Phoneme Distribution Summary =====') - for i, key in enumerate(sorted(phoneme_map.keys())): - if i == len(ph_required) - 1: + keys = sorted(ph_count_map.keys(), key=lambda v: v[0] if isinstance(v, tuple) else v) + for i, key in enumerate(keys): + if i == len(ph_count_map) - 1: end = '\n' elif i % 10 == 9: end = ',\n' else: end = ', ' - print(f'\'{key}\': {phoneme_map[key]}', end=end) + if isinstance(key, tuple): + key_disp = '(' + ', '.join(key) + ')' + else: + key_disp = key + print(f'{key_disp}: {ph_count_map[key]}', end=end) # Draw graph. - x = sorted(phoneme_map.keys()) - values = [phoneme_map[k] for k in x] + xs = [str(k) for k in keys] + ys = [ph_count_map[k] for k in keys] plt = distribution_to_figure( title='Phoneme Distribution Summary', x_label='Phoneme', y_label='Number of occurrences', - items=x, values=values + items=xs, values=ys ) filename = self.binary_data_dir / 'phoneme_distribution.jpg' plt.savefig(fname=filename, @@ -239,12 +263,14 @@ def check_coverage(self): print(f'| save summary to \'{filename}\'') # Check unrecognizable or missing phonemes - if ph_occurred != ph_required: - unrecognizable_phones = ph_occurred.difference(ph_required) - missing_phones = ph_required.difference(ph_occurred) - raise BinarizationError('transcriptions and dictionary mismatch.\n' - f' (+) {sorted(unrecognizable_phones)}\n' - f' (-) {sorted(missing_phones)}') + if ph_idx_occurred != ph_idx_required: + missing_phones = sorted({ + self.phoneme_dictionary.decode_one(idx, scalar=False) + for idx in ph_idx_required.difference(ph_idx_occurred) + }, key=lambda v: v[0] if isinstance(v, tuple) else v) + raise BinarizationError( + f'The following phonemes are not covered in transcriptions: {sorted(missing_phones)}' + ) def process_dataset(self, prefix, num_workers=0, apply_augmentation=False): args = [] diff --git a/basics/base_exporter.py b/basics/base_exporter.py index cc016004a..e2e65f534 100644 --- a/basics/base_exporter.py +++ b/basics/base_exporter.py @@ -1,4 +1,6 @@ import json +import pathlib +import shutil from pathlib import Path from typing import Union @@ -44,6 +46,19 @@ def export_model(self, path: Path): """ raise NotImplementedError() + # noinspection PyMethodMayBeStatic + def export_dictionaries(self, path: Path): + dicts = hparams.get('dictionaries') + if dicts is not None: + for lang in dicts.keys(): + fn = f'dictionary-{lang}.txt' + shutil.copy(pathlib.Path(hparams['work_dir']) / fn, path) + print(f'| export dictionary => {path / fn}') + else: + fn = 'dictionary.txt' + shutil.copy(pathlib.Path(hparams['work_dir']) / fn, path) + print(f'| export dictionary => {path / fn}') + def export_attachments(self, path: Path): """ Exports related files and configs (e.g. the dictionary) to the target directory. diff --git a/basics/base_task.py b/basics/base_task.py index 768f8e311..767f1458d 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -8,7 +8,6 @@ import matplotlib import utils -from utils.text_encoder import TokenTextEncoder matplotlib.use('Agg') @@ -24,7 +23,7 @@ DsBatchSampler, DsTensorBoardLogger, get_latest_checkpoint_path, get_strategy ) -from utils.phoneme_utils import locate_dictionary, build_phoneme_list +from utils.phoneme_utils import load_phoneme_dictionary torch.multiprocessing.set_sharing_strategy(os.getenv('TORCH_SHARE_STRATEGY', 'file_system')) @@ -71,7 +70,7 @@ def __init__(self, *args, **kwargs): self.skip_immediate_validation = False self.skip_immediate_ckpt_save = False - self.phone_encoder = self.build_phone_encoder() + self.phoneme_dictionary = load_phoneme_dictionary() self.build_model() self.valid_losses: Dict[str, Metric] = {} @@ -165,11 +164,6 @@ def load_pre_train_model(self): else: raise RuntimeError("") - @staticmethod - def build_phone_encoder(): - phone_list = build_phoneme_list() - return TokenTextEncoder(vocab_list=phone_list) - def _build_model(self): raise NotImplementedError() @@ -448,21 +442,19 @@ def start(cls): if not hparams['infer']: # train @rank_zero_only def train_payload_copy(): - # Copy spk_map.json and dictionary.txt to work dir + # Copy files to work_dir binary_dir = pathlib.Path(hparams['binary_data_dir']) spk_map = work_dir / 'spk_map.json' spk_map_src = binary_dir / 'spk_map.json' if not spk_map.exists() and spk_map_src.exists(): shutil.copy(spk_map_src, spk_map) print(f'| Copied spk map to {spk_map}.') - dictionary = work_dir / 'dictionary.txt' - dict_src = binary_dir / 'dictionary.txt' - if not dictionary.exists(): - if dict_src.exists(): - shutil.copy(dict_src, dictionary) - else: - shutil.copy(locate_dictionary(), dictionary) - print(f'| Copied dictionary to {dictionary}.') + for lang in hparams['dictionaries'].keys(): + dict_dst = work_dir / f'dictionary-{lang}.txt' + dict_src = binary_dir / f'dictionary-{lang}.txt' + if not dict_dst.exists(): + shutil.copy(dict_src, dict_dst) + print(f'| Copied dictionary for language \'{lang}\' to {dict_dst}.') train_payload_copy() trainer.fit(task, ckpt_path=get_latest_checkpoint_path(work_dir)) diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py index 4f0f533e2..1f56a9ce4 100644 --- a/deployment/exporters/acoustic_exporter.py +++ b/deployment/exporters/acoustic_exporter.py @@ -1,4 +1,3 @@ -import shutil from pathlib import Path from typing import List, Union, Tuple, Dict @@ -12,8 +11,7 @@ from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST from utils import load_ckpt, onnx_helper, remove_suffix from utils.hparams import hparams -from utils.phoneme_utils import locate_dictionary, build_phoneme_list -from utils.text_encoder import TokenTextEncoder +from utils.phoneme_utils import load_phoneme_dictionary class DiffSingerAcousticExporter(BaseExporter): @@ -32,7 +30,7 @@ def __init__( self.model_name: str = hparams['exp_name'] self.ckpt_steps: int = ckpt_steps self.spk_map: dict = self.build_spk_map() - self.vocab = TokenTextEncoder(vocab_list=build_phoneme_list()) + self.phoneme_dictionary = load_phoneme_dictionary() self.model = self.build_model() self.fs2_aux_cache_path = self.cache_dir / ( 'fs2_aux.onnx' if self.model.use_shallow_diffusion else 'fs2.onnx' @@ -80,7 +78,7 @@ def __init__( def build_model(self) -> DiffSingerAcousticONNX: model = DiffSingerAcousticONNX( - vocab_size=len(self.vocab), + vocab_size=len(self.phoneme_dictionary), out_dims=hparams['audio_num_mel_bins'] ).eval().to(self.device) load_ckpt(model, hparams['work_dir'], ckpt_steps=self.ckpt_steps, @@ -111,7 +109,7 @@ def export_attachments(self, path: Path): path / f'{self.model_name}.{spk[0]}.emb', self._perform_spk_mix(spk[1]) ) - self._export_dictionary(path / 'dictionary.txt') + self.export_dictionaries(path) self._export_phonemes(path / f'{self.model_name}.phonemes.txt') model_name = self.model_name @@ -395,11 +393,6 @@ def _export_spk_embed(self, path: Path, spk_embed: torch.Tensor): f.write(spk_embed.cpu().numpy().tobytes()) print(f'| export spk embed => {path}') - # noinspection PyMethodMayBeStatic - def _export_dictionary(self, path: Path): - print(f'| export dictionary => {path}') - shutil.copy(locate_dictionary(), path) - def _export_phonemes(self, path: Path): - self.vocab.store_to_file(path) + self.phoneme_dictionary.dump(path) print(f'| export phonemes => {path}') diff --git a/deployment/exporters/variance_exporter.py b/deployment/exporters/variance_exporter.py index 1af433ae4..4e594c407 100644 --- a/deployment/exporters/variance_exporter.py +++ b/deployment/exporters/variance_exporter.py @@ -1,4 +1,3 @@ -import shutil from pathlib import Path from typing import Union, List, Tuple, Dict @@ -12,8 +11,7 @@ from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST from utils import load_ckpt, onnx_helper, remove_suffix from utils.hparams import hparams -from utils.phoneme_utils import locate_dictionary, build_phoneme_list -from utils.text_encoder import TokenTextEncoder +from utils.phoneme_utils import load_phoneme_dictionary class DiffSingerVarianceExporter(BaseExporter): @@ -32,7 +30,7 @@ def __init__( self.model_name: str = hparams['exp_name'] self.ckpt_steps: int = ckpt_steps self.spk_map: dict = self.build_spk_map() - self.vocab = TokenTextEncoder(vocab_list=build_phoneme_list()) + self.phoneme_dictionary = load_phoneme_dictionary() self.model = self.build_model() self.linguistic_encoder_cache_path = self.cache_dir / 'linguistic.onnx' self.dur_predictor_cache_path = self.cache_dir / 'dur.onnx' @@ -83,7 +81,7 @@ def __init__( def build_model(self) -> DiffSingerVarianceONNX: model = DiffSingerVarianceONNX( - vocab_size=len(self.vocab) + vocab_size=len(self.phoneme_dictionary) ).eval().to(self.device) load_ckpt(model, hparams['work_dir'], ckpt_steps=self.ckpt_steps, prefix_in_ckpt='model', strict=True, device=self.device) @@ -142,7 +140,7 @@ def export_attachments(self, path: Path): path / f'{self.model_name}.{spk[0]}.emb', self._perform_spk_mix(spk[1]) ) - self._export_dictionary(path / 'dictionary.txt') + self.export_dictionaries(path) self._export_phonemes((path / f'{self.model_name}.phonemes.txt')) model_name = self.model_name @@ -771,11 +769,6 @@ def _export_spk_embed(self, path: Path, spk_embed: torch.Tensor): f.write(spk_embed.cpu().numpy().tobytes()) print(f'| export spk embed => {path}') - # noinspection PyMethodMayBeStatic - def _export_dictionary(self, path: Path): - print(f'| export dictionary => {path}') - shutil.copy(locate_dictionary(), path) - def _export_phonemes(self, path: Path): - self.vocab.store_to_file(path) + self.phoneme_dictionary.dump(path) print(f'| export phonemes => {path}') diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py index d0a3c7b5a..48a3afb40 100644 --- a/deployment/modules/fastspeech2.py +++ b/deployment/modules/fastspeech2.py @@ -9,7 +9,7 @@ from modules.fastspeech.acoustic_encoder import FastSpeech2Acoustic from modules.fastspeech.variance_encoder import FastSpeech2Variance from utils.hparams import hparams -from utils.text_encoder import PAD_INDEX +from utils.phoneme_utils import PAD_INDEX f0_bin = 256 f0_max = 1100.0 diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index a67f5b166..7b93d38cb 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -1,12 +1,11 @@ -from collections import OrderedDict - -import tqdm import json import pathlib +from collections import OrderedDict +from typing import Dict import numpy as np import torch -from typing import Dict +import tqdm from basics.base_svs_infer import BaseSVSInfer from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST @@ -16,8 +15,7 @@ from utils import load_ckpt from utils.hparams import hparams from utils.infer_utils import cross_fade, resample_align_curve, save_wav -from utils.phoneme_utils import build_phoneme_list -from utils.text_encoder import TokenTextEncoder +from utils.phoneme_utils import load_phoneme_dictionary class DiffSingerAcousticInfer(BaseSVSInfer): @@ -37,7 +35,7 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N if hparams.get('use_tension_embed', False): self.variances_to_embed.add('tension') - self.ph_encoder = TokenTextEncoder(vocab_list=build_phoneme_list()) + self.phoneme_dictionary = load_phoneme_dictionary() if hparams['use_spk_id']: with open(pathlib.Path(hparams['work_dir']) / 'spk_map.json', 'r', encoding='utf8') as f: self.spk_map = json.load(f) @@ -50,7 +48,7 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N def build_model(self, ckpt_steps=None): model = DiffSingerAcoustic( - vocab_size=len(self.ph_encoder), + vocab_size=len(self.phoneme_dictionary), out_dims=hparams['audio_num_mel_bins'] ).eval().to(self.device) load_ckpt(model, hparams['work_dir'], ckpt_steps=ckpt_steps, @@ -73,7 +71,9 @@ def preprocess_input(self, param, idx=0): """ batch = {} summary = OrderedDict() - txt_tokens = torch.LongTensor([self.ph_encoder.encode(param['ph_seq'])]).to(self.device) # => [B, T_txt] + txt_tokens = torch.LongTensor([ + self.phoneme_dictionary.encode(param['ph_seq']) + ]).to(self.device) # => [B, T_txt] batch['tokens'] = txt_tokens ph_dur = torch.from_numpy(np.array(param['ph_dur'].split(), np.float32)).to(self.device) diff --git a/inference/ds_variance.py b/inference/ds_variance.py index c8a9b090a..86abd1abd 100644 --- a/inference/ds_variance.py +++ b/inference/ds_variance.py @@ -1,31 +1,29 @@ import copy import json - -import tqdm import pathlib from collections import OrderedDict +from typing import List, Tuple import librosa import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +import tqdm from scipy import interpolate -from typing import List, Tuple from basics.base_svs_infer import BaseSVSInfer +from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST from modules.fastspeech.tts_modules import ( LengthRegulator, RhythmRegulator, mel2ph_to_dur ) -from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST from modules.toplevel import DiffSingerVariance from utils import load_ckpt from utils.hparams import hparams from utils.infer_utils import resample_align_curve -from utils.phoneme_utils import build_phoneme_list +from utils.phoneme_utils import load_phoneme_dictionary from utils.pitch_utils import interp_f0 -from utils.text_encoder import TokenTextEncoder class DiffSingerVarianceInfer(BaseSVSInfer): @@ -34,7 +32,7 @@ def __init__( predictions: set = None ): super().__init__(device=device) - self.ph_encoder = TokenTextEncoder(vocab_list=build_phoneme_list()) + self.phoneme_dictionary = load_phoneme_dictionary() if hparams['use_spk_id']: with open(pathlib.Path(hparams['work_dir']) / 'spk_map.json', 'r', encoding='utf8') as f: self.spk_map = json.load(f) @@ -76,7 +74,7 @@ def __init__( def build_model(self, ckpt_steps=None): model = DiffSingerVariance( - vocab_size=len(self.ph_encoder) + vocab_size=len(self.phoneme_dictionary) ).eval().to(self.device) load_ckpt(model, hparams['work_dir'], ckpt_steps=ckpt_steps, prefix_in_ckpt='model', strict=True, device=self.device) @@ -97,7 +95,9 @@ def preprocess_input( """ batch = {} summary = OrderedDict() - txt_tokens = torch.LongTensor([self.ph_encoder.encode(param['ph_seq'].split())]).to(self.device) # [B=1, T_ph] + txt_tokens = torch.LongTensor([ + self.phoneme_dictionary.encode(param['ph_seq'].split()) + ]).to(self.device) # [B=1, T_ph] T_ph = txt_tokens.shape[1] batch['tokens'] = txt_tokens ph_num = torch.from_numpy(np.array([param['ph_num'].split()], np.int64)).to(self.device) # [B=1, T_w] diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 9ab4ed633..b1507837e 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -8,7 +8,7 @@ ) from modules.fastspeech.tts_modules import FastSpeech2Encoder, mel2ph_to_dur from utils.hparams import hparams -from utils.text_encoder import PAD_INDEX +from utils.phoneme_utils import PAD_INDEX class FastSpeech2Acoustic(nn.Module): diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index 82e0a88e8..2031d89ce 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -8,7 +8,7 @@ ) from modules.fastspeech.tts_modules import FastSpeech2Encoder, DurationPredictor from utils.hparams import hparams -from utils.text_encoder import PAD_INDEX +from utils.phoneme_utils import PAD_INDEX class FastSpeech2Variance(nn.Module): diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index b61c88f88..bf799abe5 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -67,17 +67,19 @@ def __init__(self): "See https://github.com/openvpi/DiffSinger/releases/tag/v2.3.0 for more details." ) - def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id): + def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): meta_data_dict = {} with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f: for utterance_label in csv.DictReader(f): item_name = utterance_label['name'] temp_dict = { 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), - 'ph_seq': utterance_label['ph_seq'].split(), + 'ph_seq': self.phoneme_dictionary.encode(utterance_label['ph_seq'], lang=lang), 'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()], - 'spk_id': spk_id, - 'spk_name': self.speakers[ds_id], + 'spk_id': self.spk_map[spk], + 'spk_name': spk, + 'language_id': self.lang_map[lang], + 'language_name': lang, } assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \ f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.' @@ -106,7 +108,7 @@ def process_item(self, item_name, meta_data, binarization_args): 'seconds': seconds, 'length': length, 'mel': mel, - 'tokens': np.array(self.phone_encoder.encode(meta_data['ph_seq']), dtype=np.int64), + 'tokens': np.array(meta_data['ph_seq'], dtype=np.int64), 'ph_dur': np.array(meta_data['ph_dur']).astype(np.float32), } diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 2882cf769..99ed65c31 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -108,7 +108,7 @@ def load_attr_from_ds(self, ds_id, name, attr, idx=0): ds = ds[idx] return ds.get(attr) - def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id): + def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): meta_data_dict = {} with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf8') as f: @@ -130,10 +130,12 @@ def require(attr): temp_dict = { 'ds_idx': item_idx, - 'spk_id': spk_id, + 'spk_id': self.spk_map[spk], 'spk_name': self.speakers[ds_id], + 'language_id': self.lang_map[lang], + 'language_name': lang, 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), - 'ph_seq': require('ph_seq').split(), + 'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang), 'ph_dur': [float(x) for x in require('ph_dur').split()] } @@ -249,7 +251,7 @@ def process_item(self, item_name, meta_data, binarization_args): 'spk_name': meta_data['spk_name'], 'seconds': seconds, 'length': length, - 'tokens': np.array(self.phone_encoder.encode(meta_data['ph_seq']), dtype=np.int64) + 'tokens': np.array(meta_data['ph_seq'], dtype=np.int64) } ph_dur_sec = torch.FloatTensor(meta_data['ph_dur']).to(self.device) diff --git a/training/acoustic_task.py b/training/acoustic_task.py index de6a9adb5..79f05003f 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -92,7 +92,7 @@ def __init__(self): def _build_model(self): return DiffSingerAcoustic( - vocab_size=len(self.phone_encoder), + vocab_size=len(self.phoneme_dictionary), out_dims=hparams['audio_num_mel_bins'] ) diff --git a/training/variance_task.py b/training/variance_task.py index 88a844952..0a33301e6 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -113,7 +113,7 @@ def __init__(self): def _build_model(self): return DiffSingerVariance( - vocab_size=len(self.phone_encoder), + vocab_size=len(self.phoneme_dictionary), ) # noinspection PyAttributeOutsideInit @@ -295,7 +295,7 @@ def sample_get(key, idx, abs_idx): def plot_dur(self, data_idx, gt_dur, pred_dur, txt=None): gt_dur = gt_dur[0].cpu().numpy() pred_dur = pred_dur[0].cpu().numpy() - txt = self.phone_encoder.decode(txt[0].cpu().numpy()).split() + txt = self.phoneme_dictionary.decode(txt[0].cpu().numpy()).split() title_text = f"{self.valid_dataset.metadata['spk_names'][data_idx]} - {self.valid_dataset.metadata['names'][data_idx]}" self.logger.all_rank_experiment.add_figure(f'dur_{data_idx}', dur_to_figure( gt_dur, pred_dur, txt, title_text diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 269122a6d..1547bc9eb 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -1,99 +1,190 @@ +import json import pathlib - -try: - from lightning.pytorch.utilities.rank_zero import rank_zero_info -except ModuleNotFoundError: - rank_zero_info = print +from typing import Dict, List, Union from utils.hparams import hparams -_initialized = False -_ALL_CONSONANTS_SET = set() -_ALL_VOWELS_SET = set() -_dictionary = { - 'AP': ['AP'], - 'SP': ['SP'] -} -_phoneme_list: list - - -def locate_dictionary(): - """ - Search and locate the dictionary file. - Order: - 1. hparams['dictionary'] - 2. hparams['g2p_dictionary'] - 3. 'dictionary.txt' in hparams['work_dir'] - 4. file with same name as hparams['g2p_dictionary'] in hparams['work_dir'] - :return: pathlib.Path of the dictionary file - """ - assert 'dictionary' in hparams or 'g2p_dictionary' in hparams, \ - 'Please specify a dictionary file in your config.' - config_dict_path = pathlib.Path(hparams['dictionary']) - if config_dict_path.exists(): - return config_dict_path - work_dir = pathlib.Path(hparams['work_dir']) - ckpt_dict_path = work_dir / config_dict_path.name - if ckpt_dict_path.exists(): - return ckpt_dict_path - ckpt_dict_path = work_dir / 'dictionary.txt' - if ckpt_dict_path.exists(): - return ckpt_dict_path - raise FileNotFoundError('Unable to locate the dictionary file. ' - 'Please specify the right dictionary in your config.') - - -def _build_dict_and_list(): - global _dictionary, _phoneme_list - - _set = set() - with open(locate_dictionary(), 'r', encoding='utf8') as _df: - _lines = _df.readlines() - for _line in _lines: - _pinyin, _ph_str = _line.strip().split('\t') - _dictionary[_pinyin] = _ph_str.split() - for _list in _dictionary.values(): - [_set.add(ph) for ph in _list] - _phoneme_list = sorted(list(_set)) - rank_zero_info('| load phoneme set: ' + str(_phoneme_list)) - - -def _initialize_consonants_and_vowels(): - # Currently we only support two-part consonant-vowel phoneme systems. - for _ph_list in _dictionary.values(): - _ph_count = len(_ph_list) - if _ph_count == 0 or _ph_list[0] in ['AP', 'SP']: - continue - elif len(_ph_list) == 1: - _ALL_VOWELS_SET.add(_ph_list[0]) +PAD_INDEX = 0 + + +class PhonemeDictionary: + def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[List[str]] = None): + all_phonemes = {'AP', 'SP'} + self._multi_langs = len(dictionaries) > 1 + for lang, dict_path in dictionaries.items(): + with open(dict_path, 'r', encoding='utf8') as dict_file: + for line in dict_file: + _, phonemes = line.strip().split('\t') + phonemes = phonemes.split() + for phoneme in phonemes: + if '/' in phoneme: + raise ValueError( + f"Invalid phoneme tag '{phoneme}' in dictionary '{dict_path}': " + f"should not contain the reserved character '/'." + ) + if self._multi_langs: + all_phonemes.add(f'{lang}/{phoneme}') + else: + all_phonemes.add(phoneme) + if merged_groups is None: + merged_groups = [] else: - _ALL_CONSONANTS_SET.add(_ph_list[0]) - _ALL_VOWELS_SET.add(_ph_list[1]) - - -def _initialize(): - global _initialized - if not _initialized: - _build_dict_and_list() - _initialize_consonants_and_vowels() - _initialized = True - - -def get_all_consonants(): - _initialize() - return sorted(_ALL_CONSONANTS_SET) - - -def get_all_vowels(): - _initialize() - return sorted(_ALL_VOWELS_SET) - - -def build_dictionary() -> dict: - _initialize() - return _dictionary - - -def build_phoneme_list() -> list: - _initialize() - return _phoneme_list + if self._multi_langs: + for group in merged_groups: + for phoneme in group: + if '/' not in phoneme: + raise ValueError( + f"Invalid phoneme tag '{phoneme}' in merged group: " + "should specify language by '/' prefix." + ) + lang, name = phoneme.split('/', maxsplit=1) + if lang not in dictionaries: + raise ValueError( + f"Invalid phoneme tag '{phoneme}' in merged group: " + f"unrecognized language name '{lang}'." + ) + merged_groups = [set(phones) for phones in merged_groups if len(phones) > 1] + else: + _merged_groups = [] + for group in merged_groups: + _group = [] + for phoneme in group: + if '/' in phoneme: + lang, name = phoneme.split('/', maxsplit=1) + if lang not in dictionaries: + raise ValueError( + f"Invalid phoneme tag '{phoneme}' in merged group: " + f"unrecognized language name '{lang}'." + ) + _group.append(name) + else: + _group.append(phoneme) + _merged_groups.append(_group) + merged_groups = [set(phones) for phones in _merged_groups if len(phones) > 1] + merged_phonemes_inverted_index = {} + for idx, group in enumerate(merged_groups): + other_idx = None + for phoneme in group: + if phoneme in merged_phonemes_inverted_index: + other_idx = merged_phonemes_inverted_index[phoneme] + break + target_idx = idx if other_idx is None else other_idx + for phoneme in group: + merged_phonemes_inverted_index[phoneme] = target_idx + if other_idx is not None: + merged_groups[other_idx] |= group + phone_to_id = {} + id_to_phone = [] + idx = 1 + for phoneme in sorted(all_phonemes): + if phoneme in merged_phonemes_inverted_index: + has_assigned = True + for alias in merged_groups[merged_phonemes_inverted_index[phoneme]]: + if alias not in phone_to_id: + has_assigned = False + phone_to_id[alias] = idx + if not has_assigned: + id_to_phone.append(tuple(sorted(merged_groups[merged_phonemes_inverted_index[phoneme]]))) + idx += 1 + else: + phone_to_id[phoneme] = idx + id_to_phone.append(phoneme) + idx += 1 + self._phone_to_id: Dict[str, int] = phone_to_id + self._id_to_phone: List[Union[str, tuple]] = id_to_phone + + @property + def vocab_size(self): + return len(self._id_to_phone) + 1 + + def __len__(self): + return self.vocab_size + + def encode_one(self, phone, lang=None): + if lang is None or not self._multi_langs or phone in self._phone_to_id: + return self._phone_to_id[phone] + if '/' not in phone: + phone = f'{lang}/{phone}' + return self._phone_to_id[phone] + + def encode(self, sentence, lang=None): + phones = sentence.strip().split() if isinstance(sentence, str) else sentence + return [self.encode_one(phone, lang=lang) for phone in phones] + + def decode_one(self, idx, lang=None, scalar=True): + if idx <= 0: + return None + phone = self._id_to_phone[idx - 1] + if not scalar or isinstance(phone, str): + return phone + if lang is None or not self._multi_langs: + return phone[0] + for alias in phone: + if alias.startswith(f'{lang}/'): + return alias + return phone[0] + + def decode(self, ids, lang=None, scalar=True): + ids = list(ids) + return ' '.join([ + self.decode_one(i, lang=lang, scalar=scalar) + for i in ids + if i >= 1 + ]) + + def dump(self, filename): + with open(filename, 'w', encoding='utf8') as fp: + json.dump(self._phone_to_id, fp, ensure_ascii=False, indent=2) + + +_dictionary = None + + +def load_phoneme_dictionary() -> PhonemeDictionary: + if _dictionary is not None: + return _dictionary + config_dicts = hparams.get('dictionaries') + if config_dicts is not None: + dicts = {} + for lang, config_dict_path in config_dicts.items(): + config_dict_path = pathlib.Path(config_dict_path) + if not config_dict_path.exists(): + config_dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt' + if not config_dict_path.exists(): + raise FileNotFoundError( + f"Could not locate dictionary for language '{lang}'." + ) + dicts[lang] = config_dict_path + else: + config_dict_path = pathlib.Path(hparams['dictionary']) + if not config_dict_path.exists(): + config_dict_path = pathlib.Path(hparams['work_dir']) / 'dictionary.txt' + if not config_dict_path.exists(): + raise FileNotFoundError( + f"Could not locate dictionary file." + ) + dicts = { + 'default': config_dict_path + } + return PhonemeDictionary( + dictionaries=dicts, + merged_groups=hparams.get('merged_phoneme_groups') + ) + + +if __name__ == '__main__': + d = PhonemeDictionary( + dictionaries={ + 'zh': 'dictionaries/opencpop-extension.txt', + # 'en': 'dictionaries/opencpop-extension.txt', + }, + merged_groups=[ + ['zh/a', 'zh/b', 'c'], + ['a', 'd', 'e'], + ['e', 'f'] + ] + ) + ph_ids = d.encode('sh ir zh e j v y i b a SP', lang='en') + ph_seq = d.decode(ph_ids) + print(ph_ids) + print(ph_seq) diff --git a/utils/text_encoder.py b/utils/text_encoder.py deleted file mode 100644 index 4b7815c46..000000000 --- a/utils/text_encoder.py +++ /dev/null @@ -1,53 +0,0 @@ -import numpy as np - -PAD = '' -PAD_INDEX = 0 - - -class TokenTextEncoder: - """Encoder based on a user-supplied vocabulary (file or list).""" - - def __init__(self, vocab_list): - """Initialize from a file or list, one token per line. - - Handling of reserved tokens works as follows: - - When initializing from a list, we add reserved tokens to the vocab. - - Args: - vocab_list: If not None, a list of elements of the vocabulary. - """ - self.vocab_list = sorted(vocab_list) - - def encode(self, sentence): - """Converts a space-separated string of phones to a list of ids.""" - phones = sentence.strip().split() if isinstance(sentence, str) else sentence - return [self.vocab_list.index(ph) + 1 if ph != PAD else PAD_INDEX for ph in phones] - - def decode(self, ids, strip_padding=False): - if strip_padding: - ids = np.trim_zeros(ids) - ids = list(ids) - return ' '.join([ - self.vocab_list[_id - 1] if _id >= 1 else PAD - for _id in ids - ]) - - @property - def vocab_size(self): - return len(self.vocab_list) + 1 - - def __len__(self): - return self.vocab_size - - def store_to_file(self, filename): - """Write vocab file to disk. - - Vocab files have one token per line. The file ends in a newline. Reserved - tokens are written to the vocab file as well. - - Args: - filename: Full path of the file to store the vocab to. - """ - with open(filename, 'w', encoding='utf8') as f: - print(PAD, file=f) - [print(tok, file=f) for tok in self.vocab_list] From a151ecf5dc99d847fefe4518b905ff5e03b8c95e Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 16 Jul 2024 01:28:24 +0800 Subject: [PATCH 02/44] Fix lang_map.json copy --- basics/base_binarizer.py | 2 +- basics/base_task.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index 896bfd3a4..8ec22c279 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -198,7 +198,7 @@ def process(self): json.dump(self.spk_map, f, ensure_ascii=False) lang_map_fn = self.binary_data_dir / 'lang_map.json' with open(lang_map_fn, 'w', encoding='utf-8') as f: - json.dump(self.spk_map, f, ensure_ascii=False) + json.dump(self.lang_map, f, ensure_ascii=False) for lang, dict_path in hparams['dictionaries'].items(): shutil.copy(dict_path, self.binary_data_dir / f'dictionary-{lang}.txt') self.check_coverage() diff --git a/basics/base_task.py b/basics/base_task.py index 767f1458d..b53133d39 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -444,17 +444,18 @@ def start(cls): def train_payload_copy(): # Copy files to work_dir binary_dir = pathlib.Path(hparams['binary_data_dir']) - spk_map = work_dir / 'spk_map.json' + spk_map_dst = work_dir / 'spk_map.json' spk_map_src = binary_dir / 'spk_map.json' - if not spk_map.exists() and spk_map_src.exists(): - shutil.copy(spk_map_src, spk_map) - print(f'| Copied spk map to {spk_map}.') + shutil.copy(spk_map_src, spk_map_dst) + lang_map_dst = work_dir / 'lang_map.json' + lang_map_src = binary_dir / 'lang_map.json' + shutil.copy(lang_map_src, lang_map_dst) + print(f'| Copied spk map to {spk_map_dst}.') for lang in hparams['dictionaries'].keys(): dict_dst = work_dir / f'dictionary-{lang}.txt' dict_src = binary_dir / f'dictionary-{lang}.txt' - if not dict_dst.exists(): - shutil.copy(dict_src, dict_dst) - print(f'| Copied dictionary for language \'{lang}\' to {dict_dst}.') + shutil.copy(dict_src, dict_dst) + print(f'| Copied dictionary for language \'{lang}\' to {dict_dst}.') train_payload_copy() trainer.fit(task, ckpt_path=get_latest_checkpoint_path(work_dir)) From b5a876b8c65140a08d9434e3d3fb703d5aa54108 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 16 Jul 2024 16:28:34 +0800 Subject: [PATCH 03/44] Add language embed (inject to txt_embed) for acoustic models --- basics/base_binarizer.py | 4 +++- modules/fastspeech/acoustic_encoder.py | 9 ++++++++- preprocessing/acoustic_binarizer.py | 8 ++++++-- training/acoustic_task.py | 11 ++++++++++- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index 8ec22c279..b1bd6b97a 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -104,9 +104,11 @@ def build_lang_map(self): assert len(self.dictionaries.keys()) <= hparams['num_lang'], \ 'Number of languages must not be greater than num_lang!' - for lang_id, lang_name in enumerate(sorted(self.dictionaries.keys())): + for lang_id, lang_name in enumerate(sorted(self.dictionaries.keys()), start=1): self.lang_map[lang_name] = lang_id + print("| lang_map: ", self.lang_map) + def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): raise NotImplementedError() diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index b1507837e..2b53ace20 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -56,6 +56,9 @@ def __init__(self, vocab_size): self.use_spk_id = hparams['use_spk_id'] if self.use_spk_id: self.spk_embed = Embedding(hparams['num_spk'], hparams['hidden_size']) + self.use_lang_id = hparams.get('use_lang_id', False) + if self.use_lang_id: + self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances): if self.use_variance_embeds: @@ -78,9 +81,13 @@ def forward_variance_embedding(self, condition, key_shift=None, speed=None, **va def forward( self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, - spk_embed_id=None, **kwargs + spk_embed_id=None, languages=None, + **kwargs ): txt_embed = self.txt_embed(txt_tokens) + if self.use_lang_id: + lang_embed = self.lang_embed(languages) + txt_embed += lang_embed dur = mel2ph_to_dur(mel2ph, txt_tokens.shape[1]).float() dur_embed = self.dur_embed(dur[:, :, None]) encoder_out = self.encoder(txt_embed, dur_embed, txt_tokens == 0) diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index bf799abe5..18a6cd478 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -36,6 +36,7 @@ ACOUSTIC_ITEM_ATTRIBUTES = [ 'spk_id', 'mel', + 'languages', 'tokens', 'mel2ph', 'f0', @@ -74,12 +75,14 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): item_name = utterance_label['name'] temp_dict = { 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), + 'lang_seq': [ + self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + for p in utterance_label['ph_seq'].split() + ], 'ph_seq': self.phoneme_dictionary.encode(utterance_label['ph_seq'], lang=lang), 'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()], 'spk_id': self.spk_map[spk], 'spk_name': spk, - 'language_id': self.lang_map[lang], - 'language_name': lang, } assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \ f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.' @@ -108,6 +111,7 @@ def process_item(self, item_name, meta_data, binarization_args): 'seconds': seconds, 'length': length, 'mel': mel, + 'languages': np.array(meta_data['lang_seq'], dtype=np.int64), 'tokens': np.array(meta_data['ph_seq'], dtype=np.int64), 'ph_dur': np.array(meta_data['ph_dur']).astype(np.float32), } diff --git a/training/acoustic_task.py b/training/acoustic_task.py index 79f05003f..ca6a71c65 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -35,6 +35,7 @@ def __init__(self, prefix, preload=False): self.need_key_shift = hparams['use_key_shift_embed'] self.need_speed = hparams['use_speed_embed'] self.need_spk_id = hparams['use_spk_id'] + self.need_lang_id = hparams['use_lang_id'] def collater(self, samples): batch = super().collater(samples) @@ -60,6 +61,9 @@ def collater(self, samples): if self.need_spk_id: spk_ids = torch.LongTensor([s['spk_id'] for s in samples]) batch['spk_ids'] = spk_ids + if self.need_lang_id: + languages = utils.collate_nd([s['languages'] for s in samples], 0) + batch['languages'] = languages return batch @@ -128,9 +132,14 @@ def run_model(self, sample, infer=False): spk_embed_id = sample['spk_ids'] else: spk_embed_id = None + if hparams['use_lang_id']: + languages = sample['languages'] + else: + languages = None output: ShallowDiffusionOutput = self.model( txt_tokens, mel2ph=mel2ph, f0=f0, **variances, - key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id, + key_shift=key_shift, speed=speed, + spk_embed_id=spk_embed_id, languages=languages, gt_mel=target, infer=infer ) From d282e28edfb60d50bb4a1de044a67b3dab08fba6 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Thu, 18 Jul 2024 23:49:58 +0800 Subject: [PATCH 04/44] Save language sequence in variance preprocessing --- preprocessing/variance_binarizer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 99ed65c31..00027954e 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -30,6 +30,7 @@ os.environ["OMP_NUM_THREADS"] = "1" VARIANCE_ITEM_ATTRIBUTES = [ 'spk_id', # index number of dataset/speaker, int64 + 'languages', # index numbers of phoneme languages, int64[T_ph,] 'tokens', # index numbers of phonemes, int64[T_ph,] 'ph_dur', # durations of phonemes, in number of frames, int64[T_ph,] 'midi', # phoneme-level mean MIDI pitch, int64[T_ph,] @@ -135,7 +136,10 @@ def require(attr): 'language_id': self.lang_map[lang], 'language_name': lang, 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), - 'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang), + 'lang_seq': [ + self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + for p in utterance_label['ph_seq'].split() + ],'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang), 'ph_dur': [float(x) for x in require('ph_dur').split()] } @@ -251,6 +255,7 @@ def process_item(self, item_name, meta_data, binarization_args): 'spk_name': meta_data['spk_name'], 'seconds': seconds, 'length': length, + 'languages': np.array(meta_data['lang_seq'], dtype=np.int64), 'tokens': np.array(meta_data['ph_seq'], dtype=np.int64) } From 70676ccc5cdcfa2b76adda823a9fc667ba6bb482 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Fri, 19 Jul 2024 02:19:24 +0800 Subject: [PATCH 05/44] Display merged phoneme groups properly in distribution plots --- basics/base_binarizer.py | 14 ++++++++------ utils/plot.py | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index b1bd6b97a..b0342d43b 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -235,6 +235,11 @@ def check_coverage(self): for idx, count in ph_idx_count_map.items() } + def display_phoneme(phoneme): + if isinstance(phoneme, tuple): + return f'({", ".join(phoneme)})' + return phoneme + print('===== Phoneme Distribution Summary =====') keys = sorted(ph_count_map.keys(), key=lambda v: v[0] if isinstance(v, tuple) else v) for i, key in enumerate(keys): @@ -244,19 +249,16 @@ def check_coverage(self): end = ',\n' else: end = ', ' - if isinstance(key, tuple): - key_disp = '(' + ', '.join(key) + ')' - else: - key_disp = key + key_disp = display_phoneme(key) print(f'{key_disp}: {ph_count_map[key]}', end=end) # Draw graph. - xs = [str(k) for k in keys] + xs = [display_phoneme(k) for k in keys] ys = [ph_count_map[k] for k in keys] plt = distribution_to_figure( title='Phoneme Distribution Summary', x_label='Phoneme', y_label='Number of occurrences', - items=xs, values=ys + items=xs, values=ys, rotate=len(self.dictionaries) > 1 ) filename = self.binary_data_dir / 'phoneme_distribution.jpg' plt.savefig(fname=filename, diff --git a/utils/plot.py b/utils/plot.py index b76e0726c..48cb9c430 100644 --- a/utils/plot.py +++ b/utils/plot.py @@ -106,7 +106,7 @@ def curve_to_figure(curve_gt, curve_pred=None, curve_base=None, grid=None, title return fig -def distribution_to_figure(title, x_label, y_label, items: list, values: list, zoom=0.8): +def distribution_to_figure(title, x_label, y_label, items: list, values: list, zoom=0.8, rotate=False): fig = plt.figure(figsize=(int(len(items) * zoom), 10)) plt.bar(x=items, height=values) plt.tick_params(labelsize=15) @@ -117,4 +117,6 @@ def distribution_to_figure(title, x_label, y_label, items: list, values: list, z plt.title(title, fontsize=30) plt.xlabel(x_label, fontsize=20) plt.ylabel(y_label, fontsize=20) + if rotate: + fig.autofmt_xdate(rotation=45) return fig From 62c093ed3ccc65cfd7e1422ddecbb178cf740ed4 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Fri, 19 Jul 2024 18:07:47 +0800 Subject: [PATCH 06/44] Add multi-dictionary inference --- basics/base_svs_infer.py | 7 ++++++- inference/ds_acoustic.py | 27 ++++++++++++++++++++++++--- inference/ds_variance.py | 24 ++++++++++++++++++++++-- scripts/infer.py | 18 ++++++++++++++++-- 4 files changed, 68 insertions(+), 8 deletions(-) diff --git a/basics/base_svs_infer.py b/basics/base_svs_infer.py index e040993a7..2b23d0112 100644 --- a/basics/base_svs_infer.py +++ b/basics/base_svs_infer.py @@ -29,6 +29,7 @@ def __init__(self, device=None): self.device = device self.timestep = hparams['hop_size'] / hparams['audio_sample_rate'] self.spk_map = {} + self.lang_map = {} self.model: torch.nn.Module = None def build_model(self, ckpt_steps=None) -> torch.nn.Module: @@ -50,7 +51,11 @@ def load_speaker_mix(self, param_src: dict, summary_dst: dict, spk_mix_map = param_src.get(param_key) # { spk_name: value } or { spk_name: "value value value ..." } dynamic = False if spk_mix_map is None: - # Get the first speaker + assert len(self.spk_map) == 1, ( + "This is a multi-speaker model. " + "Please specify a speaker or speaker mix by --spk option." + ) + # Get the only speaker for name in self.spk_map.keys(): spk_mix_map = {name: 1.0} break diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index 7b93d38cb..d8dbcc13d 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -41,6 +41,10 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N self.spk_map = json.load(f) assert isinstance(self.spk_map, dict) and len(self.spk_map) > 0, 'Invalid or empty speaker map!' assert len(self.spk_map) == len(set(self.spk_map.values())), 'Duplicate speaker id in speaker map!' + lang_map_fn = pathlib.Path(hparams['work_dir']) / 'lang_map.json' + if lang_map_fn.exists(): + with open(lang_map_fn, 'r', encoding='utf8') as f: + self.lang_map = json.load(f) self.model = self.build_model(ckpt_steps=ckpt_steps) self.lr = LengthRegulator().to(self.device) if load_vocoder: @@ -71,8 +75,23 @@ def preprocess_input(self, param, idx=0): """ batch = {} summary = OrderedDict() + + lang = param.get('lang') + if lang is None: + assert len(self.lang_map) <= 1, ( + "This is a multilingual model. " + "Please specify a language by --lang option." + ) + else: + assert lang in self.lang_map, f'Unrecognized language name: \'{lang}\'.' + if hparams.get('use_lang_id', False): + languages = torch.LongTensor([ + self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + for p in param['ph_seq'].split() + ]).to(self.device) # => [B, T_txt] + batch['languages'] = languages txt_tokens = torch.LongTensor([ - self.phoneme_dictionary.encode(param['ph_seq']) + self.phoneme_dictionary.encode(param['ph_seq'], lang=lang) ]).to(self.device) # => [B, T_txt] batch['tokens'] = txt_tokens @@ -175,9 +194,11 @@ def forward_model(self, sample): else: spk_mix_embed = None mel_pred: ShallowDiffusionOutput = self.model( - txt_tokens, mel2ph=sample['mel2ph'], f0=sample['f0'], **variances, + txt_tokens, languages=sample.get('languages'), + mel2ph=sample['mel2ph'], f0=sample['f0'], **variances, key_shift=sample.get('key_shift'), speed=sample.get('speed'), - spk_mix_embed=spk_mix_embed, infer=True + spk_mix_embed=spk_mix_embed, + infer=True ) return mel_pred.diff_out diff --git a/inference/ds_variance.py b/inference/ds_variance.py index 86abd1abd..f5a401c3e 100644 --- a/inference/ds_variance.py +++ b/inference/ds_variance.py @@ -38,6 +38,10 @@ def __init__( self.spk_map = json.load(f) assert isinstance(self.spk_map, dict) and len(self.spk_map) > 0, 'Invalid or empty speaker map!' assert len(self.spk_map) == len(set(self.spk_map.values())), 'Duplicate speaker id in speaker map!' + lang_map_fn = pathlib.Path(hparams['work_dir']) / 'lang_map.json' + if lang_map_fn.exists(): + with open(lang_map_fn, 'r', encoding='utf8') as f: + self.lang_map = json.load(f) self.model: DiffSingerVariance = self.build_model(ckpt_steps=ckpt_steps) self.lr = LengthRegulator() self.rr = RhythmRegulator() @@ -95,8 +99,23 @@ def preprocess_input( """ batch = {} summary = OrderedDict() + + lang = param.get('lang') + if lang is None: + assert len(self.lang_map) <= 1, ( + "This is a multilingual model. " + "Please specify a language by --lang option." + ) + else: + assert lang in self.lang_map, f'Unrecognized language name: \'{lang}\'.' + if hparams.get('use_lang_id', False): + languages = torch.LongTensor([ + self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + for p in param['ph_seq'].split() + ]).to(self.device) # [B=1, T_ph] + batch['languages'] = languages txt_tokens = torch.LongTensor([ - self.phoneme_dictionary.encode(param['ph_seq'].split()) + self.phoneme_dictionary.encode(param['ph_seq'], lang=lang) ]).to(self.device) # [B=1, T_ph] T_ph = txt_tokens.shape[1] batch['tokens'] = txt_tokens @@ -305,7 +324,8 @@ def forward_model(self, sample): ph_spk_mix_embed = spk_mix_embed = None dur_pred, pitch_pred, variance_pred = self.model( - txt_tokens, midi=midi, ph2word=ph2word, word_dur=word_dur, ph_dur=ph_dur, mel2ph=mel2ph, + txt_tokens, languages=sample.get('languages'), + midi=midi, ph2word=ph2word, word_dur=word_dur, ph_dur=ph_dur, mel2ph=mel2ph, note_midi=note_midi, note_rest=note_rest, note_dur=note_dur, note_glide=note_glide, mel2note=mel2note, base_pitch=base_pitch, pitch=pitch, pitch_expr=expr, ph_spk_mix_embed=ph_spk_mix_embed, spk_mix_embed=spk_mix_embed, diff --git a/scripts/infer.py b/scripts/infer.py index 83a5cabb7..ae08f5d12 100644 --- a/scripts/infer.py +++ b/scripts/infer.py @@ -61,6 +61,11 @@ def main(): required=False, help='Speaker name or mixture of speakers' ) +@click.option( + '--lang', type=click.STRING, + required=False, + help='Default language name' +) @click.option( '--out', type=click.Path( file_okay=False, dir_okay=True, path_type=pathlib.Path @@ -112,6 +117,7 @@ def acoustic( exp: str, ckpt: int, spk: str, + lang: str, out: pathlib.Path, title: str, num: int, @@ -195,9 +201,10 @@ def acoustic( for param in params: if gender is not None and hparams['use_key_shift_embed']: param['gender'] = gender - if spk_mix is not None: param['spk_mix'] = spk_mix + if lang is not None: + param['lang'] = lang from inference.ds_acoustic import DiffSingerAcousticInfer infer_ins = DiffSingerAcousticInfer(load_vocoder=not mel, ckpt_steps=ckpt) @@ -241,6 +248,11 @@ def acoustic( required=False, help='Speaker name or mixture of speakers' ) +@click.option( + '--lang', type=click.STRING, + required=False, + help='Default language name' +) @click.option( '--out', type=click.Path( file_okay=False, dir_okay=True, path_type=pathlib.Path @@ -282,6 +294,7 @@ def variance( exp: str, ckpt: int, spk: str, + lang: str, predict: Tuple[str], out: pathlib.Path, title: str, @@ -344,11 +357,12 @@ def variance( for param in params: if expr is not None: param['expr'] = expr - if spk_mix is not None: param['ph_spk_mix_backup'] = param.get('ph_spk_mix') param['spk_mix_backup'] = param.get('spk_mix') param['ph_spk_mix'] = param['spk_mix'] = spk_mix + if lang is not None: + param['lang'] = lang from inference.ds_variance import DiffSingerVarianceInfer infer_ins = DiffSingerVarianceInfer(ckpt_steps=ckpt, predictions=set(predict)) From 96b9a602cdbb85bdd4756efd29bb1be9d062696a Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Fri, 19 Jul 2024 20:31:32 +0800 Subject: [PATCH 07/44] Save original phoneme texts for duration plots --- basics/base_binarizer.py | 5 ++++- preprocessing/acoustic_binarizer.py | 6 ++++-- preprocessing/variance_binarizer.py | 9 ++++++--- training/variance_task.py | 3 ++- utils/phoneme_utils.py | 18 ------------------ 5 files changed, 16 insertions(+), 25 deletions(-) diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index b0342d43b..92e583d61 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -281,7 +281,7 @@ def process_dataset(self, prefix, num_workers=0, apply_augmentation=False): builder = IndexedDatasetBuilder(self.binary_data_dir, prefix=prefix, allowed_attr=self.data_attrs) total_sec = {k: 0.0 for k in self.spk_map} total_raw_sec = {k: 0.0 for k in self.spk_map} - extra_info = {'names': {}, 'spk_ids': {}, 'spk_names': {}, 'lengths': {}} + extra_info = {'names': {}, 'ph_texts': {}, 'spk_ids': {}, 'spk_names': {}, 'lengths': {}} max_no = -1 for item_name, meta_data in self.meta_data_iterator(prefix): @@ -301,6 +301,7 @@ def postprocess(_item): extra_info[k] = {} extra_info[k][item_no] = v.shape[0] extra_info['names'][item_no] = _item['name'].split(':', 1)[-1] + extra_info['ph_texts'][item_no] = _item['ph_text'] extra_info['spk_ids'][item_no] = _item['spk_id'] extra_info['spk_names'][item_no] = _item['spk_name'] extra_info['lengths'][item_no] = _item['length'] @@ -317,6 +318,7 @@ def postprocess(_item): extra_info[k] = {} extra_info[k][aug_item_no] = v.shape[0] extra_info['names'][aug_item_no] = aug_item['name'].split(':', 1)[-1] + extra_info['ph_texts'][aug_item_no] = aug_item['ph_text'] extra_info['spk_ids'][aug_item_no] = aug_item['spk_id'] extra_info['spk_names'][aug_item_no] = aug_item['spk_name'] extra_info['lengths'][aug_item_no] = aug_item['length'] @@ -345,6 +347,7 @@ def postprocess(_item): builder.finalize() if prefix == "train": extra_info.pop("names") + extra_info.pop('ph_texts') extra_info.pop("spk_names") with open(self.binary_data_dir / f"{prefix}.meta", "wb") as f: # noinspection PyTypeChecker diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 18a6cd478..efb97a1ec 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -75,14 +75,15 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): item_name = utterance_label['name'] temp_dict = { 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), + 'spk_id': self.spk_map[spk], + 'spk_name': spk, 'lang_seq': [ self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] for p in utterance_label['ph_seq'].split() ], 'ph_seq': self.phoneme_dictionary.encode(utterance_label['ph_seq'], lang=lang), 'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()], - 'spk_id': self.spk_map[spk], - 'spk_name': spk, + 'ph_text': utterance_label['ph_seq'], } assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \ f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.' @@ -114,6 +115,7 @@ def process_item(self, item_name, meta_data, binarization_args): 'languages': np.array(meta_data['lang_seq'], dtype=np.int64), 'tokens': np.array(meta_data['ph_seq'], dtype=np.int64), 'ph_dur': np.array(meta_data['ph_dur']).astype(np.float32), + 'ph_text': meta_data['ph_text'], } # get ground truth dur diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 00027954e..3feda896d 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -139,8 +139,10 @@ def require(attr): 'lang_seq': [ self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] for p in utterance_label['ph_seq'].split() - ],'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang), - 'ph_dur': [float(x) for x in require('ph_dur').split()] + ], + 'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang), + 'ph_dur': [float(x) for x in require('ph_dur').split()], + 'ph_text': require('ph_seq'), } assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \ @@ -256,7 +258,8 @@ def process_item(self, item_name, meta_data, binarization_args): 'seconds': seconds, 'length': length, 'languages': np.array(meta_data['lang_seq'], dtype=np.int64), - 'tokens': np.array(meta_data['ph_seq'], dtype=np.int64) + 'tokens': np.array(meta_data['ph_seq'], dtype=np.int64), + 'ph_text': meta_data['ph_text'], } ph_dur_sec = torch.FloatTensor(meta_data['ph_dur']).to(self.device) diff --git a/training/variance_task.py b/training/variance_task.py index 0a33301e6..e6e885944 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -295,7 +295,8 @@ def sample_get(key, idx, abs_idx): def plot_dur(self, data_idx, gt_dur, pred_dur, txt=None): gt_dur = gt_dur[0].cpu().numpy() pred_dur = pred_dur[0].cpu().numpy() - txt = self.phoneme_dictionary.decode(txt[0].cpu().numpy()).split() + if txt is None: + txt = self.valid_dataset.metadata['ph_texts'][data_idx].split() title_text = f"{self.valid_dataset.metadata['spk_names'][data_idx]} - {self.valid_dataset.metadata['names'][data_idx]}" self.logger.all_rank_experiment.add_figure(f'dur_{data_idx}', dur_to_figure( gt_dur, pred_dur, txt, title_text diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 1547bc9eb..1e8275330 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -170,21 +170,3 @@ def load_phoneme_dictionary() -> PhonemeDictionary: dictionaries=dicts, merged_groups=hparams.get('merged_phoneme_groups') ) - - -if __name__ == '__main__': - d = PhonemeDictionary( - dictionaries={ - 'zh': 'dictionaries/opencpop-extension.txt', - # 'en': 'dictionaries/opencpop-extension.txt', - }, - merged_groups=[ - ['zh/a', 'zh/b', 'c'], - ['a', 'd', 'e'], - ['e', 'f'] - ] - ) - ph_ids = d.encode('sh ir zh e j v y i b a SP', lang='en') - ph_seq = d.decode(ph_ids) - print(ph_ids) - print(ph_seq) From dbe3840c39c7285f47cb64a324c1d32b6d018153 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Fri, 19 Jul 2024 20:53:03 +0800 Subject: [PATCH 08/44] Fix duration plots displaying bug --- training/variance_task.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/training/variance_task.py b/training/variance_task.py index e6e885944..6a27a3b6d 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -262,7 +262,10 @@ def sample_get(key, idx, abs_idx): self.valid_metrics['ph_dur_acc'].update( pdur_pred=pred_dur, pdur_target=gt_dur, ph2word=ph2word, mask=mask ) - self.plot_dur(data_idx, gt_dur, pred_dur, tokens) + self.plot_dur( + data_idx, gt_dur, pred_dur, + txt=self.valid_dataset.metadata['ph_texts'][data_idx].split() + ) if pitch_preds is not None: pitch_len = self.valid_dataset.metadata['pitch'][data_idx] pred_pitch = sample_get('base_pitch', i, data_idx) + pitch_preds[i][:pitch_len].unsqueeze(0) @@ -295,8 +298,6 @@ def sample_get(key, idx, abs_idx): def plot_dur(self, data_idx, gt_dur, pred_dur, txt=None): gt_dur = gt_dur[0].cpu().numpy() pred_dur = pred_dur[0].cpu().numpy() - if txt is None: - txt = self.valid_dataset.metadata['ph_texts'][data_idx].split() title_text = f"{self.valid_dataset.metadata['spk_names'][data_idx]} - {self.valid_dataset.metadata['names'][data_idx]}" self.logger.all_rank_experiment.add_figure(f'dur_{data_idx}', dur_to_figure( gt_dur, pred_dur, txt, title_text From b5f20a587e5276064e8fef181bfdeb11866670de Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 20 Jul 2024 03:09:02 +0800 Subject: [PATCH 09/44] Explicit `languages` argument passing --- modules/toplevel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/toplevel.py b/modules/toplevel.py index 1976d09a9..777f42291 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -88,11 +88,12 @@ def __init__(self, vocab_size, out_dims): def forward( self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, - spk_embed_id=None, gt_mel=None, infer=True, **kwargs + spk_embed_id=None, languages=None, gt_mel=None, infer=True, **kwargs ) -> ShallowDiffusionOutput: condition = self.fs2( txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed, - spk_embed_id=spk_embed_id, **kwargs + spk_embed_id=spk_embed_id, languages=languages, + **kwargs ) if infer: if self.use_shallow_diffusion: From a7dbb9340a0814680c5930f6f56f57742941590b Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 20 Jul 2024 20:57:37 +0800 Subject: [PATCH 10/44] Add language embed (inject to txt_embed) for variance models --- modules/fastspeech/variance_encoder.py | 14 +++++++++++++- training/variance_task.py | 7 ++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index 2031d89ce..aca443352 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -16,8 +16,11 @@ def __init__(self, vocab_size): super().__init__() self.predict_dur = hparams['predict_dur'] self.linguistic_mode = 'word' if hparams['predict_dur'] else 'phoneme' + self.use_lang_id = hparams['use_lang_id'] self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX) + if self.use_lang_id: + self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) if self.predict_dur: self.onset_embed = Embedding(2, hparams['hidden_size']) @@ -45,7 +48,12 @@ def __init__(self, vocab_size): dur_loss_type=dur_hparams['loss_type'] ) - def forward(self, txt_tokens, midi, ph2word, ph_dur=None, word_dur=None, spk_embed=None, infer=True): + def forward( + self, txt_tokens, midi, ph2word, + ph_dur=None, word_dur=None, + spk_embed=None, languages=None, + infer=True + ): """ :param txt_tokens: (train, infer) [B, T_ph] :param midi: (train, infer) [B, T_ph] @@ -53,10 +61,14 @@ def forward(self, txt_tokens, midi, ph2word, ph_dur=None, word_dur=None, spk_emb :param ph_dur: (train, [infer]) [B, T_ph] :param word_dur: (infer) [B, T_w] :param spk_embed: (train) [B, T_ph, H] + :param languages (train, infer) [B, T_ph] :param infer: whether inference :return: encoder_out, ph_dur_pred """ txt_embed = self.txt_embed(txt_tokens) + if self.use_lang_id: + lang_embed = self.lang_embed(languages) + txt_embed += lang_embed if self.linguistic_mode == 'word': b = txt_tokens.shape[0] onset = torch.diff(ph2word, dim=1, prepend=ph2word.new_zeros(b, 1)) > 0 diff --git a/training/variance_task.py b/training/variance_task.py index 6a27a3b6d..2fdc599f6 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -41,6 +41,8 @@ def collater(self, samples): if hparams['use_spk_id']: batch['spk_ids'] = torch.LongTensor([s['spk_id'] for s in samples]) + if hparams['use_lang_id']: + batch['languages'] = utils.collate_nd([s['languages'] for s in samples], 0) if hparams['predict_dur']: batch['ph2word'] = utils.collate_nd([s['ph2word'] for s in samples], 0) batch['midi'] = utils.collate_nd([s['midi'] for s in samples], 0) @@ -85,6 +87,7 @@ def __init__(self): self.diffusion_type = hparams['diffusion_type'] self.use_spk_id = hparams['use_spk_id'] + self.use_lang_id = hparams['use_lang_id'] self.predict_dur = hparams['predict_dur'] if self.predict_dur: @@ -154,6 +157,7 @@ def build_losses_and_metrics(self): def run_model(self, sample, infer=False): spk_ids = sample['spk_ids'] if self.use_spk_id else None # [B,] + languages = sample['languages'] if self.use_lang_id else None # [B,] txt_tokens = sample['tokens'] # [B, T_ph] ph_dur = sample['ph_dur'] # [B, T_ph] ph2word = sample.get('ph2word') # [B, T_ph] @@ -188,7 +192,8 @@ def run_model(self, sample, infer=False): } output = self.model( - txt_tokens, midi=midi, ph2word=ph2word, + txt_tokens, languages=languages, + midi=midi, ph2word=ph2word, ph_dur=ph_dur, mel2ph=mel2ph, note_midi=note_midi, note_rest=note_rest, note_dur=note_dur, note_glide=note_glide, mel2note=mel2note, From 2a175615b0ae5c518939e646a1e65b46ec98b76e Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 20 Jul 2024 21:02:39 +0800 Subject: [PATCH 11/44] Fix argument passing --- modules/toplevel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/toplevel.py b/modules/toplevel.py index 777f42291..6db01eaff 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -212,7 +212,8 @@ def forward( note_midi=None, note_rest=None, note_dur=None, note_glide=None, mel2note=None, base_pitch=None, pitch=None, pitch_expr=None, pitch_retake=None, variance_retake: Dict[str, Tensor] = None, - spk_id=None, infer=True, **kwargs + spk_id=None, languages=None, + infer=True, **kwargs ): if self.use_spk_id: ph_spk_mix_embed = kwargs.get('ph_spk_mix_embed') @@ -228,7 +229,8 @@ def forward( encoder_out, dur_pred_out = self.fs2( txt_tokens, midi=midi, ph2word=ph2word, ph_dur=ph_dur, word_dur=word_dur, - spk_embed=ph_spk_embed, infer=infer + spk_embed=ph_spk_embed, languages=languages, + infer=infer ) if not self.predict_pitch and not self.predict_variances: From 8b215dbe89a2b56fe8c2f36dfe779a29d73d67bd Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 21 Jul 2024 02:24:53 +0800 Subject: [PATCH 12/44] Add log for lang_map.json copy --- basics/base_task.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/basics/base_task.py b/basics/base_task.py index b53133d39..065f8273a 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -447,10 +447,11 @@ def train_payload_copy(): spk_map_dst = work_dir / 'spk_map.json' spk_map_src = binary_dir / 'spk_map.json' shutil.copy(spk_map_src, spk_map_dst) + print(f'| Copied spk map to {spk_map_dst}.') lang_map_dst = work_dir / 'lang_map.json' lang_map_src = binary_dir / 'lang_map.json' shutil.copy(lang_map_src, lang_map_dst) - print(f'| Copied spk map to {spk_map_dst}.') + print(f'| Copied lang map to {lang_map_dst}.') for lang in hparams['dictionaries'].keys(): dict_dst = work_dir / f'dictionary-{lang}.txt' dict_src = binary_dir / f'dictionary-{lang}.txt' From 6f80697e34d803874fd36f51909ea344d8263587 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 21 Jul 2024 03:22:20 +0800 Subject: [PATCH 13/44] Add language embedding scale --- modules/fastspeech/acoustic_encoder.py | 13 +++++++++---- modules/fastspeech/variance_encoder.py | 15 +++++++++------ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 2b53ace20..3395f0c0e 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -1,3 +1,5 @@ +import math + import torch import torch.nn as nn from torch.nn import functional as F @@ -58,6 +60,7 @@ def __init__(self, vocab_size): self.spk_embed = Embedding(hparams['num_spk'], hparams['hidden_size']) self.use_lang_id = hparams.get('use_lang_id', False) if self.use_lang_id: + self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size'])) self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances): @@ -85,12 +88,14 @@ def forward( **kwargs ): txt_embed = self.txt_embed(txt_tokens) - if self.use_lang_id: - lang_embed = self.lang_embed(languages) - txt_embed += lang_embed dur = mel2ph_to_dur(mel2ph, txt_tokens.shape[1]).float() dur_embed = self.dur_embed(dur[:, :, None]) - encoder_out = self.encoder(txt_embed, dur_embed, txt_tokens == 0) + if self.use_lang_id: + lang_embed = self.lang_embed(languages) + extra_embed = dur_embed + lang_embed * self.lang_embed_scale + else: + extra_embed = dur_embed + encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0) encoder_out = F.pad(encoder_out, [0, 0, 1, 0]) mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]]) diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index aca443352..a5be5ec6f 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -1,3 +1,5 @@ +import math + import torch import torch.nn as nn from torch.nn import functional as F @@ -20,6 +22,7 @@ def __init__(self, vocab_size): self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX) if self.use_lang_id: + self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size'])) self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) if self.predict_dur: @@ -66,9 +69,6 @@ def forward( :return: encoder_out, ph_dur_pred """ txt_embed = self.txt_embed(txt_tokens) - if self.use_lang_id: - lang_embed = self.lang_embed(languages) - txt_embed += lang_embed if self.linguistic_mode == 'word': b = txt_tokens.shape[0] onset = torch.diff(ph2word, dim=1, prepend=ph2word.new_zeros(b, 1)) > 0 @@ -80,11 +80,14 @@ def forward( )[:, 1:] # [B, T_ph] => [B, T_w] word_dur = torch.gather(F.pad(word_dur, [1, 0], value=0), 1, ph2word) # [B, T_w] => [B, T_ph] word_dur_embed = self.word_dur_embed(word_dur.float()[:, :, None]) - - encoder_out = self.encoder(txt_embed, onset_embed + word_dur_embed, txt_tokens == 0) + extra_embed = onset_embed + word_dur_embed else: ph_dur_embed = self.ph_dur_embed(ph_dur.float()[:, :, None]) - encoder_out = self.encoder(txt_embed, ph_dur_embed, txt_tokens == 0) + extra_embed = ph_dur_embed + if self.use_lang_id: + lang_embed = self.lang_embed(languages) + extra_embed += lang_embed * self.lang_embed_scale + encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0) if self.predict_dur: midi_embed = self.midi_embed(midi) # => [B, T_ph, H] From 655e9ba9611861793297956e79ccbf329313d7f2 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Mon, 22 Jul 2024 02:14:20 +0800 Subject: [PATCH 14/44] Add language embedding type --- modules/fastspeech/acoustic_encoder.py | 6 +++++- modules/fastspeech/variance_encoder.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 3395f0c0e..3b82cfbce 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -62,6 +62,7 @@ def __init__(self, vocab_size): if self.use_lang_id: self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size'])) self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) + self.lang_embed_type = hparams.get('lang_embed_type', 'before') def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances): if self.use_variance_embeds: @@ -90,12 +91,15 @@ def forward( txt_embed = self.txt_embed(txt_tokens) dur = mel2ph_to_dur(mel2ph, txt_tokens.shape[1]).float() dur_embed = self.dur_embed(dur[:, :, None]) - if self.use_lang_id: + if self.use_lang_id and self.lang_embed_type == 'before': lang_embed = self.lang_embed(languages) extra_embed = dur_embed + lang_embed * self.lang_embed_scale else: extra_embed = dur_embed encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0) + if self.use_lang_id and self.lang_embed_type == 'after': + lang_embed = self.lang_embed(languages) + encoder_out = encoder_out + lang_embed * self.lang_embed_scale encoder_out = F.pad(encoder_out, [0, 0, 1, 0]) mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]]) diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index a5be5ec6f..eccded276 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -24,6 +24,7 @@ def __init__(self, vocab_size): if self.use_lang_id: self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size'])) self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) + self.lang_embed_type = hparams.get('lang_embed_type', 'before') if self.predict_dur: self.onset_embed = Embedding(2, hparams['hidden_size']) @@ -84,10 +85,13 @@ def forward( else: ph_dur_embed = self.ph_dur_embed(ph_dur.float()[:, :, None]) extra_embed = ph_dur_embed - if self.use_lang_id: + if self.use_lang_id and self.lang_embed_type == 'before': lang_embed = self.lang_embed(languages) extra_embed += lang_embed * self.lang_embed_scale encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0) + if self.use_lang_id and self.lang_embed_type == 'after': + lang_embed = self.lang_embed(languages) + encoder_out = encoder_out + lang_embed * self.lang_embed_scale if self.predict_dur: midi_embed = self.midi_embed(midi) # => [B, T_ph, H] From c6b96cf877b920eda98bb455fd6059ecc4af68f7 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 23 Jul 2024 00:39:53 +0800 Subject: [PATCH 15/44] Preprocessing: only apply lang embed on cross-lingual phonemes --- preprocessing/acoustic_binarizer.py | 6 +++++- preprocessing/variance_binarizer.py | 6 +++++- utils/phoneme_utils.py | 16 +++++++++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index efb97a1ec..99d0aaf68 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -78,7 +78,11 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): 'spk_id': self.spk_map[spk], 'spk_name': spk, 'lang_seq': [ - self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + ( + self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + if self.phoneme_dictionary.is_cross_lingual(p) + else 0 + ) for p in utterance_label['ph_seq'].split() ], 'ph_seq': self.phoneme_dictionary.encode(utterance_label['ph_seq'], lang=lang), diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 3feda896d..30c175b2c 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -137,7 +137,11 @@ def require(attr): 'language_name': lang, 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), 'lang_seq': [ - self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + ( + self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + if self.phoneme_dictionary.is_cross_lingual(p) + else 0 + ) for p in utterance_label['ph_seq'].split() ], 'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang), diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 1e8275330..59a0924bc 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -73,8 +73,10 @@ def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[Li merged_phonemes_inverted_index[phoneme] = target_idx if other_idx is not None: merged_groups[other_idx] |= group + group.clear() phone_to_id = {} id_to_phone = [] + cross_lingual_phonemes = set() idx = 1 for phoneme in sorted(all_phonemes): if phoneme in merged_phonemes_inverted_index: @@ -84,14 +86,23 @@ def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[Li has_assigned = False phone_to_id[alias] = idx if not has_assigned: - id_to_phone.append(tuple(sorted(merged_groups[merged_phonemes_inverted_index[phoneme]]))) + merged_group = sorted(merged_groups[merged_phonemes_inverted_index[phoneme]]) + merged_from_langs = { + alias.split('/', maxsplit=1)[0] + for alias in merged_group + if '/' in alias + } + id_to_phone.append(tuple(merged_group)) idx += 1 + if len(merged_from_langs) > 1: + cross_lingual_phonemes.update(ph for ph in merged_group if '/' in ph) else: phone_to_id[phoneme] = idx id_to_phone.append(phoneme) idx += 1 self._phone_to_id: Dict[str, int] = phone_to_id self._id_to_phone: List[Union[str, tuple]] = id_to_phone + self._cross_lingual_phonemes = cross_lingual_phonemes @property def vocab_size(self): @@ -100,6 +111,9 @@ def vocab_size(self): def __len__(self): return self.vocab_size + def is_cross_lingual(self, phone): + return phone in self._cross_lingual_phonemes + def encode_one(self, phone, lang=None): if lang is None or not self._multi_langs or phone in self._phone_to_id: return self._phone_to_id[phone] From 8377728bd413452c62f3ab79fb6cf2050f9bea39 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 24 Jul 2024 23:30:44 +0800 Subject: [PATCH 16/44] Inference: only apply lang embed on cross-lingual phonemes --- inference/ds_acoustic.py | 6 +++++- inference/ds_variance.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index d8dbcc13d..8b139f62f 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -86,7 +86,11 @@ def preprocess_input(self, param, idx=0): assert lang in self.lang_map, f'Unrecognized language name: \'{lang}\'.' if hparams.get('use_lang_id', False): languages = torch.LongTensor([ - self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + ( + self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + if self.phoneme_dictionary.is_cross_lingual(p) + else 0 + ) for p in param['ph_seq'].split() ]).to(self.device) # => [B, T_txt] batch['languages'] = languages diff --git a/inference/ds_variance.py b/inference/ds_variance.py index f5a401c3e..aa74dcabd 100644 --- a/inference/ds_variance.py +++ b/inference/ds_variance.py @@ -110,7 +110,11 @@ def preprocess_input( assert lang in self.lang_map, f'Unrecognized language name: \'{lang}\'.' if hparams.get('use_lang_id', False): languages = torch.LongTensor([ - self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + ( + self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]] + if self.phoneme_dictionary.is_cross_lingual(p) + else 0 + ) for p in param['ph_seq'].split() ]).to(self.device) # [B=1, T_ph] batch['languages'] = languages From 3d0a9ba3eaa4e03efe371ceb349f8d3bd56e5c11 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 24 Jul 2024 23:32:06 +0800 Subject: [PATCH 17/44] Revert "Add language embedding type" This reverts commit 655e9ba9611861793297956e79ccbf329313d7f2. --- modules/fastspeech/acoustic_encoder.py | 6 +----- modules/fastspeech/variance_encoder.py | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 3b82cfbce..3395f0c0e 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -62,7 +62,6 @@ def __init__(self, vocab_size): if self.use_lang_id: self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size'])) self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) - self.lang_embed_type = hparams.get('lang_embed_type', 'before') def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances): if self.use_variance_embeds: @@ -91,15 +90,12 @@ def forward( txt_embed = self.txt_embed(txt_tokens) dur = mel2ph_to_dur(mel2ph, txt_tokens.shape[1]).float() dur_embed = self.dur_embed(dur[:, :, None]) - if self.use_lang_id and self.lang_embed_type == 'before': + if self.use_lang_id: lang_embed = self.lang_embed(languages) extra_embed = dur_embed + lang_embed * self.lang_embed_scale else: extra_embed = dur_embed encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0) - if self.use_lang_id and self.lang_embed_type == 'after': - lang_embed = self.lang_embed(languages) - encoder_out = encoder_out + lang_embed * self.lang_embed_scale encoder_out = F.pad(encoder_out, [0, 0, 1, 0]) mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]]) diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index eccded276..a5be5ec6f 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -24,7 +24,6 @@ def __init__(self, vocab_size): if self.use_lang_id: self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size'])) self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) - self.lang_embed_type = hparams.get('lang_embed_type', 'before') if self.predict_dur: self.onset_embed = Embedding(2, hparams['hidden_size']) @@ -85,13 +84,10 @@ def forward( else: ph_dur_embed = self.ph_dur_embed(ph_dur.float()[:, :, None]) extra_embed = ph_dur_embed - if self.use_lang_id and self.lang_embed_type == 'before': + if self.use_lang_id: lang_embed = self.lang_embed(languages) extra_embed += lang_embed * self.lang_embed_scale encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0) - if self.use_lang_id and self.lang_embed_type == 'after': - lang_embed = self.lang_embed(languages) - encoder_out = encoder_out + lang_embed * self.lang_embed_scale if self.predict_dur: midi_embed = self.midi_embed(midi) # => [B, T_ph, H] From 932c4f425d35cc6f4ea4acbf916132b33cf3be25 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 27 Jul 2024 17:31:51 +0800 Subject: [PATCH 18/44] Revert lang_embed_scale --- modules/fastspeech/acoustic_encoder.py | 11 ++++------- modules/fastspeech/variance_encoder.py | 5 +---- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 3395f0c0e..6c4e54f8b 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -1,5 +1,3 @@ -import math - import torch import torch.nn as nn from torch.nn import functional as F @@ -17,6 +15,9 @@ class FastSpeech2Acoustic(nn.Module): def __init__(self, vocab_size): super().__init__() self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX) + self.use_lang_id = hparams.get('use_lang_id', False) + if self.use_lang_id: + self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) self.dur_embed = Linear(1, hparams['hidden_size']) self.encoder = FastSpeech2Encoder( hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], @@ -58,10 +59,6 @@ def __init__(self, vocab_size): self.use_spk_id = hparams['use_spk_id'] if self.use_spk_id: self.spk_embed = Embedding(hparams['num_spk'], hparams['hidden_size']) - self.use_lang_id = hparams.get('use_lang_id', False) - if self.use_lang_id: - self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size'])) - self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances): if self.use_variance_embeds: @@ -92,7 +89,7 @@ def forward( dur_embed = self.dur_embed(dur[:, :, None]) if self.use_lang_id: lang_embed = self.lang_embed(languages) - extra_embed = dur_embed + lang_embed * self.lang_embed_scale + extra_embed = dur_embed + lang_embed else: extra_embed = dur_embed encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0) diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index a5be5ec6f..a02e6e010 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -1,5 +1,3 @@ -import math - import torch import torch.nn as nn from torch.nn import functional as F @@ -22,7 +20,6 @@ def __init__(self, vocab_size): self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX) if self.use_lang_id: - self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size'])) self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0) if self.predict_dur: @@ -86,7 +83,7 @@ def forward( extra_embed = ph_dur_embed if self.use_lang_id: lang_embed = self.lang_embed(languages) - extra_embed += lang_embed * self.lang_embed_scale + extra_embed += lang_embed encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0) if self.predict_dur: From a0ec7e3a2ee293cdec7b61fcfc7af81ef4eeeed7 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 27 Jul 2024 23:18:46 +0800 Subject: [PATCH 19/44] Adapt ONNX exporters for multi-language models --- basics/base_exporter.py | 12 +++++ deployment/exporters/acoustic_exporter.py | 32 +++++++++++-- deployment/exporters/variance_exporter.py | 47 +++++++++++++----- deployment/modules/fastspeech2.py | 58 +++++++++++++++++++---- deployment/modules/toplevel.py | 24 ++++++---- utils/onnx_helper.py | 46 ++++++++++++++++-- utils/phoneme_utils.py | 6 ++- 7 files changed, 185 insertions(+), 40 deletions(-) diff --git a/basics/base_exporter.py b/basics/base_exporter.py index e2e65f534..77e5805a8 100644 --- a/basics/base_exporter.py +++ b/basics/base_exporter.py @@ -33,6 +33,18 @@ def build_spk_map(self) -> dict: else: return {} + # noinspection PyMethodMayBeStatic + def build_lang_map(self) -> dict: + lang_map_fn = pathlib.Path(hparams['work_dir']) / 'lang_map.json' + if lang_map_fn.exists(): + with open(lang_map_fn, 'r', encoding='utf8') as f: + lang_map = json.load(f) + assert isinstance(lang_map, dict) and len(lang_map) > 0, 'Invalid or empty language map!' + assert len(lang_map) == len(set(lang_map.values())), 'Duplicate language id in language map!' + return lang_map + else: + return {} + def build_model(self) -> nn.Module: """ Creates an instance of nn.Module and load its state dict on the target device. diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py index 1f56a9ce4..9160af6e3 100644 --- a/deployment/exporters/acoustic_exporter.py +++ b/deployment/exporters/acoustic_exporter.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import List, Union, Tuple, Dict @@ -30,6 +31,7 @@ def __init__( self.model_name: str = hparams['exp_name'] self.ckpt_steps: int = ckpt_steps self.spk_map: dict = self.build_spk_map() + self.lang_map: dict = self.build_lang_map() self.phoneme_dictionary = load_phoneme_dictionary() self.model = self.build_model() self.fs2_aux_cache_path = self.cache_dir / ( @@ -79,7 +81,11 @@ def __init__( def build_model(self) -> DiffSingerAcousticONNX: model = DiffSingerAcousticONNX( vocab_size=len(self.phoneme_dictionary), - out_dims=hparams['audio_num_mel_bins'] + out_dims=hparams['audio_num_mel_bins'], + cross_lingual_token_idx=sorted({ + self.phoneme_dictionary.encode_one(p) + for p in self.phoneme_dictionary.cross_lingual_phonemes + }) ).eval().to(self.device) load_ckpt(model, hparams['work_dir'], ckpt_steps=self.ckpt_steps, prefix_in_ckpt='model', strict=True, device=self.device) @@ -110,14 +116,15 @@ def export_attachments(self, path: Path): self._perform_spk_mix(spk[1]) ) self.export_dictionaries(path) - self._export_phonemes(path / f'{self.model_name}.phonemes.txt') + self._export_phonemes(path) model_name = self.model_name if self.freeze_spk is not None: model_name += '.' + self.freeze_spk[0] dsconfig = { # basic configs - 'phonemes': f'{self.model_name}.phonemes.txt', + 'phonemes': f'{self.model_name}.phonemes.json', + 'use_lang_id': hparams.get('use_lang_id', False), 'acoustic': f'{model_name}.onnx', 'hidden_size': hparams['hidden_size'], 'vocoder': 'nsf_hifigan_44.1k_hop512_128bin_2024.02', @@ -209,6 +216,12 @@ def _torch_export_model(self): dynamix_axes['spk_embed'] = { 1: 'n_frames' } + if hparams.get('use_lang_id'): + kwargs['languages'] = torch.zeros_like(tokens) + input_names.append('languages') + dynamix_axes['languages'] = { + 1: 'n_tokens' + } dynamix_axes['condition'] = { 1: 'n_frames' } @@ -332,6 +345,10 @@ def _optimize_fs2_aux_graph(self, fs2: onnx.ModelProto) -> onnx.ModelProto: print(f'Running ONNX Simplifier on {self.fs2_aux_class_name}...') fs2, check = onnxsim.simplify(fs2, include_subgraph=True) assert check, 'Simplified ONNX model could not be validated' + onnx_helper.model_reorder_io_list( + fs2, 'input', + target_name='languages', insert_after_name='tokens' + ) print(f'| optimize graph: {self.fs2_aux_class_name}') return fs2 @@ -394,5 +411,10 @@ def _export_spk_embed(self, path: Path, spk_embed: torch.Tensor): print(f'| export spk embed => {path}') def _export_phonemes(self, path: Path): - self.phoneme_dictionary.dump(path) - print(f'| export phonemes => {path}') + ph_path = path / f'{self.model_name}.phonemes.json' + self.phoneme_dictionary.dump(ph_path) + print(f'| export phonemes => {ph_path}') + lang_path = path / 'languages.json' + with open(lang_path, 'w', encoding='utf8') as f: + json.dump(self.lang_map, f, ensure_ascii=False, indent=2) + print(f'| export languages => {lang_path}') diff --git a/deployment/exporters/variance_exporter.py b/deployment/exporters/variance_exporter.py index 4e594c407..76d061834 100644 --- a/deployment/exporters/variance_exporter.py +++ b/deployment/exporters/variance_exporter.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import Union, List, Tuple, Dict @@ -30,6 +31,7 @@ def __init__( self.model_name: str = hparams['exp_name'] self.ckpt_steps: int = ckpt_steps self.spk_map: dict = self.build_spk_map() + self.lang_map: dict = self.build_lang_map() self.phoneme_dictionary = load_phoneme_dictionary() self.model = self.build_model() self.linguistic_encoder_cache_path = self.cache_dir / 'linguistic.onnx' @@ -81,7 +83,11 @@ def __init__( def build_model(self) -> DiffSingerVarianceONNX: model = DiffSingerVarianceONNX( - vocab_size=len(self.phoneme_dictionary) + vocab_size=len(self.phoneme_dictionary), + cross_lingual_token_idx=sorted({ + self.phoneme_dictionary.encode_one(p) + for p in self.phoneme_dictionary.cross_lingual_phonemes + }) ).eval().to(self.device) load_ckpt(model, hparams['work_dir'], ckpt_steps=self.ckpt_steps, prefix_in_ckpt='model', strict=True, device=self.device) @@ -141,14 +147,16 @@ def export_attachments(self, path: Path): self._perform_spk_mix(spk[1]) ) self.export_dictionaries(path) - self._export_phonemes((path / f'{self.model_name}.phonemes.txt')) + self._export_phonemes(path) model_name = self.model_name if self.freeze_spk is not None: model_name += '.' + self.freeze_spk[0] dsconfig = { # basic configs - 'phonemes': f'{self.model_name}.phonemes.txt', + 'phonemes': f'{self.model_name}.phonemes.json', + 'languages': sorted(self.lang_map.keys()), + 'use_lang_id': hparams.get('use_lang_id', False), 'linguistic': f'{model_name}.linguistic.onnx', 'hidden_size': self.model.hidden_size, 'predict_dur': self.model.predict_dur, @@ -184,6 +192,7 @@ def _torch_export_model(self): ph_dur = torch.LongTensor([[3, 5, 2, 1, 4]]).to(self.device) word_div = torch.LongTensor([[2, 2, 1]]).to(self.device) word_dur = torch.LongTensor([[8, 3, 4]]).to(self.device) + languages = torch.LongTensor([[0] * 5]).to(self.device) encoder_out = torch.rand(1, 5, hparams['hidden_size'], dtype=torch.float32, device=self.device) x_masks = tokens == 0 ph_midi = torch.LongTensor([[60] * 5]).to(self.device) @@ -196,6 +205,7 @@ def _torch_export_model(self): 1: 'n_tokens' } } + input_lang_id = hparams.get('use_lang_id', False) input_spk_embed = hparams['use_spk_id'] and not self.freeze_spk print(f'Exporting {self.fs2_class_name}...') @@ -205,13 +215,15 @@ def _torch_export_model(self): ( tokens, word_div, - word_dur + word_dur, + *([languages] if input_lang_id else []) ), self.linguistic_encoder_cache_path, input_names=[ 'tokens', 'word_div', - 'word_dur' + 'word_dur', + *(['languages'] if input_lang_id else []) ], output_names=encoder_output_names, dynamic_axes={ @@ -224,7 +236,8 @@ def _torch_export_model(self): 'word_dur': { 1: 'n_words' }, - **encoder_common_axes + **encoder_common_axes, + **({'languages': {1: 'n_tokens'}} if input_lang_id else {}) }, opset_version=15 ) @@ -268,12 +281,14 @@ def _torch_export_model(self): self.model.view_as_linguistic_encoder(), ( tokens, - ph_dur + ph_dur, + *([languages] if input_lang_id else []) ), self.linguistic_encoder_cache_path, input_names=[ 'tokens', - 'ph_dur' + 'ph_dur', + *(['languages'] if input_lang_id else []) ], output_names=encoder_output_names, dynamic_axes={ @@ -283,7 +298,8 @@ def _torch_export_model(self): 'ph_dur': { 1: 'n_tokens' }, - **encoder_common_axes + **encoder_common_axes, + **({'languages': {1: 'n_tokens'}} if input_lang_id else {}) }, opset_version=15 ) @@ -635,6 +651,10 @@ def _optimize_linguistic_graph(self, linguistic: onnx.ModelProto) -> onnx.ModelP print(f'Running ONNX Simplifier on {self.fs2_class_name}...') linguistic, check = onnxsim.simplify(linguistic, include_subgraph=True) assert check, 'Simplified ONNX model could not be validated' + onnx_helper.model_reorder_io_list( + linguistic, 'input', + target_name='languages', insert_after_name='tokens' + ) print(f'| optimize graph: {self.fs2_class_name}') return linguistic @@ -770,5 +790,10 @@ def _export_spk_embed(self, path: Path, spk_embed: torch.Tensor): print(f'| export spk embed => {path}') def _export_phonemes(self, path: Path): - self.phoneme_dictionary.dump(path) - print(f'| export phonemes => {path}') + ph_path = path / f'{self.model_name}.phonemes.json' + self.phoneme_dictionary.dump(ph_path) + print(f'| export phonemes => {ph_path}') + lang_path = path / f'{self.model_name}.languages.json' + with open(lang_path, 'w', encoding='utf8') as fw: + json.dump(self.lang_map, fw, ensure_ascii=False, indent=2) + print(f'| export languages => {lang_path}') diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py index 48a3afb40..bae452aea 100644 --- a/deployment/modules/fastspeech2.py +++ b/deployment/modules/fastspeech2.py @@ -41,8 +41,13 @@ def forward(self, dur): class FastSpeech2AcousticONNX(FastSpeech2Acoustic): - def __init__(self, vocab_size): + def __init__(self, vocab_size, cross_lingual_token_idx=None): super().__init__(vocab_size=vocab_size) + self.register_buffer( + 'cross_lingual_token_idx', + torch.LongTensor(cross_lingual_token_idx), + persistent=False + ) # [N,] # for temporary compatibility; will be completely removed in the future self.f0_embed_type = hparams.get('f0_embed_type', 'continuous') @@ -56,14 +61,29 @@ def __init__(self, vocab_size): self.speed_min, self.speed_max = hparams['augmentation_args']['random_time_stretching']['range'] # noinspection PyMethodOverriding - def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=None, spk_embed=None): + def forward( + self, tokens, durations, + f0, variances: dict, + gender=None, velocity=None, + spk_embed=None, + languages=None + ): txt_embed = self.txt_embed(tokens) durations = durations * (tokens > 0) mel2ph = self.lr(durations) f0 = f0 * (mel2ph > 0) mel2ph = mel2ph[..., None].repeat((1, 1, hparams['hidden_size'])) dur_embed = self.dur_embed(durations.float()[:, :, None]) - encoded = self.encoder(txt_embed, dur_embed, tokens == PAD_INDEX) + if self.use_lang_id: + lang_mask = torch.any( + tokens[..., None] == self.cross_lingual_token_idx[None, None], + dim=-1 + ) + lang_embed = self.lang_embed(languages * lang_mask) + extra_embed = dur_embed + lang_embed + else: + extra_embed = dur_embed + encoded = self.encoder(txt_embed, extra_embed, tokens == PAD_INDEX) encoded = F.pad(encoded, (0, 0, 1, 0)) condition = torch.gather(encoded, 1, mel2ph) @@ -109,25 +129,47 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity= class FastSpeech2VarianceONNX(FastSpeech2Variance): - def __init__(self, vocab_size): + def __init__(self, vocab_size, cross_lingual_token_idx=None): super().__init__(vocab_size=vocab_size) + self.register_buffer( + 'cross_lingual_token_idx', + torch.LongTensor(cross_lingual_token_idx), + persistent=False + ) self.lr = LengthRegulator() - def forward_encoder_word(self, tokens, word_div, word_dur): + def forward_encoder_word(self, tokens, word_div, word_dur, languages=None): txt_embed = self.txt_embed(tokens) ph2word = self.lr(word_div) onset = ph2word > F.pad(ph2word, [1, -1]) onset_embed = self.onset_embed(onset.long()) ph_word_dur = torch.gather(F.pad(word_dur, [1, 0]), 1, ph2word) word_dur_embed = self.word_dur_embed(ph_word_dur.float()[:, :, None]) + extra_embed = onset_embed + word_dur_embed + if self.use_lang_id: + lang_mask = torch.any( + tokens[..., None] == self.cross_lingual_token_idx[None, None], + dim=-1 + ) + lang_embed = self.lang_embed(languages * lang_mask) + extra_embed += lang_embed x_masks = tokens == PAD_INDEX - return self.encoder(txt_embed, onset_embed + word_dur_embed, x_masks), x_masks + return self.encoder(txt_embed, extra_embed, x_masks), x_masks - def forward_encoder_phoneme(self, tokens, ph_dur): + def forward_encoder_phoneme(self, tokens, ph_dur, languages=None): txt_embed = self.txt_embed(tokens) ph_dur_embed = self.ph_dur_embed(ph_dur.float()[:, :, None]) + if self.use_lang_id: + lang_mask = torch.any( + tokens[..., None] == self.cross_lingual_token_idx[None, None], + dim=-1 + ) + lang_embed = self.lang_embed(languages * lang_mask) + extra_embed = ph_dur_embed + lang_embed + else: + extra_embed = ph_dur_embed x_masks = tokens == PAD_INDEX - return self.encoder(txt_embed, ph_dur_embed, x_masks), x_masks + return self.encoder(txt_embed, extra_embed, x_masks), x_masks def forward_dur_predictor(self, encoder_out, x_masks, ph_midi, spk_embed=None): midi_embed = self.midi_embed(ph_midi) diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py index 1dd4fe129..bfb281958 100644 --- a/deployment/modules/toplevel.py +++ b/deployment/modules/toplevel.py @@ -18,12 +18,13 @@ class DiffSingerAcousticONNX(DiffSingerAcoustic): - def __init__(self, vocab_size, out_dims): + def __init__(self, vocab_size, out_dims, cross_lingual_token_idx=None): super().__init__(vocab_size, out_dims) del self.fs2 del self.diffusion self.fs2 = FastSpeech2AcousticONNX( - vocab_size=vocab_size + vocab_size=vocab_size, + cross_lingual_token_idx=cross_lingual_token_idx ) if self.diffusion_type == 'ddpm': self.diffusion = GaussianDiffusionONNX( @@ -73,11 +74,13 @@ def forward_fs2_aux( variances: dict, gender: Tensor = None, velocity: Tensor = None, - spk_embed: Tensor = None + spk_embed: Tensor = None, + languages: Tensor = None ): condition = self.fs2( tokens, durations, f0, variances=variances, - gender=gender, velocity=velocity, spk_embed=spk_embed + gender=gender, velocity=velocity, spk_embed=spk_embed, + languages=languages ) if self.use_shallow_diffusion: aux_mel_pred = self.aux_decoder(condition, infer=True) @@ -135,11 +138,12 @@ def view_as_reflow(self) -> nn.Module: class DiffSingerVarianceONNX(DiffSingerVariance): - def __init__(self, vocab_size): + def __init__(self, vocab_size, cross_lingual_token_idx=None): super().__init__(vocab_size=vocab_size) del self.fs2 self.fs2 = FastSpeech2VarianceONNX( - vocab_size=vocab_size + vocab_size=vocab_size, + cross_lingual_token_idx=cross_lingual_token_idx ) self.hidden_size = hparams['hidden_size'] if self.predict_pitch: @@ -210,13 +214,13 @@ def embed_frozen_spk(self, encoder_out): encoder_out += self.frozen_spk_embed return encoder_out - def forward_linguistic_encoder_word(self, tokens, word_div, word_dur): - encoder_out, x_masks = self.fs2.forward_encoder_word(tokens, word_div, word_dur) + def forward_linguistic_encoder_word(self, tokens, word_div, word_dur, languages=None): + encoder_out, x_masks = self.fs2.forward_encoder_word(tokens, word_div, word_dur, languages=languages) encoder_out = self.embed_frozen_spk(encoder_out) return encoder_out, x_masks - def forward_linguistic_encoder_phoneme(self, tokens, ph_dur): - encoder_out, x_masks = self.fs2.forward_encoder_phoneme(tokens, ph_dur) + def forward_linguistic_encoder_phoneme(self, tokens, ph_dur, languages=None): + encoder_out, x_masks = self.fs2.forward_encoder_phoneme(tokens, ph_dur, languages=languages) encoder_out = self.embed_frozen_spk(encoder_out) return encoder_out, x_masks diff --git a/utils/onnx_helper.py b/utils/onnx_helper.py index 176df56dc..1470e47d6 100644 --- a/utils/onnx_helper.py +++ b/utils/onnx_helper.py @@ -1,5 +1,5 @@ import re -from typing import Dict, Tuple, Union +from typing import Dict, Tuple, Union, Literal import onnx from google.protobuf.internal.containers import RepeatedCompositeFieldContainer @@ -51,6 +51,42 @@ def _override_shapes( _override_shapes(model.graph.output, output_shapes) +def model_reorder_io_list( + model: ModelProto, + input_or_output: Literal['input', 'output'], + target_name: str, + insert_after_name: str, +): + """ + Reorder the input of the model graph by moving the target input after the specified input (in-place operation). + If the given names are not found, the operation will be ignored. + :param model: model to perform the operation on + :param input_or_output: 'input' or 'output' to specify the list to reorder + :param target_name: the name of the input to be reordered + :param insert_after_name: the name of the input to be inserted after (None for the first) + """ + def _reorder_input(input_list: RepeatedCompositeFieldContainer[ValueInfoProto]): + nonlocal input_or_output + target_idx = -1 + insert_after_idx = -1 + for i, value_info in enumerate(input_list): + if value_info.name == target_name: + target_idx = i + if value_info.name == insert_after_name: + insert_after_idx = i + if target_idx != -1 and insert_after_idx != -1: + target = input_list.pop(target_idx) + input_list.insert(insert_after_idx + 1, target) + _verbose(f'| reorder {input_or_output}: \'{target_name}\' after \'{insert_after_name}\'') + + if input_or_output == 'input': + _reorder_input(model.graph.input) + elif input_or_output == 'output': + _reorder_input(model.graph.output) + else: + raise ValueError('Argument \'input_or_output\' should be either \'input\' or \'output\'.') + + def model_add_prefixes( model: ModelProto, initializer_prefix=None, @@ -97,7 +133,7 @@ def _add_prefixes_recursive(subgraph): new_name = initializer_prefix + initializer.name _verbose('| add prefix:', initializer.name, '->', new_name) initializer.name = new_name - + for value_info in subgraph.value_info: if dim_prefix is not None: for dim in value_info.type.tensor_type.shape.dim: @@ -114,7 +150,7 @@ def _add_prefixes_recursive(subgraph): new_name = value_info_prefix + value_info.name _verbose('| add prefix:', value_info.name, '->', new_name) value_info.name = new_name - + if node_prefix is not None: for node in subgraph.node: if ignored_pattern is not None and re.match(ignored_pattern, node.name): @@ -122,7 +158,7 @@ def _add_prefixes_recursive(subgraph): new_name = node_prefix + node.name _verbose('| add prefix:', node.name, '->', new_name) node.name = new_name - + for node in subgraph.node: # For 'If' and 'Loop' nodes, add prefixes recursively if node.op_type == 'If': @@ -134,7 +170,7 @@ def _add_prefixes_recursive(subgraph): if attr.name == 'body': body = onnx.helper.get_attribute_value(attr) _add_prefixes_recursive(body) - + # For each node, rename its inputs and outputs for io_list in [node.input, node.output]: for i, io_value in enumerate(io_list): diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 59a0924bc..df40fd4b8 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -102,7 +102,7 @@ def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[Li idx += 1 self._phone_to_id: Dict[str, int] = phone_to_id self._id_to_phone: List[Union[str, tuple]] = id_to_phone - self._cross_lingual_phonemes = cross_lingual_phonemes + self._cross_lingual_phonemes = frozenset(cross_lingual_phonemes) @property def vocab_size(self): @@ -111,6 +111,10 @@ def vocab_size(self): def __len__(self): return self.vocab_size + @property + def cross_lingual_phonemes(self): + return self._cross_lingual_phonemes + def is_cross_lingual(self, phone): return phone in self._cross_lingual_phonemes From 4a4b2b0abbd51871eba0d77b4347ff46f69eb7bb Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 28 Jul 2024 00:06:01 +0800 Subject: [PATCH 20/44] Refactor configuration schemas for datasets --- basics/base_binarizer.py | 81 +++++++++++++++-------------- preprocessing/acoustic_binarizer.py | 2 +- preprocessing/variance_binarizer.py | 4 +- 3 files changed, 46 insertions(+), 41 deletions(-) diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index 92e583d61..fb71614f6 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -43,13 +43,11 @@ class BaseBinarizer: the phoneme set. """ - def __init__(self, data_dir=None, data_attrs=None): - if data_dir is None: - data_dir = hparams['raw_data_dir'] - if not isinstance(data_dir, list): - data_dir = [data_dir] - - self.raw_data_dirs = [pathlib.Path(d) for d in data_dir] + def __init__(self, datasets=None, data_attrs=None): + if datasets is None: + datasets = hparams['datasets'] + self.datasets = datasets + self.raw_data_dirs = [pathlib.Path(ds['raw_data_dir']) for ds in self.datasets] self.binary_data_dir = pathlib.Path(hparams['binary_data_dir']) self.data_attrs = [] if data_attrs is None else data_attrs @@ -58,13 +56,11 @@ def __init__(self, data_dir=None, data_attrs=None): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.spk_map = {} - self.spk_ids = hparams['spk_ids'] - self.speakers = hparams['speakers'] + self.spk_ids = None self.build_spk_map() self.lang_map = {} self.dictionaries = hparams['dictionaries'] - self.languages = hparams['languages'] self.build_lang_map() self.items = {} @@ -76,58 +72,58 @@ def __init__(self, data_dir=None, data_attrs=None): self.timestep = hparams['hop_size'] / hparams['audio_sample_rate'] def build_spk_map(self): - assert isinstance(self.speakers, list), 'Speakers must be a list' - assert len(self.speakers) == len(self.raw_data_dirs), \ - 'Number of raw data dirs must equal number of speaker names!' - if len(self.spk_ids) == 0: - self.spk_ids = list(range(len(self.raw_data_dirs))) - else: - assert len(self.spk_ids) == len(self.raw_data_dirs), \ - 'Length of explicitly given spk_ids must equal the number of raw datasets.' - assert max(self.spk_ids) < hparams['num_spk'], \ - f'Index in spk_id sequence {self.spk_ids} is out of range. All values should be smaller than num_spk.' - - for spk_name, spk_id in zip(self.speakers, self.spk_ids): + spk_ids = [ds.get('spk_id') for ds in self.datasets] + assigned_spk_ids = {spk_id for spk_id in spk_ids if spk_id is not None} + for i in range(len(spk_ids)): + if spk_ids[i] is not None: + continue + idx = 0 + while idx in assigned_spk_ids: + idx += 1 + spk_ids[i] = idx + assert max(spk_ids) < hparams['num_spk'], \ + f'Index in spk_id sequence {spk_ids} is out of range. All values should be smaller than num_spk.' + + for spk_id, dataset in zip(spk_ids, self.datasets): + spk_name = dataset['speaker'] if spk_name in self.spk_map and self.spk_map[spk_name] != spk_id: raise ValueError(f'Invalid speaker ID assignment. Name \'{spk_name}\' is assigned ' f'with different speaker IDs: {self.spk_map[spk_name]} and {spk_id}.') self.spk_map[spk_name] = spk_id + self.spk_ids = spk_ids print("| spk_map: ", self.spk_map) def build_lang_map(self): - assert isinstance(self.languages, list), 'Languages must be a list' - assert len(self.languages) == len(self.raw_data_dirs), \ - 'Number of raw data dirs must equal number of language names!' - for lang in self.languages: - assert lang in self.dictionaries, f'Unrecognized language name: {lang}' assert len(self.dictionaries.keys()) <= hparams['num_lang'], \ 'Number of languages must not be greater than num_lang!' + for dataset in self.datasets: + assert dataset['language'] in self.dictionaries, f'Unrecognized language name: {dataset["language"]}' for lang_id, lang_name in enumerate(sorted(self.dictionaries.keys()), start=1): self.lang_map[lang_name] = lang_id print("| lang_map: ", self.lang_map) - def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): + def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang) -> dict: raise NotImplementedError() - def split_train_valid_set(self, item_names): + def split_train_valid_set(self, prefixes: list): """ Split the dataset into training set and validation set. :return: train_item_names, valid_item_names """ - prefixes = {str(pr): 1 for pr in hparams['test_prefixes']} + prefixes = {str(pr): 1 for pr in prefixes} valid_item_names = {} # Add prefixes that specified speaker index and matches exactly item name to test set for prefix in deepcopy(prefixes): - if prefix in item_names: + if prefix in self.item_names: valid_item_names[prefix] = 1 prefixes.pop(prefix) # Add prefixes that exactly matches item name without speaker id to test set for prefix in deepcopy(prefixes): matched = False - for name in item_names: + for name in self.item_names: if name.split(':')[-1] == prefix: valid_item_names[name] = 1 matched = True @@ -136,7 +132,7 @@ def split_train_valid_set(self, item_names): # Add names with one of the remaining prefixes to test set for prefix in deepcopy(prefixes): matched = False - for name in item_names: + for name in self.item_names: if name.startswith(prefix): valid_item_names[name] = 1 matched = True @@ -144,7 +140,7 @@ def split_train_valid_set(self, item_names): prefixes.pop(prefix) for prefix in deepcopy(prefixes): matched = False - for name in item_names: + for name in self.item_names: if name.split(':')[-1].startswith(prefix): valid_item_names[name] = 1 matched = True @@ -160,7 +156,7 @@ def split_train_valid_set(self, item_names): valid_item_names = list(valid_item_names.keys()) assert len(valid_item_names) > 0, 'Validation set is empty!' - train_item_names = [x for x in item_names if x not in set(valid_item_names)] + train_item_names = [x for x in self.item_names if x not in set(valid_item_names)] assert len(train_item_names) > 0, 'Training set is empty!' return train_item_names, valid_item_names @@ -184,10 +180,19 @@ def meta_data_iterator(self, prefix): def process(self): # load each dataset - for ds_id, (data_dir, spk, lang) in enumerate(zip(self.raw_data_dirs, self.speakers, self.languages)): - self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk=spk, lang=lang) + test_prefixes = [] + for ds_id, dataset in enumerate(self.datasets): + items = self.load_meta_data( + pathlib.Path(dataset['raw_data_dir']), + ds_id=ds_id, spk=dataset['speaker'], lang=dataset['language'] + ) + self.items.update(items) + test_prefixes.extend( + f'{ds_id}:{prefix}' + for prefix in dataset.get('test_prefixes', []) + ) self.item_names = sorted(list(self.items.keys())) - self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names) + self._train_item_names, self._valid_item_names = self.split_train_valid_set(test_prefixes) if self.binarization_args['shuffle']: random.shuffle(self.item_names) diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 99d0aaf68..0455c4f94 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -95,7 +95,7 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang): f'Negative ph_dur found in \'{item_name}\'.' meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict - self.items.update(meta_data_dict) + return meta_data_dict @torch.no_grad() def process_item(self, item_name, meta_data, binarization_args): diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 30c175b2c..c88ae924c 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -132,7 +132,7 @@ def require(attr): temp_dict = { 'ds_idx': item_idx, 'spk_id': self.spk_map[spk], - 'spk_name': self.speakers[ds_id], + 'spk_name': spk, 'language_id': self.lang_map[lang], 'language_name': lang, 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), @@ -173,7 +173,7 @@ def require(attr): meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict - self.items.update(meta_data_dict) + return meta_data_dict def check_coverage(self): super().check_coverage() From 678e3e6fb19aeeebe7b71263c119af960db840a7 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 28 Jul 2024 15:17:09 +0800 Subject: [PATCH 21/44] Add check of existence for merged phonemes --- utils/phoneme_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index df40fd4b8..a9ddb160b 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -43,6 +43,12 @@ def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[Li f"Invalid phoneme tag '{phoneme}' in merged group: " f"unrecognized language name '{lang}'." ) + unique_name = phoneme if self._multi_langs else name + if unique_name not in all_phonemes: + raise ValueError( + f"Invalid phoneme tag '{phoneme}' in merged group: " + f"not found in phoneme set." + ) merged_groups = [set(phones) for phones in merged_groups if len(phones) > 1] else: _merged_groups = [] From d0d7b7319990e16e88003a7a07dc3ef5a5cecf24 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 28 Jul 2024 15:17:25 +0800 Subject: [PATCH 22/44] Fix spk_id assignment --- basics/base_binarizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index fb71614f6..d1f812015 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -74,13 +74,14 @@ def __init__(self, datasets=None, data_attrs=None): def build_spk_map(self): spk_ids = [ds.get('spk_id') for ds in self.datasets] assigned_spk_ids = {spk_id for spk_id in spk_ids if spk_id is not None} + idx = 0 for i in range(len(spk_ids)): if spk_ids[i] is not None: continue - idx = 0 while idx in assigned_spk_ids: idx += 1 spk_ids[i] = idx + assigned_spk_ids.add(idx) assert max(spk_ids) < hparams['num_spk'], \ f'Index in spk_id sequence {spk_ids} is out of range. All values should be smaller than num_spk.' From f3a969c886eaf871fd99a39fd04d8b6750ff2f0b Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 28 Jul 2024 17:43:48 +0800 Subject: [PATCH 23/44] Fix languages.json filename --- deployment/exporters/acoustic_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py index 9160af6e3..287e35099 100644 --- a/deployment/exporters/acoustic_exporter.py +++ b/deployment/exporters/acoustic_exporter.py @@ -414,7 +414,7 @@ def _export_phonemes(self, path: Path): ph_path = path / f'{self.model_name}.phonemes.json' self.phoneme_dictionary.dump(ph_path) print(f'| export phonemes => {ph_path}') - lang_path = path / 'languages.json' + lang_path = path / f'{self.model_name}.languages.json' with open(lang_path, 'w', encoding='utf8') as f: json.dump(self.lang_map, f, ensure_ascii=False, indent=2) print(f'| export languages => {lang_path}') From bf44910f4e2f0f884f48f2dded1a01c51500be6d Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 28 Jul 2024 18:11:34 +0800 Subject: [PATCH 24/44] Fix `languages` key in dsconfig.yaml --- deployment/exporters/acoustic_exporter.py | 1 + deployment/exporters/variance_exporter.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py index 287e35099..0e31105e7 100644 --- a/deployment/exporters/acoustic_exporter.py +++ b/deployment/exporters/acoustic_exporter.py @@ -124,6 +124,7 @@ def export_attachments(self, path: Path): dsconfig = { # basic configs 'phonemes': f'{self.model_name}.phonemes.json', + 'languages': f'{self.model_name}.languages.json', 'use_lang_id': hparams.get('use_lang_id', False), 'acoustic': f'{model_name}.onnx', 'hidden_size': hparams['hidden_size'], diff --git a/deployment/exporters/variance_exporter.py b/deployment/exporters/variance_exporter.py index 76d061834..27e8e4d0b 100644 --- a/deployment/exporters/variance_exporter.py +++ b/deployment/exporters/variance_exporter.py @@ -155,7 +155,7 @@ def export_attachments(self, path: Path): dsconfig = { # basic configs 'phonemes': f'{self.model_name}.phonemes.json', - 'languages': sorted(self.lang_map.keys()), + 'languages': f'{self.model_name}.languages.json', 'use_lang_id': hparams.get('use_lang_id', False), 'linguistic': f'{model_name}.linguistic.onnx', 'hidden_size': self.model.hidden_size, From fb5f58922b62ddcf3533a0a3bfe5867b2ac8b175 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Thu, 1 Aug 2024 01:21:05 +0800 Subject: [PATCH 25/44] Set `use_lang_id` to false if there are no cross-lingual phonemes --- deployment/exporters/acoustic_exporter.py | 5 +++-- deployment/exporters/variance_exporter.py | 5 +++-- deployment/modules/fastspeech2.py | 4 ++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py index 0e31105e7..849dae5db 100644 --- a/deployment/exporters/acoustic_exporter.py +++ b/deployment/exporters/acoustic_exporter.py @@ -33,6 +33,7 @@ def __init__( self.spk_map: dict = self.build_spk_map() self.lang_map: dict = self.build_lang_map() self.phoneme_dictionary = load_phoneme_dictionary() + self.use_lang_id = hparams.get('use_lang_id', False) and len(self.phoneme_dictionary.cross_lingual_phonemes) > 0 self.model = self.build_model() self.fs2_aux_cache_path = self.cache_dir / ( 'fs2_aux.onnx' if self.model.use_shallow_diffusion else 'fs2.onnx' @@ -125,7 +126,7 @@ def export_attachments(self, path: Path): # basic configs 'phonemes': f'{self.model_name}.phonemes.json', 'languages': f'{self.model_name}.languages.json', - 'use_lang_id': hparams.get('use_lang_id', False), + 'use_lang_id': self.use_lang_id, 'acoustic': f'{model_name}.onnx', 'hidden_size': hparams['hidden_size'], 'vocoder': 'nsf_hifigan_44.1k_hop512_128bin_2024.02', @@ -217,7 +218,7 @@ def _torch_export_model(self): dynamix_axes['spk_embed'] = { 1: 'n_frames' } - if hparams.get('use_lang_id'): + if self.use_lang_id: kwargs['languages'] = torch.zeros_like(tokens) input_names.append('languages') dynamix_axes['languages'] = { diff --git a/deployment/exporters/variance_exporter.py b/deployment/exporters/variance_exporter.py index 27e8e4d0b..82808ec08 100644 --- a/deployment/exporters/variance_exporter.py +++ b/deployment/exporters/variance_exporter.py @@ -33,6 +33,7 @@ def __init__( self.spk_map: dict = self.build_spk_map() self.lang_map: dict = self.build_lang_map() self.phoneme_dictionary = load_phoneme_dictionary() + self.use_lang_id = hparams.get('use_lang_id', False) and len(self.phoneme_dictionary.cross_lingual_phonemes) > 0 self.model = self.build_model() self.linguistic_encoder_cache_path = self.cache_dir / 'linguistic.onnx' self.dur_predictor_cache_path = self.cache_dir / 'dur.onnx' @@ -156,7 +157,7 @@ def export_attachments(self, path: Path): # basic configs 'phonemes': f'{self.model_name}.phonemes.json', 'languages': f'{self.model_name}.languages.json', - 'use_lang_id': hparams.get('use_lang_id', False), + 'use_lang_id': self.use_lang_id, 'linguistic': f'{model_name}.linguistic.onnx', 'hidden_size': self.model.hidden_size, 'predict_dur': self.model.predict_dur, @@ -205,7 +206,7 @@ def _torch_export_model(self): 1: 'n_tokens' } } - input_lang_id = hparams.get('use_lang_id', False) + input_lang_id = self.use_lang_id input_spk_embed = hparams['use_spk_id'] and not self.freeze_spk print(f'Exporting {self.fs2_class_name}...') diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py index bae452aea..20dfdb0d7 100644 --- a/deployment/modules/fastspeech2.py +++ b/deployment/modules/fastspeech2.py @@ -48,6 +48,8 @@ def __init__(self, vocab_size, cross_lingual_token_idx=None): torch.LongTensor(cross_lingual_token_idx), persistent=False ) # [N,] + if len(cross_lingual_token_idx) == 0: + self.use_lang_id = False # for temporary compatibility; will be completely removed in the future self.f0_embed_type = hparams.get('f0_embed_type', 'continuous') @@ -136,6 +138,8 @@ def __init__(self, vocab_size, cross_lingual_token_idx=None): torch.LongTensor(cross_lingual_token_idx), persistent=False ) + if len(cross_lingual_token_idx) == 0: + self.use_lang_id = False self.lr = LengthRegulator() def forward_encoder_word(self, tokens, word_div, word_dur, languages=None): From 333d9ef8e939cb7ae101372a266e65cf46178619 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 3 Aug 2024 00:07:48 +0800 Subject: [PATCH 26/44] Support defining extra phonemes --- utils/phoneme_utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index a9ddb160b..012a65f35 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -8,8 +8,23 @@ class PhonemeDictionary: - def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[List[str]] = None): + def __init__( + self, + dictionaries: Dict[str, pathlib.Path], + extra_phonemes: List[str] = None, + merged_groups: List[List[str]] = None + ): all_phonemes = {'AP', 'SP'} + if extra_phonemes: + for ph in extra_phonemes: + if '/' in ph: + lang, name = ph.split('/', maxsplit=1) + if lang not in dictionaries: + raise ValueError( + f"Invalid phoneme tag '{ph}' in extra phonemes: " + f"unrecognized language name '{lang}'." + ) + all_phonemes.add(ph) self._multi_langs = len(dictionaries) > 1 for lang, dict_path in dictionaries.items(): with open(dict_path, 'r', encoding='utf8') as dict_file: @@ -192,5 +207,6 @@ def load_phoneme_dictionary() -> PhonemeDictionary: } return PhonemeDictionary( dictionaries=dicts, + extra_phonemes=hparams.get('extra_phonemes'), merged_groups=hparams.get('merged_phoneme_groups') ) From d3cd5cdb251798ee36fefc1dc86cfd3195405922 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 3 Aug 2024 00:45:53 +0800 Subject: [PATCH 27/44] Refactor configs --- configs/acoustic.yaml | 32 ++++++++++-------- configs/base.yaml | 2 +- configs/templates/config_acoustic.yaml | 46 +++++++++++++++++--------- configs/templates/config_variance.yaml | 44 +++++++++++++++--------- configs/variance.yaml | 39 ++++++++++++---------- 5 files changed, 102 insertions(+), 61 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 0364b5c15..f3cf127f4 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -2,17 +2,22 @@ base_config: - configs/base.yaml task_cls: training.acoustic_task.AcousticTask -num_spk: 1 -speakers: - - opencpop -spk_ids: [] -test_prefixes: [ - '2044', - '2086', - '2092', - '2093', - '2100', -] + +dictionaries: + zh: dictionaries/opencpop-extension.txt +extra_phonemes: [] +merged_phoneme_groups: [] +datasets: + - raw_data_dir: 'data/opencpop/raw' + speaker: opencpop + spk_id: 0 + language: zh + test_prefixes: + - '2044' + - '2086' + - '2092' + - '2093' + - '2100' vocoder: NsfHifiGAN vocoder_ckpt: checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt @@ -41,10 +46,8 @@ augmentation_args: range: [0.5, 2.] scale: 0.75 -raw_data_dir: 'data/opencpop/raw' binary_data_dir: 'data/opencpop/binary' binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer -dictionary: dictionaries/opencpop-extension.txt spec_min: [-12] spec_max: [0] mel_vmin: -14. @@ -55,7 +58,10 @@ breathiness_smooth_width: 0.12 voicing_smooth_width: 0.12 tension_smooth_width: 0.12 +use_lang_id: false +num_lang: 1 use_spk_id: false +num_spk: 1 use_energy_embed: false use_breathiness_embed: false use_voicing_embed: false diff --git a/configs/base.yaml b/configs/base.yaml index b2e610f95..ab33c5541 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -5,7 +5,7 @@ task_cls: null # dataset ############# sort_by_len: true -raw_data_dir: null +datasets: [] binary_data_dir: null binarizer_cls: null binarization_args: diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 198444bc7..21b8e5805 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -1,19 +1,33 @@ -base_config: configs/acoustic.yaml +base_config: + - configs/acoustic.yaml + +dictionaries: + zh: dictionaries/opencpop-extension.txt +extra_phonemes: [] +merged_phoneme_groups: [] + +datasets: + - raw_data_dir: data/xxx1/raw + speaker: speaker1 + spk_id: 0 + language: zh + test_prefixes: + - wav1 + - wav2 + - wav3 + - wav4 + - wav5 + - raw_data_dir: data/xxx2/raw + speaker: speaker2 + spk_id: 1 + language: zh + test_prefixes: + - wav1 + - wav2 + - wav3 + - wav4 + - wav5 -raw_data_dir: - - data/xxx1/raw - - data/xxx2/raw -speakers: - - speaker1 - - speaker2 -spk_ids: [] -test_prefixes: - - wav1 - - wav2 - - wav3 - - wav4 - - wav5 -dictionary: dictionaries/opencpop-extension.txt binary_data_dir: data/xxx/binary binarization_args: num_workers: 0 @@ -24,6 +38,8 @@ hnsep_ckpt: 'checkpoints/vr/model.pt' vocoder: NsfHifiGAN vocoder_ckpt: checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt +use_lang_id: false +num_lang: 1 use_spk_id: false num_spk: 1 diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index d75667797..952e994ae 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -1,29 +1,43 @@ base_config: - configs/variance.yaml -raw_data_dir: - - data/xxx1/raw - - data/xxx2/raw -speakers: - - speaker1 - - speaker2 -spk_ids: [] -test_prefixes: - - wav1 - - wav2 - - wav3 - - wav4 - - wav5 -dictionary: dictionaries/opencpop-extension.txt +dictionaries: + zh: dictionaries/opencpop-extension.txt +extra_phonemes: [] +merged_phoneme_groups: [] + +datasets: + - raw_data_dir: data/xxx1/raw + speaker: speaker1 + spk_id: 0 + language: zh + test_prefixes: + - wav1 + - wav2 + - wav3 + - wav4 + - wav5 + - raw_data_dir: data/xxx2/raw + speaker: speaker2 + spk_id: 1 + language: zh + test_prefixes: + - wav1 + - wav2 + - wav3 + - wav4 + - wav5 + binary_data_dir: data/xxx/binary binarization_args: num_workers: 0 - pe: parselmouth pe_ckpt: 'checkpoints/rmvpe/model.pt' hnsep: vr hnsep_ckpt: 'checkpoints/vr/model.pt' +use_lang_id: false +num_lang: 1 use_spk_id: false num_spk: 1 # NOTICE: before enabling variance modules, please read the docs at diff --git a/configs/variance.yaml b/configs/variance.yaml index 2c6d002da..9d7b59b4b 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -2,17 +2,22 @@ base_config: - configs/base.yaml task_cls: training.variance_task.VarianceTask -num_spk: 1 -speakers: - - opencpop -spk_ids: [] -test_prefixes: [ - '2044', - '2086', - '2092', - '2093', - '2100', -] + +dictionaries: + zh: dictionaries/opencpop-extension.txt +extra_phonemes: [] +merged_phoneme_groups: [] +datasets: + - raw_data_dir: 'data/opencpop/raw' + speaker: opencpop + spk_id: 0 + language: zh + test_prefixes: + - '2044' + - '2086' + - '2092' + - '2093' + - '2100' audio_sample_rate: 44100 hop_size: 512 # Hop size. @@ -25,16 +30,13 @@ binarization_args: num_workers: 0 prefer_ds: false -raw_data_dir: 'data/opencpop_variance/raw' binary_data_dir: 'data/opencpop_variance/binary' binarizer_cls: preprocessing.variance_binarizer.VarianceBinarizer -dictionary: dictionaries/opencpop-extension.txt +use_lang_id: false +num_lang: 1 use_spk_id: false - -rel_pos: true -hidden_size: 256 - +num_spk: 1 predict_dur: true predict_pitch: true predict_energy: false @@ -42,6 +44,9 @@ predict_breathiness: false predict_voicing: false predict_tension: false +rel_pos: true +hidden_size: 256 + dur_prediction_args: arch: fs2 hidden_size: 512 From f729db859bea3ceab6785c7164644e88dbb55f1d Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 3 Aug 2024 22:52:36 +0800 Subject: [PATCH 28/44] Prefer file copies in work_dir when loading dictionaries --- utils/phoneme_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 012a65f35..584c55d84 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -186,9 +186,9 @@ def load_phoneme_dictionary() -> PhonemeDictionary: if config_dicts is not None: dicts = {} for lang, config_dict_path in config_dicts.items(): - config_dict_path = pathlib.Path(config_dict_path) + config_dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt' if not config_dict_path.exists(): - config_dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt' + config_dict_path = pathlib.Path(config_dict_path) if not config_dict_path.exists(): raise FileNotFoundError( f"Could not locate dictionary for language '{lang}'." From 453cb0fae5d6e03024f01695b95a548cfb4e15ce Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 4 Aug 2024 22:08:00 +0800 Subject: [PATCH 29/44] Fix cannot locate dictionary --- utils/phoneme_utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 584c55d84..f1556068b 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -37,6 +37,8 @@ def __init__( f"Invalid phoneme tag '{phoneme}' in dictionary '{dict_path}': " f"should not contain the reserved character '/'." ) + if phoneme in all_phonemes: + continue if self._multi_langs: all_phonemes.add(f'{lang}/{phoneme}') else: @@ -186,10 +188,10 @@ def load_phoneme_dictionary() -> PhonemeDictionary: if config_dicts is not None: dicts = {} for lang, config_dict_path in config_dicts.items(): - config_dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt' - if not config_dict_path.exists(): - config_dict_path = pathlib.Path(config_dict_path) - if not config_dict_path.exists(): + dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt' + if not dict_path.exists(): + dict_path = pathlib.Path(config_dict_path) + if not dict_path.exists(): raise FileNotFoundError( f"Could not locate dictionary for language '{lang}'." ) From 663db52b4cf4b19ba6d132af0568d3f9f7ba8692 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 17 Aug 2024 22:31:48 +0800 Subject: [PATCH 30/44] Fix unexpected loading error when dictionary changes --- utils/phoneme_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index f1556068b..50145979e 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -195,17 +195,17 @@ def load_phoneme_dictionary() -> PhonemeDictionary: raise FileNotFoundError( f"Could not locate dictionary for language '{lang}'." ) - dicts[lang] = config_dict_path + dicts[lang] = dict_path else: - config_dict_path = pathlib.Path(hparams['dictionary']) - if not config_dict_path.exists(): - config_dict_path = pathlib.Path(hparams['work_dir']) / 'dictionary.txt' - if not config_dict_path.exists(): + dict_path = pathlib.Path(hparams['work_dir']) / 'dictionary.txt' + if not dict_path.exists(): + dict_path = pathlib.Path(hparams['dictionary']) + if not dict_path.exists(): raise FileNotFoundError( f"Could not locate dictionary file." ) dicts = { - 'default': config_dict_path + 'default': dict_path } return PhonemeDictionary( dictionaries=dicts, From 6c7bb0836919e509e2c8fd8b8c5ed0d1731e0499 Mon Sep 17 00:00:00 2001 From: Anjo <87346264+AnAndroNerd@users.noreply.github.com> Date: Fri, 15 Nov 2024 22:28:09 -0700 Subject: [PATCH 31/44] Update toplevel.py (#219) --- modules/toplevel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/toplevel.py b/modules/toplevel.py index 5aedfed76..aceff1f70 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -154,7 +154,7 @@ def __init__(self, vocab_size): self.pitch_retake_embed = Embedding(2, hparams['hidden_size']) pitch_hparams = hparams['pitch_prediction_args'] self.pitch_backbone_type = compat.get_backbone_type(hparams, nested_config=pitch_hparams) - self.pitch_backbone_args = compat.get_backbone_args(hparams, backbone_type=self.pitch_backbone_type) + self.pitch_backbone_args = compat.get_backbone_args(pitch_hparams, backbone_type=self.pitch_backbone_type) if self.diffusion_type == 'ddpm': self.pitch_predictor = PitchDiffusion( vmin=pitch_hparams['pitd_norm_min'], From da79ef21653430d23ff2d9d7925929f15b115c92 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 4 Jan 2025 16:15:23 +0800 Subject: [PATCH 32/44] Fix unexpected config passing --- configs/acoustic.yaml | 15 ++------------- configs/variance.yaml | 15 ++------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 294471638..aad05ea15 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -3,21 +3,10 @@ base_config: task_cls: training.acoustic_task.AcousticTask -dictionaries: - zh: dictionaries/opencpop-extension.txt +dictionaries: {} extra_phonemes: [] merged_phoneme_groups: [] -datasets: - - raw_data_dir: 'data/opencpop/raw' - speaker: opencpop - spk_id: 0 - language: zh - test_prefixes: - - '2044' - - '2086' - - '2092' - - '2093' - - '2100' +datasets: [] vocoder: NsfHifiGAN vocoder_ckpt: checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt diff --git a/configs/variance.yaml b/configs/variance.yaml index 49e18ab7f..3e02e430e 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -3,21 +3,10 @@ base_config: task_cls: training.variance_task.VarianceTask -dictionaries: - zh: dictionaries/opencpop-extension.txt +dictionaries: {} extra_phonemes: [] merged_phoneme_groups: [] -datasets: - - raw_data_dir: 'data/opencpop/raw' - speaker: opencpop - spk_id: 0 - language: zh - test_prefixes: - - '2044' - - '2086' - - '2092' - - '2093' - - '2100' +datasets: [] audio_sample_rate: 44100 hop_size: 512 # Hop size. From 5d5632914a9ccb3e83da0b6b41c76da9416eced4 Mon Sep 17 00:00:00 2001 From: yxlllc <33565655+yxlllc@users.noreply.github.com> Date: Fri, 17 Jan 2025 01:10:43 +0800 Subject: [PATCH 33/44] Update lynxnet backbone (#228) * Change the injection method of conditions on lynxnet (#225) * update configurations for new-lynxnet * update configurations for new-lynxnet * update configurations for new-lynxnet --------- Co-authored-by: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com> --- configs/acoustic.yaml | 1 + configs/templates/config_acoustic.yaml | 1 + configs/templates/config_variance.yaml | 4 +++ modules/backbones/lynxnet.py | 39 +++++++++++++++++--------- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index aad05ea15..99fd175bd 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -74,6 +74,7 @@ backbone_args: num_layers: 6 kernel_size: 31 dropout_rate: 0.0 + strong_cond: true main_loss_type: l2 main_loss_log_norm: false schedule_type: 'linear' diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 5bcfc2df4..263d936fa 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -80,6 +80,7 @@ backbone_args: num_layers: 6 kernel_size: 31 dropout_rate: 0.0 + strong_cond: true #backbone_type: 'wavenet' #backbone_args: # num_channels: 512 diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index a9e2a19d4..ad051c0ec 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -101,6 +101,8 @@ pitch_prediction_args: # backbone_args: # num_layers: 6 # num_channels: 512 +# dropout_rate: 0.0 +# strong_cond: true variances_prediction_args: total_repeat_bins: 48 @@ -113,6 +115,8 @@ variances_prediction_args: # backbone_args: # num_layers: 6 # num_channels: 384 +# dropout_rate: 0.0 +# strong_cond: true lambda_dur_loss: 1.0 lambda_pitch_loss: 1.0 diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py index 744967c6b..18e7bf497 100644 --- a/modules/backbones/lynxnet.py +++ b/modules/backbones/lynxnet.py @@ -10,6 +10,12 @@ from utils.hparams import hparams +class Conv1d(torch.nn.Conv1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + nn.init.kaiming_normal_(self.weight) + + class SwiGLU(nn.Module): # Swish-Applies the gated linear unit function. def __init__(self, dim=-1): @@ -39,7 +45,7 @@ def calc_same_padding(kernel_size): pad = kernel_size // 2 return pad, pad - (kernel_size + 1) % 2 - def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.): + def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.0): super().__init__() inner_dim = dim * expansion_factor activation_classes = { @@ -73,27 +79,30 @@ def forward(self, x): class LYNXNetResidualLayer(nn.Module): - def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.): + def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.0): super().__init__() self.diffusion_projection = nn.Conv1d(dim, dim, 1) self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1) self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, activation=activation, dropout=dropout) - def forward(self, x, conditioner, diffusion_step): - res_x = x.transpose(1, 2) - x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner) - x = x.transpose(1, 2) - x = self.convmodule(x) # (#batch, dim, length) - x = x + res_x + def forward(self, x, conditioner, diffusion_step, front_cond_inject=False): + if front_cond_inject: + x = x + self.conditioner_projection(conditioner) + res_x = x + else: + res_x = x + x = x + self.conditioner_projection(conditioner) + x = x + self.diffusion_projection(diffusion_step) x = x.transpose(1, 2) - + x = self.convmodule(x) # (#batch, dim, length) + x = x.transpose(1, 2) + res_x return x # (#batch, length, dim) class LYNXNet(nn.Module): def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31, - activation='PReLU', dropout=0.): + activation='PReLU', dropout=0.0, strong_cond=False): """ LYNXNet(Linear Gated Depthwise Separable Convolution Network) TIPS:You can control the style of the generated results by modifying the 'activation', @@ -104,7 +113,7 @@ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansio super().__init__() self.in_dims = in_dims self.n_feats = n_feats - self.input_projection = nn.Conv1d(in_dims * n_feats, num_channels, 1) + self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1) self.diffusion_embedding = nn.Sequential( SinusoidalPosEmb(num_channels), nn.Linear(num_channels, num_channels * 4), @@ -125,7 +134,8 @@ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansio ] ) self.norm = nn.LayerNorm(num_channels) - self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1) + self.output_projection = Conv1d(num_channels, in_dims * n_feats, kernel_size=1) + self.strong_cond = strong_cond nn.init.zeros_(self.output_projection.weight) def forward(self, spec, diffusion_step, cond): @@ -142,12 +152,13 @@ def forward(self, spec, diffusion_step, cond): x = spec.flatten(start_dim=1, end_dim=2) # [B, F x M, T] x = self.input_projection(x) # x [B, residual_channel, T] - x = F.gelu(x) + if not self.strong_cond: + x = F.gelu(x) diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1) for layer in self.residual_layers: - x = layer(x, cond, diffusion_step) + x = layer(x, cond, diffusion_step, front_cond_inject=self.strong_cond) # post-norm x = self.norm(x.transpose(1, 2)).transpose(1, 2) From 3f8bc85280d66302389c8c86b7cee4df9b3e950e Mon Sep 17 00:00:00 2001 From: yxlllc <33565655+yxlllc@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:39:30 +0800 Subject: [PATCH 34/44] Improve fastspeech2 encoder using Rotary Position Embedding (RoPE) in multi-head self-attention (#234) * update multi-head self attention with RoPE * RoPE onnx (#230) * fix requirements.txt (#233) * fix rope for melody encoder * support swiglu activation for ffn * update dependencies --------- Co-authored-by: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com> --- configs/acoustic.yaml | 2 + configs/templates/config_acoustic.yaml | 2 + configs/templates/config_variance.yaml | 2 + configs/variance.yaml | 2 + modules/commons/common_layers.py | 125 +++++++-- modules/commons/rotary_embedding_torch.py | 320 ++++++++++++++++++++++ modules/fastspeech/acoustic_encoder.py | 3 +- modules/fastspeech/tts_modules.py | 38 +-- modules/fastspeech/variance_encoder.py | 6 +- requirements-onnx.txt | 1 + requirements.txt | 1 + 11 files changed, 465 insertions(+), 37 deletions(-) create mode 100644 modules/commons/rotary_embedding_torch.py diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 99fd175bd..9f27733f7 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -62,6 +62,8 @@ diffusion_type: reflow time_scale_factor: 1000 timesteps: 1000 max_beta: 0.02 +enc_ffn_kernel_size: 3 +use_rope: true rel_pos: true sampling_algorithm: euler sampling_steps: 20 diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 263d936fa..59778df99 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -69,6 +69,8 @@ augmentation_args: # diffusion and shallow diffusion diffusion_type: reflow +enc_ffn_kernel_size: 3 +use_rope: true use_shallow_diffusion: true T_start: 0.4 T_start_infer: 0.4 diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index ad051c0ec..7d5b211aa 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -63,6 +63,8 @@ voicing_db_max: -12.0 tension_logit_min: -10.0 tension_logit_max: 10.0 +enc_ffn_kernel_size: 3 +use_rope: true hidden_size: 256 dur_prediction_args: arch: fs2 diff --git a/configs/variance.yaml b/configs/variance.yaml index 3e02e430e..4f69e34f7 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -33,6 +33,8 @@ predict_breathiness: false predict_voicing: false predict_tension: false +enc_ffn_kernel_size: 3 +use_rope: true rel_pos: true hidden_size: 256 diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index b12cc7f96..3927cd272 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -1,7 +1,7 @@ from __future__ import annotations import math - +import numpy as np import torch import torch.nn.functional as F import torch.onnx.operators @@ -104,24 +104,43 @@ def max_positions(): return int(1e5) # an arbitrary large number +class SwiGLU(nn.Module): + # Swish-Applies the gated linear unit function. + def __init__(self, dim=-1): + super().__init__() + self.dim = dim + + def forward(self, x): + # out, gate = x.chunk(2, dim=self.dim) + # Using torch.split instead of chunk for ONNX export compatibility. + out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim) + return out * F.silu(gate) + + class TransformerFFNLayer(nn.Module): def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'): super().__init__() self.kernel_size = kernel_size self.dropout = dropout self.act = act - self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2) + filter_size_1 = filter_size if self.act == 'relu': self.act_fn = ReLU() elif self.act == 'gelu': self.act_fn = GELU() elif self.act == 'swish': self.act_fn = SiLU() + elif self.act == 'swiglu': + self.act_fn = SwiGLU() + filter_size_1 = filter_size * 2 + else: + raise ValueError(f'{act} is not a valid activation') + self.ffn_1 = nn.Conv1d(hidden_size, filter_size_1, kernel_size, padding=kernel_size // 2) self.ffn_2 = XavierUniformInitLinear(filter_size, hidden_size) def forward(self, x): - # x: T x B x C - x = self.ffn_1(x.permute(1, 2, 0)).permute(2, 0, 1) + # x: B x T x C + x = self.ffn_1(x.transpose(1, 2)).transpose(1, 2) x = x * self.kernel_size ** -0.5 x = self.act_fn(x) @@ -130,15 +149,86 @@ def forward(self, x): return x +class MultiheadSelfAttentionWithRoPE(nn.Module): + def __init__(self, embed_dim, num_heads, dropout=0.1, bias=False, rotary_embed=None): + super().__init__() + assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads" + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + # Linear layers for Q, K, V projections + self.in_proj = nn.Linear(embed_dim, embed_dim * 3, bias=bias) + + # Final linear layer after concatenation + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + # Dropout layer + self.dropout = nn.Dropout(dropout) + + # Rotary Embeddings + self.rotary_embed = rotary_embed + + def forward(self, x, key_padding_mask=None): + # x: (B, L, C) + # key_padding_mask: (B, L) + batch_size, seq_len, embed_dim = x.size() + + # Project inputs to Q, K, V + Q, K, V = torch.split(self.in_proj(x), self.embed_dim, dim=-1) + + # Reshape Q, K, V for multi-head attention + Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # (B, H, L, D) + K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # (B, H, L, D) + V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # (B, H, L, D) + + # Apply RoPE + if self.rotary_embed is not None: + Q = self.rotary_embed.rotate_queries_or_keys(Q) + K = self.rotary_embed.rotate_queries_or_keys(K) + + # Compute attention scores + scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_dim) # (B, H, L, L) + + # Apply key padding mask if provided + if key_padding_mask is not None: + # Expand mask to match attention scores shape + mask = key_padding_mask.unsqueeze(1).unsqueeze(1) # (B, 1, 1, L) + scores = scores.masked_fill(mask == 1, -np.inf) # Masked positions are set to -inf + + # Compute attention weights + attn_weights = F.softmax(scores, dim=-1) # (B, H, L, L) + attn_weights = self.dropout(attn_weights) + + # Apply attention weights to V + attn_output = torch.matmul(attn_weights, V) # (B, H, L, D) + + # Reshape and concatenate heads + attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim) # (B, L, C) + + # Final linear projection + output = self.out_proj(attn_output) # (B, L, C) + + return output + + class EncSALayer(nn.Module): def __init__(self, c, num_heads, dropout, attention_dropout=0.1, - relu_dropout=0.1, kernel_size=9, act='gelu'): + relu_dropout=0.1, kernel_size=9, act='gelu', rotary_embed=None): super().__init__() self.dropout = dropout self.layer_norm1 = LayerNorm(c) - self.self_attn = MultiheadAttention( - c, num_heads, dropout=attention_dropout, bias=False, - ) + if rotary_embed is None: + self.self_attn = MultiheadAttention( + c, num_heads, dropout=attention_dropout, bias=False, batch_first=True + ) + self.use_rope = False + else: + self.self_attn = MultiheadSelfAttentionWithRoPE( + c, num_heads, dropout=attention_dropout, bias=False, rotary_embed=rotary_embed + ) + self.use_rope = True self.layer_norm2 = LayerNorm(c) self.ffn = TransformerFFNLayer( c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act @@ -151,22 +241,25 @@ def forward(self, x, encoder_padding_mask=None, **kwargs): self.layer_norm2.training = layer_norm_training residual = x x = self.layer_norm1(x) - x, _, = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=encoder_padding_mask - ) + if self.use_rope: + x = self.self_attn(x, key_padding_mask=encoder_padding_mask) + else: + x, _, = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask + ) x = F.dropout(x, self.dropout, training=self.training) x = residual + x - x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] + x = x * (1 - encoder_padding_mask.float())[..., None] residual = x x = self.layer_norm2(x) x = self.ffn(x) x = F.dropout(x, self.dropout, training=self.training) x = residual + x - x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] + x = x * (1 - encoder_padding_mask.float())[..., None] return x diff --git a/modules/commons/rotary_embedding_torch.py b/modules/commons/rotary_embedding_torch.py new file mode 100644 index 000000000..4efcb514f --- /dev/null +++ b/modules/commons/rotary_embedding_torch.py @@ -0,0 +1,320 @@ +from __future__ import annotations +from math import pi, log + +import torch +from torch.amp import autocast +from torch.nn import Module, ModuleList +from torch import nn, einsum, broadcast_tensors, Tensor + +from einops import rearrange, repeat + +from typing import Literal + +# helper functions + +def exists(val): + return val is not None + +def default(val, d): + return val if exists(val) else d + +# broadcat, as tortoise-tts was using it + +def broadcat(tensors, dim = -1): + broadcasted_tensors = broadcast_tensors(*tensors) + return torch.cat(broadcasted_tensors, dim = dim) + +def slice_at_dim(t, dim_slice: slice, *, dim): + dim += (t.ndim if dim < 0 else 0) + colons = [slice(None)] * t.ndim + colons[dim] = dim_slice + return t[tuple(colons)] + +# rotary embedding helper functions + +def rotate_half(x): + x = rearrange(x, '... (d r) -> ... d r', r = 2) + x1, x2 = x.unbind(dim = -1) + x = torch.stack((-x2, x1), dim = -1) + return rearrange(x, '... d r -> ... (d r)') + +@autocast('cuda', enabled = False) +def apply_rotary_emb( + freqs, + t, + start_index = 0, + scale = 1., + seq_dim = -2, + freqs_seq_dim = None +): + dtype = t.dtype + + if not exists(freqs_seq_dim): + if freqs.ndim == 2 or t.ndim == 3: + freqs_seq_dim = 0 + + if t.ndim == 3 or exists(freqs_seq_dim): + seq_len = t.shape[seq_dim] + freqs = slice_at_dim(freqs, slice(-seq_len, None), dim = freqs_seq_dim) + + rot_dim = freqs.shape[-1] + end_index = start_index + rot_dim + + assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}' + + # Split t into three parts: left, middle (to be transformed), and right + t_left = t[..., :start_index] + t_middle = t[..., start_index:end_index] + t_right = t[..., end_index:] + + # Apply rotary embeddings without modifying t in place + t_transformed = (t_middle * freqs.cos() * scale) + (rotate_half(t_middle) * freqs.sin() * scale) + + out = torch.cat((t_left, t_transformed, t_right), dim=-1) + + return out.type(dtype) + +# learned rotation helpers + +def apply_learned_rotations(rotations, t, start_index = 0, freq_ranges = None): + if exists(freq_ranges): + rotations = einsum('..., f -> ... f', rotations, freq_ranges) + rotations = rearrange(rotations, '... r f -> ... (r f)') + + rotations = repeat(rotations, '... n -> ... (n r)', r = 2) + return apply_rotary_emb(rotations, t, start_index = start_index) + +# classes + +class RotaryEmbedding(Module): + def __init__( + self, + dim, + custom_freqs: Tensor | None = None, + freqs_for: Literal['lang', 'pixel', 'constant'] = 'lang', + theta = 10000, + max_freq = 10, + num_freqs = 1, + learned_freq = False, + use_xpos = False, + xpos_scale_base = 512, + interpolate_factor = 1., + theta_rescale_factor = 1., + seq_before_head_dim = False, + cache_if_possible = True, + cache_max_seq_len = 8192 + ): + super().__init__() + # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning + # has some connection to NTK literature + # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + + theta *= theta_rescale_factor ** (dim / (dim - 2)) + + self.freqs_for = freqs_for + + if exists(custom_freqs): + freqs = custom_freqs + elif freqs_for == 'lang': + freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) + elif freqs_for == 'pixel': + freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi + elif freqs_for == 'constant': + freqs = torch.ones(num_freqs).float() + + self.cache_if_possible = cache_if_possible + self.cache_max_seq_len = cache_max_seq_len + + self.register_buffer('cached_freqs', torch.zeros(cache_max_seq_len, dim), persistent = False) + self.cached_freqs_seq_len = 0 + + self.freqs = nn.Parameter(freqs, requires_grad = learned_freq) + + self.learned_freq = learned_freq + + # dummy for device + + self.register_buffer('dummy', torch.tensor(0), persistent = False) + + # default sequence dimension + + self.seq_before_head_dim = seq_before_head_dim + self.default_seq_dim = -3 if seq_before_head_dim else -2 + + # interpolation factors + + assert interpolate_factor >= 1. + self.interpolate_factor = interpolate_factor + + # xpos + + self.use_xpos = use_xpos + + if not use_xpos: + return + + scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim) + self.scale_base = xpos_scale_base + + self.register_buffer('scale', scale, persistent = False) + self.register_buffer('cached_scales', torch.zeros(cache_max_seq_len, dim), persistent = False) + self.cached_scales_seq_len = 0 + + # add apply_rotary_emb as static method + + self.apply_rotary_emb = staticmethod(apply_rotary_emb) + + @property + def device(self): + return self.dummy.device + + def get_seq_pos(self, seq_len, device, dtype, offset = 0): + return (torch.arange(seq_len, device = device, dtype = dtype) + offset) / self.interpolate_factor + + def rotate_queries_or_keys(self, t, seq_dim = None, offset = 0, scale = None): + seq_dim = default(seq_dim, self.default_seq_dim) + + assert not self.use_xpos or exists(scale), 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings' + + device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim] + + seq = self.get_seq_pos(seq_len, device = device, dtype = dtype, offset = offset) + + freqs = self.forward(seq, seq_len = seq_len, offset = offset) + + if seq_dim == -3: + freqs = rearrange(freqs, 'n d -> n 1 d') + + return apply_rotary_emb(freqs, t, scale = default(scale, 1.), seq_dim = seq_dim) + + def rotate_queries_with_cached_keys(self, q, k, seq_dim = None, offset = 0): + dtype, device, seq_dim = q.dtype, q.device, default(seq_dim, self.default_seq_dim) + + q_len, k_len = q.shape[seq_dim], k.shape[seq_dim] + assert q_len <= k_len + + q_scale = k_scale = 1. + + if self.use_xpos: + seq = self.get_seq_pos(k_len, dtype = dtype, device = device) + + q_scale = self.get_scale(seq[-q_len:]).type(dtype) + k_scale = self.get_scale(seq).type(dtype) + + rotated_q = self.rotate_queries_or_keys(q, seq_dim = seq_dim, scale = q_scale, offset = k_len - q_len + offset) + rotated_k = self.rotate_queries_or_keys(k, seq_dim = seq_dim, scale = k_scale ** -1) + + rotated_q = rotated_q.type(q.dtype) + rotated_k = rotated_k.type(k.dtype) + + return rotated_q, rotated_k + + def rotate_queries_and_keys(self, q, k, seq_dim = None): + seq_dim = default(seq_dim, self.default_seq_dim) + + assert self.use_xpos + device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim] + + seq = self.get_seq_pos(seq_len, dtype = dtype, device = device) + + freqs = self.forward(seq, seq_len = seq_len) + scale = self.get_scale(seq, seq_len = seq_len).to(dtype) + + if seq_dim == -3: + freqs = rearrange(freqs, 'n d -> n 1 d') + scale = rearrange(scale, 'n d -> n 1 d') + + rotated_q = apply_rotary_emb(freqs, q, scale = scale, seq_dim = seq_dim) + rotated_k = apply_rotary_emb(freqs, k, scale = scale ** -1, seq_dim = seq_dim) + + rotated_q = rotated_q.type(q.dtype) + rotated_k = rotated_k.type(k.dtype) + + return rotated_q, rotated_k + + def get_scale( + self, + t: Tensor, + seq_len: int | None = None, + offset = 0 + ): + assert self.use_xpos + + should_cache = ( + self.cache_if_possible and + exists(seq_len) and + (offset + seq_len) <= self.cache_max_seq_len + ) + + if ( + should_cache and \ + exists(self.cached_scales) and \ + (seq_len + offset) <= self.cached_scales_seq_len + ): + return self.cached_scales[offset:(offset + seq_len)] + + scale = 1. + if self.use_xpos: + power = (t - len(t) // 2) / self.scale_base + scale = self.scale ** rearrange(power, 'n -> n 1') + scale = repeat(scale, 'n d -> n (d r)', r = 2) + + if should_cache and offset == 0: + self.cached_scales[:seq_len] = scale.detach() + self.cached_scales_seq_len = seq_len + + return scale + + def get_axial_freqs(self, *dims): + Colon = slice(None) + all_freqs = [] + + for ind, dim in enumerate(dims): + if self.freqs_for == 'pixel': + pos = torch.linspace(-1, 1, steps = dim, device = self.device) + else: + pos = torch.arange(dim, device = self.device) + + freqs = self.forward(pos, seq_len = dim) + + all_axis = [None] * len(dims) + all_axis[ind] = Colon + + new_axis_slice = (Ellipsis, *all_axis, Colon) + all_freqs.append(freqs[new_axis_slice]) + + all_freqs = broadcast_tensors(*all_freqs) + return torch.cat(all_freqs, dim = -1) + + @autocast('cuda', enabled = False) + def forward( + self, + t: Tensor, + seq_len: int | None = None, + offset = 0 + ): + should_cache = ( + self.cache_if_possible and + not self.learned_freq and + exists(seq_len) and + self.freqs_for != 'pixel' and + (offset + seq_len) <= self.cache_max_seq_len + ) + + if ( + should_cache and \ + exists(self.cached_freqs) and \ + (offset + seq_len) <= self.cached_freqs_seq_len + ): + return self.cached_freqs[offset:(offset + seq_len)].detach() + + freqs = self.freqs + + freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs) + freqs = repeat(freqs, '... n -> ... (n r)', r = 2) + + if should_cache and offset == 0: + self.cached_freqs[:seq_len] = freqs.detach() + self.cached_freqs_seq_len = seq_len + + return freqs diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 6c4e54f8b..b6f986bb0 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -23,7 +23,8 @@ def __init__(self, vocab_size): hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'], dropout=hparams['dropout'], num_heads=hparams['num_heads'], - use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] + use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams.get('rel_pos', False), + use_rope=hparams.get('use_rope', False) ) self.pitch_embed = Linear(1, hparams['hidden_size']) diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py index 1dd164d17..391de11ab 100644 --- a/modules/fastspeech/tts_modules.py +++ b/modules/fastspeech/tts_modules.py @@ -3,7 +3,7 @@ import torch import torch.nn as nn from torch.nn import functional as F - +from modules.commons.rotary_embedding_torch import RotaryEmbedding from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer from modules.commons.espnet_positional_embedding import RelPositionalEncoding @@ -12,13 +12,13 @@ class TransformerEncoderLayer(nn.Module): - def __init__(self, hidden_size, dropout, kernel_size=None, act='gelu', num_heads=2): + def __init__(self, hidden_size, dropout, kernel_size=None, act='gelu', num_heads=2, rotary_embed=None): super().__init__() self.op = EncSALayer( hidden_size, num_heads, dropout=dropout, attention_dropout=0.0, relu_dropout=dropout, kernel_size=kernel_size, - act=act + act=act, rotary_embed=rotary_embed ) def forward(self, x, **kwargs): @@ -353,18 +353,21 @@ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None): class FastSpeech2Encoder(nn.Module): def __init__(self, hidden_size, num_layers, ffn_kernel_size=9, ffn_act='gelu', - dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True): + dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True, use_rope=False): super().__init__() self.num_layers = num_layers embed_dim = self.hidden_size = hidden_size self.dropout = dropout self.use_pos_embed = use_pos_embed - + if use_pos_embed and use_rope: + rotary_embed = RotaryEmbedding(dim = embed_dim // num_heads) + else: + rotary_embed = None self.layers = nn.ModuleList([ TransformerEncoderLayer( self.hidden_size, self.dropout, kernel_size=ffn_kernel_size, act=ffn_act, - num_heads=num_heads + num_heads=num_heads, rotary_embed=rotary_embed ) for _ in range(self.num_layers) ]) @@ -373,7 +376,9 @@ def __init__(self, hidden_size, num_layers, self.embed_scale = math.sqrt(hidden_size) self.padding_idx = 0 self.rel_pos = rel_pos - if self.rel_pos: + if use_rope: + self.embed_positions = None + elif self.rel_pos: self.embed_positions = RelPositionalEncoding(hidden_size, dropout_rate=0.0) else: self.embed_positions = SinusoidalPositionalEmbedding( @@ -385,7 +390,7 @@ def forward_embedding(self, main_embed, extra_embed=None, padding_mask=None): x = self.embed_scale * main_embed if extra_embed is not None: x = x + extra_embed - if self.use_pos_embed: + if self.use_pos_embed and self.embed_positions is not None: if self.rel_pos: x = self.embed_positions(x) else: @@ -396,7 +401,7 @@ def forward_embedding(self, main_embed, extra_embed=None, padding_mask=None): def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_hiddens=False): x = self.forward_embedding(main_embed, extra_embed, padding_mask=padding_mask) # [B, T, H] - nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None] # [T, B, 1] + nonpadding_mask_BT = 1 - padding_mask.float()[:, :, None] # [B, T, 1] # NOTICE: # The following codes are commented out because @@ -411,16 +416,13 @@ def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_ # x = x + positions # x = F.dropout(x, p=self.dropout, training=self.training) - # B x T x C -> T x B x C - x = x.transpose(0, 1) * nonpadding_mask_TB + x = x * nonpadding_mask_BT hiddens = [] for layer in self.layers: - x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB - hiddens.append(x) - x = self.layer_norm(x) * nonpadding_mask_TB + x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_BT + if return_hiddens: + hiddens.append(x) + x = self.layer_norm(x) * nonpadding_mask_BT if return_hiddens: - x = torch.stack(hiddens, 0) # [L, T, B, C] - x = x.transpose(1, 2) # [L, B, T, C] - else: - x = x.transpose(0, 1) # [B, T, C] + x = torch.stack(hiddens, 0) # [L, B, T, C] return x diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index a02e6e010..deab9ee84 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -32,7 +32,8 @@ def __init__(self, vocab_size): hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'], dropout=hparams['dropout'], num_heads=hparams['num_heads'], - use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] + use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams.get('rel_pos', False), + use_rope=hparams.get('use_rope', False) ) dur_hparams = hparams['dur_prediction_args'] @@ -121,7 +122,8 @@ def get_hparam(key): hidden_size=hidden_size, num_layers=get_hparam('enc_layers'), ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), ffn_act=get_hparam('ffn_act'), dropout=get_hparam('dropout'), num_heads=get_hparam('num_heads'), - use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos') + use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos'), + use_rope=get_hparam('use_rope') ) self.out_proj = Linear(hidden_size, hparams['hidden_size']) diff --git a/requirements-onnx.txt b/requirements-onnx.txt index 976591f70..dda531484 100644 --- a/requirements-onnx.txt +++ b/requirements-onnx.txt @@ -3,6 +3,7 @@ # See instructions at https://pytorch.org/get-started/previous-versions/ click +einops>=0.7.0 h5py librosa<0.10.0 lightning~=2.1.0 diff --git a/requirements.txt b/requirements.txt index 90f3f9c5e..8f79e2382 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ # See instructions at https://pytorch.org/get-started/locally/ click +einops>=0.7.0 h5py librosa<0.10.0 lightning~=2.3.0 From 575d0aba7229397a8a0d3f9c784b7002a02500f6 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Sun, 18 Aug 2024 00:30:04 +0800 Subject: [PATCH 35/44] support mini-nsf-hifigan vocoder --- modules/nsf_hifigan/env.py | 2 + modules/nsf_hifigan/models.py | 84 +++++++++++++++++++++++------------ 2 files changed, 58 insertions(+), 28 deletions(-) diff --git a/modules/nsf_hifigan/env.py b/modules/nsf_hifigan/env.py index ebb9486d3..04abfd9dc 100644 --- a/modules/nsf_hifigan/env.py +++ b/modules/nsf_hifigan/env.py @@ -18,6 +18,8 @@ def __setitem__(self, key, value): return super(AttrDict, self).__setitem__(key, value) def __getitem__(self, name): + if name not in super(AttrDict, self).keys(): + return None return super(AttrDict, self).__getitem__(name) def __delitem__(self, name): diff --git a/modules/nsf_hifigan/models.py b/modules/nsf_hifigan/models.py index cc21039f7..085907d94 100644 --- a/modules/nsf_hifigan/models.py +++ b/modules/nsf_hifigan/models.py @@ -209,46 +209,74 @@ def __init__(self, h): self.h = h self.num_kernels = len(h.resblock_kernel_sizes) self.num_upsamples = len(h.upsample_rates) - self.m_source = SourceModuleHnNSF( - sampling_rate=h.sampling_rate, - harmonic_num=8 - ) - self.noise_convs = nn.ModuleList() - self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) - resblock = ResBlock1 if h.resblock == '1' else ResBlock2 - + self.mini_nsf = h.mini_nsf + + if h.mini_nsf: + self.source_sr = h.sampling_rate / int(np.prod(h.upsample_rates[2: ])) + self.upp = int(np.prod(h.upsample_rates[: 2])) + else: + self.source_sr = h.sampling_rate + self.upp = int(np.prod(h.upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=h.sampling_rate, + harmonic_num=8 + ) + self.noise_convs = nn.ModuleList() + + self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) + self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): - c_cur = h.upsample_initial_channel // (2 ** (i + 1)) - self.ups.append(weight_norm( - ConvTranspose1d(h.upsample_initial_channel // (2 ** i), h.upsample_initial_channel // (2 ** (i + 1)), - k, u, padding=(k - u) // 2))) - if i + 1 < len(h.upsample_rates): # - stride_f0 = int(np.prod(h.upsample_rates[i + 1:])) - self.noise_convs.append(Conv1d( - 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) - else: - self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.resblocks = nn.ModuleList() + resblock = ResBlock1 if h.resblock == '1' else ResBlock2 ch = h.upsample_initial_channel - for i in range(len(self.ups)): + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): ch //= 2 + self.ups.append(weight_norm(ConvTranspose1d(2 * ch, ch, k, u, padding=(k - u) // 2))) for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): self.resblocks.append(resblock(h, ch, k, d)) + if not h.mini_nsf: + if i + 1 < len(h.upsample_rates): # + stride_f0 = int(np.prod(h.upsample_rates[i + 1:])) + self.noise_convs.append(Conv1d( + 1, ch, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) + else: + self.noise_convs.append(Conv1d(1, ch, kernel_size=1)) + elif i == 1: + self.source_conv = Conv1d(1, ch, 1) + self.source_conv.apply(init_weights) self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) self.conv_post.apply(init_weights) - self.upp = int(np.prod(h.upsample_rates)) - + + def fastsinegen(self, f0): + n = torch.arange(1, self.upp + 1, device=f0.device) + s0 = f0.unsqueeze(-1) / self.source_sr + ds0 = F.pad(s0[:, 1:, :] - s0[:, :-1, :], (0, 0, 0, 1)) + rad = s0 * n + 0.5 * ds0 * n * (n - 1) / self.upp + rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5 + rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0) + rad += F.pad(rad_acc, (0, 0, 1, -1)) + rad = rad.reshape(f0.shape[0], 1, -1) + sines = torch.sin(2 * np.pi * rad) + return sines + def forward(self, x, f0): - har_source = self.m_source(f0, self.upp).transpose(1, 2) + if self.mini_nsf: + har_source = self.fastsinegen(f0) + else: + har_source = self.m_source(f0, self.upp).transpose(1, 2) x = self.conv_pre(x) for i in range(self.num_upsamples): x = F.leaky_relu(x, LRELU_SLOPE) x = self.ups[i](x) - x_source = self.noise_convs[i](har_source) - x = x + x_source + if not self.mini_nsf: + x_source = self.noise_convs[i](har_source) + x = x + x_source + elif i == 1: + x_source = self.source_conv(har_source) + x = x + x_source xs = None for j in range(self.num_kernels): if xs is None: @@ -259,14 +287,14 @@ def forward(self, x, f0): x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) - return x def remove_weight_norm(self): - rank_zero_info('Removing weight norm...') + # rank_zero_info('Removing weight norm...') + print('Removing weight norm...') for l in self.ups: remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() remove_weight_norm(self.conv_pre) - remove_weight_norm(self.conv_post) + remove_weight_norm(self.conv_post) \ No newline at end of file From 51da9ec499b5ed8b02f74b9118ece035ab0b3cb1 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Sun, 18 Aug 2024 17:15:22 +0800 Subject: [PATCH 36/44] discard negative pad --- modules/nsf_hifigan/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/nsf_hifigan/models.py b/modules/nsf_hifigan/models.py index 085907d94..1bdfa21e1 100644 --- a/modules/nsf_hifigan/models.py +++ b/modules/nsf_hifigan/models.py @@ -137,7 +137,7 @@ def _f02sine(self, f0, upp): rad = f0 / self.sampling_rate * torch.arange(1, upp + 1, device=f0.device) rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5 rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0) - rad += F.pad(rad_acc, (0, 0, 1, -1)) + rad += F.pad(rad_acc[:, :-1, :], (0, 0, 1, 0)) rad = rad.reshape(f0.shape[0], -1, 1) rad = torch.multiply(rad, torch.arange(1, self.dim + 1, device=f0.device).reshape(1, 1, -1)) rand_ini = torch.rand(1, 1, self.dim, device=f0.device) @@ -257,7 +257,7 @@ def fastsinegen(self, f0): rad = s0 * n + 0.5 * ds0 * n * (n - 1) / self.upp rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5 rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0) - rad += F.pad(rad_acc, (0, 0, 1, -1)) + rad += F.pad(rad_acc[:, :-1, :], (0, 0, 1, 0)) rad = rad.reshape(f0.shape[0], 1, -1) sines = torch.sin(2 * np.pi * rad) return sines From 960bf90a6bc64c5252028165987a845376559d22 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Fri, 14 Feb 2025 11:55:41 +0800 Subject: [PATCH 37/44] fix MHA inference using low torch version --- modules/backbones/lynxnet.py | 15 +-------------- modules/commons/common_layers.py | 4 +++- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py index 18e7bf497..5dbd1d0a1 100644 --- a/modules/backbones/lynxnet.py +++ b/modules/backbones/lynxnet.py @@ -6,7 +6,7 @@ import torch.nn as nn import torch.nn.functional as F -from modules.commons.common_layers import SinusoidalPosEmb +from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU from utils.hparams import hparams @@ -16,19 +16,6 @@ def __init__(self, *args, **kwargs): nn.init.kaiming_normal_(self.weight) -class SwiGLU(nn.Module): - # Swish-Applies the gated linear unit function. - def __init__(self, dim=-1): - super().__init__() - self.dim = dim - - def forward(self, x): - # out, gate = x.chunk(2, dim=self.dim) - # Using torch.split instead of chunk for ONNX export compatibility. - out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim) - return out * F.silu(gate) - - class Transpose(nn.Module): def __init__(self, dims): super().__init__() diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index 3927cd272..bf4a2822c 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -221,7 +221,7 @@ def __init__(self, c, num_heads, dropout, attention_dropout=0.1, self.layer_norm1 = LayerNorm(c) if rotary_embed is None: self.self_attn = MultiheadAttention( - c, num_heads, dropout=attention_dropout, bias=False, batch_first=True + c, num_heads, dropout=attention_dropout, bias=False, batch_first=False ) self.use_rope = False else: @@ -244,12 +244,14 @@ def forward(self, x, encoder_padding_mask=None, **kwargs): if self.use_rope: x = self.self_attn(x, key_padding_mask=encoder_padding_mask) else: + x = x.transpose(0, 1) x, _, = self.self_attn( query=x, key=x, value=x, key_padding_mask=encoder_padding_mask ) + x = x.transpose(0, 1) x = F.dropout(x, self.dropout, training=self.training) x = residual + x x = x * (1 - encoder_padding_mask.float())[..., None] From 84b32ed2eb4b6e0477efc5f68dc6d0a287156967 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 16 Feb 2025 22:20:37 +0800 Subject: [PATCH 38/44] Fix missing phoneme list sorting --- basics/base_binarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index d1f812015..397bd8305 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -279,7 +279,7 @@ def display_phoneme(phoneme): for idx in ph_idx_required.difference(ph_idx_occurred) }, key=lambda v: v[0] if isinstance(v, tuple) else v) raise BinarizationError( - f'The following phonemes are not covered in transcriptions: {sorted(missing_phones)}' + f'The following phonemes are not covered in transcriptions: {missing_phones}' ) def process_dataset(self, prefix, num_workers=0, apply_augmentation=False): From 7741b5555ccd7b3dfc97f4c0d17dd344128a5d83 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Mon, 17 Feb 2025 23:58:29 +0800 Subject: [PATCH 39/44] Fix single-language dictionary parsing language tag --- utils/phoneme_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 50145979e..7dc27afa6 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -142,6 +142,8 @@ def is_cross_lingual(self, phone): return phone in self._cross_lingual_phonemes def encode_one(self, phone, lang=None): + if '/' in phone: + lang, phone = phone.split('/', maxsplit=1) if lang is None or not self._multi_langs or phone in self._phone_to_id: return self._phone_to_id[phone] if '/' not in phone: From 58edd2fefc239d1229f82d735716dc373dedc161 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 23 Mar 2025 00:04:00 +0800 Subject: [PATCH 40/44] Add `pitch_controllable` flag to vocoder exporter (cherry picked from commit a6deb6b5c3dcca554546e790328278d493bdf8e9) --- deployment/exporters/nsf_hifigan_exporter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deployment/exporters/nsf_hifigan_exporter.py b/deployment/exporters/nsf_hifigan_exporter.py index cbc052ce6..2f0a6b402 100644 --- a/deployment/exporters/nsf_hifigan_exporter.py +++ b/deployment/exporters/nsf_hifigan_exporter.py @@ -25,6 +25,7 @@ def __init__( super().__init__(device=device, cache_dir=cache_dir) self.model_path = model_path self.model_name = model_name + self.vocoder_pitch_controllable = False self.model = self.build_model() self.model_class_name = remove_suffix(self.model.__class__.__name__, 'ONNX') self.model_cache_path = (self.cache_dir / self.model_name).with_suffix('.onnx') @@ -38,6 +39,7 @@ def build_model(self) -> nn.Module: "See https://github.com/openvpi/DiffSinger/releases/tag/v2.3.0 for more details." ) model = NSFHiFiGANONNX(config).eval().to(self.device) + self.vocoder_pitch_controllable = config.get("pc_aug", False) load_ckpt(model.generator, str(self.model_path), prefix_in_ckpt=None, key_in_ckpt='generator', strict=True, device=self.device) @@ -73,6 +75,10 @@ def export_attachments(self, path: Path): 'mel_fmax': hparams['fmax'] if hparams['fmax'] is not None else hparams['audio_sample_rate'] / 2, 'mel_base': 'e', 'mel_scale': 'slaney', + 'pitch_controllable': self.vocoder_pitch_controllable, + # Some old vocoder versions may have severe performance issues on CUDA; + # the issues were fixed in newer versions, and this flag is to distinguish them + 'force_on_cpu': False, }, fw, sort_keys=False) print(f'| export configs => {config_path} **PLEASE EDIT BEFORE USE**') From 38335bf02f3d45335aca33f644b791baf2694ddb Mon Sep 17 00:00:00 2001 From: yxlllc Date: Sun, 23 Mar 2025 02:05:21 +0800 Subject: [PATCH 41/44] support noise injection --- modules/nsf_hifigan/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/nsf_hifigan/models.py b/modules/nsf_hifigan/models.py index 1bdfa21e1..084949886 100644 --- a/modules/nsf_hifigan/models.py +++ b/modules/nsf_hifigan/models.py @@ -210,7 +210,8 @@ def __init__(self, h): self.num_kernels = len(h.resblock_kernel_sizes) self.num_upsamples = len(h.upsample_rates) self.mini_nsf = h.mini_nsf - + self.noise_sigma = h.noise_sigma + if h.mini_nsf: self.source_sr = h.sampling_rate / int(np.prod(h.upsample_rates[2: ])) self.upp = int(np.prod(h.upsample_rates[: 2])) @@ -268,6 +269,8 @@ def forward(self, x, f0): else: har_source = self.m_source(f0, self.upp).transpose(1, 2) x = self.conv_pre(x) + if self.noise_sigma is not None and self.noise_sigma > 0: + x += self.noise_sigma * torch.randn_like(x) for i in range(self.num_upsamples): x = F.leaky_relu(x, LRELU_SLOPE) x = self.ups[i](x) From 4a56fce58d825ca8d1dba3158211ec5d0abdf50e Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 29 Mar 2025 00:20:30 +0800 Subject: [PATCH 42/44] Allow merging global phonemes and language-specific phonemes --- utils/phoneme_utils.py | 52 +++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 7dc27afa6..c208be705 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -14,6 +14,7 @@ def __init__( extra_phonemes: List[str] = None, merged_groups: List[List[str]] = None ): + # Step 1: Collect all phonemes all_phonemes = {'AP', 'SP'} if extra_phonemes: for ph in extra_phonemes: @@ -43,47 +44,36 @@ def __init__( all_phonemes.add(f'{lang}/{phoneme}') else: all_phonemes.add(phoneme) + # Step 2: Parse merged phoneme groups if merged_groups is None: merged_groups = [] else: - if self._multi_langs: - for group in merged_groups: - for phoneme in group: - if '/' not in phoneme: - raise ValueError( - f"Invalid phoneme tag '{phoneme}' in merged group: " - "should specify language by '/' prefix." - ) + _merged_groups = [] + for group in merged_groups: + _group = [] + for phoneme in group: + if '/' in phoneme: lang, name = phoneme.split('/', maxsplit=1) if lang not in dictionaries: raise ValueError( f"Invalid phoneme tag '{phoneme}' in merged group: " f"unrecognized language name '{lang}'." ) - unique_name = phoneme if self._multi_langs else name - if unique_name not in all_phonemes: - raise ValueError( - f"Invalid phoneme tag '{phoneme}' in merged group: " - f"not found in phoneme set." - ) - merged_groups = [set(phones) for phones in merged_groups if len(phones) > 1] - else: - _merged_groups = [] - for group in merged_groups: - _group = [] - for phoneme in group: - if '/' in phoneme: - lang, name = phoneme.split('/', maxsplit=1) - if lang not in dictionaries: - raise ValueError( - f"Invalid phoneme tag '{phoneme}' in merged group: " - f"unrecognized language name '{lang}'." - ) - _group.append(name) + if self._multi_langs: + element = phoneme else: - _group.append(phoneme) - _merged_groups.append(_group) - merged_groups = [set(phones) for phones in _merged_groups if len(phones) > 1] + element = name + else: + element = phoneme + if element not in all_phonemes: + raise ValueError( + f"Invalid phoneme tag '{phoneme}' in merged group: " + f"not found in phoneme set." + ) + _group.append(element) + _merged_groups.append(_group) + merged_groups = [set(phones) for phones in _merged_groups if len(phones) > 1] + # Step 3: Build phoneme index merged_phonemes_inverted_index = {} for idx, group in enumerate(merged_groups): other_idx = None From 21a0f6bff93d50d911295214af9d132977165f3c Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 29 Mar 2025 02:25:33 +0800 Subject: [PATCH 43/44] Check for conflicts between short names and global tags --- utils/phoneme_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index c208be705..ca1af6203 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -25,6 +25,11 @@ def __init__( f"Invalid phoneme tag '{ph}' in extra phonemes: " f"unrecognized language name '{lang}'." ) + if name in all_phonemes: + raise ValueError( + f"Invalid phoneme tag '{ph}' in extra phonemes: " + f"short name conflicts with existing tag." + ) all_phonemes.add(ph) self._multi_langs = len(dictionaries) > 1 for lang, dict_path in dictionaries.items(): From 7b58b46fee3ba3c3cf250cc0b0c402ff7347b9c3 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 29 Mar 2025 02:27:34 +0800 Subject: [PATCH 44/44] Finish documentation for multi-dictionary --- docs/BestPractices.md | 173 ++++++++++++++++++++-------------- docs/ConfigurationSchemas.md | 174 ++++++++++++++++++++++++----------- docs/GettingStarted.md | 4 +- 3 files changed, 225 insertions(+), 126 deletions(-) diff --git a/docs/BestPractices.md b/docs/BestPractices.md index 04426b836..cc9c26dd9 100644 --- a/docs/BestPractices.md +++ b/docs/BestPractices.md @@ -1,42 +1,126 @@ # Best Practices -## Materials for training and using models +## Fundamental concepts and materials -### Datasets +### Configuration files -A dataset mainly includes recordings and transcriptions, which is called a _raw dataset_. Raw datasets should be organized as the following folder structure: +A configuration file is a YAML file that defines enabled features, model hyperparameters and controls the behavior of the binarizer, trainer and inference. Almost all settings and controls in this repository, including the practices in this guidance, are achieved through configuration files. -- my_raw_data/ - - wavs/ - - 001.wav - - 002.wav - - ... (more recording files) - - transcriptions.csv +For more information of the configuration system and configurable attributes, see [Configuration Schemas](ConfigurationSchemas.md). -In the example above, the _my_raw_data_ folder is the root directory of a raw dataset. +### Languages -The _transcriptions.csv_ file contains all labels of the recordings. The common column of the CSV file is `name`, which represents all recording items by their filenames **without extension**. Elements of sequence attributes should be split by `space`. Other required columns may vary according to the category of the model you are training, and will be introduced in the following sections. +Each language you are dealing with should have a unique tag in the configuration file. **We highly recommend using ISO 639 language codes as language tags.** For example, `zh` and `zho` stands for Chinese (`cmn` specifically for Mandarin Chinese), `ja` and `jpn` for Japanese, `en` and `eng` for English, `yue` for Cantonese (Yue). You can download a complete language code table from https://iso639-3.sil.org/code_tables/download_tables. + +### Phonemes + +Phonemes are the fundamental part of dictionaries and labels. There are two types of phonemes: language-specific phonemes and global phonemes. + +**Language-specific phonemes:** If there are multiple languages, all language-specific phonemes will be prefixed with its language name. For example: `zh/a`, `ja/o`, `en/eh`. These are called the **full name** of the phonemes, while `a`, `o`, `eh` are called the **short name** which has definite meaning only in a specific language context. If there is only one language, the short names can be used to determine each phoneme. + +**Global phonemes:** Some phonemes do not belong to any language. There are two reserved global phoneme tags: `SP` for space, and `AP` for aspiration. There can also be other user-defined tags (`EP`, `GS`, `VF`, etc.). These tags will not be prefixed with language, and are prior when identifying phoneme names. + +Extra phonemes, including user-defined global phonemes and additional language-specific phonemes that are not present in the dictionaries, can be defined in a list in the configuration file (full names should be used): + +```yaml +extra_phonemes: ['EP', 'ja/cl'] +``` + +The phoneme set expands rapidly with the number of languages. There are actually many similar phonemes that can be merged. Define the merging groups in your configuration file (full names should be used): + +```yaml +merged_phoneme_groups: + - [zh/i, ja/i, en/iy] + - [zh/s, ja/s, en/s] + - [ja/cl, SP] # global phonemes can also be merged + # ... (other groups omitted for brevity) +use_lang_id: true # whether to use language embedding; only take effects if there are cross-lingual phonemes +``` + +Merging phonemes does not mean that they are exactly the same for the dictionary. For those cross-lingual merged phonemes, Setting `use_lang_id` to true will still distinguish them by language IDs. + +#### Phoneme naming principles + +- Short names of language-specific phonemes should not conflict with global phoneme names, including reserved ones. +- `/` cannot be used because it is already used for splitting the language tag and the short name. +- `-` and `+` cannot be used because they are defined as slur tags in most singing voice synthesis editors. +- Other special characters, including but not limited to `@`, `#`, `&`, `|`, `<`, `>`, is not recommended because they may be used as special tags in the future format changes. +- ASCII characters are preferred for the best encoding compatibility, but all UTF-8 characters are acceptable. ### Dictionaries -A dictionary is a .txt file, in which each line represents a mapping rule from one syllable to its phoneme sequence. The syllable and the phonemes are split by `tab`, and the phonemes are split by `space`: +Each language should have a corresponding dictionary. Define languages and dictionaries in your configuration file: + +```yaml +dictionaries: + zh: dictionaries/opencpop-extension.txt + ja: dictionaries/japanese_dict_full.txt + en: dictionaries/ds_cmudict-07b.txt +num_lang: 3 # number of languages; should be >= number of defined languages +``` + +Each dictionary is a *.txt* file, in which each line represents a mapping rule from one syllable to its phoneme sequence. The syllable and the phonemes are split by `tab`, and the phonemes are split by `space`: ``` ... ``` -Syllable names and phoneme names can be customized, but with the following limitations/suggestions: +#### Syllable naming principles -- `SP` (rest), `AP` (breath) and `` (padding) cannot be used because they are reserved. +- Try to use a standard writing or pronouncing system. For example, pinyin for Mandarin Chinese, romaji for Japanese and English words for English. +- `AP` and `SP` cannot be used because they are reserved tags when using DiffSinger in editors. +- `/` cannot be used because it is already used for splitting the language tag and the short name. - `-` and `+` cannot be used because they are defined as slur tags in most singing voice synthesis editors. -- Special characters including but not limited to `@`, `#`, `&`, `|`, `/`, `<`, `>`, etc. should be avoided because they may be used as special tags in the future format changes. Using them now is okay, and all modifications will be notified in advance. +- Syllable names is not recommended to start with `.` because this may have special meanings in the future editors. +- Other special characters, including but not limited to `@`, `#`, `&`, `|`, `<`, `>`, is not recommended because they may be used as special tags in the future format changes. - ASCII characters are preferred for the best encoding compatibility, but all UTF-8 characters are acceptable. -There are some preset dictionaries in the [dictionaries/](../dictionaries) folder. For the guidance of using a custom dictionary, see [Using custom dictionaries](#using-custom-dictionaries). +There are some example dictionaries in the [dictionaries/](../dictionaries) folder. -### Configuration files +### Datasets -A configuration file is a YAML file that defines enabled features, model hyperparameters and controls the behavior of the binarizer, trainer and inference. For more information of the configuration system and configurable attributes, see [Configuration Schemas](ConfigurationSchemas.md). +A dataset mainly includes recordings and transcriptions, which is called a _raw dataset_. Raw datasets should be organized as the following folder structure: + +- my_raw_data/ + - wavs/ + - 001.wav + - 002.wav + - ... (more recording files) + - transcriptions.csv + +In the example above, the _my_raw_data_ directory is the root directory of a raw dataset. + +The _transcriptions.csv_ file contains all labels of the recordings. The common column of the CSV file is `name`, which represents all recording items by their filenames **without extension**. Elements of sequence attributes should be split by `space`. Other required columns may vary according to the category of the model you are training, and will be introduced in the following sections. + +Each dataset should have a main language. If you have many recordings in multiple languages, it is recommended to separate them by language (you can merge their speaker IDs in the configuration). In each dataset, the main language is set as the language context, and phoneme labels in transcriptions.csv do not need a prefix (short name). It is also valid if there are phonemes from other languages, but all of them should be prefixed with their actual language (full name). Global phonemes should not be prefixed in any datasets. + +You can define your datasets in the configuration file like this: + +```yaml +datasets: # define all raw datasets + - raw_data_dir: data/spk1-zh/raw # path to the root of a raw dataset + speaker: speaker1 # speaker name + spk_id: 0 # optional; use this to merge two datasets; otherwise automatically assigned + language: zh # language tag (main language) of this dataset + test_prefixes: # optional; validation samples from this dataset + - wav1 + - wav2 + - raw_data_dir: data/spk1-en/raw + speaker: speaker1 + spk_id: 0 # specify the same speaker ID to merge into the previous one + language: en + test_prefixes: + - wav1 + - wav2 + - raw_data_dir: data/spk2/raw + speaker: speaker2 + language: ja + test_prefixes: + - wav1 + - wav2 + # ... (other datasets omitted for brevity) +num_spk: 2 # number of languages; should be > maximum speaker ID +``` ### DS files @@ -54,7 +138,7 @@ The [DiffSinger Community Vocoders Project](https://openvpi.github.io/vocoders) The pre-trained vocoder can be fine-tuned on your target dataset. It is highly recommended to do so because fine-tuned vocoder can generate much better results on specific (seen) datasets while does not need much computing resources. See the [vocoder training and fine-tuning repository](https://github.com/openvpi/SingingVocoders) for detailed instructions. After you get the fine-tuned vocoder checkpoint, you can configure it by `vocoder_ckpt` key in your configuration file. The fine-tuned NSF-HiFiGAN vocoder checkpoints can be exported to ONNX format like other DiffSinger user models for further production purposes. -Another unrecommended option: train a ultra-lightweight [DDSP vocoder](https://github.com/yxlllc/pc-ddsp) first by yourself, then configure it according to the relevant [instructions](https://github.com/yxlllc/pc-ddsp/blob/master/DiffSinger.md). +Another unrecommended option: train an ultra-lightweight [DDSP vocoder](https://github.com/yxlllc/pc-ddsp) first by yourself, then configure it according to the relevant [instructions](https://github.com/yxlllc/pc-ddsp/blob/master/DiffSinger.md). #### Feature extractors or auxiliary models @@ -108,57 +192,6 @@ Functionalities of variance models are defined by their outputs. There are three There may be some mutual influence between the modules above when they are enabled together. See [mutual influence between variance modules](#mutual-influence-between-variance-modules) for more details. -## Using custom dictionaries - -This section is about using a custom grapheme-to-phoneme dictionary for any language(s). - -### Add a dictionary - -Assume that you have made a dictionary file named `my_dict.txt`. Edit your configuration file: - -```yaml -dictionary: my_dict.txt -``` - -Then you can binarize your data as normal. The phonemes in your dataset must cover, and must only cover the phonemes appeared in your dictionary. Otherwise, the binarizer will raise an error: - -``` -AssertionError: transcriptions and dictionary mismatch. - (+) ['E', 'En', 'i0', 'ir'] - (-) ['AP', 'SP'] -``` - -This means there are 4 unexpected symbols in the data labels (`ir`, `i0`, `E`, `En`) and 2 missing phonemes that are not covered by the data labels (`AP`, `SP`). - -Once the coverage checks passed, a phoneme distribution summary will be saved into your binary data directory. Below is an example. - -![phoneme-distribution](resources/phoneme-distribution.jpg) - -During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. There are one padding index (marked as `defaultlengths -### dictionary +### datasets -Path to the word-phoneme mapping dictionary file. Training data must fully cover phonemes in the dictionary. +List of dataset configs for preprocessing. - + +
visibilityacoustic, variance
scopepreprocessing
customizabilitynormal
typeList[dict]
+ +### datasets[].language + +Language context of this dataset. Must be a key of [dictionaries](#dictionaries). + + + + + + +
visibilityacoustic, variance
scopepreprocessing
customizabilityrequired
typestr
+ +### datasets[].raw_data_dir + +Path to this dataset including wave files, transcriptions, etc. + + + + +
visibilityall
scopepreprocessing
customizabilityrequired
typestr
+### datasets[].speaker + +The name of speaker of this dataset. Speaker names are mapped to speaker indexes and stored into spk_map.json when preprocessing. + + + + + + +
visibilityacoustic, variance
scopepreprocessing
customizabilityrequired
typestr
+ +### datasets[].spk_id + +The speaker ID assigned to this dataset. Will be automatically assigned if not given. IDs can be duplicate or discontinuous to merge multiple datasets to one speaker. + + + + + + +
visibilityacoustic, variance
scopepreprocessing
customizabilitynormal
typeint
+ +### datasets[].test_prefixes + +List of data item names or name prefixes in this dataset for the validation set. For each string `s` in the list: + +- If `s` equals to an actual item name, add that item to validation set. +- If `s` does not equal to any item names, add all items whose names start with `s` to validation set. + + + + + + +
visibilityall
scopepreprocessing
customizabilityrequired
typelist
+ +### dictionaries + +Map of language names and their corresponding dictionary file paths. The phonemes in these dictionaries will be combined as the final phoneme set and have their phoneme IDs. Training data must fully cover all phoneme IDs. + + + + + + + +
visibilityacoustic, variance
scopepreprocessing
customizabilityrequired
typeDict[str, str]
default{}
+ ### diff_accelerator DDPM sampling acceleration method. The following methods are currently available: @@ -655,6 +724,18 @@ Length of sinusoidal smoothing convolution kernel (in seconds) on extracted ener default0.12 +### extra_phonemes + +Extra phonemes to be added to the phoneme set. This list can be used to define custom global phoneme tags besides `AP` and `SP`, or to contain phonemes that are not present in any of the dictionaries. + + + + + + + +
visibilityacoustic, variance
scopepreprocessing
customizabilitynormal
typelist
default[]
+ ### f0_max Maximum base frequency (F0) in Hz for pitch extraction. @@ -1122,6 +1203,18 @@ Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, ` typedict +### merged_phoneme_groups + +Phoneme groups to merge. Each group is a phoneme name list. The merged phonemes share the same ID and thus the same phoneme embedding. + + + + + + + +
visibilityacoustic, variance
scopepreprocessing
customizabilityrequired
typelist
default[]
+ ### midi_smooth_width Length of sinusoidal smoothing convolution kernel (in seconds) on the step function representing MIDI sequence for base pitch calculation. @@ -1170,6 +1263,17 @@ The number of attention heads of `torch.nn.MultiheadAttention` in FastSpeech2 en default2 +### num_lang + +Number of languages. This value is used to allocate language embeddings in the linguistic encoder. + + + + + + +
visibilityacoustic, variance
scopenn
customizabilityrequired
typeint
+ ### num_sanity_val_steps Number of sanity validation steps at the beginning. @@ -1499,17 +1603,6 @@ Whether to enable voicing prediction. defaulttrue -### raw_data_dir - -Path(s) to the raw dataset including wave files, transcriptions, etc. - - - - - - -
visibilityall
scopepreprocessing
customizabilityrequired
typestr, List[str]
- ### rel_pos Whether to use relative positional encoding in FastSpeech2 module. @@ -1674,29 +1767,6 @@ Whether to apply the _sorting by similar length_ algorithm described in [sampler defaulttrue -### speakers - -The names of speakers in a multi-speaker model. Speaker names are mapped to speaker indexes and stored into spk_map.json when preprocessing. - - - - - - -
visibilityacoustic, variance
scopepreprocessing
customizabilityrequired
typelist
- -### spk_ids - -The IDs of speakers in a multi-speaker model. If an empty list is given, speaker IDs will be automatically generated as $0,1,2,...,N_{spk}-1$. IDs can be duplicate or discontinuous. - - - - - - - -
visibilityacoustic, variance
scopepreprocessing
customizabilityrequired
typeList[int]
default[]
- ### spec_min Minimum mel spectrogram value used for normalization to [-1, 1]. Different mel bins can have different minimum values. @@ -1801,22 +1871,6 @@ Length of sinusoidal smoothing convolution kernel (in seconds) on extracted tens default0.12 -### test_prefixes - -List of data item names or name prefixes for the validation set. For each string `s` in the list: - -- If `s` equals to an actual item name, add that item to validation set. -- If `s` does not equal to any item names, add all items whose names start with `s` to validation set. - -For multi-speaker combined datasets, "ds_id:name_prefix" can be used to apply the rules above within data from a specific sub-dataset, where ds_id represents the dataset index. - - - - - - -
visibilityall
scopepreprocessing
customizabilityrequired
typelist
- ### time_scale_factor The scale factor that will be multiplied on the time $t$ of Rectified Flow before embedding into the model. @@ -1891,6 +1945,18 @@ Whether to embed key shifting values introduced by random pitch shifting augment constraintsMust be true if random pitch shifting is enabled. +### use_lang_id + +Whether to embed the language ID from a multilingual dataset. This option only takes effect for those cross-lingual phonemes in the merged groups. + + + + + + + +
visibilityacoustic, variance
scopenn, preprocessing, inference
customizabilityrecommended
typebool
defaultfalse
+ ### use_melody_encoder Whether to enable melody encoder for the pitch predictor. @@ -1941,7 +2007,7 @@ Whether to embed speed values introduced by random time stretching augmentation. ### use_spk_id -Whether embed the speaker id from a multi-speaker dataset. +Whether to embed the speaker ID from a multi-speaker dataset. diff --git a/docs/GettingStarted.md b/docs/GettingStarted.md index a3422c3f0..92ddb395f 100644 --- a/docs/GettingStarted.md +++ b/docs/GettingStarted.md @@ -14,9 +14,9 @@ DiffSinger requires Python 3.8 or later. We strongly recommend you create a virt pip install -r requirements.txt ``` -### Materials and assets +### Concepts and materials -Some essential materials and assets are needed before continuing with this repository. See [materials for training and using models](BestPractices.md#materials-for-training-and-using-models) for detailed instructions. +Before you proceed, it is necessary to understand some fundamental concepts in this repository and prepare some materials and assets. See [fundamental concepts and materials](BestPractices.md#fundamental-concepts-and-materials) for detailed information. ## Configuration
visibilityacoustic, variance