From d7427586b6fccbdf29f4741a6ac31fc7a86e571d Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 01:58:05 +0800 Subject: [PATCH 01/15] Drop support for discrete F0 embedding (reserved in ONNX exporter) --- configs/acoustic.yaml | 1 - deployment/modules/fastspeech2.py | 17 ++++++++++++++--- docs/ConfigurationSchemas.md | 16 ---------------- modules/fastspeech/acoustic_encoder.py | 18 +++--------------- utils/pitch_utils.py | 19 ------------------- 5 files changed, 17 insertions(+), 54 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index f32b878e3..974373cc7 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -56,7 +56,6 @@ energy_smooth_width: 0.12 breathiness_smooth_width: 0.12 use_spk_id: false -f0_embed_type: continuous use_energy_embed: false use_breathiness_embed: false use_key_shift_embed: false diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py index 52f6a4d28..b2ee086c4 100644 --- a/deployment/modules/fastspeech2.py +++ b/deployment/modules/fastspeech2.py @@ -1,17 +1,22 @@ import copy +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from modules.commons.common_layers import NormalInitEmbedding as Embedding from modules.fastspeech.acoustic_encoder import FastSpeech2Acoustic from modules.fastspeech.variance_encoder import FastSpeech2Variance from utils.hparams import hparams -from utils.pitch_utils import ( - f0_bin, f0_mel_min, f0_mel_max -) from utils.text_encoder import PAD_INDEX +f0_bin = 256 +f0_max = 1100.0 +f0_min = 50.0 +f0_mel_min = 1127 * np.log(1 + f0_min / 700) +f0_mel_max = 1127 * np.log(1 + f0_max / 700) + def f0_to_coarse(f0): f0_mel = 1127 * (1 + f0 / 700).log() @@ -38,6 +43,12 @@ def forward(self, dur): class FastSpeech2AcousticONNX(FastSpeech2Acoustic): def __init__(self, vocab_size): super().__init__(vocab_size=vocab_size) + + # for temporary compatibility; will be completely removed in the future + self.f0_embed_type = hparams.get('f0_embed_type', 'discrete') + if self.f0_embed_type == 'discrete': + self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX) + self.lr = LengthRegulator() if hparams.get('use_key_shift_embed', False): self.shift_min, self.shift_max = hparams['augmentation_args']['random_pitch_shifting']['range'] diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index f9fdf60cb..8f2abe364 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -648,22 +648,6 @@ Length of sinusoidal smoothing convolution kernel (in seconds) on extracted ener default0.12 -### f0_embed_type - -Map f0 to embedding using: - -- `torch.nn.Linear` if 'continuous' -- `torch.nn.Embedding` if 'discrete' - - - - - - - - -
visibilityacoustic
scopenn
customizabilitynormal
typestr
defaultcontinuous
constraintsChoose from 'continuous', 'discrete'.
- ### f0_max Maximum base frequency (F0) in Hz for pitch extraction. diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 61b04a4fa..5d1ab1699 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -8,7 +8,6 @@ ) from modules.fastspeech.tts_modules import FastSpeech2Encoder, mel2ph_to_dur from utils.hparams import hparams -from utils.pitch_utils import f0_to_coarse from utils.text_encoder import PAD_INDEX @@ -25,14 +24,7 @@ def __init__(self, vocab_size): use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] ) - self.f0_embed_type = hparams.get('f0_embed_type', 'discrete') - if self.f0_embed_type == 'discrete': - self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX) - elif self.f0_embed_type == 'continuous': - self.pitch_embed = Linear(1, hparams['hidden_size']) - else: - raise ValueError('f0_embed_type must be \'discrete\' or \'continuous\'.') - + self.pitch_embed = Linear(1, hparams['hidden_size']) self.variance_embed_list = [] self.use_energy_embed = hparams.get('use_energy_embed', False) self.use_breathiness_embed = hparams.get('use_breathiness_embed', False) @@ -100,12 +92,8 @@ def forward( spk_embed = self.spk_embed(spk_embed_id)[:, None, :] condition += spk_embed - if self.f0_embed_type == 'discrete': - pitch = f0_to_coarse(f0) - pitch_embed = self.pitch_embed(pitch) - else: - f0_mel = (1 + f0 / 700).log() - pitch_embed = self.pitch_embed(f0_mel[:, :, None]) + f0_mel = (1 + f0 / 700).log() + pitch_embed = self.pitch_embed(f0_mel[:, :, None]) condition += pitch_embed condition = self.forward_variance_embedding( diff --git a/utils/pitch_utils.py b/utils/pitch_utils.py index fe706f94f..57ae943f7 100644 --- a/utils/pitch_utils.py +++ b/utils/pitch_utils.py @@ -1,23 +1,4 @@ import numpy as np -import torch - -f0_bin = 256 -f0_max = 1100.0 -f0_min = 50.0 -f0_mel_min = 1127 * np.log(1 + f0_min / 700) -f0_mel_max = 1127 * np.log(1 + f0_max / 700) - - -def f0_to_coarse(f0): - is_torch = isinstance(f0, torch.Tensor) - f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 - - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 - f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) - assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) - return f0_coarse def norm_f0(f0, uv=None): From 9505b5496d70a3403d06fe8841b8f32a76d551cd Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 02:15:21 +0800 Subject: [PATCH 02/15] Drop support for `interp_uv` configuration key --- augmentation/spec_stretch.py | 2 +- configs/acoustic.yaml | 1 - docs/ConfigurationSchemas.md | 12 ------------ preprocessing/acoustic_binarizer.py | 2 +- 4 files changed, 2 insertions(+), 15 deletions(-) diff --git a/augmentation/spec_stretch.py b/augmentation/spec_stretch.py index 5d90518ec..8944f56a6 100644 --- a/augmentation/spec_stretch.py +++ b/augmentation/spec_stretch.py @@ -48,7 +48,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None) ).cpu().numpy() f0, _ = self.pe.get_pitch( - wav, aug_item['length'], hparams, speed=speed, interp_uv=hparams['interp_uv'] + wav, aug_item['length'], hparams, speed=speed, interp_uv=True ) aug_item['f0'] = f0.astype(np.float32) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 974373cc7..99beb58c4 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -51,7 +51,6 @@ spec_min: [-5] spec_max: [0] mel_vmin: -6. #-6. mel_vmax: 1.5 -interp_uv: true energy_smooth_width: 0.12 breathiness_smooth_width: 0.12 diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index 8f2abe364..fa7d56413 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -856,18 +856,6 @@ Hop size or step length (in number of waveform samples) of mel and feature extra default512 -### interp_uv - -Whether to apply linear interpolation to unvoiced parts in f0. - - - - - - - -
visibilityacoustic
scopepreprocessing
customizabilityreserved
typeboolean
defaulttrue
- ### lambda_aux_mel_loss Coefficient of aux mel loss when calculating total loss of acoustic model with shallow diffusion. diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index c6cf48b24..fdc87a22a 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -108,7 +108,7 @@ def process_item(self, item_name, meta_data, binarization_args): if pitch_extractor is None: pitch_extractor = initialize_pe() gt_f0, uv = pitch_extractor.get_pitch( - wav, length, hparams, interp_uv=hparams['interp_uv'] + wav, length, hparams, interp_uv=True ) if uv.all(): # All unvoiced print(f'Skipped \'{item_name}\': empty gt f0') From c56f3f35f2591a75ffe9772121c5b37bd92feecc Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 17:40:39 +0800 Subject: [PATCH 03/15] Drop support for `train_set_name` and `valid_set_name` configuration keys --- basics/base_task.py | 4 ++-- configs/base.yaml | 2 -- docs/ConfigurationSchemas.md | 24 ------------------------ 3 files changed, 2 insertions(+), 28 deletions(-) diff --git a/basics/base_task.py b/basics/base_task.py index 8a180e0b0..38e83386b 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -87,8 +87,8 @@ def _finish_init(self): # Training, validation and testing ########### def setup(self, stage): - self.train_dataset = self.dataset_cls(hparams['train_set_name']) - self.valid_dataset = self.dataset_cls(hparams['valid_set_name']) + self.train_dataset = self.dataset_cls('train') + self.valid_dataset = self.dataset_cls('valid') self.num_replicas = (self.trainer.distributed_sampler_kwargs or {}).get('num_replicas', 1) def get_need_freeze_state_dict_key(self, model_state_dict) -> list: diff --git a/configs/base.yaml b/configs/base.yaml index 57ec1c583..1171e85e2 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -67,8 +67,6 @@ max_batch_frames: 32000 max_batch_size: 100000 max_val_batch_frames: 60000 max_val_batch_size: 1 -train_set_name: 'train' -valid_set_name: 'valid' pe: 'parselmouth' pe_ckpt: '' f0_min: 65 diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index fa7d56413..e47cf7a38 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -1731,18 +1731,6 @@ Total number of diffusion steps. default1000 -### train_set_name - -Name of the training set used in binary filenames, TensorBoard keys, etc. - - - - - - - -
visibilityall
scopepreprocessing, training
customizabilityreserved
typestr
defaulttrain
- ### use_breathiness_embed Whether to accept and embed breathiness values into the model. @@ -1877,18 +1865,6 @@ Whether to load and use the vocoder to generate audio during validation. Validat defaulttrue -### valid_set_name - -Name of the validation set used in binary filenames, TensorBoard keys, etc. - - - - - - - -
visibilityall
scopepreprocessing, training
customizabilityreserved
typestr
defaultvalid
- ### variances_prediction_args Arguments for prediction of variance parameters other than pitch, like energy, breathiness, etc. From 9f79b75d46edb7132335d78efc37517a71cb5c4f Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 17:47:36 +0800 Subject: [PATCH 04/15] Drop support for linear domain of random time stretching augmentation --- configs/acoustic.yaml | 1 - configs/templates/config_acoustic.yaml | 1 - docs/ConfigurationSchemas.md | 16 ---------------- preprocessing/acoustic_binarizer.py | 11 ++--------- 4 files changed, 2 insertions(+), 27 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 99beb58c4..fd85e990c 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -39,7 +39,6 @@ augmentation_args: random_time_stretching: enabled: false range: [0.5, 2.] - domain: log # or linear scale: 0.75 raw_data_dir: 'data/opencpop/raw' diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index ef32ef5ac..84f6a7079 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -37,7 +37,6 @@ augmentation_args: random_time_stretching: enabled: true range: [0.5, 2.] - domain: log # or linear scale: 0.75 residual_channels: 512 diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index e47cf7a38..723fe9129 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -165,22 +165,6 @@ Arguments for random time stretching augmentation. typedict -### augmentation_args.random_time_stretching.domain - -The domain where random time stretching factors are uniformly distributed in. - -- If 'linear', stretching ratio $x$ will be uniformly distributed in $[V_{min}, V_{max}]$. -- If 'log', $\ln{x}$ will be uniformly distributed in $[\ln{V_{min}}, \ln{V_{max}}]$. - - - - - - - - -
visibilityacoustic
scopepreprocessing
customizabilitynot recommended
typestr
defaultlog
constraintChoose from 'log', 'linear'.
- ### augmentation_args.random_time_stretching.enabled Whether to apply random time stretching augmentation. diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index fdc87a22a..7ec1ae6f0 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -225,12 +225,10 @@ def arrange_data_augmentation(self, data_iterator): from augmentation.spec_stretch import SpectrogramStretchAugmentation aug_args = self.augmentation_args['random_time_stretching'] speed_min, speed_max = aug_args['range'] - domain = aug_args['domain'] assert hparams.get('use_speed_embed', False), \ 'Random time stretching augmentation requires use_speed_embed == True.' assert 0 < speed_min < 1 < speed_max, \ 'Random time stretching augmentation must have a range where 0 < min < 1 < max.' - assert domain in ['log', 'linear'], 'domain must be \'log\' or \'linear\'.' aug_ins = SpectrogramStretchAugmentation(self.raw_data_dirs, aug_args, pe=aug_pe) scale = aug_args['scale'] @@ -241,13 +239,8 @@ def arrange_data_augmentation(self, data_iterator): aug_items = random.choices(all_item_names, k=k_from_raw) + random.choices(aug_list, k=k_from_aug + k_mutate) for aug_type, aug_item in zip(aug_types, aug_items): - if domain == 'log': - # Uniform distribution in log domain - speed = speed_min * (speed_max / speed_min) ** random.random() - else: - # Uniform distribution in linear domain - rand = random.uniform(-1, 1) - speed = 1 + (speed_max - 1) * rand if rand >= 0 else 1 + (1 - speed_min) * rand + # Uniform distribution in log domain + speed = speed_min * (speed_max / speed_min) ** random.random() if aug_type == 0: aug_task = { 'name': aug_item, From 996786721973fa956adc452aefadb408df66ccc8 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 18:04:33 +0800 Subject: [PATCH 05/15] Drop support for `num_pad_tokens` configuration key --- configs/acoustic.yaml | 1 - configs/variance.yaml | 1 - docs/BestPractices.md | 6 +----- docs/ConfigurationSchemas.md | 14 -------------- utils/text_encoder.py | 23 ++++------------------- 5 files changed, 5 insertions(+), 40 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index fd85e990c..f3f8b40d5 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -45,7 +45,6 @@ raw_data_dir: 'data/opencpop/raw' binary_data_dir: 'data/opencpop/binary' binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer dictionary: dictionaries/opencpop-extension.txt -num_pad_tokens: 1 spec_min: [-5] spec_max: [0] mel_vmin: -6. #-6. diff --git a/configs/variance.yaml b/configs/variance.yaml index 1951f685e..4a780bdd8 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -29,7 +29,6 @@ raw_data_dir: 'data/opencpop_variance/raw' binary_data_dir: 'data/opencpop_variance/binary' binarizer_cls: preprocessing.variance_binarizer.VarianceBinarizer dictionary: dictionaries/opencpop-extension.txt -num_pad_tokens: 1 use_spk_id: false diff --git a/docs/BestPractices.md b/docs/BestPractices.md index b142a3873..23829de79 100644 --- a/docs/BestPractices.md +++ b/docs/BestPractices.md @@ -132,11 +132,7 @@ Once the coverage checks passed, a phoneme distribution summary will be saved in ![phoneme-distribution](resources/phoneme-distribution.jpg) -During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. By default, there are one padding index before all real phonemes IDs. You may edit the number of padding indices, but it is not recommended to do so: - -```yaml -num_pad_tokens: 1 -``` +During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. There are one padding index (marked as `default2 -### num_pad_tokens - -Number of padding phoneme indexes before all real tokens. - -Due to some historical reasons, old checkpoints may have 3 padding tokens called \, \ and \. After refactoring, all padding tokens are called \, and only the first one (token == 0) will be used. - - - - - - - -
visibilityacoustic, variance
scopenn, preprocess
customizabilitynot recommended
typeint
default1
- ### num_sanity_val_steps Number of sanity validation steps at the beginning. diff --git a/utils/text_encoder.py b/utils/text_encoder.py index 605b7e80e..4b7815c46 100644 --- a/utils/text_encoder.py +++ b/utils/text_encoder.py @@ -1,19 +1,9 @@ import numpy as np -from utils.hparams import hparams - PAD = '' PAD_INDEX = 0 -def strip_ids(ids, ids_to_strip): - """Strip ids_to_strip from the end ids.""" - ids = list(ids) - while ids and ids[-1] in ids_to_strip: - ids.pop() - return ids - - class TokenTextEncoder: """Encoder based on a user-supplied vocabulary (file or list).""" @@ -26,30 +16,25 @@ def __init__(self, vocab_list): Args: vocab_list: If not None, a list of elements of the vocabulary. """ - self.num_reserved_ids = hparams.get('num_pad_tokens', 3) - assert self.num_reserved_ids > 0, 'num_pad_tokens must be positive' self.vocab_list = sorted(vocab_list) def encode(self, sentence): """Converts a space-separated string of phones to a list of ids.""" phones = sentence.strip().split() if isinstance(sentence, str) else sentence - return [self.vocab_list.index(ph) + self.num_reserved_ids if ph != PAD else PAD_INDEX for ph in phones] + return [self.vocab_list.index(ph) + 1 if ph != PAD else PAD_INDEX for ph in phones] def decode(self, ids, strip_padding=False): if strip_padding: ids = np.trim_zeros(ids) ids = list(ids) return ' '.join([ - self.vocab_list[_id - self.num_reserved_ids] if _id >= self.num_reserved_ids else PAD + self.vocab_list[_id - 1] if _id >= 1 else PAD for _id in ids ]) - def pad(self): - pass - @property def vocab_size(self): - return len(self.vocab_list) + self.num_reserved_ids + return len(self.vocab_list) + 1 def __len__(self): return self.vocab_size @@ -64,5 +49,5 @@ def store_to_file(self, filename): filename: Full path of the file to store the vocab to. """ with open(filename, 'w', encoding='utf8') as f: - [print(PAD, file=f) for _ in range(self.num_reserved_ids)] + print(PAD, file=f) [print(tok, file=f) for tok in self.vocab_list] From a09221e072e6c559333083771c4dd034f550e3e8 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 18:50:53 +0800 Subject: [PATCH 06/15] Drop support for code backup before training --- basics/base_task.py | 10 ---------- configs/base.yaml | 5 ----- docs/ConfigurationSchemas.md | 12 ------------ 3 files changed, 27 deletions(-) diff --git a/basics/base_task.py b/basics/base_task.py index 38e83386b..e176f2c9a 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -3,7 +3,6 @@ import pathlib import shutil import sys -from datetime import datetime from typing import Dict import matplotlib @@ -16,7 +15,6 @@ import torch.utils.data from torchmetrics import Metric, MeanMetric import lightning.pytorch as pl -from lightning.pytorch.callbacks import LearningRateMonitor from lightning.pytorch.utilities.rank_zero import rank_zero_debug, rank_zero_info, rank_zero_only from basics.base_module import CategorizedModule @@ -451,14 +449,6 @@ def start(cls): if not hparams['infer']: # train @rank_zero_only def train_payload_copy(): - # copy_code = input(f'{hparams["save_codes"]} code backup? y/n: ') == 'y' - copy_code = True # backup code every time - if copy_code: - code_dir = work_dir / 'codes' / datetime.now().strftime('%Y%m%d%H%M%S') - code_dir.mkdir(exist_ok=True, parents=True) - for c in hparams['save_codes']: - shutil.copytree(c, code_dir / c, dirs_exist_ok=True) - print(f'| Copied codes to {code_dir}.') # Copy spk_map.json and dictionary.txt to work dir binary_dir = pathlib.Path(hparams['binary_data_dir']) spk_map = work_dir / 'spk_map.json' diff --git a/configs/base.yaml b/configs/base.yaml index 1171e85e2..34708c275 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -1,11 +1,6 @@ # task task_cls: null seed: 1234 -save_codes: - - configs - - modules - - training - - utils ############# # dataset diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index 5ee45f1a4..bd7ac7697 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -1487,18 +1487,6 @@ Training performance on some datasets may be very sensitive to this value. Chang default6 -### save_codes - -Files in these folders will be backed up every time a training starts. - - - - - - - -
visibilityall
scopetraining
customizabilitynormal
typelist
default[configs, modules, training, utils]
- ### schedule_type The diffusion schedule type. From bae5779e5d5a18415ae659a602bccc16711f705d Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 20:18:53 +0800 Subject: [PATCH 07/15] Drop support for `ffn_padding` configuration key --- configs/base.yaml | 1 - docs/ConfigurationSchemas.md | 14 +--- modules/commons/common_layers.py | 89 ++++++-------------------- modules/fastspeech/acoustic_encoder.py | 3 +- modules/fastspeech/tts_modules.py | 62 ++++++------------ modules/fastspeech/variance_encoder.py | 7 +- 6 files changed, 44 insertions(+), 132 deletions(-) diff --git a/configs/base.yaml b/configs/base.yaml index 34708c275..0fb73eaaa 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -31,7 +31,6 @@ enc_layers: 4 num_heads: 2 enc_ffn_kernel_size: 9 ffn_act: gelu -ffn_padding: 'SAME' use_spk_id: false ########### diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index bd7ac7697..6e3799f04 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -673,18 +673,6 @@ Activation function of TransformerFFNLayer in FastSpeech2 encoder: constraintsChoose from 'relu', 'gelu', 'swish'. -### ffn_padding - -Padding mode of TransformerFFNLayer convolution in FastSpeech2 encoder. - - - - - - - -
visibilityacoustic, variance
scopenn
customizabilitynot recommended
typestr
defaultSAME
- ### fft_size Fast Fourier Transforms parameter for mel extraction. @@ -1042,7 +1030,7 @@ Minimum mel spectrogram heatmap value for TensorBoard plotting. ### melody_encoder_args -Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `enc_ffn_kernel_size`, `ffn_padding`, `ffn_act`, `dropout`, `num_heads`, `use_pos_embed`, `rel_pos`. If either of the parameter does not exist in this configuration key, it inherits from the linguistic encoder. +Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `enc_ffn_kernel_size`, `ffn_act`, `dropout`, `num_heads`, `use_pos_embed`, `rel_pos`. If either of the parameter does not exist in this configuration key, it inherits from the linguistic encoder. diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index 7e5fd08d4..9ea2c2638 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -104,35 +104,13 @@ def max_positions(): return int(1e5) # an arbitrary large number -class ConvTBC(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, padding=0): - super(ConvTBC, self).__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.padding = padding - - self.weight = torch.nn.Parameter(torch.Tensor( - self.kernel_size, in_channels, out_channels)) - self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) - - def forward(self, x): - return torch.conv_tbc(x.contiguous(), self.weight, self.bias, self.padding) - - class TransformerFFNLayer(nn.Module): - def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'): + def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'): super().__init__() self.kernel_size = kernel_size self.dropout = dropout self.act = act - if padding == 'SAME': - self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2) - elif padding == 'LEFT': - self.ffn_1 = nn.Sequential( - nn.ConstantPad1d((kernel_size - 1, 0), 0.0), - nn.Conv1d(hidden_size, filter_size, kernel_size) - ) + self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2) if self.act == 'relu': self.act_fn = ReLU() elif self.act == 'gelu': @@ -152,44 +130,18 @@ def forward(self, x): return x -class BatchNorm1dTBC(nn.Module): - def __init__(self, c): - super(BatchNorm1dTBC, self).__init__() - self.bn = nn.BatchNorm1d(c) - - def forward(self, x): - """ - - :param x: [T, B, C] - :return: [T, B, C] - """ - x = x.permute(1, 2, 0) # [B, C, T] - x = self.bn(x) # [B, C, T] - x = x.permute(2, 0, 1) # [T, B, C] - return x - - class EncSALayer(nn.Module): def __init__(self, c, num_heads, dropout, attention_dropout=0.1, - relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'): + relu_dropout=0.1, kernel_size=9, act='gelu'): super().__init__() - self.c = c self.dropout = dropout - self.num_heads = num_heads - if num_heads > 0: - if norm == 'ln': - self.layer_norm1 = LayerNorm(c) - elif norm == 'bn': - self.layer_norm1 = BatchNorm1dTBC(c) - self.self_attn = MultiheadAttention( - self.c, num_heads, dropout=attention_dropout, bias=False, - ) - if norm == 'ln': - self.layer_norm2 = LayerNorm(c) - elif norm == 'bn': - self.layer_norm2 = BatchNorm1dTBC(c) + self.layer_norm1 = LayerNorm(c) + self.self_attn = MultiheadAttention( + c, num_heads, dropout=attention_dropout, bias=False, + ) + self.layer_norm2 = LayerNorm(c) self.ffn = TransformerFFNLayer( - c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act + c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act ) def forward(self, x, encoder_padding_mask=None, **kwargs): @@ -197,18 +149,17 @@ def forward(self, x, encoder_padding_mask=None, **kwargs): if layer_norm_training is not None: self.layer_norm1.training = layer_norm_training self.layer_norm2.training = layer_norm_training - if self.num_heads > 0: - residual = x - x = self.layer_norm1(x) - x, _, = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=encoder_padding_mask - ) - x = F.dropout(x, self.dropout, training=self.training) - x = residual + x - x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] + residual = x + x = self.layer_norm1(x) + x, _, = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask + ) + x = F.dropout(x, self.dropout, training=self.training) + x = residual + x + x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] residual = x x = self.layer_norm2(x) diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 5d1ab1699..494354cb2 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -18,8 +18,7 @@ def __init__(self, vocab_size): self.dur_embed = Linear(1, hparams['hidden_size']) self.encoder = FastSpeech2Encoder( self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], - ffn_kernel_size=hparams['enc_ffn_kernel_size'], - ffn_padding=hparams['ffn_padding'], ffn_act=hparams['ffn_act'], + ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'], dropout=hparams['dropout'], num_heads=hparams['num_heads'], use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] ) diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py index bc5eff265..5b31909b2 100644 --- a/modules/fastspeech/tts_modules.py +++ b/modules/fastspeech/tts_modules.py @@ -4,7 +4,7 @@ import torch.nn as nn from torch.nn import functional as F -from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer, BatchNorm1dTBC +from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer from modules.commons.espnet_positional_embedding import RelPositionalEncoding DEFAULT_MAX_SOURCE_POSITIONS = 2000 @@ -12,14 +12,13 @@ class TransformerEncoderLayer(nn.Module): - def __init__(self, hidden_size, dropout, kernel_size=None, padding='SAME', act='gelu', num_heads=2, norm='ln'): + def __init__(self, hidden_size, dropout, kernel_size=None, act='gelu', num_heads=2): super().__init__() self.op = EncSALayer( hidden_size, num_heads, dropout=dropout, attention_dropout=0.0, relu_dropout=dropout, kernel_size=kernel_size, - padding=padding, - norm=norm, act=act + act=act ) def forward(self, x, **kwargs): @@ -63,7 +62,7 @@ class DurationPredictor(torch.nn.Module): """ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, - dropout_rate=0.1, offset=1.0, padding='SAME', dur_loss_type='mse'): + dropout_rate=0.1, offset=1.0, dur_loss_type='mse'): """Initialize duration predictor module. Args: in_dims (int): Input dimension. @@ -77,18 +76,14 @@ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, self.offset = offset self.conv = torch.nn.ModuleList() self.kernel_size = kernel_size - self.padding = padding for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans - self.conv += [torch.nn.Sequential( - torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2) - if padding == 'SAME' - else (kernel_size - 1, 0), 0), - torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0), + self.conv.append(torch.nn.Sequential( + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate) - )] + )) self.loss_type = dur_loss_type if self.loss_type in ['mse', 'huber']: @@ -141,7 +136,7 @@ def forward(self, xs, x_masks=None, infer=True): class VariancePredictor(torch.nn.Module): def __init__(self, vmin, vmax, in_dims, n_layers=5, n_chans=512, kernel_size=5, - dropout_rate=0.1, padding='SAME'): + dropout_rate=0.1): """Initialize variance predictor module. Args: in_dims (int): Input dimension. @@ -156,18 +151,14 @@ def __init__(self, vmin, vmax, in_dims, self.vmax = vmax self.conv = torch.nn.ModuleList() self.kernel_size = kernel_size - self.padding = padding for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans - self.conv += [torch.nn.Sequential( - torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2) - if padding == 'SAME' - else (kernel_size - 1, 0), 0), - torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0), + self.conv.append(torch.nn.Sequential( + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate) - )] + )) self.linear = torch.nn.Linear(n_chans, 1) self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096) self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) @@ -195,7 +186,7 @@ def forward(self, xs, infer=True): class PitchPredictor(torch.nn.Module): def __init__(self, vmin, vmax, num_bins, deviation, in_dims, n_layers=5, n_chans=384, kernel_size=5, - dropout_rate=0.1, padding='SAME'): + dropout_rate=0.1): """Initialize pitch predictor module. Args: in_dims (int): Input dimension. @@ -214,18 +205,14 @@ def __init__(self, vmin, vmax, num_bins, deviation, self.base_pitch_embed = torch.nn.Linear(1, in_dims) self.conv = torch.nn.ModuleList() self.kernel_size = kernel_size - self.padding = padding for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans - self.conv += [torch.nn.Sequential( - torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2) - if padding == 'SAME' - else (kernel_size - 1, 0), 0), - torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0), + self.conv.append(torch.nn.Sequential( + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate) - )] + )) self.linear = torch.nn.Linear(n_chans, num_bins) self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096) self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) @@ -364,31 +351,23 @@ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None): class FastSpeech2Encoder(nn.Module): def __init__(self, embed_tokens, hidden_size, num_layers, - ffn_kernel_size=9, ffn_padding='SAME', ffn_act='gelu', - dropout=None, num_heads=2, use_last_norm=True, norm='ln', - use_pos_embed=True, rel_pos=True): + ffn_kernel_size=9, ffn_act='gelu', + dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True): super().__init__() self.num_layers = num_layers embed_dim = self.hidden_size = hidden_size self.dropout = dropout self.use_pos_embed = use_pos_embed - self.use_last_norm = use_last_norm self.layers = nn.ModuleList([ TransformerEncoderLayer( self.hidden_size, self.dropout, - kernel_size=ffn_kernel_size, padding=ffn_padding, act=ffn_act, + kernel_size=ffn_kernel_size, act=ffn_act, num_heads=num_heads ) for _ in range(self.num_layers) ]) - if self.use_last_norm: - if norm == 'ln': - self.layer_norm = nn.LayerNorm(embed_dim) - elif norm == 'bn': - self.layer_norm = BatchNorm1dTBC(embed_dim) - else: - self.layer_norm = None + self.layer_norm = nn.LayerNorm(embed_dim) self.embed_tokens = embed_tokens # redundant, but have to persist for compatibility with old checkpoints self.embed_scale = math.sqrt(hidden_size) @@ -438,8 +417,7 @@ def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_ for layer in self.layers: x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB hiddens.append(x) - if self.use_last_norm: - x = self.layer_norm(x) * nonpadding_mask_TB + x = self.layer_norm(x) * nonpadding_mask_TB if return_hiddens: x = torch.stack(hiddens, 0) # [L, T, B, C] x = x.transpose(1, 2) # [L, B, T, C] diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index 8e2117f6b..82702b3b6 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -27,8 +27,7 @@ def __init__(self, vocab_size): self.encoder = FastSpeech2Encoder( self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], - ffn_kernel_size=hparams['enc_ffn_kernel_size'], - ffn_padding=hparams['ffn_padding'], ffn_act=hparams['ffn_act'], + ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'], dropout=hparams['dropout'], num_heads=hparams['num_heads'], use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] ) @@ -41,7 +40,6 @@ def __init__(self, vocab_size): n_chans=dur_hparams['hidden_size'], n_layers=dur_hparams['num_layers'], dropout_rate=dur_hparams['dropout'], - padding=hparams['ffn_padding'], kernel_size=dur_hparams['kernel_size'], offset=dur_hparams['log_offset'], dur_loss_type=dur_hparams['loss_type'] @@ -109,8 +107,7 @@ def get_hparam(key): self.encoder = FastSpeech2Encoder( None, hidden_size, num_layers=get_hparam('enc_layers'), - ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), - ffn_padding=get_hparam('ffn_padding'), ffn_act=get_hparam('ffn_act'), + ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), ffn_act=get_hparam('ffn_act'), dropout=get_hparam('dropout'), num_heads=get_hparam('num_heads'), use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos') ) From 204593946ed749368bc60535d01f2bcccaf43fab Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 20:37:08 +0800 Subject: [PATCH 08/15] Drop support for random seeding --- basics/base_binarizer.py | 1 - basics/base_task.py | 4 +--- configs/base.yaml | 1 - docs/ConfigurationSchemas.md | 12 ------------ 4 files changed, 1 insertion(+), 17 deletions(-) diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index 38b191077..ddad6e02e 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -173,7 +173,6 @@ def process(self): self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names) if self.binarization_args['shuffle']: - random.seed(hparams['seed']) random.shuffle(self.item_names) self.binary_data_dir.mkdir(parents=True, exist_ok=True) diff --git a/basics/base_task.py b/basics/base_task.py index e176f2c9a..c933e2a99 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -342,8 +342,7 @@ def train_dataloader(self): size_reversed=True, required_batch_count_multiple=hparams['accumulate_grad_batches'], shuffle_sample=True, - shuffle_batch=True, - seed=hparams['seed'] + shuffle_batch=True ) return torch.utils.data.DataLoader( self.train_dataset, @@ -394,7 +393,6 @@ def on_test_end(self): @classmethod def start(cls): - pl.seed_everything(hparams['seed'], workers=True) task = cls() # if pre_train is not None: diff --git a/configs/base.yaml b/configs/base.yaml index 0fb73eaaa..4eed200e9 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -1,6 +1,5 @@ # task task_cls: null -seed: 1234 ############# # dataset diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index 6e3799f04..46f0f23f5 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -1488,18 +1488,6 @@ The diffusion schedule type.
typedict
constraintsChoose from 'linear', 'cosine'.
-### seed - -The global random seed used to shuffle data, initializing model weights, etc. - - - - - - - -
visibilityall
scopepreprocessing, training
customizabilitynormal
typeint
default1234
- ### shallow_diffusion_args Arguments for shallow_diffusion. From 1d2a9e80527347564ad7e12126d698ac3f9ff4c9 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 21:19:56 +0800 Subject: [PATCH 09/15] Add placeholder to load old checkpoint --- modules/fastspeech/tts_modules.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py index 5b31909b2..590c4da92 100644 --- a/modules/fastspeech/tts_modules.py +++ b/modules/fastspeech/tts_modules.py @@ -79,6 +79,7 @@ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans self.conv.append(torch.nn.Sequential( + torch.nn.Identity(), # this is a placeholder for ConstantPad1d which is now merged into Conv1d torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), From 26e8f07f6b471217b6f2c6eff61c8ac4754b2832 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 7 Feb 2024 21:21:00 +0800 Subject: [PATCH 10/15] Remove duplicate txt_embed layer (resuming may raise errors) --- modules/fastspeech/acoustic_encoder.py | 2 +- modules/fastspeech/tts_modules.py | 3 +-- modules/fastspeech/variance_encoder.py | 4 ++-- utils/__init__.py | 6 +++++- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 494354cb2..2099c9008 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -17,7 +17,7 @@ def __init__(self, vocab_size): self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX) self.dur_embed = Linear(1, hparams['hidden_size']) self.encoder = FastSpeech2Encoder( - self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], + hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'], dropout=hparams['dropout'], num_heads=hparams['num_heads'], use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py index 590c4da92..1dd164d17 100644 --- a/modules/fastspeech/tts_modules.py +++ b/modules/fastspeech/tts_modules.py @@ -351,7 +351,7 @@ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None): class FastSpeech2Encoder(nn.Module): - def __init__(self, embed_tokens, hidden_size, num_layers, + def __init__(self, hidden_size, num_layers, ffn_kernel_size=9, ffn_act='gelu', dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True): super().__init__() @@ -370,7 +370,6 @@ def __init__(self, embed_tokens, hidden_size, num_layers, ]) self.layer_norm = nn.LayerNorm(embed_dim) - self.embed_tokens = embed_tokens # redundant, but have to persist for compatibility with old checkpoints self.embed_scale = math.sqrt(hidden_size) self.padding_idx = 0 self.rel_pos = rel_pos diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index 82702b3b6..82e0a88e8 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -26,7 +26,7 @@ def __init__(self, vocab_size): self.ph_dur_embed = Linear(1, hparams['hidden_size']) self.encoder = FastSpeech2Encoder( - self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], + hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'], dropout=hparams['dropout'], num_heads=hparams['num_heads'], use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] @@ -106,7 +106,7 @@ def get_hparam(key): self.note_glide_embed = Embedding(len(hparams['glide_types']) + 1, hidden_size, padding_idx=0) self.encoder = FastSpeech2Encoder( - None, hidden_size, num_layers=get_hparam('enc_layers'), + hidden_size=hidden_size, num_layers=get_hparam('enc_layers'), ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), ffn_act=get_hparam('ffn_act'), dropout=get_hparam('dropout'), num_heads=get_hparam('num_heads'), use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos') diff --git a/utils/__init__.py b/utils/__init__.py index 356fef7a5..abb5df151 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -165,9 +165,12 @@ def filter_kwargs(dict_to_filter, kwarg_obj): def load_ckpt( cur_model, ckpt_base_dir, ckpt_steps=None, - prefix_in_ckpt='model', key_in_ckpt='state_dict', + prefix_in_ckpt='model', ignored_prefixes=None, key_in_ckpt='state_dict', strict=True, device='cpu' ): + if ignored_prefixes is None: + # NOTICE: this is for compatibility with old checkpoints which have duplicate txt_embed layer in them. + ignored_prefixes = ['model.fs2.encoder.embed_tokens'] if not isinstance(ckpt_base_dir, pathlib.Path): ckpt_base_dir = pathlib.Path(ckpt_base_dir) if ckpt_base_dir.is_file(): @@ -197,6 +200,7 @@ def load_ckpt( state_dict = OrderedDict({ k[len(prefix_in_ckpt) + 1:]: v for k, v in state_dict.items() if k.startswith(f'{prefix_in_ckpt}.') + if all(not k.startswith(p) for p in ignored_prefixes) }) if not strict: cur_model_state_dict = cur_model.state_dict() From 49c23602ce55a7c943211951633f8d9e9a8e7935 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Thu, 8 Feb 2024 15:03:44 +0800 Subject: [PATCH 11/15] Remove migration script and error message for transcriptions.txt --- preprocessing/acoustic_binarizer.py | 35 ++++++------- scripts/migrate.py | 78 ----------------------------- 2 files changed, 14 insertions(+), 99 deletions(-) delete mode 100644 scripts/migrate.py diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 7ec1ae6f0..bba3b7b6b 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -55,27 +55,20 @@ def __init__(self): def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id): meta_data_dict = {} - if (raw_data_dir / 'transcriptions.csv').exists(): - with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f: - for utterance_label in csv.DictReader(f): - item_name = utterance_label['name'] - temp_dict = { - 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), - 'ph_seq': utterance_label['ph_seq'].split(), - 'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()], - 'spk_id': spk_id, - 'spk_name': self.speakers[ds_id], - } - assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \ - f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.' - meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict - else: - raise FileNotFoundError( - f'transcriptions.csv not found in {raw_data_dir}. ' - 'If this is a dataset with the old transcription format, please consider ' - 'migrating it to the new format via the following command:\n' - 'python scripts/migrate.py txt ' - ) + with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f: + for utterance_label in csv.DictReader(f): + item_name = utterance_label['name'] + temp_dict = { + 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), + 'ph_seq': utterance_label['ph_seq'].split(), + 'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()], + 'spk_id': spk_id, + 'spk_name': self.speakers[ds_id], + } + assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \ + f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.' + meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict + self.items.update(meta_data_dict) @torch.no_grad() diff --git a/scripts/migrate.py b/scripts/migrate.py deleted file mode 100644 index f1125f3d5..000000000 --- a/scripts/migrate.py +++ /dev/null @@ -1,78 +0,0 @@ -import pathlib -from collections import OrderedDict - -import click - - -@click.group() -def main(): - pass - - -@main.command(help='Migrate checkpoint files of MIDI-less acoustic models from old format') -@click.argument('input_ckpt', metavar='INPUT') -@click.argument('output_ckpt', metavar='OUTPUT') -@click.option('--overwrite', is_flag=True, show_default=True, help='Overwrite the existing file') -def ckpt( - input_ckpt: str, - output_ckpt: str, - overwrite: bool = False -): - input_ckpt = pathlib.Path(input_ckpt).resolve() - output_ckpt = pathlib.Path(output_ckpt).resolve() - assert input_ckpt.exists(), 'The input file does not exist.' - assert overwrite or not output_ckpt.exists(), \ - 'The output file already exists or is the same as the input file.\n' \ - 'This is not recommended because migration scripts may not be stable, ' \ - 'and you may be at risk of losing your model.\n' \ - 'If you are sure to OVERWRITE the existing file, please re-run this script with the \'--overwrite\' argument.' - - import torch - ckpt_loaded = torch.load(input_ckpt, map_location='cpu') - if 'category' in ckpt_loaded: - print('This checkpoint file is already in the new format.') - exit(0) - state_dict: OrderedDict = ckpt_loaded['state_dict'] - ckpt_loaded['optimizer_states'][0]['state'].clear() - new_state_dict = OrderedDict() - for key in state_dict: - if key.startswith('model.fs2'): - # keep model.fs2.xxx - new_state_dict[key] = state_dict[key] - else: - # model.xxx => model.diffusion.xxx - path = key.split('.', maxsplit=1)[1] - new_state_dict[f'model.diffusion.{path}'] = state_dict[key] - ckpt_loaded['category'] = 'acoustic' - ckpt_loaded['state_dict'] = new_state_dict - torch.save(ckpt_loaded, output_ckpt) - - -@main.command(help='Migrate transcriptions.txt in old datasets to transcriptions.csv') -@click.argument('input_txt', metavar='INPUT') -def txt( - input_txt: str -): - input_txt = pathlib.Path(input_txt).resolve() - assert input_txt.exists(), 'The input file does not exist.' - with open(input_txt, 'r', encoding='utf8') as f: - utterances = f.readlines() - utterances = [u.split('|') for u in utterances] - utterances = [ - { - 'name': u[0], - 'ph_seq': u[2], - 'ph_dur': u[5] - } - for u in utterances - ] - - import csv - with open(input_txt.with_suffix('.csv'), 'w', encoding='utf8', newline='') as f: - writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur']) - writer.writeheader() - writer.writerows(utterances) - - -if __name__ == '__main__': - main() From 1bc60dd2f01d5c51afe50d08a6a50b3a459acca8 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Thu, 8 Feb 2024 15:38:54 +0800 Subject: [PATCH 12/15] Remove seed from batch shuffling --- utils/training_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/training_utils.py b/utils/training_utils.py index 98fb6da47..d406985cd 100644 --- a/utils/training_utils.py +++ b/utils/training_utils.py @@ -103,7 +103,7 @@ def __init__(self, dataset, max_batch_frames, max_batch_size, sub_indices=None, def __form_batches(self): if self.formed == self.epoch + self.seed: return - rng = np.random.default_rng(self.seed + self.epoch) + rng = np.random.default_rng() # Create indices if self.shuffle_sample: if self.sub_indices is not None: From 969e313529fba3c88380fb0f06fdc76751954fd1 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 24 Feb 2024 01:32:14 +0800 Subject: [PATCH 13/15] Use direct access on some hparam keys --- augmentation/spec_stretch.py | 4 ++-- basics/base_task.py | 6 +++--- deployment/exporters/acoustic_exporter.py | 6 +++--- deployment/modules/fastspeech2.py | 10 +++++----- inference/ds_acoustic.py | 4 ++-- modules/diffusion/ddpm.py | 6 +++--- modules/pe/__init__.py | 2 +- preprocessing/acoustic_binarizer.py | 8 ++++---- scripts/binarize.py | 2 +- scripts/infer.py | 2 +- utils/phoneme_utils.py | 2 +- utils/training_utils.py | 2 +- 12 files changed, 27 insertions(+), 27 deletions(-) diff --git a/augmentation/spec_stretch.py b/augmentation/spec_stretch.py index 9c22e363b..7eeeda2dc 100644 --- a/augmentation/spec_stretch.py +++ b/augmentation/spec_stretch.py @@ -38,7 +38,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None) aug_item['mel'] = mel - if speed != 1. or hparams.get('use_speed_embed', False): + if speed != 1. or hparams['use_speed_embed']: aug_item['length'] = mel.shape[0] aug_item['speed'] = int(np.round(hparams['hop_size'] * speed)) / hparams['hop_size'] # real speed aug_item['seconds'] /= aug_item['speed'] @@ -83,7 +83,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None) align_length=aug_item['length'] ) - if key_shift != 0. or hparams.get('use_key_shift_embed', False): + if key_shift != 0. or hparams['use_key_shift_embed']: if replace_spk_id is None: aug_item['key_shift'] = key_shift else: diff --git a/basics/base_task.py b/basics/base_task.py index c933e2a99..eabfcae6e 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -128,9 +128,9 @@ def load_finetune_ckpt( self.load_state_dict(state_dict, strict=False) def load_pre_train_model(self): - pre_train_ckpt_path = hparams.get('finetune_ckpt_path') - blacklist = hparams.get('finetune_ignored_params') - # whitelist=hparams.get('pre_train_whitelist') + pre_train_ckpt_path = hparams['finetune_ckpt_path'] + blacklist = hparams['finetune_ignored_params'] + # whitelist=hparams['pre_train_whitelist'] if blacklist is None: blacklist = [] # if whitelist is None: diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py index ab4252469..d637f34f1 100644 --- a/deployment/exporters/acoustic_exporter.py +++ b/deployment/exporters/acoustic_exporter.py @@ -58,7 +58,7 @@ def __init__( if hparams['use_spk_id'] else None self.export_spk: List[Tuple[str, Dict[str, float]]] = export_spk \ if hparams['use_spk_id'] and export_spk is not None else [] - if hparams.get('use_key_shift_embed', False) and not self.expose_gender: + if hparams['use_key_shift_embed'] and not self.expose_gender: shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range'] key_shift = freeze_gender * shift_max if freeze_gender >= 0. else freeze_gender * abs(shift_min) key_shift = max(min(key_shift, shift_max), shift_min) # clip key shift @@ -143,14 +143,14 @@ def _torch_export_model(self): for v_name in self.model.fs2.variance_embed_list } } - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: if self.expose_gender: kwargs['gender'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device) input_names.append('gender') dynamix_axes['gender'] = { 1: 'n_frames' } - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: if self.expose_velocity: kwargs['velocity'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device) input_names.append('velocity') diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py index b2ee086c4..157e031d8 100644 --- a/deployment/modules/fastspeech2.py +++ b/deployment/modules/fastspeech2.py @@ -45,14 +45,14 @@ def __init__(self, vocab_size): super().__init__(vocab_size=vocab_size) # for temporary compatibility; will be completely removed in the future - self.f0_embed_type = hparams.get('f0_embed_type', 'discrete') + self.f0_embed_type = hparams['f0_embed_type'] if self.f0_embed_type == 'discrete': self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX) self.lr = LengthRegulator() - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: self.shift_min, self.shift_max = hparams['augmentation_args']['random_pitch_shifting']['range'] - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: self.speed_min, self.speed_max = hparams['augmentation_args']['random_time_stretching']['range'] # noinspection PyMethodOverriding @@ -82,7 +82,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity= ], dim=-1).sum(-1) condition += variance_embeds - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: if hasattr(self, 'frozen_key_shift'): key_shift_embed = self.key_shift_embed(self.frozen_key_shift[:, None, None]) else: @@ -92,7 +92,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity= key_shift_embed = self.key_shift_embed(key_shift[:, :, None]) condition += key_shift_embed - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: if velocity is not None: velocity = torch.clip(velocity, min=self.speed_min, max=self.speed_max) speed_embed = self.speed_embed(velocity[:, :, None]) diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index 02f6b3a92..a67f5b166 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -111,7 +111,7 @@ def preprocess_input(self, param, idx=0): )).to(self.device)[None] summary[v_name] = 'manual' - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range'] gender = param.get('gender') if gender is None: @@ -135,7 +135,7 @@ def preprocess_input(self, param, idx=0): min=shift_min, max=shift_max ) - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: if param.get('velocity') is None: summary['velocity'] = 'default' batch['speed'] = torch.FloatTensor([1.]).to(self.device)[:, None] # => [B=1, T=1] diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index cd8295a7c..89aea5eb4 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -36,7 +36,7 @@ def noise_like(shape, device, repeat=False): return repeat_noise() if repeat else noise() -def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.01)): +def linear_beta_schedule(timesteps, max_beta=0.01): """ linear schedule """ @@ -239,8 +239,8 @@ def inference(self, cond, b=1, x_start=None, device=None): assert x_start is not None, 'Missing shallow diffusion source.' x = x_start - if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0: - algorithm = hparams.get('diff_accelerator', 'ddim') + if hparams['pndm_speedup'] > 1 and t_max > 0: + algorithm = hparams['diff_accelerator'] if algorithm == 'dpm-solver': from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver # 1. Define the noise schedule. diff --git a/modules/pe/__init__.py b/modules/pe/__init__.py index 99d3dae95..edf747a32 100644 --- a/modules/pe/__init__.py +++ b/modules/pe/__init__.py @@ -6,7 +6,7 @@ def initialize_pe(): - pe = hparams.get('pe', 'parselmouth') + pe = hparams['pe'] pe_ckpt = hparams['pe_ckpt'] if pe == 'parselmouth': return ParselmouthPE() diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index ec5d88841..36b609ffb 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -189,10 +189,10 @@ def process_item(self, item_name, meta_data, binarization_args): processed_input['tension'] = tension.cpu().numpy() - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: processed_input['key_shift'] = 0. - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: processed_input['speed'] = 1. return processed_input @@ -207,7 +207,7 @@ def arrange_data_augmentation(self, data_iterator): from augmentation.spec_stretch import SpectrogramStretchAugmentation aug_args = self.augmentation_args['random_pitch_shifting'] key_shift_min, key_shift_max = aug_args['range'] - assert hparams.get('use_key_shift_embed', False), \ + assert hparams['use_key_shift_embed'], \ 'Random pitch shifting augmentation requires use_key_shift_embed == True.' assert key_shift_min < 0 < key_shift_max, \ 'Random pitch shifting augmentation must have a range where min < 0 < max.' @@ -273,7 +273,7 @@ def arrange_data_augmentation(self, data_iterator): from augmentation.spec_stretch import SpectrogramStretchAugmentation aug_args = self.augmentation_args['random_time_stretching'] speed_min, speed_max = aug_args['range'] - assert hparams.get('use_speed_embed', False), \ + assert hparams['use_speed_embed'], \ 'Random time stretching augmentation requires use_speed_embed == True.' assert 0 < speed_min < 1 < speed_max, \ 'Random time stretching augmentation must have a range where 0 < min < 1 < max.' diff --git a/scripts/binarize.py b/scripts/binarize.py index 767e947a3..74abd2ba2 100644 --- a/scripts/binarize.py +++ b/scripts/binarize.py @@ -13,7 +13,7 @@ def binarize(): - binarizer_cls = hparams.get("binarizer_cls", 'basics.base_binarizer.BaseBinarizer') + binarizer_cls = hparams["binarizer_cls"] pkg = ".".join(binarizer_cls.split(".")[:-1]) cls_name = binarizer_cls.split(".")[-1] binarizer_cls = getattr(importlib.import_module(pkg), cls_name) diff --git a/scripts/infer.py b/scripts/infer.py index 8c6e6e835..e56a71186 100644 --- a/scripts/infer.py +++ b/scripts/infer.py @@ -123,7 +123,7 @@ def acoustic( spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None for param in params: - if gender is not None and hparams.get('use_key_shift_embed'): + if gender is not None and hparams['use_key_shift_embed']: param['gender'] = gender if spk_mix is not None: diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 7c6f317f5..269122a6d 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -29,7 +29,7 @@ def locate_dictionary(): """ assert 'dictionary' in hparams or 'g2p_dictionary' in hparams, \ 'Please specify a dictionary file in your config.' - config_dict_path = pathlib.Path(hparams.get('dictionary', hparams.get('g2p_dictionary'))) + config_dict_path = pathlib.Path(hparams['dictionary']) if config_dict_path.exists(): return config_dict_path work_dir = pathlib.Path(hparams['work_dir']) diff --git a/utils/training_utils.py b/utils/training_utils.py index d406985cd..26d24eec5 100644 --- a/utils/training_utils.py +++ b/utils/training_utils.py @@ -113,7 +113,7 @@ def __form_batches(self): indices = rng.permutation(len(self.dataset)) if self.sort_by_similar_size: - grid = int(hparams.get('sampler_frame_count_grid', 6)) + grid = int(hparams['sampler_frame_count_grid']) assert grid > 0 sizes = (np.round(np.array(self.dataset.sizes)[indices] / grid) * grid).clip(grid, None) sizes *= (-1 if self.size_reversed else 1) From a95d49186f3b8794b032f0c7ffd6411ffad20ec8 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 25 Feb 2024 15:43:49 +0800 Subject: [PATCH 14/15] Fix duplicate keys in YAML --- configs/variance.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/configs/variance.yaml b/configs/variance.yaml index 8ad481ecb..e1c0338f1 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -79,10 +79,6 @@ energy_smooth_width: 0.12 breathiness_db_min: -96.0 breathiness_db_max: -20.0 breathiness_smooth_width: 0.12 -tension_logit_min: -10.0 -tension_logit_max: 10.0 -tension_smooth_width: 0.12 - voicing_db_min: -96.0 voicing_db_max: -12.0 voicing_smooth_width: 0.12 From 1e6f1540521a7bc69d00cdf3e30487769af65c60 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 25 Feb 2024 15:46:58 +0800 Subject: [PATCH 15/15] Rename `pndm_speedup` to `diff_speedup` --- configs/acoustic.yaml | 2 +- configs/variance.yaml | 2 +- docs/ConfigurationSchemas.md | 22 +++++++++++----------- modules/diffusion/ddpm.py | 10 +++++----- scripts/infer.py | 10 ++++++++-- 5 files changed, 26 insertions(+), 20 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index a5d2226ce..07a97be47 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -66,7 +66,7 @@ timesteps: 1000 max_beta: 0.02 rel_pos: true diff_accelerator: ddim -pndm_speedup: 10 +diff_speedup: 10 hidden_size: 256 residual_layers: 20 residual_channels: 512 diff --git a/configs/variance.yaml b/configs/variance.yaml index e1c0338f1..4136dffa7 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -104,7 +104,7 @@ max_beta: 0.02 diff_decoder_type: 'wavenet' diff_loss_type: l2 diff_accelerator: ddim -pndm_speedup: 10 +diff_speedup: 10 # train and eval num_sanity_val_steps: 1 diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index 46f0f23f5..171e13068 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -402,6 +402,17 @@ Loss type of the DDPM. constraintsChoose from 'l1', 'l2'. +### diff_speedup + +Diffusion sampling speed-up ratio. 1 means no speeding up. + + + + + + +
visibilityacoustic, variance
typeint
default10
constraintsMust be a factor of K_step.
+ ### dilation_cycle_length Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks. @@ -1351,17 +1362,6 @@ Strategy name for the Lightning trainer. defaultauto -### pndm_speedup - -Diffusion sampling speed-up ratio. 1 means no speeding up. - - - - - - -
visibilityacoustic, variance
typeint
default10
constraintsMust be a factor of K_step.
- ### predict_breathiness Whether to enable breathiness prediction. diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index 89aea5eb4..7b91122fd 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -239,7 +239,7 @@ def inference(self, cond, b=1, x_start=None, device=None): assert x_start is not None, 'Missing shallow diffusion source.' x = x_start - if hparams['pndm_speedup'] > 1 and t_max > 0: + if hparams['diff_speedup'] > 1 and t_max > 0: algorithm = hparams['diff_accelerator'] if algorithm == 'dpm-solver': from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver @@ -270,7 +270,7 @@ def wrapped(x, t, **kwargs): # costs and the sample quality. dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") - steps = t_max // hparams["pndm_speedup"] + steps = t_max // hparams["diff_speedup"] self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False) x = dpm_solver.sample( x, @@ -308,7 +308,7 @@ def wrapped(x, t, **kwargs): # costs and the sample quality. uni_pc = UniPC(model_fn, noise_schedule, variant='bh2') - steps = t_max // hparams["pndm_speedup"] + steps = t_max // hparams["diff_speedup"] self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False) x = uni_pc.sample( x, @@ -320,7 +320,7 @@ def wrapped(x, t, **kwargs): self.bar.close() elif algorithm == 'pndm': self.noise_list = deque(maxlen=4) - iteration_interval = hparams['pndm_speedup'] + iteration_interval = hparams['diff_speedup'] for i in tqdm( reversed(range(0, t_max, iteration_interval)), desc='sample time step', total=t_max // iteration_interval, disable=not hparams['infer'], leave=False @@ -330,7 +330,7 @@ def wrapped(x, t, **kwargs): iteration_interval, cond=cond ) elif algorithm == 'ddim': - iteration_interval = hparams['pndm_speedup'] + iteration_interval = hparams['diff_speedup'] for i in tqdm( reversed(range(0, t_max, iteration_interval)), desc='sample time step', total=t_max // iteration_interval, disable=not hparams['infer'], leave=False diff --git a/scripts/infer.py b/scripts/infer.py index e56a71186..3618bcb6d 100644 --- a/scripts/infer.py +++ b/scripts/infer.py @@ -119,7 +119,10 @@ def acoustic( if speedup > 0: assert depth % speedup == 0, f'Acceleration ratio must be factor of diffusion depth {depth}.' - hparams['pndm_speedup'] = speedup + hparams['diff_speedup'] = speedup + elif 'diff_speedup' not in hparams: + # NOTICE: this is for compatibility + hparams['diff_speedup'] = hparams['pndm_speedup'] spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None for param in params: @@ -213,7 +216,10 @@ def variance( if speedup > 0: assert hparams['K_step'] % speedup == 0, f'Acceleration ratio must be factor of K_step {hparams["K_step"]}.' - hparams['pndm_speedup'] = speedup + hparams['diff_speedup'] = speedup + elif 'diff_speedup' not in hparams: + # NOTICE: this is for compatibility + hparams['diff_speedup'] = hparams['pndm_speedup'] spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None for param in params: