diff --git a/augmentation/spec_stretch.py b/augmentation/spec_stretch.py index 6f1a394ef..7eeeda2dc 100644 --- a/augmentation/spec_stretch.py +++ b/augmentation/spec_stretch.py @@ -38,7 +38,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None) aug_item['mel'] = mel - if speed != 1. or hparams.get('use_speed_embed', False): + if speed != 1. or hparams['use_speed_embed']: aug_item['length'] = mel.shape[0] aug_item['speed'] = int(np.round(hparams['hop_size'] * speed)) / hparams['hop_size'] # real speed aug_item['seconds'] /= aug_item['speed'] @@ -50,7 +50,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None) f0, _ = self.pe.get_pitch( wav, samplerate=hparams['audio_sample_rate'], length=aug_item['length'], hop_size=hparams['hop_size'], f0_min=hparams['f0_min'], f0_max=hparams['f0_max'], - speed=speed, interp_uv=hparams['interp_uv'] + speed=speed, interp_uv=True ) aug_item['f0'] = f0.astype(np.float32) @@ -83,7 +83,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None) align_length=aug_item['length'] ) - if key_shift != 0. or hparams.get('use_key_shift_embed', False): + if key_shift != 0. or hparams['use_key_shift_embed']: if replace_spk_id is None: aug_item['key_shift'] = key_shift else: diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py index 38b191077..ddad6e02e 100644 --- a/basics/base_binarizer.py +++ b/basics/base_binarizer.py @@ -173,7 +173,6 @@ def process(self): self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names) if self.binarization_args['shuffle']: - random.seed(hparams['seed']) random.shuffle(self.item_names) self.binary_data_dir.mkdir(parents=True, exist_ok=True) diff --git a/basics/base_task.py b/basics/base_task.py index 8a180e0b0..eabfcae6e 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -3,7 +3,6 @@ import pathlib import shutil import sys -from datetime import datetime from typing import Dict import matplotlib @@ -16,7 +15,6 @@ import torch.utils.data from torchmetrics import Metric, MeanMetric import lightning.pytorch as pl -from lightning.pytorch.callbacks import LearningRateMonitor from lightning.pytorch.utilities.rank_zero import rank_zero_debug, rank_zero_info, rank_zero_only from basics.base_module import CategorizedModule @@ -87,8 +85,8 @@ def _finish_init(self): # Training, validation and testing ########### def setup(self, stage): - self.train_dataset = self.dataset_cls(hparams['train_set_name']) - self.valid_dataset = self.dataset_cls(hparams['valid_set_name']) + self.train_dataset = self.dataset_cls('train') + self.valid_dataset = self.dataset_cls('valid') self.num_replicas = (self.trainer.distributed_sampler_kwargs or {}).get('num_replicas', 1) def get_need_freeze_state_dict_key(self, model_state_dict) -> list: @@ -130,9 +128,9 @@ def load_finetune_ckpt( self.load_state_dict(state_dict, strict=False) def load_pre_train_model(self): - pre_train_ckpt_path = hparams.get('finetune_ckpt_path') - blacklist = hparams.get('finetune_ignored_params') - # whitelist=hparams.get('pre_train_whitelist') + pre_train_ckpt_path = hparams['finetune_ckpt_path'] + blacklist = hparams['finetune_ignored_params'] + # whitelist=hparams['pre_train_whitelist'] if blacklist is None: blacklist = [] # if whitelist is None: @@ -344,8 +342,7 @@ def train_dataloader(self): size_reversed=True, required_batch_count_multiple=hparams['accumulate_grad_batches'], shuffle_sample=True, - shuffle_batch=True, - seed=hparams['seed'] + shuffle_batch=True ) return torch.utils.data.DataLoader( self.train_dataset, @@ -396,7 +393,6 @@ def on_test_end(self): @classmethod def start(cls): - pl.seed_everything(hparams['seed'], workers=True) task = cls() # if pre_train is not None: @@ -451,14 +447,6 @@ def start(cls): if not hparams['infer']: # train @rank_zero_only def train_payload_copy(): - # copy_code = input(f'{hparams["save_codes"]} code backup? y/n: ') == 'y' - copy_code = True # backup code every time - if copy_code: - code_dir = work_dir / 'codes' / datetime.now().strftime('%Y%m%d%H%M%S') - code_dir.mkdir(exist_ok=True, parents=True) - for c in hparams['save_codes']: - shutil.copytree(c, code_dir / c, dirs_exist_ok=True) - print(f'| Copied codes to {code_dir}.') # Copy spk_map.json and dictionary.txt to work dir binary_dir = pathlib.Path(hparams['binary_data_dir']) spk_map = work_dir / 'spk_map.json' diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index b7e7dcfda..07a97be47 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -39,26 +39,22 @@ augmentation_args: random_time_stretching: enabled: false range: [0.5, 2.] - domain: log # or linear scale: 0.75 raw_data_dir: 'data/opencpop/raw' binary_data_dir: 'data/opencpop/binary' binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer dictionary: dictionaries/opencpop-extension.txt -num_pad_tokens: 1 spec_min: [-5] spec_max: [0] mel_vmin: -6. #-6. mel_vmax: 1.5 -interp_uv: true energy_smooth_width: 0.12 breathiness_smooth_width: 0.12 voicing_smooth_width: 0.12 tension_smooth_width: 0.12 use_spk_id: false -f0_embed_type: continuous use_energy_embed: false use_breathiness_embed: false use_voicing_embed: false @@ -70,7 +66,7 @@ timesteps: 1000 max_beta: 0.02 rel_pos: true diff_accelerator: ddim -pndm_speedup: 10 +diff_speedup: 10 hidden_size: 256 residual_layers: 20 residual_channels: 512 diff --git a/configs/base.yaml b/configs/base.yaml index 57ec1c583..4eed200e9 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -1,11 +1,5 @@ # task task_cls: null -seed: 1234 -save_codes: - - configs - - modules - - training - - utils ############# # dataset @@ -36,7 +30,6 @@ enc_layers: 4 num_heads: 2 enc_ffn_kernel_size: 9 ffn_act: gelu -ffn_padding: 'SAME' use_spk_id: false ########### @@ -67,8 +60,6 @@ max_batch_frames: 32000 max_batch_size: 100000 max_val_batch_frames: 60000 max_val_batch_size: 1 -train_set_name: 'train' -valid_set_name: 'valid' pe: 'parselmouth' pe_ckpt: '' f0_min: 65 diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 57c7c8653..cb0142502 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -43,7 +43,6 @@ augmentation_args: random_time_stretching: enabled: true range: [0.5, 2.] - domain: log # or linear scale: 0.75 residual_channels: 512 diff --git a/configs/variance.yaml b/configs/variance.yaml index 276ce3084..4136dffa7 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -29,7 +29,6 @@ raw_data_dir: 'data/opencpop_variance/raw' binary_data_dir: 'data/opencpop_variance/binary' binarizer_cls: preprocessing.variance_binarizer.VarianceBinarizer dictionary: dictionaries/opencpop-extension.txt -num_pad_tokens: 1 use_spk_id: false @@ -80,10 +79,6 @@ energy_smooth_width: 0.12 breathiness_db_min: -96.0 breathiness_db_max: -20.0 breathiness_smooth_width: 0.12 -tension_logit_min: -10.0 -tension_logit_max: 10.0 -tension_smooth_width: 0.12 - voicing_db_min: -96.0 voicing_db_max: -12.0 voicing_smooth_width: 0.12 @@ -109,7 +104,7 @@ max_beta: 0.02 diff_decoder_type: 'wavenet' diff_loss_type: l2 diff_accelerator: ddim -pndm_speedup: 10 +diff_speedup: 10 # train and eval num_sanity_val_steps: 1 diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py index ab4252469..d637f34f1 100644 --- a/deployment/exporters/acoustic_exporter.py +++ b/deployment/exporters/acoustic_exporter.py @@ -58,7 +58,7 @@ def __init__( if hparams['use_spk_id'] else None self.export_spk: List[Tuple[str, Dict[str, float]]] = export_spk \ if hparams['use_spk_id'] and export_spk is not None else [] - if hparams.get('use_key_shift_embed', False) and not self.expose_gender: + if hparams['use_key_shift_embed'] and not self.expose_gender: shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range'] key_shift = freeze_gender * shift_max if freeze_gender >= 0. else freeze_gender * abs(shift_min) key_shift = max(min(key_shift, shift_max), shift_min) # clip key shift @@ -143,14 +143,14 @@ def _torch_export_model(self): for v_name in self.model.fs2.variance_embed_list } } - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: if self.expose_gender: kwargs['gender'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device) input_names.append('gender') dynamix_axes['gender'] = { 1: 'n_frames' } - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: if self.expose_velocity: kwargs['velocity'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device) input_names.append('velocity') diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py index 52f6a4d28..157e031d8 100644 --- a/deployment/modules/fastspeech2.py +++ b/deployment/modules/fastspeech2.py @@ -1,17 +1,22 @@ import copy +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from modules.commons.common_layers import NormalInitEmbedding as Embedding from modules.fastspeech.acoustic_encoder import FastSpeech2Acoustic from modules.fastspeech.variance_encoder import FastSpeech2Variance from utils.hparams import hparams -from utils.pitch_utils import ( - f0_bin, f0_mel_min, f0_mel_max -) from utils.text_encoder import PAD_INDEX +f0_bin = 256 +f0_max = 1100.0 +f0_min = 50.0 +f0_mel_min = 1127 * np.log(1 + f0_min / 700) +f0_mel_max = 1127 * np.log(1 + f0_max / 700) + def f0_to_coarse(f0): f0_mel = 1127 * (1 + f0 / 700).log() @@ -38,10 +43,16 @@ def forward(self, dur): class FastSpeech2AcousticONNX(FastSpeech2Acoustic): def __init__(self, vocab_size): super().__init__(vocab_size=vocab_size) + + # for temporary compatibility; will be completely removed in the future + self.f0_embed_type = hparams['f0_embed_type'] + if self.f0_embed_type == 'discrete': + self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX) + self.lr = LengthRegulator() - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: self.shift_min, self.shift_max = hparams['augmentation_args']['random_pitch_shifting']['range'] - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: self.speed_min, self.speed_max = hparams['augmentation_args']['random_time_stretching']['range'] # noinspection PyMethodOverriding @@ -71,7 +82,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity= ], dim=-1).sum(-1) condition += variance_embeds - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: if hasattr(self, 'frozen_key_shift'): key_shift_embed = self.key_shift_embed(self.frozen_key_shift[:, None, None]) else: @@ -81,7 +92,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity= key_shift_embed = self.key_shift_embed(key_shift[:, :, None]) condition += key_shift_embed - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: if velocity is not None: velocity = torch.clip(velocity, min=self.speed_min, max=self.speed_max) speed_embed = self.speed_embed(velocity[:, :, None]) diff --git a/docs/BestPractices.md b/docs/BestPractices.md index b142a3873..23829de79 100644 --- a/docs/BestPractices.md +++ b/docs/BestPractices.md @@ -132,11 +132,7 @@ Once the coverage checks passed, a phoneme distribution summary will be saved in ![phoneme-distribution](resources/phoneme-distribution.jpg) -During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. By default, there are one padding index before all real phonemes IDs. You may edit the number of padding indices, but it is not recommended to do so: - -```yaml -num_pad_tokens: 1 -``` +During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. There are one padding index (marked as `typedict -### augmentation_args.random_time_stretching.domain - -The domain where random time stretching factors are uniformly distributed in. - -- If 'linear', stretching ratio $x$ will be uniformly distributed in $[V_{min}, V_{max}]$. -- If 'log', $\ln{x}$ will be uniformly distributed in $[\ln{V_{min}}, \ln{V_{max}}]$. - - - - - - - - -
visibilityacoustic
scopepreprocessing
customizabilitynot recommended
typestr
defaultlog
constraintChoose from 'log', 'linear'.
- ### augmentation_args.random_time_stretching.enabled Whether to apply random time stretching augmentation. @@ -418,6 +402,17 @@ Loss type of the DDPM. constraintsChoose from 'l1', 'l2'. +### diff_speedup + +Diffusion sampling speed-up ratio. 1 means no speeding up. + + + + + + +
visibilityacoustic, variance
typeint
default10
constraintsMust be a factor of K_step.
+ ### dilation_cycle_length Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks. @@ -648,22 +643,6 @@ Length of sinusoidal smoothing convolution kernel (in seconds) on extracted ener default0.12 -### f0_embed_type - -Map f0 to embedding using: - -- `torch.nn.Linear` if 'continuous' -- `torch.nn.Embedding` if 'discrete' - - - - - - - - -
visibilityacoustic
scopenn
customizabilitynormal
typestr
defaultcontinuous
constraintsChoose from 'continuous', 'discrete'.
- ### f0_max Maximum base frequency (F0) in Hz for pitch extraction. @@ -705,18 +684,6 @@ Activation function of TransformerFFNLayer in FastSpeech2 encoder: constraintsChoose from 'relu', 'gelu', 'swish'. -### ffn_padding - -Padding mode of TransformerFFNLayer convolution in FastSpeech2 encoder. - - - - - - - -
visibilityacoustic, variance
scopenn
customizabilitynot recommended
typestr
defaultSAME
- ### fft_size Fast Fourier Transforms parameter for mel extraction. @@ -872,18 +839,6 @@ Hop size or step length (in number of waveform samples) of mel and feature extra default512 -### interp_uv - -Whether to apply linear interpolation to unvoiced parts in f0. - - - - - - - -
visibilityacoustic
scopepreprocessing
customizabilityreserved
typeboolean
defaulttrue
- ### lambda_aux_mel_loss Coefficient of aux mel loss when calculating total loss of acoustic model with shallow diffusion. @@ -1086,7 +1041,7 @@ Minimum mel spectrogram heatmap value for TensorBoard plotting. ### melody_encoder_args -Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `enc_ffn_kernel_size`, `ffn_padding`, `ffn_act`, `dropout`, `num_heads`, `use_pos_embed`, `rel_pos`. If either of the parameter does not exist in this configuration key, it inherits from the linguistic encoder. +Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `enc_ffn_kernel_size`, `ffn_act`, `dropout`, `num_heads`, `use_pos_embed`, `rel_pos`. If either of the parameter does not exist in this configuration key, it inherits from the linguistic encoder. @@ -1140,20 +1095,6 @@ The number of attention heads of `torch.nn.MultiheadAttention` in FastSpeech2 en
typedict
default2
-### num_pad_tokens - -Number of padding phoneme indexes before all real tokens. - -Due to some historical reasons, old checkpoints may have 3 padding tokens called \, \ and \. After refactoring, all padding tokens are called \, and only the first one (token == 0) will be used. - - - - - - - -
visibilityacoustic, variance
scopenn, preprocess
customizabilitynot recommended
typeint
default1
- ### num_sanity_val_steps Number of sanity validation steps at the beginning. @@ -1421,17 +1362,6 @@ Strategy name for the Lightning trainer. defaultauto -### pndm_speedup - -Diffusion sampling speed-up ratio. 1 means no speeding up. - - - - - - -
visibilityacoustic, variance
typeint
default10
constraintsMust be a factor of K_step.
- ### predict_breathiness Whether to enable breathiness prediction. @@ -1545,18 +1475,6 @@ Training performance on some datasets may be very sensitive to this value. Chang default6 -### save_codes - -Files in these folders will be backed up every time a training starts. - - - - - - - -
visibilityall
scopetraining
customizabilitynormal
typelist
default[configs, modules, training, utils]
- ### schedule_type The diffusion schedule type. @@ -1570,18 +1488,6 @@ The diffusion schedule type. constraintsChoose from 'linear', 'cosine'. -### seed - -The global random seed used to shuffle data, initializing model weights, etc. - - - - - - - -
visibilityall
scopepreprocessing, training
customizabilitynormal
typeint
default1234
- ### shallow_diffusion_args Arguments for shallow_diffusion. @@ -1759,18 +1665,6 @@ Total number of diffusion steps. default1000 -### train_set_name - -Name of the training set used in binary filenames, TensorBoard keys, etc. - - - - - - - -
visibilityall
scopepreprocessing, training
customizabilityreserved
typestr
defaulttrain
- ### use_breathiness_embed Whether to accept and embed breathiness values into the model. @@ -1905,18 +1799,6 @@ Whether to load and use the vocoder to generate audio during validation. Validat defaulttrue -### valid_set_name - -Name of the validation set used in binary filenames, TensorBoard keys, etc. - - - - - - - -
visibilityall
scopepreprocessing, training
customizabilityreserved
typestr
defaultvalid
- ### variances_prediction_args Arguments for prediction of variance parameters other than pitch, like energy, breathiness, etc. diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index 02f6b3a92..a67f5b166 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -111,7 +111,7 @@ def preprocess_input(self, param, idx=0): )).to(self.device)[None] summary[v_name] = 'manual' - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range'] gender = param.get('gender') if gender is None: @@ -135,7 +135,7 @@ def preprocess_input(self, param, idx=0): min=shift_min, max=shift_max ) - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: if param.get('velocity') is None: summary['velocity'] = 'default' batch['speed'] = torch.FloatTensor([1.]).to(self.device)[:, None] # => [B=1, T=1] diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index 7e5fd08d4..9ea2c2638 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -104,35 +104,13 @@ def max_positions(): return int(1e5) # an arbitrary large number -class ConvTBC(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, padding=0): - super(ConvTBC, self).__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.padding = padding - - self.weight = torch.nn.Parameter(torch.Tensor( - self.kernel_size, in_channels, out_channels)) - self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) - - def forward(self, x): - return torch.conv_tbc(x.contiguous(), self.weight, self.bias, self.padding) - - class TransformerFFNLayer(nn.Module): - def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'): + def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'): super().__init__() self.kernel_size = kernel_size self.dropout = dropout self.act = act - if padding == 'SAME': - self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2) - elif padding == 'LEFT': - self.ffn_1 = nn.Sequential( - nn.ConstantPad1d((kernel_size - 1, 0), 0.0), - nn.Conv1d(hidden_size, filter_size, kernel_size) - ) + self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2) if self.act == 'relu': self.act_fn = ReLU() elif self.act == 'gelu': @@ -152,44 +130,18 @@ def forward(self, x): return x -class BatchNorm1dTBC(nn.Module): - def __init__(self, c): - super(BatchNorm1dTBC, self).__init__() - self.bn = nn.BatchNorm1d(c) - - def forward(self, x): - """ - - :param x: [T, B, C] - :return: [T, B, C] - """ - x = x.permute(1, 2, 0) # [B, C, T] - x = self.bn(x) # [B, C, T] - x = x.permute(2, 0, 1) # [T, B, C] - return x - - class EncSALayer(nn.Module): def __init__(self, c, num_heads, dropout, attention_dropout=0.1, - relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'): + relu_dropout=0.1, kernel_size=9, act='gelu'): super().__init__() - self.c = c self.dropout = dropout - self.num_heads = num_heads - if num_heads > 0: - if norm == 'ln': - self.layer_norm1 = LayerNorm(c) - elif norm == 'bn': - self.layer_norm1 = BatchNorm1dTBC(c) - self.self_attn = MultiheadAttention( - self.c, num_heads, dropout=attention_dropout, bias=False, - ) - if norm == 'ln': - self.layer_norm2 = LayerNorm(c) - elif norm == 'bn': - self.layer_norm2 = BatchNorm1dTBC(c) + self.layer_norm1 = LayerNorm(c) + self.self_attn = MultiheadAttention( + c, num_heads, dropout=attention_dropout, bias=False, + ) + self.layer_norm2 = LayerNorm(c) self.ffn = TransformerFFNLayer( - c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act + c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act ) def forward(self, x, encoder_padding_mask=None, **kwargs): @@ -197,18 +149,17 @@ def forward(self, x, encoder_padding_mask=None, **kwargs): if layer_norm_training is not None: self.layer_norm1.training = layer_norm_training self.layer_norm2.training = layer_norm_training - if self.num_heads > 0: - residual = x - x = self.layer_norm1(x) - x, _, = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=encoder_padding_mask - ) - x = F.dropout(x, self.dropout, training=self.training) - x = residual + x - x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] + residual = x + x = self.layer_norm1(x) + x, _, = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask + ) + x = F.dropout(x, self.dropout, training=self.training) + x = residual + x + x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] residual = x x = self.layer_norm2(x) diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index cd8295a7c..7b91122fd 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -36,7 +36,7 @@ def noise_like(shape, device, repeat=False): return repeat_noise() if repeat else noise() -def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.01)): +def linear_beta_schedule(timesteps, max_beta=0.01): """ linear schedule """ @@ -239,8 +239,8 @@ def inference(self, cond, b=1, x_start=None, device=None): assert x_start is not None, 'Missing shallow diffusion source.' x = x_start - if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0: - algorithm = hparams.get('diff_accelerator', 'ddim') + if hparams['diff_speedup'] > 1 and t_max > 0: + algorithm = hparams['diff_accelerator'] if algorithm == 'dpm-solver': from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver # 1. Define the noise schedule. @@ -270,7 +270,7 @@ def wrapped(x, t, **kwargs): # costs and the sample quality. dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") - steps = t_max // hparams["pndm_speedup"] + steps = t_max // hparams["diff_speedup"] self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False) x = dpm_solver.sample( x, @@ -308,7 +308,7 @@ def wrapped(x, t, **kwargs): # costs and the sample quality. uni_pc = UniPC(model_fn, noise_schedule, variant='bh2') - steps = t_max // hparams["pndm_speedup"] + steps = t_max // hparams["diff_speedup"] self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False) x = uni_pc.sample( x, @@ -320,7 +320,7 @@ def wrapped(x, t, **kwargs): self.bar.close() elif algorithm == 'pndm': self.noise_list = deque(maxlen=4) - iteration_interval = hparams['pndm_speedup'] + iteration_interval = hparams['diff_speedup'] for i in tqdm( reversed(range(0, t_max, iteration_interval)), desc='sample time step', total=t_max // iteration_interval, disable=not hparams['infer'], leave=False @@ -330,7 +330,7 @@ def wrapped(x, t, **kwargs): iteration_interval, cond=cond ) elif algorithm == 'ddim': - iteration_interval = hparams['pndm_speedup'] + iteration_interval = hparams['diff_speedup'] for i in tqdm( reversed(range(0, t_max, iteration_interval)), desc='sample time step', total=t_max // iteration_interval, disable=not hparams['infer'], leave=False diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index 666e7f659..9ab4ed633 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -8,7 +8,6 @@ ) from modules.fastspeech.tts_modules import FastSpeech2Encoder, mel2ph_to_dur from utils.hparams import hparams -from utils.pitch_utils import f0_to_coarse from utils.text_encoder import PAD_INDEX @@ -18,21 +17,13 @@ def __init__(self, vocab_size): self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX) self.dur_embed = Linear(1, hparams['hidden_size']) self.encoder = FastSpeech2Encoder( - self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], - ffn_kernel_size=hparams['enc_ffn_kernel_size'], - ffn_padding=hparams['ffn_padding'], ffn_act=hparams['ffn_act'], + hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], + ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'], dropout=hparams['dropout'], num_heads=hparams['num_heads'], use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] ) - self.f0_embed_type = hparams.get('f0_embed_type', 'discrete') - if self.f0_embed_type == 'discrete': - self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX) - elif self.f0_embed_type == 'continuous': - self.pitch_embed = Linear(1, hparams['hidden_size']) - else: - raise ValueError('f0_embed_type must be \'discrete\' or \'continuous\'.') - + self.pitch_embed = Linear(1, hparams['hidden_size']) self.variance_embed_list = [] self.use_energy_embed = hparams.get('use_energy_embed', False) self.use_breathiness_embed = hparams.get('use_breathiness_embed', False) @@ -106,12 +97,8 @@ def forward( spk_embed = self.spk_embed(spk_embed_id)[:, None, :] condition += spk_embed - if self.f0_embed_type == 'discrete': - pitch = f0_to_coarse(f0) - pitch_embed = self.pitch_embed(pitch) - else: - f0_mel = (1 + f0 / 700).log() - pitch_embed = self.pitch_embed(f0_mel[:, :, None]) + f0_mel = (1 + f0 / 700).log() + pitch_embed = self.pitch_embed(f0_mel[:, :, None]) condition += pitch_embed condition = self.forward_variance_embedding( diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py index bc5eff265..1dd164d17 100644 --- a/modules/fastspeech/tts_modules.py +++ b/modules/fastspeech/tts_modules.py @@ -4,7 +4,7 @@ import torch.nn as nn from torch.nn import functional as F -from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer, BatchNorm1dTBC +from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer from modules.commons.espnet_positional_embedding import RelPositionalEncoding DEFAULT_MAX_SOURCE_POSITIONS = 2000 @@ -12,14 +12,13 @@ class TransformerEncoderLayer(nn.Module): - def __init__(self, hidden_size, dropout, kernel_size=None, padding='SAME', act='gelu', num_heads=2, norm='ln'): + def __init__(self, hidden_size, dropout, kernel_size=None, act='gelu', num_heads=2): super().__init__() self.op = EncSALayer( hidden_size, num_heads, dropout=dropout, attention_dropout=0.0, relu_dropout=dropout, kernel_size=kernel_size, - padding=padding, - norm=norm, act=act + act=act ) def forward(self, x, **kwargs): @@ -63,7 +62,7 @@ class DurationPredictor(torch.nn.Module): """ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, - dropout_rate=0.1, offset=1.0, padding='SAME', dur_loss_type='mse'): + dropout_rate=0.1, offset=1.0, dur_loss_type='mse'): """Initialize duration predictor module. Args: in_dims (int): Input dimension. @@ -77,18 +76,15 @@ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, self.offset = offset self.conv = torch.nn.ModuleList() self.kernel_size = kernel_size - self.padding = padding for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans - self.conv += [torch.nn.Sequential( - torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2) - if padding == 'SAME' - else (kernel_size - 1, 0), 0), - torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0), + self.conv.append(torch.nn.Sequential( + torch.nn.Identity(), # this is a placeholder for ConstantPad1d which is now merged into Conv1d + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate) - )] + )) self.loss_type = dur_loss_type if self.loss_type in ['mse', 'huber']: @@ -141,7 +137,7 @@ def forward(self, xs, x_masks=None, infer=True): class VariancePredictor(torch.nn.Module): def __init__(self, vmin, vmax, in_dims, n_layers=5, n_chans=512, kernel_size=5, - dropout_rate=0.1, padding='SAME'): + dropout_rate=0.1): """Initialize variance predictor module. Args: in_dims (int): Input dimension. @@ -156,18 +152,14 @@ def __init__(self, vmin, vmax, in_dims, self.vmax = vmax self.conv = torch.nn.ModuleList() self.kernel_size = kernel_size - self.padding = padding for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans - self.conv += [torch.nn.Sequential( - torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2) - if padding == 'SAME' - else (kernel_size - 1, 0), 0), - torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0), + self.conv.append(torch.nn.Sequential( + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate) - )] + )) self.linear = torch.nn.Linear(n_chans, 1) self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096) self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) @@ -195,7 +187,7 @@ def forward(self, xs, infer=True): class PitchPredictor(torch.nn.Module): def __init__(self, vmin, vmax, num_bins, deviation, in_dims, n_layers=5, n_chans=384, kernel_size=5, - dropout_rate=0.1, padding='SAME'): + dropout_rate=0.1): """Initialize pitch predictor module. Args: in_dims (int): Input dimension. @@ -214,18 +206,14 @@ def __init__(self, vmin, vmax, num_bins, deviation, self.base_pitch_embed = torch.nn.Linear(1, in_dims) self.conv = torch.nn.ModuleList() self.kernel_size = kernel_size - self.padding = padding for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans - self.conv += [torch.nn.Sequential( - torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2) - if padding == 'SAME' - else (kernel_size - 1, 0), 0), - torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0), + self.conv.append(torch.nn.Sequential( + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate) - )] + )) self.linear = torch.nn.Linear(n_chans, num_bins) self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096) self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) @@ -363,34 +351,25 @@ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None): class FastSpeech2Encoder(nn.Module): - def __init__(self, embed_tokens, hidden_size, num_layers, - ffn_kernel_size=9, ffn_padding='SAME', ffn_act='gelu', - dropout=None, num_heads=2, use_last_norm=True, norm='ln', - use_pos_embed=True, rel_pos=True): + def __init__(self, hidden_size, num_layers, + ffn_kernel_size=9, ffn_act='gelu', + dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True): super().__init__() self.num_layers = num_layers embed_dim = self.hidden_size = hidden_size self.dropout = dropout self.use_pos_embed = use_pos_embed - self.use_last_norm = use_last_norm self.layers = nn.ModuleList([ TransformerEncoderLayer( self.hidden_size, self.dropout, - kernel_size=ffn_kernel_size, padding=ffn_padding, act=ffn_act, + kernel_size=ffn_kernel_size, act=ffn_act, num_heads=num_heads ) for _ in range(self.num_layers) ]) - if self.use_last_norm: - if norm == 'ln': - self.layer_norm = nn.LayerNorm(embed_dim) - elif norm == 'bn': - self.layer_norm = BatchNorm1dTBC(embed_dim) - else: - self.layer_norm = None + self.layer_norm = nn.LayerNorm(embed_dim) - self.embed_tokens = embed_tokens # redundant, but have to persist for compatibility with old checkpoints self.embed_scale = math.sqrt(hidden_size) self.padding_idx = 0 self.rel_pos = rel_pos @@ -438,8 +417,7 @@ def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_ for layer in self.layers: x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB hiddens.append(x) - if self.use_last_norm: - x = self.layer_norm(x) * nonpadding_mask_TB + x = self.layer_norm(x) * nonpadding_mask_TB if return_hiddens: x = torch.stack(hiddens, 0) # [L, T, B, C] x = x.transpose(1, 2) # [L, B, T, C] diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index 8e2117f6b..82e0a88e8 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -26,9 +26,8 @@ def __init__(self, vocab_size): self.ph_dur_embed = Linear(1, hparams['hidden_size']) self.encoder = FastSpeech2Encoder( - self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], - ffn_kernel_size=hparams['enc_ffn_kernel_size'], - ffn_padding=hparams['ffn_padding'], ffn_act=hparams['ffn_act'], + hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'], + ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'], dropout=hparams['dropout'], num_heads=hparams['num_heads'], use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos'] ) @@ -41,7 +40,6 @@ def __init__(self, vocab_size): n_chans=dur_hparams['hidden_size'], n_layers=dur_hparams['num_layers'], dropout_rate=dur_hparams['dropout'], - padding=hparams['ffn_padding'], kernel_size=dur_hparams['kernel_size'], offset=dur_hparams['log_offset'], dur_loss_type=dur_hparams['loss_type'] @@ -108,9 +106,8 @@ def get_hparam(key): self.note_glide_embed = Embedding(len(hparams['glide_types']) + 1, hidden_size, padding_idx=0) self.encoder = FastSpeech2Encoder( - None, hidden_size, num_layers=get_hparam('enc_layers'), - ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), - ffn_padding=get_hparam('ffn_padding'), ffn_act=get_hparam('ffn_act'), + hidden_size=hidden_size, num_layers=get_hparam('enc_layers'), + ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), ffn_act=get_hparam('ffn_act'), dropout=get_hparam('dropout'), num_heads=get_hparam('num_heads'), use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos') ) diff --git a/modules/pe/__init__.py b/modules/pe/__init__.py index 99d3dae95..edf747a32 100644 --- a/modules/pe/__init__.py +++ b/modules/pe/__init__.py @@ -6,7 +6,7 @@ def initialize_pe(): - pe = hparams.get('pe', 'parselmouth') + pe = hparams['pe'] pe_ckpt = hparams['pe_ckpt'] if pe == 'parselmouth': return ParselmouthPE() diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 6df8e7a55..36b609ffb 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -64,27 +64,20 @@ def __init__(self): def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id): meta_data_dict = {} - if (raw_data_dir / 'transcriptions.csv').exists(): - with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f: - for utterance_label in csv.DictReader(f): - item_name = utterance_label['name'] - temp_dict = { - 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), - 'ph_seq': utterance_label['ph_seq'].split(), - 'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()], - 'spk_id': spk_id, - 'spk_name': self.speakers[ds_id], - } - assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \ - f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.' - meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict - else: - raise FileNotFoundError( - f'transcriptions.csv not found in {raw_data_dir}. ' - 'If this is a dataset with the old transcription format, please consider ' - 'migrating it to the new format via the following command:\n' - 'python scripts/migrate.py txt ' - ) + with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f: + for utterance_label in csv.DictReader(f): + item_name = utterance_label['name'] + temp_dict = { + 'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'), + 'ph_seq': utterance_label['ph_seq'].split(), + 'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()], + 'spk_id': spk_id, + 'spk_name': self.speakers[ds_id], + } + assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \ + f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.' + meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict + self.items.update(meta_data_dict) @torch.no_grad() @@ -119,7 +112,7 @@ def process_item(self, item_name, meta_data, binarization_args): gt_f0, uv = pitch_extractor.get_pitch( wav, samplerate=hparams['audio_sample_rate'], length=length, hop_size=hparams['hop_size'], f0_min=hparams['f0_min'], f0_max=hparams['f0_max'], - interp_uv=hparams['interp_uv'] + interp_uv=True ) if uv.all(): # All unvoiced print(f'Skipped \'{item_name}\': empty gt f0') @@ -196,10 +189,10 @@ def process_item(self, item_name, meta_data, binarization_args): processed_input['tension'] = tension.cpu().numpy() - if hparams.get('use_key_shift_embed', False): + if hparams['use_key_shift_embed']: processed_input['key_shift'] = 0. - if hparams.get('use_speed_embed', False): + if hparams['use_speed_embed']: processed_input['speed'] = 1. return processed_input @@ -214,7 +207,7 @@ def arrange_data_augmentation(self, data_iterator): from augmentation.spec_stretch import SpectrogramStretchAugmentation aug_args = self.augmentation_args['random_pitch_shifting'] key_shift_min, key_shift_max = aug_args['range'] - assert hparams.get('use_key_shift_embed', False), \ + assert hparams['use_key_shift_embed'], \ 'Random pitch shifting augmentation requires use_key_shift_embed == True.' assert key_shift_min < 0 < key_shift_max, \ 'Random pitch shifting augmentation must have a range where min < 0 < max.' @@ -280,12 +273,10 @@ def arrange_data_augmentation(self, data_iterator): from augmentation.spec_stretch import SpectrogramStretchAugmentation aug_args = self.augmentation_args['random_time_stretching'] speed_min, speed_max = aug_args['range'] - domain = aug_args['domain'] - assert hparams.get('use_speed_embed', False), \ + assert hparams['use_speed_embed'], \ 'Random time stretching augmentation requires use_speed_embed == True.' assert 0 < speed_min < 1 < speed_max, \ 'Random time stretching augmentation must have a range where 0 < min < 1 < max.' - assert domain in ['log', 'linear'], 'domain must be \'log\' or \'linear\'.' aug_ins = SpectrogramStretchAugmentation(self.raw_data_dirs, aug_args, pe=aug_pe) scale = aug_args['scale'] @@ -296,13 +287,8 @@ def arrange_data_augmentation(self, data_iterator): aug_items = random.choices(all_item_names, k=k_from_raw) + random.choices(aug_list, k=k_from_aug + k_mutate) for aug_type, aug_item in zip(aug_types, aug_items): - if domain == 'log': - # Uniform distribution in log domain - speed = speed_min * (speed_max / speed_min) ** random.random() - else: - # Uniform distribution in linear domain - rand = random.uniform(-1, 1) - speed = 1 + (speed_max - 1) * rand if rand >= 0 else 1 + (1 - speed_min) * rand + # Uniform distribution in log domain + speed = speed_min * (speed_max / speed_min) ** random.random() if aug_type == 0: aug_task = { 'name': aug_item, diff --git a/scripts/binarize.py b/scripts/binarize.py index 767e947a3..74abd2ba2 100644 --- a/scripts/binarize.py +++ b/scripts/binarize.py @@ -13,7 +13,7 @@ def binarize(): - binarizer_cls = hparams.get("binarizer_cls", 'basics.base_binarizer.BaseBinarizer') + binarizer_cls = hparams["binarizer_cls"] pkg = ".".join(binarizer_cls.split(".")[:-1]) cls_name = binarizer_cls.split(".")[-1] binarizer_cls = getattr(importlib.import_module(pkg), cls_name) diff --git a/scripts/infer.py b/scripts/infer.py index 8c6e6e835..3618bcb6d 100644 --- a/scripts/infer.py +++ b/scripts/infer.py @@ -119,11 +119,14 @@ def acoustic( if speedup > 0: assert depth % speedup == 0, f'Acceleration ratio must be factor of diffusion depth {depth}.' - hparams['pndm_speedup'] = speedup + hparams['diff_speedup'] = speedup + elif 'diff_speedup' not in hparams: + # NOTICE: this is for compatibility + hparams['diff_speedup'] = hparams['pndm_speedup'] spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None for param in params: - if gender is not None and hparams.get('use_key_shift_embed'): + if gender is not None and hparams['use_key_shift_embed']: param['gender'] = gender if spk_mix is not None: @@ -213,7 +216,10 @@ def variance( if speedup > 0: assert hparams['K_step'] % speedup == 0, f'Acceleration ratio must be factor of K_step {hparams["K_step"]}.' - hparams['pndm_speedup'] = speedup + hparams['diff_speedup'] = speedup + elif 'diff_speedup' not in hparams: + # NOTICE: this is for compatibility + hparams['diff_speedup'] = hparams['pndm_speedup'] spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None for param in params: diff --git a/scripts/migrate.py b/scripts/migrate.py deleted file mode 100644 index f1125f3d5..000000000 --- a/scripts/migrate.py +++ /dev/null @@ -1,78 +0,0 @@ -import pathlib -from collections import OrderedDict - -import click - - -@click.group() -def main(): - pass - - -@main.command(help='Migrate checkpoint files of MIDI-less acoustic models from old format') -@click.argument('input_ckpt', metavar='INPUT') -@click.argument('output_ckpt', metavar='OUTPUT') -@click.option('--overwrite', is_flag=True, show_default=True, help='Overwrite the existing file') -def ckpt( - input_ckpt: str, - output_ckpt: str, - overwrite: bool = False -): - input_ckpt = pathlib.Path(input_ckpt).resolve() - output_ckpt = pathlib.Path(output_ckpt).resolve() - assert input_ckpt.exists(), 'The input file does not exist.' - assert overwrite or not output_ckpt.exists(), \ - 'The output file already exists or is the same as the input file.\n' \ - 'This is not recommended because migration scripts may not be stable, ' \ - 'and you may be at risk of losing your model.\n' \ - 'If you are sure to OVERWRITE the existing file, please re-run this script with the \'--overwrite\' argument.' - - import torch - ckpt_loaded = torch.load(input_ckpt, map_location='cpu') - if 'category' in ckpt_loaded: - print('This checkpoint file is already in the new format.') - exit(0) - state_dict: OrderedDict = ckpt_loaded['state_dict'] - ckpt_loaded['optimizer_states'][0]['state'].clear() - new_state_dict = OrderedDict() - for key in state_dict: - if key.startswith('model.fs2'): - # keep model.fs2.xxx - new_state_dict[key] = state_dict[key] - else: - # model.xxx => model.diffusion.xxx - path = key.split('.', maxsplit=1)[1] - new_state_dict[f'model.diffusion.{path}'] = state_dict[key] - ckpt_loaded['category'] = 'acoustic' - ckpt_loaded['state_dict'] = new_state_dict - torch.save(ckpt_loaded, output_ckpt) - - -@main.command(help='Migrate transcriptions.txt in old datasets to transcriptions.csv') -@click.argument('input_txt', metavar='INPUT') -def txt( - input_txt: str -): - input_txt = pathlib.Path(input_txt).resolve() - assert input_txt.exists(), 'The input file does not exist.' - with open(input_txt, 'r', encoding='utf8') as f: - utterances = f.readlines() - utterances = [u.split('|') for u in utterances] - utterances = [ - { - 'name': u[0], - 'ph_seq': u[2], - 'ph_dur': u[5] - } - for u in utterances - ] - - import csv - with open(input_txt.with_suffix('.csv'), 'w', encoding='utf8', newline='') as f: - writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur']) - writer.writeheader() - writer.writerows(utterances) - - -if __name__ == '__main__': - main() diff --git a/utils/__init__.py b/utils/__init__.py index 356fef7a5..abb5df151 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -165,9 +165,12 @@ def filter_kwargs(dict_to_filter, kwarg_obj): def load_ckpt( cur_model, ckpt_base_dir, ckpt_steps=None, - prefix_in_ckpt='model', key_in_ckpt='state_dict', + prefix_in_ckpt='model', ignored_prefixes=None, key_in_ckpt='state_dict', strict=True, device='cpu' ): + if ignored_prefixes is None: + # NOTICE: this is for compatibility with old checkpoints which have duplicate txt_embed layer in them. + ignored_prefixes = ['model.fs2.encoder.embed_tokens'] if not isinstance(ckpt_base_dir, pathlib.Path): ckpt_base_dir = pathlib.Path(ckpt_base_dir) if ckpt_base_dir.is_file(): @@ -197,6 +200,7 @@ def load_ckpt( state_dict = OrderedDict({ k[len(prefix_in_ckpt) + 1:]: v for k, v in state_dict.items() if k.startswith(f'{prefix_in_ckpt}.') + if all(not k.startswith(p) for p in ignored_prefixes) }) if not strict: cur_model_state_dict = cur_model.state_dict() diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py index 7c6f317f5..269122a6d 100644 --- a/utils/phoneme_utils.py +++ b/utils/phoneme_utils.py @@ -29,7 +29,7 @@ def locate_dictionary(): """ assert 'dictionary' in hparams or 'g2p_dictionary' in hparams, \ 'Please specify a dictionary file in your config.' - config_dict_path = pathlib.Path(hparams.get('dictionary', hparams.get('g2p_dictionary'))) + config_dict_path = pathlib.Path(hparams['dictionary']) if config_dict_path.exists(): return config_dict_path work_dir = pathlib.Path(hparams['work_dir']) diff --git a/utils/pitch_utils.py b/utils/pitch_utils.py index fe706f94f..57ae943f7 100644 --- a/utils/pitch_utils.py +++ b/utils/pitch_utils.py @@ -1,23 +1,4 @@ import numpy as np -import torch - -f0_bin = 256 -f0_max = 1100.0 -f0_min = 50.0 -f0_mel_min = 1127 * np.log(1 + f0_min / 700) -f0_mel_max = 1127 * np.log(1 + f0_max / 700) - - -def f0_to_coarse(f0): - is_torch = isinstance(f0, torch.Tensor) - f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 - - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 - f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) - assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) - return f0_coarse def norm_f0(f0, uv=None): diff --git a/utils/text_encoder.py b/utils/text_encoder.py index 605b7e80e..4b7815c46 100644 --- a/utils/text_encoder.py +++ b/utils/text_encoder.py @@ -1,19 +1,9 @@ import numpy as np -from utils.hparams import hparams - PAD = '' PAD_INDEX = 0 -def strip_ids(ids, ids_to_strip): - """Strip ids_to_strip from the end ids.""" - ids = list(ids) - while ids and ids[-1] in ids_to_strip: - ids.pop() - return ids - - class TokenTextEncoder: """Encoder based on a user-supplied vocabulary (file or list).""" @@ -26,30 +16,25 @@ def __init__(self, vocab_list): Args: vocab_list: If not None, a list of elements of the vocabulary. """ - self.num_reserved_ids = hparams.get('num_pad_tokens', 3) - assert self.num_reserved_ids > 0, 'num_pad_tokens must be positive' self.vocab_list = sorted(vocab_list) def encode(self, sentence): """Converts a space-separated string of phones to a list of ids.""" phones = sentence.strip().split() if isinstance(sentence, str) else sentence - return [self.vocab_list.index(ph) + self.num_reserved_ids if ph != PAD else PAD_INDEX for ph in phones] + return [self.vocab_list.index(ph) + 1 if ph != PAD else PAD_INDEX for ph in phones] def decode(self, ids, strip_padding=False): if strip_padding: ids = np.trim_zeros(ids) ids = list(ids) return ' '.join([ - self.vocab_list[_id - self.num_reserved_ids] if _id >= self.num_reserved_ids else PAD + self.vocab_list[_id - 1] if _id >= 1 else PAD for _id in ids ]) - def pad(self): - pass - @property def vocab_size(self): - return len(self.vocab_list) + self.num_reserved_ids + return len(self.vocab_list) + 1 def __len__(self): return self.vocab_size @@ -64,5 +49,5 @@ def store_to_file(self, filename): filename: Full path of the file to store the vocab to. """ with open(filename, 'w', encoding='utf8') as f: - [print(PAD, file=f) for _ in range(self.num_reserved_ids)] + print(PAD, file=f) [print(tok, file=f) for tok in self.vocab_list] diff --git a/utils/training_utils.py b/utils/training_utils.py index 98fb6da47..26d24eec5 100644 --- a/utils/training_utils.py +++ b/utils/training_utils.py @@ -103,7 +103,7 @@ def __init__(self, dataset, max_batch_frames, max_batch_size, sub_indices=None, def __form_batches(self): if self.formed == self.epoch + self.seed: return - rng = np.random.default_rng(self.seed + self.epoch) + rng = np.random.default_rng() # Create indices if self.shuffle_sample: if self.sub_indices is not None: @@ -113,7 +113,7 @@ def __form_batches(self): indices = rng.permutation(len(self.dataset)) if self.sort_by_similar_size: - grid = int(hparams.get('sampler_frame_count_grid', 6)) + grid = int(hparams['sampler_frame_count_grid']) assert grid > 0 sizes = (np.round(np.array(self.dataset.sizes)[indices] / grid) * grid).clip(grid, None) sizes *= (-1 if self.size_reversed else 1)