diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 0364b5c15..2cbc45303 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -73,10 +73,12 @@ sampling_steps: 20 diff_accelerator: ddim diff_speedup: 10 hidden_size: 256 -residual_layers: 20 -residual_channels: 512 -dilation_cycle_length: 4 # * -backbone_type: 'wavenet' +backbone_type: 'lynxnet' +backbone_args: + num_channels: 1024 + num_layers: 6 + kernel_size: 31 + dropout_rate: 0.0 main_loss_type: l2 main_loss_log_norm: false schedule_type: 'linear' diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 198444bc7..a9453a368 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -51,16 +51,24 @@ augmentation_args: range: [0.5, 2.] scale: 0.75 -residual_channels: 512 -residual_layers: 20 - -# shallow diffusion +# diffusion and shallow diffusion diffusion_type: reflow use_shallow_diffusion: true T_start: 0.4 T_start_infer: 0.4 K_step: 300 K_step_infer: 300 +backbone_type: 'lynxnet' +backbone_args: + num_channels: 1024 + num_layers: 6 + kernel_size: 31 + dropout_rate: 0.0 +#backbone_type: 'wavenet' +#backbone_args: +# num_channels: 512 +# num_layers: 20 +# dilation_cycle_length: 4 shallow_diffusion_args: train_aux_decoder: true train_diffusion: true diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index d75667797..daa8e15dc 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -78,15 +78,27 @@ pitch_prediction_args: pitd_clip_min: -12.0 pitd_clip_max: 12.0 repeat_bins: 64 - residual_layers: 20 - residual_channels: 256 - dilation_cycle_length: 5 # * + backbone_type: 'wavenet' + backbone_args: + num_layers: 20 + num_channels: 256 + dilation_cycle_length: 5 +# backbone_type: 'lynxnet' +# backbone_args: +# num_layers: 6 +# num_channels: 512 variances_prediction_args: total_repeat_bins: 48 - residual_layers: 10 - residual_channels: 192 - dilation_cycle_length: 4 # * + backbone_type: 'wavenet' + backbone_args: + num_layers: 10 + num_channels: 192 + dilation_cycle_length: 4 +# backbone_type: 'lynxnet' +# backbone_args: +# num_layers: 6 +# num_channels: 384 lambda_dur_loss: 1.0 lambda_pitch_loss: 1.0 diff --git a/configs/variance.yaml b/configs/variance.yaml index 2c6d002da..e9a7764f2 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -68,9 +68,11 @@ pitch_prediction_args: pitd_clip_min: -12.0 pitd_clip_max: 12.0 repeat_bins: 64 - residual_layers: 20 - residual_channels: 256 - dilation_cycle_length: 5 # * + backbone_type: 'wavenet' + backbone_args: + num_layers: 20 + num_channels: 256 + dilation_cycle_length: 5 energy_db_min: -96.0 energy_db_max: -12.0 @@ -89,9 +91,11 @@ tension_smooth_width: 0.12 variances_prediction_args: total_repeat_bins: 48 - residual_layers: 10 - residual_channels: 192 - dilation_cycle_length: 4 # * + backbone_type: 'wavenet' + backbone_args: + num_layers: 10 + num_channels: 192 + dilation_cycle_length: 4 lambda_dur_loss: 1.0 lambda_pitch_loss: 1.0 @@ -103,7 +107,6 @@ schedule_type: 'linear' K_step: 1000 timesteps: 1000 max_beta: 0.02 -backbone_type: 'wavenet' main_loss_type: l2 main_loss_log_norm: true sampling_algorithm: euler diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py index 1dd4fe129..e358f25a0 100644 --- a/deployment/modules/toplevel.py +++ b/deployment/modules/toplevel.py @@ -31,12 +31,8 @@ def __init__(self, vocab_size, out_dims): num_feats=1, timesteps=hparams['timesteps'], k_step=hparams['K_step'], - backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')), - backbone_args={ - 'n_layers': hparams['residual_layers'], - 'n_chans': hparams['residual_channels'], - 'n_dilates': hparams['dilation_cycle_length'], - }, + backbone_type=self.backbone_type, + backbone_args=self.backbone_args, spec_min=hparams['spec_min'], spec_max=hparams['spec_max'] ) @@ -46,12 +42,8 @@ def __init__(self, vocab_size, out_dims): num_feats=1, t_start=hparams['T_start'], time_scale_factor=hparams['time_scale_factor'], - backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')), - backbone_args={ - 'n_layers': hparams['residual_layers'], - 'n_chans': hparams['residual_channels'], - 'n_dilates': hparams['dilation_cycle_length'], - }, + backbone_type=self.backbone_type, + backbone_args=self.backbone_args, spec_min=hparams['spec_min'], spec_max=hparams['spec_max'] ) @@ -155,12 +147,8 @@ def __init__(self, vocab_size): repeat_bins=pitch_hparams['repeat_bins'], timesteps=hparams['timesteps'], k_step=hparams['K_step'], - backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')), - backbone_args={ - 'n_layers': pitch_hparams['residual_layers'], - 'n_chans': pitch_hparams['residual_channels'], - 'n_dilates': pitch_hparams['dilation_cycle_length'], - } + backbone_type=self.pitch_backbone_type, + backbone_args=self.pitch_backbone_args ) elif self.diffusion_type == 'reflow': self.pitch_predictor = PitchRectifiedFlowONNX( @@ -170,12 +158,8 @@ def __init__(self, vocab_size): cmax=pitch_hparams['pitd_clip_max'], repeat_bins=pitch_hparams['repeat_bins'], time_scale_factor=hparams['time_scale_factor'], - backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')), - backbone_args={ - 'n_layers': pitch_hparams['residual_layers'], - 'n_chans': pitch_hparams['residual_channels'], - 'n_dilates': pitch_hparams['dilation_cycle_length'], - } + backbone_type=self.pitch_backbone_type, + backbone_args=self.pitch_backbone_args ) else: raise ValueError(f"Invalid diffusion type: {self.diffusion_type}") diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md index d7297b1f5..b0e8dee66 100644 --- a/docs/ConfigurationSchemas.md +++ b/docs/ConfigurationSchemas.md @@ -201,6 +201,24 @@ Scale ratio of random time stretching augmentation. default0.75 +### backbone_args + +Keyword arguments for the backbone of main decoder module. + + + + + +
visibilityacoustic, variance
scopenn
typedict
+ +Some available arguments are listed below. + +| argument name | for backbone type | description | +|:---------------------:|:-----------------:|:-----------------------------------------------------------------------------------------------------------:| +| num_layers | wavenet/lynxnet | Number of layer blocks, or depth of the network | +| num_channels | wavenet/lynxnet | Number of channels, or width of the network | +| dilation_cycle_length | wavenet | Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks. | + ### backbone_type Backbone type of the main decoder/predictor module. @@ -208,9 +226,10 @@ Backbone type of the main decoder/predictor module. - + - + +
visibilityacoustic, variance
scopenn
customizabilityreserved
customizabilitynormal
typestr
defaultwavenet
defaultlynxnet
constraintsChoose from 'wavenet', 'lynxnet'.
### base_config @@ -418,18 +437,6 @@ The type of ODE-based generative model algorithm. The following models are curre constraintsChoose from 'ddpm', 'reflow'. -### dilation_cycle_length - -Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks. - - - - - - - -
visibilityacoustic
scopenn
customizabilitynot recommended
typeint
default4
- ### dropout Dropout rate in some FastSpeech2 modules. @@ -1273,13 +1280,21 @@ Arguments for pitch prediction. typedict -### pitch_prediction_args.dilation_cycle_length +### pitch_prediction_args.backbone_args -Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the pitch predictor model. +Equivalent to [backbone_args](#backbone_args) but only for the pitch predictor model. If not set, use the root backbone type. - +
visibilityvariance
default5
+ +### pitch_prediction_args.backbone_type + +Equivalent to [backbone_type](#backbone_type) but only for the pitch predictor model. + + + +
visibilityvariance
defaultwavenet
### pitch_prediction_args.pitd_clip_max @@ -1340,24 +1355,6 @@ Number of repeating bins in the pitch predictor. default64 -### pitch_prediction_args.residual_channels - -Equivalent to [residual_channels](#residual_channels) but only for the pitch predictor. - - - - -
visibilityvariance
default256
- -### pitch_prediction_args.residual_layers - -Equivalent to [residual_layers](#residual_layers) but only for the pitch predictor. - - - - -
visibilityvariance
default20
- ### pl_trainer_accelerator Type of Lightning trainer hardware accelerator. @@ -1525,30 +1522,6 @@ Whether to use relative positional encoding in FastSpeech2 module. defaulttrue -### residual_channels - -Number of dilated convolution channels in residual blocks in WaveNet. - - - - - - - -
visibilityacoustic
scopenn
customizabilitynormal
typeint
default512
- -### residual_layers - -Number of residual blocks in WaveNet. - - - - - - - -
visibilityacoustic
scopenn
customizabilitynormal
typeint
default20
- ### sampler_frame_count_grid The batch sampler applies an algorithm called _sorting by similar length_ when collecting batches. Data samples are first grouped by their approximate lengths before they get shuffled within each group. Assume this value is set to $L_{grid}$, the approximate length of a data sample with length $L_{real}$ can be calculated through the following expression: @@ -2034,43 +2007,33 @@ Arguments for prediction of variance parameters other than pitch, like energy, b typedict -### variances_prediction_args.dilation_cycle_length +### variances_prediction_args.backbone_args -Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the multi-variance predictor model. +Equivalent to [backbone_args](#backbone_args) but only for the multi-variance predictor. -
visibilityvariance
default4
-### variances_prediction_args.total_repeat_bins +### variances_prediction_args.backbone_type -Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter. +Equivalent to [backbone_type](#backbone_type) but only for the multi-variance predictor model. If not set, use the root backbone type. - - - - -
visibilityvariance
scopenn, inference
customizabilityrecommended
typeint
default48
- -### variances_prediction_args.residual_channels - -Equivalent to [residual_channels](#residual_channels) but only for the multi-variance predictor. - - - - +
visibilityvariance
default192
defaultwavenet
-### variances_prediction_args.residual_layers +### variances_prediction_args.total_repeat_bins -Equivalent to [residual_layers](#residual_layers) but only for the multi-variance predictor. +Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter. - + + + +
visibilityvariance
default10
scopenn, inference
customizabilityrecommended
typeint
default48
### vocoder diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py index 1061b8779..8fce796ab 100644 --- a/modules/backbones/__init__.py +++ b/modules/backbones/__init__.py @@ -1,5 +1,18 @@ +import torch.nn from modules.backbones.wavenet import WaveNet +from modules.backbones.lynxnet import LYNXNet +from utils import filter_kwargs BACKBONES = { - 'wavenet': WaveNet + 'wavenet': WaveNet, + 'lynxnet': LYNXNet } + + +def build_backbone( + out_dims: int, num_feats: int, + backbone_type: str, backbone_args: dict +) -> torch.nn.Module: + backbone = BACKBONES[backbone_type] + kwargs = filter_kwargs(backbone_args, backbone) + return BACKBONES[backbone_type](out_dims, num_feats, **kwargs) diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py new file mode 100644 index 000000000..744967c6b --- /dev/null +++ b/modules/backbones/lynxnet.py @@ -0,0 +1,165 @@ +# refer to: +# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py +# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modules.commons.common_layers import SinusoidalPosEmb +from utils.hparams import hparams + + +class SwiGLU(nn.Module): + # Swish-Applies the gated linear unit function. + def __init__(self, dim=-1): + super().__init__() + self.dim = dim + + def forward(self, x): + # out, gate = x.chunk(2, dim=self.dim) + # Using torch.split instead of chunk for ONNX export compatibility. + out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim) + return out * F.silu(gate) + + +class Transpose(nn.Module): + def __init__(self, dims): + super().__init__() + assert len(dims) == 2, 'dims must be a tuple of two dimensions' + self.dims = dims + + def forward(self, x): + return x.transpose(*self.dims) + + +class LYNXConvModule(nn.Module): + @staticmethod + def calc_same_padding(kernel_size): + pad = kernel_size // 2 + return pad, pad - (kernel_size + 1) % 2 + + def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.): + super().__init__() + inner_dim = dim * expansion_factor + activation_classes = { + 'SiLU': nn.SiLU, + 'ReLU': nn.ReLU, + 'PReLU': lambda: nn.PReLU(inner_dim) + } + activation = activation if activation is not None else 'PReLU' + if activation not in activation_classes: + raise ValueError(f'{activation} is not a valid activation') + _activation = activation_classes[activation]() + padding = self.calc_same_padding(kernel_size) + if float(dropout) > 0.: + _dropout = nn.Dropout(dropout) + else: + _dropout = nn.Identity() + self.net = nn.Sequential( + nn.LayerNorm(dim), + Transpose((1, 2)), + nn.Conv1d(dim, inner_dim * 2, 1), + SwiGLU(dim=1), + nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim), + _activation, + nn.Conv1d(inner_dim, dim, 1), + Transpose((1, 2)), + _dropout + ) + + def forward(self, x): + return self.net(x) + + +class LYNXNetResidualLayer(nn.Module): + def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.): + super().__init__() + self.diffusion_projection = nn.Conv1d(dim, dim, 1) + self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1) + self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, + activation=activation, dropout=dropout) + + def forward(self, x, conditioner, diffusion_step): + res_x = x.transpose(1, 2) + x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner) + x = x.transpose(1, 2) + x = self.convmodule(x) # (#batch, dim, length) + x = x + res_x + x = x.transpose(1, 2) + + return x # (#batch, length, dim) + + +class LYNXNet(nn.Module): + def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31, + activation='PReLU', dropout=0.): + """ + LYNXNet(Linear Gated Depthwise Separable Convolution Network) + TIPS:You can control the style of the generated results by modifying the 'activation', + - 'PReLU'(default) : Similar to WaveNet + - 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM + - 'ReLU' : Contrary to 'SiLU', Voice will be weakened + """ + super().__init__() + self.in_dims = in_dims + self.n_feats = n_feats + self.input_projection = nn.Conv1d(in_dims * n_feats, num_channels, 1) + self.diffusion_embedding = nn.Sequential( + SinusoidalPosEmb(num_channels), + nn.Linear(num_channels, num_channels * 4), + nn.GELU(), + nn.Linear(num_channels * 4, num_channels), + ) + self.residual_layers = nn.ModuleList( + [ + LYNXNetResidualLayer( + dim_cond=hparams['hidden_size'], + dim=num_channels, + expansion_factor=expansion_factor, + kernel_size=kernel_size, + activation=activation, + dropout=dropout + ) + for i in range(num_layers) + ] + ) + self.norm = nn.LayerNorm(num_channels) + self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1) + nn.init.zeros_(self.output_projection.weight) + + def forward(self, spec, diffusion_step, cond): + """ + :param spec: [B, F, M, T] + :param diffusion_step: [B, 1] + :param cond: [B, H, T] + :return: + """ + + if self.n_feats == 1: + x = spec[:, 0] # [B, M, T] + else: + x = spec.flatten(start_dim=1, end_dim=2) # [B, F x M, T] + + x = self.input_projection(x) # x [B, residual_channel, T] + x = F.gelu(x) + + diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1) + + for layer in self.residual_layers: + x = layer(x, cond, diffusion_step) + + # post-norm + x = self.norm(x.transpose(1, 2)).transpose(1, 2) + + # MLP and GLU + x = self.output_projection(x) # [B, 128, T] + + if self.n_feats == 1: + x = x[:, None, :, :] + else: + # This is the temporary solution since PyTorch 1.13 + # does not support exporting aten::unflatten to ONNX + # x = x.unflatten(dim=1, sizes=(self.n_feats, self.in_dims)) + x = x.reshape(-1, self.n_feats, self.in_dims, x.shape[2]) + return x diff --git a/modules/backbones/wavenet.py b/modules/backbones/wavenet.py index 0a1400d30..08e57eff4 100644 --- a/modules/backbones/wavenet.py +++ b/modules/backbones/wavenet.py @@ -5,6 +5,7 @@ import torch.nn as nn import torch.nn.functional as F +from modules.commons.common_layers import SinusoidalPosEmb from utils.hparams import hparams @@ -14,21 +15,6 @@ def __init__(self, *args, **kwargs): nn.init.kaiming_normal_(self.weight) -class SinusoidalPosEmb(nn.Module): - def __init__(self, dim): - super().__init__() - self.dim = dim - - def forward(self, x): - device = x.device - half_dim = self.dim // 2 - emb = math.log(10000) / (half_dim - 1) - emb = torch.exp(torch.arange(half_dim, device=device) * -emb) - emb = x[:, None] * emb[None, :] - emb = torch.cat((emb.sin(), emb.cos()), dim=-1) - return emb - - class ResidualBlock(nn.Module): def __init__(self, encoder_hidden, residual_channels, dilation): super().__init__() @@ -63,27 +49,27 @@ def forward(self, x, conditioner, diffusion_step): class WaveNet(nn.Module): - def __init__(self, in_dims, n_feats, *, n_layers=20, n_chans=256, n_dilates=4): + def __init__(self, in_dims, n_feats, *, num_layers=20, num_channels=256, dilation_cycle_length=4): super().__init__() self.in_dims = in_dims self.n_feats = n_feats - self.input_projection = Conv1d(in_dims * n_feats, n_chans, 1) - self.diffusion_embedding = SinusoidalPosEmb(n_chans) + self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1) + self.diffusion_embedding = SinusoidalPosEmb(num_channels) self.mlp = nn.Sequential( - nn.Linear(n_chans, n_chans * 4), + nn.Linear(num_channels, num_channels * 4), nn.Mish(), - nn.Linear(n_chans * 4, n_chans) + nn.Linear(num_channels * 4, num_channels) ) self.residual_layers = nn.ModuleList([ ResidualBlock( encoder_hidden=hparams['hidden_size'], - residual_channels=n_chans, - dilation=2 ** (i % n_dilates) + residual_channels=num_channels, + dilation=2 ** (i % dilation_cycle_length) ) - for i in range(n_layers) + for i in range(num_layers) ]) - self.skip_projection = Conv1d(n_chans, n_chans, 1) - self.output_projection = Conv1d(n_chans, in_dims * n_feats, 1) + self.skip_projection = Conv1d(num_channels, num_channels, 1) + self.output_projection = Conv1d(num_channels, in_dims * n_feats, 1) nn.init.zeros_(self.output_projection.weight) def forward(self, spec, diffusion_step, cond): diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index 9ea2c2638..b12cc7f96 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -168,3 +168,18 @@ def forward(self, x, encoder_padding_mask=None, **kwargs): x = residual + x x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] return x + + +class SinusoidalPosEmb(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + device = x.device + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, device=device) * -emb) + emb = x[:, None] * emb[None, :] + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb diff --git a/modules/compat.py b/modules/compat.py new file mode 100644 index 000000000..8311b16dd --- /dev/null +++ b/modules/compat.py @@ -0,0 +1,24 @@ +def get_backbone_type(root_config: dict, nested_config: dict = None): + if nested_config is None: + nested_config = root_config + return nested_config.get( + 'backbone_type', + root_config.get( + 'backbone_type', + root_config.get('diff_decoder_type', 'wavenet') + ) + ) + + +def get_backbone_args(config: dict, backbone_type: str): + args = config.get('backbone_args') + if args is not None: + return args + elif backbone_type == 'wavenet': + return { + 'num_layers': config.get('residual_layers'), + 'num_channels': config.get('residual_channels'), + 'dilation_cycle_length': config.get('dilation_cycle_length'), + } + else: + return None diff --git a/modules/core/ddpm.py b/modules/core/ddpm.py index d79f21c79..6b0ae4803 100644 --- a/modules/core/ddpm.py +++ b/modules/core/ddpm.py @@ -9,7 +9,7 @@ from torch import nn from tqdm import tqdm -from modules.backbones import BACKBONES +from modules.backbones import build_backbone from utils.hparams import hparams @@ -57,7 +57,7 @@ def __init__(self, out_dims, num_feats=1, timesteps=1000, k_step=1000, backbone_type=None, backbone_args=None, betas=None, spec_min=None, spec_max=None): super().__init__() - self.denoise_fn: nn.Module = BACKBONES[backbone_type](out_dims, num_feats, **backbone_args) + self.denoise_fn: nn.Module = build_backbone(out_dims, num_feats, backbone_type, backbone_args) self.out_dims = out_dims self.num_feats = num_feats diff --git a/modules/core/reflow.py b/modules/core/reflow.py index 2a2b21fcb..f09eb2392 100644 --- a/modules/core/reflow.py +++ b/modules/core/reflow.py @@ -6,7 +6,7 @@ import torch.nn as nn from tqdm import tqdm -from modules.backbones import BACKBONES +from modules.backbones import build_backbone from utils.hparams import hparams @@ -15,7 +15,7 @@ def __init__(self, out_dims, num_feats=1, t_start=0., time_scale_factor=1000, backbone_type=None, backbone_args=None, spec_min=None, spec_max=None): super().__init__() - self.velocity_fn: nn.Module = BACKBONES[backbone_type](out_dims, num_feats, **backbone_args) + self.velocity_fn: nn.Module = build_backbone(out_dims, num_feats, backbone_type, backbone_args) self.out_dims = out_dims self.num_feats = num_feats self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False) diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py index e5668536b..77ebb8331 100644 --- a/modules/fastspeech/param_adaptor.py +++ b/modules/fastspeech/param_adaptor.py @@ -2,6 +2,7 @@ import torch +import modules.compat as compat from modules.core.ddpm import MultiVarianceDiffusion from utils import filter_kwargs from utils.hparams import hparams @@ -68,6 +69,8 @@ def build_adaptor(self, cls=MultiVarianceDiffusion): f'Total number of repeat bins must be divisible by number of ' \ f'variance parameters ({len(self.variance_prediction_list)}).' repeat_bins = total_repeat_bins // len(self.variance_prediction_list) + backbone_type = compat.get_backbone_type(hparams, nested_config=variances_hparams) + backbone_args = compat.get_backbone_args(variances_hparams, backbone_type=backbone_type) kwargs = filter_kwargs( { 'ranges': ranges, @@ -75,12 +78,8 @@ def build_adaptor(self, cls=MultiVarianceDiffusion): 'repeat_bins': repeat_bins, 'timesteps': hparams.get('timesteps'), 'time_scale_factor': hparams.get('time_scale_factor'), - 'backbone_type': hparams.get('backbone_type', hparams.get('diff_decoder_type')), - 'backbone_args': { - 'n_layers': variances_hparams['residual_layers'], - 'n_chans': variances_hparams['residual_channels'], - 'n_dilates': variances_hparams['dilation_cycle_length'], - } + 'backbone_type': backbone_type, + 'backbone_args': backbone_args }, cls ) diff --git a/modules/toplevel.py b/modules/toplevel.py index 1976d09a9..99f73d541 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from torch import Tensor +import modules.compat as compat from basics.base_module import CategorizedModule from modules.aux_decoder import AuxDecoderAdaptor from modules.commons.common_layers import ( @@ -53,18 +54,16 @@ def __init__(self, vocab_size, out_dims): aux_decoder_args=self.shallow_args['aux_decoder_args'] ) self.diffusion_type = hparams.get('diffusion_type', 'ddpm') + self.backbone_type = compat.get_backbone_type(hparams) + self.backbone_args = compat.get_backbone_args(hparams, self.backbone_type) if self.diffusion_type == 'ddpm': self.diffusion = GaussianDiffusion( out_dims=out_dims, num_feats=1, timesteps=hparams['timesteps'], k_step=hparams['K_step'], - backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')), - backbone_args={ - 'n_layers': hparams['residual_layers'], - 'n_chans': hparams['residual_channels'], - 'n_dilates': hparams['dilation_cycle_length'], - }, + backbone_type=self.backbone_type, + backbone_args=self.backbone_args, spec_min=hparams['spec_min'], spec_max=hparams['spec_max'] ) @@ -74,12 +73,8 @@ def __init__(self, vocab_size, out_dims): num_feats=1, t_start=hparams['T_start'], time_scale_factor=hparams['time_scale_factor'], - backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')), - backbone_args={ - 'n_layers': hparams['residual_layers'], - 'n_chans': hparams['residual_channels'], - 'n_dilates': hparams['dilation_cycle_length'], - }, + backbone_type=self.backbone_type, + backbone_args=self.backbone_args, spec_min=hparams['spec_min'], spec_max=hparams['spec_max'] ) @@ -157,7 +152,8 @@ def __init__(self, vocab_size): self.pitch_retake_embed = Embedding(2, hparams['hidden_size']) pitch_hparams = hparams['pitch_prediction_args'] - + self.pitch_backbone_type = compat.get_backbone_type(hparams, nested_config=pitch_hparams) + self.pitch_backbone_args = compat.get_backbone_args(hparams, backbone_type=self.pitch_backbone_type) if self.diffusion_type == 'ddpm': self.pitch_predictor = PitchDiffusion( vmin=pitch_hparams['pitd_norm_min'], @@ -167,12 +163,8 @@ def __init__(self, vocab_size): repeat_bins=pitch_hparams['repeat_bins'], timesteps=hparams['timesteps'], k_step=hparams['K_step'], - backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')), - backbone_args={ - 'n_layers': pitch_hparams['residual_layers'], - 'n_chans': pitch_hparams['residual_channels'], - 'n_dilates': pitch_hparams['dilation_cycle_length'], - } + backbone_type=self.pitch_backbone_type, + backbone_args=self.pitch_backbone_args ) elif self.diffusion_type == 'reflow': self.pitch_predictor = PitchRectifiedFlow( @@ -182,12 +174,8 @@ def __init__(self, vocab_size): cmax=pitch_hparams['pitd_clip_max'], repeat_bins=pitch_hparams['repeat_bins'], time_scale_factor=hparams['time_scale_factor'], - backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')), - backbone_args={ - 'n_layers': pitch_hparams['residual_layers'], - 'n_chans': pitch_hparams['residual_channels'], - 'n_dilates': pitch_hparams['dilation_cycle_length'], - } + backbone_type=self.pitch_backbone_type, + backbone_args=self.pitch_backbone_args ) else: raise ValueError(f"Invalid diffusion type: {self.diffusion_type}") diff --git a/scripts/export.py b/scripts/export.py index 537cdad9f..d666175d6 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -14,6 +14,13 @@ from utils.hparams import set_hparams, hparams +def check_pytorch_version(): + # Require PyTorch version to be exactly 1.13.x + if torch.__version__.startswith('1.13.'): + return + raise RuntimeError('This script requires PyTorch 1.13.x. Please install the correct version.') + + def find_exp(exp): if not (root_dir / 'checkpoints' / exp).exists(): for subdir in (root_dir / 'checkpoints').iterdir(): @@ -291,4 +298,5 @@ def nsf_hifigan( if __name__ == '__main__': + check_pytorch_version() main()