diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 0364b5c15..2cbc45303 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -73,10 +73,12 @@ sampling_steps: 20
diff_accelerator: ddim
diff_speedup: 10
hidden_size: 256
-residual_layers: 20
-residual_channels: 512
-dilation_cycle_length: 4 # *
-backbone_type: 'wavenet'
+backbone_type: 'lynxnet'
+backbone_args:
+ num_channels: 1024
+ num_layers: 6
+ kernel_size: 31
+ dropout_rate: 0.0
main_loss_type: l2
main_loss_log_norm: false
schedule_type: 'linear'
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 198444bc7..a9453a368 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -51,16 +51,24 @@ augmentation_args:
range: [0.5, 2.]
scale: 0.75
-residual_channels: 512
-residual_layers: 20
-
-# shallow diffusion
+# diffusion and shallow diffusion
diffusion_type: reflow
use_shallow_diffusion: true
T_start: 0.4
T_start_infer: 0.4
K_step: 300
K_step_infer: 300
+backbone_type: 'lynxnet'
+backbone_args:
+ num_channels: 1024
+ num_layers: 6
+ kernel_size: 31
+ dropout_rate: 0.0
+#backbone_type: 'wavenet'
+#backbone_args:
+# num_channels: 512
+# num_layers: 20
+# dilation_cycle_length: 4
shallow_diffusion_args:
train_aux_decoder: true
train_diffusion: true
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
index d75667797..daa8e15dc 100644
--- a/configs/templates/config_variance.yaml
+++ b/configs/templates/config_variance.yaml
@@ -78,15 +78,27 @@ pitch_prediction_args:
pitd_clip_min: -12.0
pitd_clip_max: 12.0
repeat_bins: 64
- residual_layers: 20
- residual_channels: 256
- dilation_cycle_length: 5 # *
+ backbone_type: 'wavenet'
+ backbone_args:
+ num_layers: 20
+ num_channels: 256
+ dilation_cycle_length: 5
+# backbone_type: 'lynxnet'
+# backbone_args:
+# num_layers: 6
+# num_channels: 512
variances_prediction_args:
total_repeat_bins: 48
- residual_layers: 10
- residual_channels: 192
- dilation_cycle_length: 4 # *
+ backbone_type: 'wavenet'
+ backbone_args:
+ num_layers: 10
+ num_channels: 192
+ dilation_cycle_length: 4
+# backbone_type: 'lynxnet'
+# backbone_args:
+# num_layers: 6
+# num_channels: 384
lambda_dur_loss: 1.0
lambda_pitch_loss: 1.0
diff --git a/configs/variance.yaml b/configs/variance.yaml
index 2c6d002da..e9a7764f2 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -68,9 +68,11 @@ pitch_prediction_args:
pitd_clip_min: -12.0
pitd_clip_max: 12.0
repeat_bins: 64
- residual_layers: 20
- residual_channels: 256
- dilation_cycle_length: 5 # *
+ backbone_type: 'wavenet'
+ backbone_args:
+ num_layers: 20
+ num_channels: 256
+ dilation_cycle_length: 5
energy_db_min: -96.0
energy_db_max: -12.0
@@ -89,9 +91,11 @@ tension_smooth_width: 0.12
variances_prediction_args:
total_repeat_bins: 48
- residual_layers: 10
- residual_channels: 192
- dilation_cycle_length: 4 # *
+ backbone_type: 'wavenet'
+ backbone_args:
+ num_layers: 10
+ num_channels: 192
+ dilation_cycle_length: 4
lambda_dur_loss: 1.0
lambda_pitch_loss: 1.0
@@ -103,7 +107,6 @@ schedule_type: 'linear'
K_step: 1000
timesteps: 1000
max_beta: 0.02
-backbone_type: 'wavenet'
main_loss_type: l2
main_loss_log_norm: true
sampling_algorithm: euler
diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py
index 1dd4fe129..e358f25a0 100644
--- a/deployment/modules/toplevel.py
+++ b/deployment/modules/toplevel.py
@@ -31,12 +31,8 @@ def __init__(self, vocab_size, out_dims):
num_feats=1,
timesteps=hparams['timesteps'],
k_step=hparams['K_step'],
- backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- backbone_args={
- 'n_layers': hparams['residual_layers'],
- 'n_chans': hparams['residual_channels'],
- 'n_dilates': hparams['dilation_cycle_length'],
- },
+ backbone_type=self.backbone_type,
+ backbone_args=self.backbone_args,
spec_min=hparams['spec_min'],
spec_max=hparams['spec_max']
)
@@ -46,12 +42,8 @@ def __init__(self, vocab_size, out_dims):
num_feats=1,
t_start=hparams['T_start'],
time_scale_factor=hparams['time_scale_factor'],
- backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- backbone_args={
- 'n_layers': hparams['residual_layers'],
- 'n_chans': hparams['residual_channels'],
- 'n_dilates': hparams['dilation_cycle_length'],
- },
+ backbone_type=self.backbone_type,
+ backbone_args=self.backbone_args,
spec_min=hparams['spec_min'],
spec_max=hparams['spec_max']
)
@@ -155,12 +147,8 @@ def __init__(self, vocab_size):
repeat_bins=pitch_hparams['repeat_bins'],
timesteps=hparams['timesteps'],
k_step=hparams['K_step'],
- backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- backbone_args={
- 'n_layers': pitch_hparams['residual_layers'],
- 'n_chans': pitch_hparams['residual_channels'],
- 'n_dilates': pitch_hparams['dilation_cycle_length'],
- }
+ backbone_type=self.pitch_backbone_type,
+ backbone_args=self.pitch_backbone_args
)
elif self.diffusion_type == 'reflow':
self.pitch_predictor = PitchRectifiedFlowONNX(
@@ -170,12 +158,8 @@ def __init__(self, vocab_size):
cmax=pitch_hparams['pitd_clip_max'],
repeat_bins=pitch_hparams['repeat_bins'],
time_scale_factor=hparams['time_scale_factor'],
- backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- backbone_args={
- 'n_layers': pitch_hparams['residual_layers'],
- 'n_chans': pitch_hparams['residual_channels'],
- 'n_dilates': pitch_hparams['dilation_cycle_length'],
- }
+ backbone_type=self.pitch_backbone_type,
+ backbone_args=self.pitch_backbone_args
)
else:
raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index d7297b1f5..b0e8dee66 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -201,6 +201,24 @@ Scale ratio of random time stretching augmentation.
| default | 0.75 |
+### backbone_args
+
+Keyword arguments for the backbone of main decoder module.
+
+
+| visibility | acoustic, variance |
+
| scope | nn |
+
| type | dict |
+
+
+Some available arguments are listed below.
+
+| argument name | for backbone type | description |
+|:---------------------:|:-----------------:|:-----------------------------------------------------------------------------------------------------------:|
+| num_layers | wavenet/lynxnet | Number of layer blocks, or depth of the network |
+| num_channels | wavenet/lynxnet | Number of channels, or width of the network |
+| dilation_cycle_length | wavenet | Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks. |
+
### backbone_type
Backbone type of the main decoder/predictor module.
@@ -208,9 +226,10 @@ Backbone type of the main decoder/predictor module.
| visibility | acoustic, variance |
| scope | nn |
-
| customizability | reserved |
+
| customizability | normal |
| type | str |
-
| default | wavenet |
+
| default | lynxnet |
+
| constraints | Choose from 'wavenet', 'lynxnet'. |
### base_config
@@ -418,18 +437,6 @@ The type of ODE-based generative model algorithm. The following models are curre
| constraints | Choose from 'ddpm', 'reflow'. |
-### dilation_cycle_length
-
-Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks.
-
-
-| visibility | acoustic |
-
| scope | nn |
-
| customizability | not recommended |
-
| type | int |
-
| default | 4 |
-
-
### dropout
Dropout rate in some FastSpeech2 modules.
@@ -1273,13 +1280,21 @@ Arguments for pitch prediction.
| type | dict |
-### pitch_prediction_args.dilation_cycle_length
+### pitch_prediction_args.backbone_args
-Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the pitch predictor model.
+Equivalent to [backbone_args](#backbone_args) but only for the pitch predictor model. If not set, use the root backbone type.
| visibility | variance |
-
| default | 5 |
+
+
+### pitch_prediction_args.backbone_type
+
+Equivalent to [backbone_type](#backbone_type) but only for the pitch predictor model.
+
+
+| visibility | variance |
+
| default | wavenet |
### pitch_prediction_args.pitd_clip_max
@@ -1340,24 +1355,6 @@ Number of repeating bins in the pitch predictor.
| default | 64 |
-### pitch_prediction_args.residual_channels
-
-Equivalent to [residual_channels](#residual_channels) but only for the pitch predictor.
-
-
-| visibility | variance |
-
| default | 256 |
-
-
-### pitch_prediction_args.residual_layers
-
-Equivalent to [residual_layers](#residual_layers) but only for the pitch predictor.
-
-
-| visibility | variance |
-
| default | 20 |
-
-
### pl_trainer_accelerator
Type of Lightning trainer hardware accelerator.
@@ -1525,30 +1522,6 @@ Whether to use relative positional encoding in FastSpeech2 module.
| default | true |
-### residual_channels
-
-Number of dilated convolution channels in residual blocks in WaveNet.
-
-
-| visibility | acoustic |
-
| scope | nn |
-
| customizability | normal |
-
| type | int |
-
| default | 512 |
-
-
-### residual_layers
-
-Number of residual blocks in WaveNet.
-
-
-| visibility | acoustic |
-
| scope | nn |
-
| customizability | normal |
-
| type | int |
-
| default | 20 |
-
-
### sampler_frame_count_grid
The batch sampler applies an algorithm called _sorting by similar length_ when collecting batches. Data samples are first grouped by their approximate lengths before they get shuffled within each group. Assume this value is set to $L_{grid}$, the approximate length of a data sample with length $L_{real}$ can be calculated through the following expression:
@@ -2034,43 +2007,33 @@ Arguments for prediction of variance parameters other than pitch, like energy, b
| type | dict |
-### variances_prediction_args.dilation_cycle_length
+### variances_prediction_args.backbone_args
-Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the multi-variance predictor model.
+Equivalent to [backbone_args](#backbone_args) but only for the multi-variance predictor.
| visibility | variance |
-
| default | 4 |
-### variances_prediction_args.total_repeat_bins
+### variances_prediction_args.backbone_type
-Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter.
+Equivalent to [backbone_type](#backbone_type) but only for the multi-variance predictor model. If not set, use the root backbone type.
| visibility | variance |
-
| scope | nn, inference |
-
| customizability | recommended |
-
| type | int |
-
| default | 48 |
-
-
-### variances_prediction_args.residual_channels
-
-Equivalent to [residual_channels](#residual_channels) but only for the multi-variance predictor.
-
-
-| visibility | variance |
-
| default | 192 |
+
| default | wavenet |
-### variances_prediction_args.residual_layers
+### variances_prediction_args.total_repeat_bins
-Equivalent to [residual_layers](#residual_layers) but only for the multi-variance predictor.
+Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter.
| visibility | variance |
-
| default | 10 |
+
| scope | nn, inference |
+
| customizability | recommended |
+
| type | int |
+
| default | 48 |
### vocoder
diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py
index 1061b8779..8fce796ab 100644
--- a/modules/backbones/__init__.py
+++ b/modules/backbones/__init__.py
@@ -1,5 +1,18 @@
+import torch.nn
from modules.backbones.wavenet import WaveNet
+from modules.backbones.lynxnet import LYNXNet
+from utils import filter_kwargs
BACKBONES = {
- 'wavenet': WaveNet
+ 'wavenet': WaveNet,
+ 'lynxnet': LYNXNet
}
+
+
+def build_backbone(
+ out_dims: int, num_feats: int,
+ backbone_type: str, backbone_args: dict
+) -> torch.nn.Module:
+ backbone = BACKBONES[backbone_type]
+ kwargs = filter_kwargs(backbone_args, backbone)
+ return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py
new file mode 100644
index 000000000..744967c6b
--- /dev/null
+++ b/modules/backbones/lynxnet.py
@@ -0,0 +1,165 @@
+# refer to:
+# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py
+# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modules.commons.common_layers import SinusoidalPosEmb
+from utils.hparams import hparams
+
+
+class SwiGLU(nn.Module):
+ # Swish-Applies the gated linear unit function.
+ def __init__(self, dim=-1):
+ super().__init__()
+ self.dim = dim
+
+ def forward(self, x):
+ # out, gate = x.chunk(2, dim=self.dim)
+ # Using torch.split instead of chunk for ONNX export compatibility.
+ out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim)
+ return out * F.silu(gate)
+
+
+class Transpose(nn.Module):
+ def __init__(self, dims):
+ super().__init__()
+ assert len(dims) == 2, 'dims must be a tuple of two dimensions'
+ self.dims = dims
+
+ def forward(self, x):
+ return x.transpose(*self.dims)
+
+
+class LYNXConvModule(nn.Module):
+ @staticmethod
+ def calc_same_padding(kernel_size):
+ pad = kernel_size // 2
+ return pad, pad - (kernel_size + 1) % 2
+
+ def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
+ super().__init__()
+ inner_dim = dim * expansion_factor
+ activation_classes = {
+ 'SiLU': nn.SiLU,
+ 'ReLU': nn.ReLU,
+ 'PReLU': lambda: nn.PReLU(inner_dim)
+ }
+ activation = activation if activation is not None else 'PReLU'
+ if activation not in activation_classes:
+ raise ValueError(f'{activation} is not a valid activation')
+ _activation = activation_classes[activation]()
+ padding = self.calc_same_padding(kernel_size)
+ if float(dropout) > 0.:
+ _dropout = nn.Dropout(dropout)
+ else:
+ _dropout = nn.Identity()
+ self.net = nn.Sequential(
+ nn.LayerNorm(dim),
+ Transpose((1, 2)),
+ nn.Conv1d(dim, inner_dim * 2, 1),
+ SwiGLU(dim=1),
+ nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim),
+ _activation,
+ nn.Conv1d(inner_dim, dim, 1),
+ Transpose((1, 2)),
+ _dropout
+ )
+
+ def forward(self, x):
+ return self.net(x)
+
+
+class LYNXNetResidualLayer(nn.Module):
+ def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
+ super().__init__()
+ self.diffusion_projection = nn.Conv1d(dim, dim, 1)
+ self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1)
+ self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size,
+ activation=activation, dropout=dropout)
+
+ def forward(self, x, conditioner, diffusion_step):
+ res_x = x.transpose(1, 2)
+ x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
+ x = x.transpose(1, 2)
+ x = self.convmodule(x) # (#batch, dim, length)
+ x = x + res_x
+ x = x.transpose(1, 2)
+
+ return x # (#batch, length, dim)
+
+
+class LYNXNet(nn.Module):
+ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31,
+ activation='PReLU', dropout=0.):
+ """
+ LYNXNet(Linear Gated Depthwise Separable Convolution Network)
+ TIPS:You can control the style of the generated results by modifying the 'activation',
+ - 'PReLU'(default) : Similar to WaveNet
+ - 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM
+ - 'ReLU' : Contrary to 'SiLU', Voice will be weakened
+ """
+ super().__init__()
+ self.in_dims = in_dims
+ self.n_feats = n_feats
+ self.input_projection = nn.Conv1d(in_dims * n_feats, num_channels, 1)
+ self.diffusion_embedding = nn.Sequential(
+ SinusoidalPosEmb(num_channels),
+ nn.Linear(num_channels, num_channels * 4),
+ nn.GELU(),
+ nn.Linear(num_channels * 4, num_channels),
+ )
+ self.residual_layers = nn.ModuleList(
+ [
+ LYNXNetResidualLayer(
+ dim_cond=hparams['hidden_size'],
+ dim=num_channels,
+ expansion_factor=expansion_factor,
+ kernel_size=kernel_size,
+ activation=activation,
+ dropout=dropout
+ )
+ for i in range(num_layers)
+ ]
+ )
+ self.norm = nn.LayerNorm(num_channels)
+ self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
+ nn.init.zeros_(self.output_projection.weight)
+
+ def forward(self, spec, diffusion_step, cond):
+ """
+ :param spec: [B, F, M, T]
+ :param diffusion_step: [B, 1]
+ :param cond: [B, H, T]
+ :return:
+ """
+
+ if self.n_feats == 1:
+ x = spec[:, 0] # [B, M, T]
+ else:
+ x = spec.flatten(start_dim=1, end_dim=2) # [B, F x M, T]
+
+ x = self.input_projection(x) # x [B, residual_channel, T]
+ x = F.gelu(x)
+
+ diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1)
+
+ for layer in self.residual_layers:
+ x = layer(x, cond, diffusion_step)
+
+ # post-norm
+ x = self.norm(x.transpose(1, 2)).transpose(1, 2)
+
+ # MLP and GLU
+ x = self.output_projection(x) # [B, 128, T]
+
+ if self.n_feats == 1:
+ x = x[:, None, :, :]
+ else:
+ # This is the temporary solution since PyTorch 1.13
+ # does not support exporting aten::unflatten to ONNX
+ # x = x.unflatten(dim=1, sizes=(self.n_feats, self.in_dims))
+ x = x.reshape(-1, self.n_feats, self.in_dims, x.shape[2])
+ return x
diff --git a/modules/backbones/wavenet.py b/modules/backbones/wavenet.py
index 0a1400d30..08e57eff4 100644
--- a/modules/backbones/wavenet.py
+++ b/modules/backbones/wavenet.py
@@ -5,6 +5,7 @@
import torch.nn as nn
import torch.nn.functional as F
+from modules.commons.common_layers import SinusoidalPosEmb
from utils.hparams import hparams
@@ -14,21 +15,6 @@ def __init__(self, *args, **kwargs):
nn.init.kaiming_normal_(self.weight)
-class SinusoidalPosEmb(nn.Module):
- def __init__(self, dim):
- super().__init__()
- self.dim = dim
-
- def forward(self, x):
- device = x.device
- half_dim = self.dim // 2
- emb = math.log(10000) / (half_dim - 1)
- emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
- emb = x[:, None] * emb[None, :]
- emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
- return emb
-
-
class ResidualBlock(nn.Module):
def __init__(self, encoder_hidden, residual_channels, dilation):
super().__init__()
@@ -63,27 +49,27 @@ def forward(self, x, conditioner, diffusion_step):
class WaveNet(nn.Module):
- def __init__(self, in_dims, n_feats, *, n_layers=20, n_chans=256, n_dilates=4):
+ def __init__(self, in_dims, n_feats, *, num_layers=20, num_channels=256, dilation_cycle_length=4):
super().__init__()
self.in_dims = in_dims
self.n_feats = n_feats
- self.input_projection = Conv1d(in_dims * n_feats, n_chans, 1)
- self.diffusion_embedding = SinusoidalPosEmb(n_chans)
+ self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
+ self.diffusion_embedding = SinusoidalPosEmb(num_channels)
self.mlp = nn.Sequential(
- nn.Linear(n_chans, n_chans * 4),
+ nn.Linear(num_channels, num_channels * 4),
nn.Mish(),
- nn.Linear(n_chans * 4, n_chans)
+ nn.Linear(num_channels * 4, num_channels)
)
self.residual_layers = nn.ModuleList([
ResidualBlock(
encoder_hidden=hparams['hidden_size'],
- residual_channels=n_chans,
- dilation=2 ** (i % n_dilates)
+ residual_channels=num_channels,
+ dilation=2 ** (i % dilation_cycle_length)
)
- for i in range(n_layers)
+ for i in range(num_layers)
])
- self.skip_projection = Conv1d(n_chans, n_chans, 1)
- self.output_projection = Conv1d(n_chans, in_dims * n_feats, 1)
+ self.skip_projection = Conv1d(num_channels, num_channels, 1)
+ self.output_projection = Conv1d(num_channels, in_dims * n_feats, 1)
nn.init.zeros_(self.output_projection.weight)
def forward(self, spec, diffusion_step, cond):
diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py
index 9ea2c2638..b12cc7f96 100644
--- a/modules/commons/common_layers.py
+++ b/modules/commons/common_layers.py
@@ -168,3 +168,18 @@ def forward(self, x, encoder_padding_mask=None, **kwargs):
x = residual + x
x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
return x
+
+
+class SinusoidalPosEmb(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ self.dim = dim
+
+ def forward(self, x):
+ device = x.device
+ half_dim = self.dim // 2
+ emb = math.log(10000) / (half_dim - 1)
+ emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+ emb = x[:, None] * emb[None, :]
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+ return emb
diff --git a/modules/compat.py b/modules/compat.py
new file mode 100644
index 000000000..8311b16dd
--- /dev/null
+++ b/modules/compat.py
@@ -0,0 +1,24 @@
+def get_backbone_type(root_config: dict, nested_config: dict = None):
+ if nested_config is None:
+ nested_config = root_config
+ return nested_config.get(
+ 'backbone_type',
+ root_config.get(
+ 'backbone_type',
+ root_config.get('diff_decoder_type', 'wavenet')
+ )
+ )
+
+
+def get_backbone_args(config: dict, backbone_type: str):
+ args = config.get('backbone_args')
+ if args is not None:
+ return args
+ elif backbone_type == 'wavenet':
+ return {
+ 'num_layers': config.get('residual_layers'),
+ 'num_channels': config.get('residual_channels'),
+ 'dilation_cycle_length': config.get('dilation_cycle_length'),
+ }
+ else:
+ return None
diff --git a/modules/core/ddpm.py b/modules/core/ddpm.py
index d79f21c79..6b0ae4803 100644
--- a/modules/core/ddpm.py
+++ b/modules/core/ddpm.py
@@ -9,7 +9,7 @@
from torch import nn
from tqdm import tqdm
-from modules.backbones import BACKBONES
+from modules.backbones import build_backbone
from utils.hparams import hparams
@@ -57,7 +57,7 @@ def __init__(self, out_dims, num_feats=1, timesteps=1000, k_step=1000,
backbone_type=None, backbone_args=None, betas=None,
spec_min=None, spec_max=None):
super().__init__()
- self.denoise_fn: nn.Module = BACKBONES[backbone_type](out_dims, num_feats, **backbone_args)
+ self.denoise_fn: nn.Module = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
self.out_dims = out_dims
self.num_feats = num_feats
diff --git a/modules/core/reflow.py b/modules/core/reflow.py
index 2a2b21fcb..f09eb2392 100644
--- a/modules/core/reflow.py
+++ b/modules/core/reflow.py
@@ -6,7 +6,7 @@
import torch.nn as nn
from tqdm import tqdm
-from modules.backbones import BACKBONES
+from modules.backbones import build_backbone
from utils.hparams import hparams
@@ -15,7 +15,7 @@ def __init__(self, out_dims, num_feats=1, t_start=0., time_scale_factor=1000,
backbone_type=None, backbone_args=None,
spec_min=None, spec_max=None):
super().__init__()
- self.velocity_fn: nn.Module = BACKBONES[backbone_type](out_dims, num_feats, **backbone_args)
+ self.velocity_fn: nn.Module = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
self.out_dims = out_dims
self.num_feats = num_feats
self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False)
diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py
index e5668536b..77ebb8331 100644
--- a/modules/fastspeech/param_adaptor.py
+++ b/modules/fastspeech/param_adaptor.py
@@ -2,6 +2,7 @@
import torch
+import modules.compat as compat
from modules.core.ddpm import MultiVarianceDiffusion
from utils import filter_kwargs
from utils.hparams import hparams
@@ -68,6 +69,8 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
f'Total number of repeat bins must be divisible by number of ' \
f'variance parameters ({len(self.variance_prediction_list)}).'
repeat_bins = total_repeat_bins // len(self.variance_prediction_list)
+ backbone_type = compat.get_backbone_type(hparams, nested_config=variances_hparams)
+ backbone_args = compat.get_backbone_args(variances_hparams, backbone_type=backbone_type)
kwargs = filter_kwargs(
{
'ranges': ranges,
@@ -75,12 +78,8 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
'repeat_bins': repeat_bins,
'timesteps': hparams.get('timesteps'),
'time_scale_factor': hparams.get('time_scale_factor'),
- 'backbone_type': hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- 'backbone_args': {
- 'n_layers': variances_hparams['residual_layers'],
- 'n_chans': variances_hparams['residual_channels'],
- 'n_dilates': variances_hparams['dilation_cycle_length'],
- }
+ 'backbone_type': backbone_type,
+ 'backbone_args': backbone_args
},
cls
)
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 1976d09a9..99f73d541 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -5,6 +5,7 @@
import torch.nn.functional as F
from torch import Tensor
+import modules.compat as compat
from basics.base_module import CategorizedModule
from modules.aux_decoder import AuxDecoderAdaptor
from modules.commons.common_layers import (
@@ -53,18 +54,16 @@ def __init__(self, vocab_size, out_dims):
aux_decoder_args=self.shallow_args['aux_decoder_args']
)
self.diffusion_type = hparams.get('diffusion_type', 'ddpm')
+ self.backbone_type = compat.get_backbone_type(hparams)
+ self.backbone_args = compat.get_backbone_args(hparams, self.backbone_type)
if self.diffusion_type == 'ddpm':
self.diffusion = GaussianDiffusion(
out_dims=out_dims,
num_feats=1,
timesteps=hparams['timesteps'],
k_step=hparams['K_step'],
- backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- backbone_args={
- 'n_layers': hparams['residual_layers'],
- 'n_chans': hparams['residual_channels'],
- 'n_dilates': hparams['dilation_cycle_length'],
- },
+ backbone_type=self.backbone_type,
+ backbone_args=self.backbone_args,
spec_min=hparams['spec_min'],
spec_max=hparams['spec_max']
)
@@ -74,12 +73,8 @@ def __init__(self, vocab_size, out_dims):
num_feats=1,
t_start=hparams['T_start'],
time_scale_factor=hparams['time_scale_factor'],
- backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- backbone_args={
- 'n_layers': hparams['residual_layers'],
- 'n_chans': hparams['residual_channels'],
- 'n_dilates': hparams['dilation_cycle_length'],
- },
+ backbone_type=self.backbone_type,
+ backbone_args=self.backbone_args,
spec_min=hparams['spec_min'],
spec_max=hparams['spec_max']
)
@@ -157,7 +152,8 @@ def __init__(self, vocab_size):
self.pitch_retake_embed = Embedding(2, hparams['hidden_size'])
pitch_hparams = hparams['pitch_prediction_args']
-
+ self.pitch_backbone_type = compat.get_backbone_type(hparams, nested_config=pitch_hparams)
+ self.pitch_backbone_args = compat.get_backbone_args(hparams, backbone_type=self.pitch_backbone_type)
if self.diffusion_type == 'ddpm':
self.pitch_predictor = PitchDiffusion(
vmin=pitch_hparams['pitd_norm_min'],
@@ -167,12 +163,8 @@ def __init__(self, vocab_size):
repeat_bins=pitch_hparams['repeat_bins'],
timesteps=hparams['timesteps'],
k_step=hparams['K_step'],
- backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- backbone_args={
- 'n_layers': pitch_hparams['residual_layers'],
- 'n_chans': pitch_hparams['residual_channels'],
- 'n_dilates': pitch_hparams['dilation_cycle_length'],
- }
+ backbone_type=self.pitch_backbone_type,
+ backbone_args=self.pitch_backbone_args
)
elif self.diffusion_type == 'reflow':
self.pitch_predictor = PitchRectifiedFlow(
@@ -182,12 +174,8 @@ def __init__(self, vocab_size):
cmax=pitch_hparams['pitd_clip_max'],
repeat_bins=pitch_hparams['repeat_bins'],
time_scale_factor=hparams['time_scale_factor'],
- backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
- backbone_args={
- 'n_layers': pitch_hparams['residual_layers'],
- 'n_chans': pitch_hparams['residual_channels'],
- 'n_dilates': pitch_hparams['dilation_cycle_length'],
- }
+ backbone_type=self.pitch_backbone_type,
+ backbone_args=self.pitch_backbone_args
)
else:
raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")
diff --git a/scripts/export.py b/scripts/export.py
index 537cdad9f..d666175d6 100644
--- a/scripts/export.py
+++ b/scripts/export.py
@@ -14,6 +14,13 @@
from utils.hparams import set_hparams, hparams
+def check_pytorch_version():
+ # Require PyTorch version to be exactly 1.13.x
+ if torch.__version__.startswith('1.13.'):
+ return
+ raise RuntimeError('This script requires PyTorch 1.13.x. Please install the correct version.')
+
+
def find_exp(exp):
if not (root_dir / 'checkpoints' / exp).exists():
for subdir in (root_dir / 'checkpoints').iterdir():
@@ -291,4 +298,5 @@ def nsf_hifigan(
if __name__ == '__main__':
+ check_pytorch_version()
main()