Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions configs/acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,12 @@ sampling_steps: 20
diff_accelerator: ddim
diff_speedup: 10
hidden_size: 256
residual_layers: 20
residual_channels: 512
dilation_cycle_length: 4 # *
backbone_type: 'wavenet'
backbone_type: 'lynxnet'
backbone_args:
num_channels: 1024
num_layers: 6
kernel_size: 31
dropout_rate: 0.0
main_loss_type: l2
main_loss_log_norm: false
schedule_type: 'linear'
Expand Down
16 changes: 12 additions & 4 deletions configs/templates/config_acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,24 @@ augmentation_args:
range: [0.5, 2.]
scale: 0.75

residual_channels: 512
residual_layers: 20

# shallow diffusion
# diffusion and shallow diffusion
diffusion_type: reflow
use_shallow_diffusion: true
T_start: 0.4
T_start_infer: 0.4
K_step: 300
K_step_infer: 300
backbone_type: 'lynxnet'
backbone_args:
num_channels: 1024
num_layers: 6
kernel_size: 31
dropout_rate: 0.0
# backbone_type: 'wavenet'
# backbone_args:
# num_channels: 512
# num_layers: 20
# dilation_cycle_length: 4
shallow_diffusion_args:
train_aux_decoder: true
train_diffusion: true
Expand Down
16 changes: 10 additions & 6 deletions configs/templates/config_variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,19 @@ pitch_prediction_args:
pitd_clip_min: -12.0
pitd_clip_max: 12.0
repeat_bins: 64
residual_layers: 20
residual_channels: 256
dilation_cycle_length: 5 # *
backbone_type: 'wavenet'
backbone_args:
num_layers: 20
num_channels: 256
dilation_cycle_length: 5

variances_prediction_args:
total_repeat_bins: 48
residual_layers: 10
residual_channels: 192
dilation_cycle_length: 4 # *
backbone_type: 'wavenet'
backbone_args:
num_layers: 10
num_channels: 192
dilation_cycle_length: 4

lambda_dur_loss: 1.0
lambda_pitch_loss: 1.0
Expand Down
16 changes: 10 additions & 6 deletions configs/variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,11 @@ pitch_prediction_args:
pitd_clip_min: -12.0
pitd_clip_max: 12.0
repeat_bins: 64
residual_layers: 20
residual_channels: 256
dilation_cycle_length: 5 # *
backbone_type: 'wavenet'
backbone_args:
num_layers: 20
num_channels: 256
dilation_cycle_length: 5

energy_db_min: -96.0
energy_db_max: -12.0
Expand All @@ -89,9 +91,11 @@ tension_smooth_width: 0.12

variances_prediction_args:
total_repeat_bins: 48
residual_layers: 10
residual_channels: 192
dilation_cycle_length: 4 # *
backbone_type: 'wavenet'
backbone_args:
num_layers: 10
num_channels: 192
dilation_cycle_length: 4

lambda_dur_loss: 1.0
lambda_pitch_loss: 1.0
Expand Down
32 changes: 8 additions & 24 deletions deployment/modules/toplevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,8 @@ def __init__(self, vocab_size, out_dims):
num_feats=1,
timesteps=hparams['timesteps'],
k_step=hparams['K_step'],
backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
backbone_args={
'n_layers': hparams['residual_layers'],
'n_chans': hparams['residual_channels'],
'n_dilates': hparams['dilation_cycle_length'],
},
backbone_type=self.backbone_type,
backbone_args=self.backbone_args,
spec_min=hparams['spec_min'],
spec_max=hparams['spec_max']
)
Expand All @@ -46,12 +42,8 @@ def __init__(self, vocab_size, out_dims):
num_feats=1,
t_start=hparams['T_start'],
time_scale_factor=hparams['time_scale_factor'],
backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
backbone_args={
'n_layers': hparams['residual_layers'],
'n_chans': hparams['residual_channels'],
'n_dilates': hparams['dilation_cycle_length'],
},
backbone_type=self.backbone_type,
backbone_args=self.backbone_args,
spec_min=hparams['spec_min'],
spec_max=hparams['spec_max']
)
Expand Down Expand Up @@ -155,12 +147,8 @@ def __init__(self, vocab_size):
repeat_bins=pitch_hparams['repeat_bins'],
timesteps=hparams['timesteps'],
k_step=hparams['K_step'],
backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
backbone_args={
'n_layers': pitch_hparams['residual_layers'],
'n_chans': pitch_hparams['residual_channels'],
'n_dilates': pitch_hparams['dilation_cycle_length'],
}
backbone_type=self.pitch_backbone_type,
backbone_args=self.pitch_backbone_args
)
elif self.diffusion_type == 'reflow':
self.pitch_predictor = PitchRectifiedFlowONNX(
Expand All @@ -170,12 +158,8 @@ def __init__(self, vocab_size):
cmax=pitch_hparams['pitd_clip_max'],
repeat_bins=pitch_hparams['repeat_bins'],
time_scale_factor=hparams['time_scale_factor'],
backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
backbone_args={
'n_layers': pitch_hparams['residual_layers'],
'n_chans': pitch_hparams['residual_channels'],
'n_dilates': pitch_hparams['dilation_cycle_length'],
}
backbone_type=self.pitch_backbone_type,
backbone_args=self.pitch_backbone_args
)
else:
raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")
Expand Down
178 changes: 178 additions & 0 deletions modules/backbones/LYNXNet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# refer to:
# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py
# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py

import math

import torch
import torch.nn as nn
import torch.nn.functional as F

from utils.hparams import hparams


class SwiGLU(nn.Module):
## Swish-Applies the gated linear unit function.
def __init__(self, dim=-1):
super().__init__()
self.dim = dim
def forward(self, x):
# out, gate = x.chunk(2, dim=self.dim)
# Using torch.split instead of chunk for ONNX export compatibility.
out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim)
return out * F.silu(gate)


class SinusoidalPosEmb(nn.Module):
def __init__(self, dim):
super().__init__()
self.dim = dim

def forward(self, x):
device = x.device
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
emb = x[:, None] * emb[None, :]
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
return emb


class Transpose(nn.Module):
def __init__(self, dims):
super().__init__()
assert len(dims) == 2, 'dims must be a tuple of two dimensions'
self.dims = dims

def forward(self, x):
return x.transpose(*self.dims)


class LYNXConvModule(nn.Module):
@staticmethod
def calc_same_padding(kernel_size):
pad = kernel_size // 2
return (pad, pad - (kernel_size + 1) % 2)

def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
super().__init__()
inner_dim = dim * expansion_factor
activation_classes = {
'SiLU': nn.SiLU,
'ReLU': nn.ReLU,
'PReLU': lambda: nn.PReLU(inner_dim)
}
activation = activation if activation is not None else 'PReLU'
if activation not in activation_classes:
raise ValueError(f'{activation} is not a valid activation')
_activation = activation_classes[activation]()
padding = self.calc_same_padding(kernel_size)
if float(dropout) > 0.:
_dropout = nn.Dropout(dropout)
else:
_dropout = nn.Identity()
self.net = nn.Sequential(
nn.LayerNorm(dim),
Transpose((1, 2)),
nn.Conv1d(dim, inner_dim * 2, 1),
SwiGLU(dim=1),
nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim),
_activation,
nn.Conv1d(inner_dim, dim, 1),
Transpose((1, 2)),
_dropout
)

def forward(self, x):
return self.net(x)


class LYNXNetResidualLayer(nn.Module):
def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
super().__init__()
self.diffusion_projection = nn.Conv1d(dim, dim, 1)
self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1)
self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, activation=activation, dropout=dropout)

def forward(self, x, conditioner, diffusion_step):
res_x = x.transpose(1, 2)
x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
x = x.transpose(1, 2)
x = self.convmodule(x) # (#batch, dim, length)
x = x + res_x
x = x.transpose(1, 2)

return x # (#batch, length, dim)


class LYNXNet(nn.Module):
def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31, activation='PReLU', dropout=0.):
"""
LYNXNet(Linear Gated Depthwise Separable Convolution Network)
TIPS:You can control the style of the generated results by modifying the 'activation',
- 'PReLU'(default) : Similar to WaveNet
- 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM
- 'ReLU' : Contrary to 'SiLU', Voice will be weakened
"""
super().__init__()
self.in_dims = in_dims
self.n_feats = n_feats
self.input_projection = nn.Conv1d(in_dims * n_feats, num_channels, 1)
self.diffusion_embedding = nn.Sequential(
SinusoidalPosEmb(num_channels),
nn.Linear(num_channels, num_channels * 4),
nn.GELU(),
nn.Linear(num_channels * 4, num_channels),
)
self.residual_layers = nn.ModuleList(
[
LYNXNetResidualLayer(
dim_cond=hparams['hidden_size'],
dim=num_channels,
expansion_factor=expansion_factor,
kernel_size=kernel_size,
activation=activation,
dropout=dropout
)
for i in range(num_layers)
]
)
self.norm = nn.LayerNorm(num_channels)
self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
nn.init.zeros_(self.output_projection.weight)

def forward(self, spec, diffusion_step, cond):
"""
:param spec: [B, F, M, T]
:param diffusion_step: [B, 1]
:param cond: [B, H, T]
:return:
"""

if self.n_feats == 1:
x = spec.squeeze(1) # [B, M, T]
else:
x = spec.flatten(start_dim=1, end_dim=2) # [B, F x M, T]

x = self.input_projection(x) # x [B, residual_channel, T]
x = F.gelu(x)

diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1)

for layer in self.residual_layers:
x = layer(x, cond, diffusion_step)

# post-norm
x = self.norm(x.transpose(1, 2)).transpose(1, 2)

# MLP and GLU
x = self.output_projection(x) # [B, 128, T]

if self.n_feats == 1:
x = x[:, None, :, :]
else:
# This is the temporary solution since PyTorch 1.13
# does not support exporting aten::unflatten to ONNX
# x = x.unflatten(dim=1, sizes=(self.n_feats, self.in_dims))
x = x.reshape(-1, self.n_feats, self.in_dims, x.shape[2])
return x
14 changes: 13 additions & 1 deletion modules/backbones/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
import torch.nn
from modules.backbones.wavenet import WaveNet
from modules.backbones.LYNXNet import LYNXNet
from utils import filter_kwargs

BACKBONES = {
'wavenet': WaveNet
'wavenet': WaveNet,
'lynxnet': LYNXNet
}

def build_backbone(
out_dims: int, num_feats: int,
backbone_type: str, backbone_args: dict
) -> torch.nn.Module:
backbone = BACKBONES[backbone_type]
kwargs = filter_kwargs(backbone_args, backbone)
return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
20 changes: 10 additions & 10 deletions modules/backbones/wavenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,27 +63,27 @@ def forward(self, x, conditioner, diffusion_step):


class WaveNet(nn.Module):
def __init__(self, in_dims, n_feats, *, n_layers=20, n_chans=256, n_dilates=4):
def __init__(self, in_dims, n_feats, *, num_layers=20, num_channels=256, dilation_cycle_length=4):
super().__init__()
self.in_dims = in_dims
self.n_feats = n_feats
self.input_projection = Conv1d(in_dims * n_feats, n_chans, 1)
self.diffusion_embedding = SinusoidalPosEmb(n_chans)
self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
self.diffusion_embedding = SinusoidalPosEmb(num_channels)
self.mlp = nn.Sequential(
nn.Linear(n_chans, n_chans * 4),
nn.Linear(num_channels, num_channels * 4),
nn.Mish(),
nn.Linear(n_chans * 4, n_chans)
nn.Linear(num_channels * 4, num_channels)
)
self.residual_layers = nn.ModuleList([
ResidualBlock(
encoder_hidden=hparams['hidden_size'],
residual_channels=n_chans,
dilation=2 ** (i % n_dilates)
residual_channels=num_channels,
dilation=2 ** (i % dilation_cycle_length)
)
for i in range(n_layers)
for i in range(num_layers)
])
self.skip_projection = Conv1d(n_chans, n_chans, 1)
self.output_projection = Conv1d(n_chans, in_dims * n_feats, 1)
self.skip_projection = Conv1d(num_channels, num_channels, 1)
self.output_projection = Conv1d(num_channels, in_dims * n_feats, 1)
nn.init.zeros_(self.output_projection.weight)

def forward(self, spec, diffusion_step, cond):
Expand Down
Loading