From 48a3c1b232a808f17dc54e5dd52a1e302d41416c Mon Sep 17 00:00:00 2001 From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com> Date: Tue, 9 Jul 2024 20:14:32 +0800 Subject: [PATCH 1/3] Update __init__.py --- modules/backbones/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py index 1061b8779..a91578567 100644 --- a/modules/backbones/__init__.py +++ b/modules/backbones/__init__.py @@ -1,5 +1,7 @@ from modules.backbones.wavenet import WaveNet +from modules.backbones.LYNXNet import LYNXNet BACKBONES = { - 'wavenet': WaveNet + 'wavenet': WaveNet, + 'lynxnet': LYNXNet } From c5dc87c7d5d84a9bc38b5af1c2c2c41c2f5225d1 Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Tue, 9 Jul 2024 20:25:08 +0800 Subject: [PATCH 2/3] Update LYNXNet --- modules/aux_decoder/LYNXNetDecoder.py | 69 +++++++++++ modules/aux_decoder/__init__.py | 7 +- modules/backbones/LYNXNet.py | 165 ++++++++++++++++++++++++++ 3 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 modules/aux_decoder/LYNXNetDecoder.py create mode 100644 modules/backbones/LYNXNet.py diff --git a/modules/aux_decoder/LYNXNetDecoder.py b/modules/aux_decoder/LYNXNetDecoder.py new file mode 100644 index 000000000..14a1cb38e --- /dev/null +++ b/modules/aux_decoder/LYNXNetDecoder.py @@ -0,0 +1,69 @@ +# refer to: +# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py +# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modules.backbones.LYNXNet import LYNXConvModule + + +class LYNXNetDecoderLayer(nn.Module): + """ + LYNXNet Decoder Layer + + Args: + dim (int): Dimension of model + expansion_factor (int): Expansion factor of conv module, default 2 + kernel_size (int): Kernel size of conv module, default 31 + in_norm (bool): Whether to use norm + activation (str): Activation Function for conv module + """ + + def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='SiLU'): + super().__init__() + self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation) + + def forward(self, x) -> torch.Tensor: + residual = x + x = self.convmodule(x) + x = residual + x + + return x + + +class LYNXNetDecoder(nn.Module): + def __init__( + self, in_dims, out_dims, /, *, + num_channels=512, num_layers=6, kernel_size=31, dropout_rate=None + ): + super().__init__() + self.input_projection = nn.Conv1d(in_dims, num_channels, 1) + self.encoder_layers = nn.ModuleList( + LYNXNetDecoderLayer( + dim=num_channels, + expansion_factor=2, + kernel_size=kernel_size, + in_norm=False, + activation='SiLU') for _ in range(num_layers) + ) + self.output_projection = nn.Conv1d(num_channels, out_dims, kernel_size=1) + + def forward(self, x, infer=False): + """ + Args: + x (torch.Tensor): Input tensor (#batch, length, in_dims) + return: + torch.Tensor: Output tensor (#batch, length, out_dims) + """ + x = x.transpose(1, 2) + x = self.input_projection(x) + x = x.transpose(1, 2) + for layer in self.encoder_layers: + x = layer(x) + x = x.transpose(1, 2) + x = self.output_projection(x) + x = x.transpose(1, 2) + + return x \ No newline at end of file diff --git a/modules/aux_decoder/__init__.py b/modules/aux_decoder/__init__.py index 54ceb2113..4801b1156 100644 --- a/modules/aux_decoder/__init__.py +++ b/modules/aux_decoder/__init__.py @@ -2,13 +2,16 @@ from torch import nn from .convnext import ConvNeXtDecoder +from .LYNXNetDecoder import LYNXNetDecoder from utils import filter_kwargs AUX_DECODERS = { - 'convnext': ConvNeXtDecoder + 'convnext': ConvNeXtDecoder, + 'lynxnet': LYNXNetDecoder } AUX_LOSSES = { - 'convnext': nn.L1Loss + 'convnext': nn.L1Loss, + 'lynxnet': nn.L1Loss } diff --git a/modules/backbones/LYNXNet.py b/modules/backbones/LYNXNet.py new file mode 100644 index 000000000..2e26e5f8c --- /dev/null +++ b/modules/backbones/LYNXNet.py @@ -0,0 +1,165 @@ +# refer to: +# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py +# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from utils.hparams import hparams + + +class SwiGLU(nn.Module): + ## Swish-Applies the gated linear unit function. + def __init__(self, dim=-1): + super().__init__() + self.dim = dim + def forward(self, x): + # out, gate = x.chunk(2, dim=self.dim) + # Using torch.split instead of chunk for ONNX export compatibility. + out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim) + return out * F.silu(gate) + + +class SinusoidalPosEmb(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + device = x.device + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, device=device) * -emb) + emb = x[:, None] * emb[None, :] + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb + + +class Transpose(nn.Module): + def __init__(self, dims): + super().__init__() + assert len(dims) == 2, 'dims must be a tuple of two dimensions' + self.dims = dims + + def forward(self, x): + return x.transpose(*self.dims) + + +class LYNXConvModule(nn.Module): + @staticmethod + def calc_same_padding(kernel_size): + pad = kernel_size // 2 + return (pad, pad - (kernel_size + 1) % 2) + + def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU'): + super().__init__() + inner_dim = dim * expansion_factor + normalize = nn.LayerNorm(dim) if in_norm or dim > 512 else nn.Identity() + activation_classes = { + 'SiLU': nn.SiLU, + 'ReLU': nn.ReLU, + 'PReLU': lambda: nn.PReLU(inner_dim) + } + activation = activation if activation is not None else 'PReLU' + if activation not in activation_classes: + raise ValueError(f'{activation} is not a valid activation') + activation = activation_classes[activation]() + padding = self.calc_same_padding(kernel_size) + self.net = nn.Sequential( + normalize, + Transpose((1, 2)), + nn.Conv1d(dim, inner_dim * 2, 1), + SwiGLU(dim=1), + nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim), + activation, + nn.Conv1d(inner_dim, dim, 1), + Transpose((1, 2)) + ) + + def forward(self, x): + return self.net(x) + + +class LYNXNetResidualLayer(nn.Module): + def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU'): + super().__init__() + self.diffusion_projection = nn.Conv1d(dim, dim, 1) + self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1) + self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation) + + def forward(self, x, conditioner, diffusion_step): + res_x = x.transpose(1, 2) + x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner) + x = x.transpose(1, 2) + x = self.convmodule(x) # (#batch, dim, length) + x = x + res_x + x = x.transpose(1, 2) + + return x # (#batch, length, dim) + + +class LYNXNet(nn.Module): + def __init__(self, in_dims, n_feats, *, n_layers=6, n_chans=512, n_dilates=2, in_norm=False, activation='PReLU'): + """ + LYNXNet(Linear Gated Depthwise Separable Convolution Network) + TIPS:You can control the style of the generated results by modifying the 'activation', + - 'PReLU'(default) : Similar to WaveNet + - 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM + - 'ReLU' : Contrary to 'SiLU', Voice will be weakened + """ + super().__init__() + self.input_projection = nn.Conv1d(in_dims * n_feats, n_chans, 1) + self.diffusion_embedding = nn.Sequential( + SinusoidalPosEmb(n_chans), + nn.Linear(n_chans, n_chans * 4), + nn.GELU(), + nn.Linear(n_chans * 4, n_chans), + ) + self.residual_layers = nn.ModuleList( + [ + LYNXNetResidualLayer( + dim_cond=hparams['hidden_size'], + dim=n_chans, + expansion_factor=n_dilates, + kernel_size=31, + in_norm=in_norm, + activation=activation + ) + for i in range(n_layers) + ] + ) + self.output_projection = nn.Conv1d(n_chans, in_dims * n_feats, kernel_size=1) + nn.init.zeros_(self.output_projection.weight) + + def forward(self, spec, diffusion_step, cond): + """ + :param spec: [B, F, M, T] + :param diffusion_step: [B, 1] + :param cond: [B, H, T] + :return: + """ + + # To keep compatibility with DiffSVC, [B, 1, M, T] + x = spec + use_4_dim = False + if x.dim() == 4: + x = x[:, 0] + use_4_dim = True + + assert x.dim() == 3, f"mel must be 3 dim tensor, but got {x.dim()}" + + x = self.input_projection(x) # x [B, residual_channel, T] + x = F.gelu(x) + + diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1) + + for layer in self.residual_layers: + x = layer(x, cond, diffusion_step) + + # MLP and GLU + x = self.output_projection(x) # [B, 128, T] + + return x[:, None] if use_4_dim else x From 7e271da8ff941b4a90b9f75601dbde422c63f804 Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Tue, 30 Jul 2024 22:31:42 +0800 Subject: [PATCH 3/3] add dropout --- modules/aux_decoder/LYNXNetDecoder.py | 9 +++++---- modules/backbones/LYNXNet.py | 26 ++++++++++++++++---------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/modules/aux_decoder/LYNXNetDecoder.py b/modules/aux_decoder/LYNXNetDecoder.py index 14a1cb38e..42b552fbf 100644 --- a/modules/aux_decoder/LYNXNetDecoder.py +++ b/modules/aux_decoder/LYNXNetDecoder.py @@ -21,9 +21,9 @@ class LYNXNetDecoderLayer(nn.Module): activation (str): Activation Function for conv module """ - def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='SiLU'): + def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='SiLU', dropout=0.): super().__init__() - self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation) + self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation, dropout=dropout) def forward(self, x) -> torch.Tensor: residual = x @@ -36,7 +36,7 @@ def forward(self, x) -> torch.Tensor: class LYNXNetDecoder(nn.Module): def __init__( self, in_dims, out_dims, /, *, - num_channels=512, num_layers=6, kernel_size=31, dropout_rate=None + num_channels=512, num_layers=6, kernel_size=31, dropout_rate=0. ): super().__init__() self.input_projection = nn.Conv1d(in_dims, num_channels, 1) @@ -46,7 +46,8 @@ def __init__( expansion_factor=2, kernel_size=kernel_size, in_norm=False, - activation='SiLU') for _ in range(num_layers) + activation='SiLU', + dropout=dropout_rate) for _ in range(num_layers) ) self.output_projection = nn.Conv1d(num_channels, out_dims, kernel_size=1) diff --git a/modules/backbones/LYNXNet.py b/modules/backbones/LYNXNet.py index 2e26e5f8c..ffc5a94e5 100644 --- a/modules/backbones/LYNXNet.py +++ b/modules/backbones/LYNXNet.py @@ -54,10 +54,10 @@ def calc_same_padding(kernel_size): pad = kernel_size // 2 return (pad, pad - (kernel_size + 1) % 2) - def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU'): + def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU', dropout=0.): super().__init__() inner_dim = dim * expansion_factor - normalize = nn.LayerNorm(dim) if in_norm or dim > 512 else nn.Identity() + _normalize = nn.LayerNorm(dim) if in_norm or dim > 512 else nn.Identity() activation_classes = { 'SiLU': nn.SiLU, 'ReLU': nn.ReLU, @@ -66,17 +66,22 @@ def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activat activation = activation if activation is not None else 'PReLU' if activation not in activation_classes: raise ValueError(f'{activation} is not a valid activation') - activation = activation_classes[activation]() + _activation = activation_classes[activation]() padding = self.calc_same_padding(kernel_size) + if float(dropout) > 0.: + _dropout = nn.Dropout(dropout) + else: + _dropout = nn.Identity() self.net = nn.Sequential( - normalize, + _normalize, Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), SwiGLU(dim=1), nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim), - activation, + _activation, nn.Conv1d(inner_dim, dim, 1), - Transpose((1, 2)) + Transpose((1, 2)), + _dropout ) def forward(self, x): @@ -84,11 +89,11 @@ def forward(self, x): class LYNXNetResidualLayer(nn.Module): - def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU'): + def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU', dropout=0.): super().__init__() self.diffusion_projection = nn.Conv1d(dim, dim, 1) self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1) - self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation) + self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation, dropout=dropout) def forward(self, x, conditioner, diffusion_step): res_x = x.transpose(1, 2) @@ -102,7 +107,7 @@ def forward(self, x, conditioner, diffusion_step): class LYNXNet(nn.Module): - def __init__(self, in_dims, n_feats, *, n_layers=6, n_chans=512, n_dilates=2, in_norm=False, activation='PReLU'): + def __init__(self, in_dims, n_feats, *, n_layers=6, n_chans=512, n_dilates=2, in_norm=False, activation='PReLU', dropout=0.): """ LYNXNet(Linear Gated Depthwise Separable Convolution Network) TIPS:You can control the style of the generated results by modifying the 'activation', @@ -126,7 +131,8 @@ def __init__(self, in_dims, n_feats, *, n_layers=6, n_chans=512, n_dilates=2, in expansion_factor=n_dilates, kernel_size=31, in_norm=in_norm, - activation=activation + activation=activation, + dropout=dropout ) for i in range(n_layers) ]