openvpi · yxlllc · Nov 3, 2024 · Aug 3, 2024 · Aug 28, 2024 · Sep 6, 2024
diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
@@ -73,10 +73,12 @@ sampling_steps: 20
 diff_accelerator: ddim
 diff_speedup: 10
 hidden_size: 256
-residual_layers: 20
-residual_channels: 512
-dilation_cycle_length: 4  # *
-backbone_type: 'wavenet'
+backbone_type: 'lynxnet'
+backbone_args:
+    num_channels: 1024
+    num_layers: 6
+    kernel_size: 31
+    dropout_rate: 0.0
 main_loss_type: l2
 main_loss_log_norm: false
 schedule_type: 'linear'

diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
@@ -51,16 +51,24 @@ augmentation_args:
     range: [0.5, 2.]
     scale: 0.75
 
-residual_channels: 512
-residual_layers: 20
-
-# shallow diffusion
+# diffusion and shallow diffusion
 diffusion_type: reflow
 use_shallow_diffusion: true
 T_start: 0.4
 T_start_infer: 0.4
 K_step: 300
 K_step_infer: 300
+backbone_type: 'lynxnet'
+backbone_args:
+    num_channels: 1024
+    num_layers: 6
+    kernel_size: 31
+    dropout_rate: 0.0
+# backbone_type: 'wavenet'
+# backbone_args:
+#    num_channels: 512
+#    num_layers: 20
+#    dilation_cycle_length: 4
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true

diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
@@ -78,15 +78,19 @@ pitch_prediction_args:
   pitd_clip_min: -12.0
   pitd_clip_max: 12.0
   repeat_bins: 64
-  residual_layers: 20
-  residual_channels: 256
-  dilation_cycle_length: 5  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 20
+    num_channels: 256
+    dilation_cycle_length: 5
 
 variances_prediction_args:
   total_repeat_bins: 48
-  residual_layers: 10
-  residual_channels: 192
-  dilation_cycle_length: 4  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 10
+    num_channels: 192
+    dilation_cycle_length: 4
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0

diff --git a/configs/variance.yaml b/configs/variance.yaml
@@ -68,9 +68,11 @@ pitch_prediction_args:
   pitd_clip_min: -12.0
   pitd_clip_max: 12.0
   repeat_bins: 64
-  residual_layers: 20
-  residual_channels: 256
-  dilation_cycle_length: 5  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 20
+    num_channels: 256
+    dilation_cycle_length: 5 
 
 energy_db_min: -96.0
 energy_db_max: -12.0
@@ -89,9 +91,11 @@ tension_smooth_width: 0.12
 
 variances_prediction_args:
   total_repeat_bins: 48
-  residual_layers: 10
-  residual_channels: 192
-  dilation_cycle_length: 4  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 10
+    num_channels: 192
+    dilation_cycle_length: 4
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0

diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py
@@ -31,12 +31,8 @@ def __init__(self, vocab_size, out_dims):
                 num_feats=1,
                 timesteps=hparams['timesteps'],
                 k_step=hparams['K_step'],
-                backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                backbone_args={
-                    'n_layers': hparams['residual_layers'],
-                    'n_chans': hparams['residual_channels'],
-                    'n_dilates': hparams['dilation_cycle_length'],
-                },
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
                 spec_min=hparams['spec_min'],
                 spec_max=hparams['spec_max']
             )
@@ -46,12 +42,8 @@ def __init__(self, vocab_size, out_dims):
                 num_feats=1,
                 t_start=hparams['T_start'],
                 time_scale_factor=hparams['time_scale_factor'],
-                backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                backbone_args={
-                    'n_layers': hparams['residual_layers'],
-                    'n_chans': hparams['residual_channels'],
-                    'n_dilates': hparams['dilation_cycle_length'],
-                },
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
                 spec_min=hparams['spec_min'],
                 spec_max=hparams['spec_max']
             )
@@ -155,12 +147,8 @@ def __init__(self, vocab_size):
                     repeat_bins=pitch_hparams['repeat_bins'],
                     timesteps=hparams['timesteps'],
                     k_step=hparams['K_step'],
-                    backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                    backbone_args={
-                        'n_layers': pitch_hparams['residual_layers'],
-                        'n_chans': pitch_hparams['residual_channels'],
-                        'n_dilates': pitch_hparams['dilation_cycle_length'],
-                    }
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args
                 )
             elif self.diffusion_type == 'reflow':
                 self.pitch_predictor = PitchRectifiedFlowONNX(
@@ -170,12 +158,8 @@ def __init__(self, vocab_size):
                     cmax=pitch_hparams['pitd_clip_max'],
                     repeat_bins=pitch_hparams['repeat_bins'],
                     time_scale_factor=hparams['time_scale_factor'],
-                    backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                    backbone_args={
-                        'n_layers': pitch_hparams['residual_layers'],
-                        'n_chans': pitch_hparams['residual_channels'],
-                        'n_dilates': pitch_hparams['dilation_cycle_length'],
-                    }
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args
                 )
             else:
                 raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")

diff --git a/modules/backbones/LYNXNet.py b/modules/backbones/LYNXNet.py
@@ -0,0 +1,178 @@
+# refer to： 
+# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py
+# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from utils.hparams import hparams
+
+
+class SwiGLU(nn.Module):
+    ## Swish-Applies the gated linear unit function.
+    def __init__(self, dim=-1):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        # out, gate = x.chunk(2, dim=self.dim)
+        # Using torch.split instead of chunk for ONNX export compatibility.
+        out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim)
+        return out * F.silu(gate)
+
+
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+
+
+class Transpose(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        assert len(dims) == 2, 'dims must be a tuple of two dimensions'
+        self.dims = dims
+
+    def forward(self, x):
+        return x.transpose(*self.dims)
+
+
+class LYNXConvModule(nn.Module):
+    @staticmethod
+    def calc_same_padding(kernel_size):
+        pad = kernel_size // 2
+        return (pad, pad - (kernel_size + 1) % 2)
+
+    def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        activation_classes = {
+            'SiLU': nn.SiLU,
+            'ReLU': nn.ReLU,
+            'PReLU': lambda: nn.PReLU(inner_dim)
+        }
+        activation = activation if activation is not None else 'PReLU'
+        if activation not in activation_classes:
+            raise ValueError(f'{activation} is not a valid activation')
+        _activation = activation_classes[activation]()
+        padding = self.calc_same_padding(kernel_size)
+        if float(dropout) > 0.:
+            _dropout = nn.Dropout(dropout)
+        else:
+            _dropout = nn.Identity()
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            Transpose((1, 2)),
+            nn.Conv1d(dim, inner_dim * 2, 1),
+            SwiGLU(dim=1),
+            nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim),
+            _activation,
+            nn.Conv1d(inner_dim, dim, 1),
+            Transpose((1, 2)),
+            _dropout
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class LYNXNetResidualLayer(nn.Module):
+    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
+        super().__init__()
+        self.diffusion_projection = nn.Conv1d(dim, dim, 1)
+        self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1)
+        self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, activation=activation, dropout=dropout)
+
+    def forward(self, x, conditioner, diffusion_step):
+        res_x = x.transpose(1, 2)
+        x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
+        x = x.transpose(1, 2)
+        x = self.convmodule(x)  # (#batch, dim, length)
+        x = x + res_x
+        x = x.transpose(1, 2)
+
+        return x  # (#batch, length, dim)
+
+
+class LYNXNet(nn.Module):
+    def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31, activation='PReLU', dropout=0.):
+        """
+        LYNXNet(Linear Gated Depthwise Separable Convolution Network)
+        TIPS:You can control the style of the generated results by modifying the 'activation', 
+            - 'PReLU'(default) : Similar to WaveNet
+            - 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM
+            - 'ReLU' : Contrary to 'SiLU', Voice will be weakened
+        """
+        super().__init__()
+        self.in_dims = in_dims
+        self.n_feats = n_feats
+        self.input_projection = nn.Conv1d(in_dims * n_feats, num_channels, 1)
+        self.diffusion_embedding = nn.Sequential(
+            SinusoidalPosEmb(num_channels),
+            nn.Linear(num_channels, num_channels * 4),
+            nn.GELU(),
+            nn.Linear(num_channels * 4, num_channels),
+        )
+        self.residual_layers = nn.ModuleList(
+            [
+                LYNXNetResidualLayer(
+                    dim_cond=hparams['hidden_size'], 
+                    dim=num_channels, 
+                    expansion_factor=expansion_factor, 
+                    kernel_size=kernel_size, 
+                    activation=activation, 
+                    dropout=dropout
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = nn.LayerNorm(num_channels)
+        self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
+        nn.init.zeros_(self.output_projection.weight)
+
+    def forward(self, spec, diffusion_step, cond):
+        """
+        :param spec: [B, F, M, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, H, T]
+        :return:
+        """
+
+        if self.n_feats == 1:
+            x = spec.squeeze(1)  # [B, M, T]
+        else:
+            x = spec.flatten(start_dim=1, end_dim=2)  # [B, F x M, T]
+
+        x = self.input_projection(x)  # x [B, residual_channel, T]
+        x = F.gelu(x)
+
+        diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1)
+
+        for layer in self.residual_layers:
+            x = layer(x, cond, diffusion_step)
+
+        # post-norm
+        x = self.norm(x.transpose(1, 2)).transpose(1, 2)
+
+        # MLP and GLU
+        x = self.output_projection(x)  # [B, 128, T]
+
+        if self.n_feats == 1:
+            x = x[:, None, :, :]
+        else:
+            # This is the temporary solution since PyTorch 1.13
+            # does not support exporting aten::unflatten to ONNX
+            # x = x.unflatten(dim=1, sizes=(self.n_feats, self.in_dims))
+            x = x.reshape(-1, self.n_feats, self.in_dims, x.shape[2])
+        return x
diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py
@@ -1,5 +1,17 @@
+import torch.nn
 from modules.backbones.wavenet import WaveNet
+from modules.backbones.LYNXNet import LYNXNet
+from utils import filter_kwargs
 
 BACKBONES = {
-    'wavenet': WaveNet
+    'wavenet': WaveNet, 
+    'lynxnet': LYNXNet
 }
+
+def build_backbone(
+        out_dims: int, num_feats: int,
+        backbone_type: str, backbone_args: dict
+) -> torch.nn.Module:
+    backbone = BACKBONES[backbone_type]
+    kwargs = filter_kwargs(backbone_args, backbone)
+    return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
diff --git a/modules/backbones/wavenet.py b/modules/backbones/wavenet.py
@@ -63,27 +63,27 @@ def forward(self, x, conditioner, diffusion_step):
 
 
 class WaveNet(nn.Module):
-    def __init__(self, in_dims, n_feats, *, n_layers=20, n_chans=256, n_dilates=4):
+    def __init__(self, in_dims, n_feats, *, num_layers=20, num_channels=256, dilation_cycle_length=4):
         super().__init__()
         self.in_dims = in_dims
         self.n_feats = n_feats
-        self.input_projection = Conv1d(in_dims * n_feats, n_chans, 1)
-        self.diffusion_embedding = SinusoidalPosEmb(n_chans)
+        self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
+        self.diffusion_embedding = SinusoidalPosEmb(num_channels)
         self.mlp = nn.Sequential(
-            nn.Linear(n_chans, n_chans * 4),
+            nn.Linear(num_channels, num_channels * 4),
             nn.Mish(),
-            nn.Linear(n_chans * 4, n_chans)
+            nn.Linear(num_channels * 4, num_channels)
         )
         self.residual_layers = nn.ModuleList([
             ResidualBlock(
                 encoder_hidden=hparams['hidden_size'],
-                residual_channels=n_chans,
-                dilation=2 ** (i % n_dilates)
+                residual_channels=num_channels,
+                dilation=2 ** (i % dilation_cycle_length)
             )
-            for i in range(n_layers)
+            for i in range(num_layers)
         ])
-        self.skip_projection = Conv1d(n_chans, n_chans, 1)
-        self.output_projection = Conv1d(n_chans, in_dims * n_feats, 1)
+        self.skip_projection = Conv1d(num_channels, num_channels, 1)
+        self.output_projection = Conv1d(num_channels, in_dims * n_feats, 1)
         nn.init.zeros_(self.output_projection.weight)
 
     def forward(self, spec, diffusion_step, cond):