From 96e8ac2831b1b78e20f0f5aaa6e59880494e5f0a Mon Sep 17 00:00:00 2001
From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com>
Date: Wed, 28 Aug 2024 20:18:21 +0800
Subject: [PATCH 01/10] [DONE]New AUX_Decoder/Backbone Network : LYNXNet (#200)

* Update __init__.py

* Update LYNXNet

* add dropout
---
 modules/aux_decoder/LYNXNetDecoder.py |  70 +++++++++++
 modules/aux_decoder/__init__.py       |   7 +-
 modules/backbones/LYNXNet.py          | 171 ++++++++++++++++++++++++++
 modules/backbones/__init__.py         |   4 +-
 4 files changed, 249 insertions(+), 3 deletions(-)
 create mode 100644 modules/aux_decoder/LYNXNetDecoder.py
 create mode 100644 modules/backbones/LYNXNet.py

diff --git a/modules/aux_decoder/LYNXNetDecoder.py b/modules/aux_decoder/LYNXNetDecoder.py
new file mode 100644
index 000000000..42b552fbf
--- /dev/null
+++ b/modules/aux_decoder/LYNXNetDecoder.py
@@ -0,0 +1,70 @@
+# refer to： 
+# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py
+# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modules.backbones.LYNXNet import LYNXConvModule
+
+
+class LYNXNetDecoderLayer(nn.Module):
+    """
+    LYNXNet Decoder Layer
+
+    Args:
+        dim (int): Dimension of model
+        expansion_factor (int): Expansion factor of conv module, default 2
+        kernel_size (int): Kernel size of conv module, default 31
+        in_norm (bool): Whether to use norm
+        activation (str): Activation Function for conv module
+    """
+
+    def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='SiLU', dropout=0.):
+        super().__init__()
+        self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation, dropout=dropout)
+
+    def forward(self, x) -> torch.Tensor:
+        residual = x
+        x = self.convmodule(x)
+        x = residual + x
+        
+        return x
+
+
+class LYNXNetDecoder(nn.Module):
+    def __init__(
+            self, in_dims, out_dims, /, *,
+            num_channels=512, num_layers=6, kernel_size=31, dropout_rate=0.
+    ):
+        super().__init__()
+        self.input_projection = nn.Conv1d(in_dims, num_channels, 1)
+        self.encoder_layers = nn.ModuleList(
+                LYNXNetDecoderLayer(
+                    dim=num_channels, 
+                    expansion_factor=2, 
+                    kernel_size=kernel_size, 
+                    in_norm=False, 
+                    activation='SiLU', 
+                    dropout=dropout_rate) for _ in range(num_layers)
+        )
+        self.output_projection = nn.Conv1d(num_channels, out_dims, kernel_size=1)
+
+    def forward(self, x, infer=False):
+        """
+        Args:
+            x (torch.Tensor): Input tensor (#batch, length, in_dims)
+        return:
+            torch.Tensor: Output tensor (#batch, length, out_dims)
+        """
+        x = x.transpose(1, 2)
+        x = self.input_projection(x)
+        x = x.transpose(1, 2)
+        for layer in self.encoder_layers:
+            x = layer(x)
+        x = x.transpose(1, 2)
+        x = self.output_projection(x)
+        x = x.transpose(1, 2)
+        
+        return x
\ No newline at end of file
diff --git a/modules/aux_decoder/__init__.py b/modules/aux_decoder/__init__.py
index 54ceb2113..4801b1156 100644
--- a/modules/aux_decoder/__init__.py
+++ b/modules/aux_decoder/__init__.py
@@ -2,13 +2,16 @@
 from torch import nn
 
 from .convnext import ConvNeXtDecoder
+from .LYNXNetDecoder import LYNXNetDecoder
 from utils import filter_kwargs
 
 AUX_DECODERS = {
-    'convnext': ConvNeXtDecoder
+    'convnext': ConvNeXtDecoder, 
+    'lynxnet': LYNXNetDecoder
 }
 AUX_LOSSES = {
-    'convnext': nn.L1Loss
+    'convnext': nn.L1Loss, 
+    'lynxnet': nn.L1Loss
 }
 
 
diff --git a/modules/backbones/LYNXNet.py b/modules/backbones/LYNXNet.py
new file mode 100644
index 000000000..ffc5a94e5
--- /dev/null
+++ b/modules/backbones/LYNXNet.py
@@ -0,0 +1,171 @@
+# refer to： 
+# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py
+# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from utils.hparams import hparams
+
+
+class SwiGLU(nn.Module):
+    ## Swish-Applies the gated linear unit function.
+    def __init__(self, dim=-1):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        # out, gate = x.chunk(2, dim=self.dim)
+        # Using torch.split instead of chunk for ONNX export compatibility.
+        out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim)
+        return out * F.silu(gate)
+
+
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+
+
+class Transpose(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        assert len(dims) == 2, 'dims must be a tuple of two dimensions'
+        self.dims = dims
+
+    def forward(self, x):
+        return x.transpose(*self.dims)
+
+
+class LYNXConvModule(nn.Module):
+    @staticmethod
+    def calc_same_padding(kernel_size):
+        pad = kernel_size // 2
+        return (pad, pad - (kernel_size + 1) % 2)
+
+    def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU', dropout=0.):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        _normalize = nn.LayerNorm(dim) if in_norm or dim > 512 else nn.Identity()
+        activation_classes = {
+            'SiLU': nn.SiLU,
+            'ReLU': nn.ReLU,
+            'PReLU': lambda: nn.PReLU(inner_dim)
+        }
+        activation = activation if activation is not None else 'PReLU'
+        if activation not in activation_classes:
+            raise ValueError(f'{activation} is not a valid activation')
+        _activation = activation_classes[activation]()
+        padding = self.calc_same_padding(kernel_size)
+        if float(dropout) > 0.:
+            _dropout = nn.Dropout(dropout)
+        else:
+            _dropout = nn.Identity()
+        self.net = nn.Sequential(
+            _normalize,
+            Transpose((1, 2)),
+            nn.Conv1d(dim, inner_dim * 2, 1),
+            SwiGLU(dim=1),
+            nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim),
+            _activation,
+            nn.Conv1d(inner_dim, dim, 1),
+            Transpose((1, 2)),
+            _dropout
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class LYNXNetResidualLayer(nn.Module):
+    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU', dropout=0.):
+        super().__init__()
+        self.diffusion_projection = nn.Conv1d(dim, dim, 1)
+        self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1)
+        self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation, dropout=dropout)
+
+    def forward(self, x, conditioner, diffusion_step):
+        res_x = x.transpose(1, 2)
+        x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
+        x = x.transpose(1, 2)
+        x = self.convmodule(x)  # (#batch, dim, length)
+        x = x + res_x
+        x = x.transpose(1, 2)
+
+        return x  # (#batch, length, dim)
+
+
+class LYNXNet(nn.Module):
+    def __init__(self, in_dims, n_feats, *, n_layers=6, n_chans=512, n_dilates=2, in_norm=False, activation='PReLU', dropout=0.):
+        """
+        LYNXNet(Linear Gated Depthwise Separable Convolution Network)
+        TIPS:You can control the style of the generated results by modifying the 'activation', 
+            - 'PReLU'(default) : Similar to WaveNet
+            - 'SiLU' : Voice will be more pronounced, not recommended for use under DDPM
+            - 'ReLU' : Contrary to 'SiLU', Voice will be weakened
+        """
+        super().__init__()
+        self.input_projection = nn.Conv1d(in_dims * n_feats, n_chans, 1)
+        self.diffusion_embedding = nn.Sequential(
+            SinusoidalPosEmb(n_chans),
+            nn.Linear(n_chans, n_chans * 4),
+            nn.GELU(),
+            nn.Linear(n_chans * 4, n_chans),
+        )
+        self.residual_layers = nn.ModuleList(
+            [
+                LYNXNetResidualLayer(
+                    dim_cond=hparams['hidden_size'], 
+                    dim=n_chans, 
+                    expansion_factor=n_dilates, 
+                    kernel_size=31, 
+                    in_norm=in_norm, 
+                    activation=activation, 
+                    dropout=dropout
+                )
+                for i in range(n_layers)
+            ]
+        )
+        self.output_projection = nn.Conv1d(n_chans, in_dims * n_feats, kernel_size=1)
+        nn.init.zeros_(self.output_projection.weight)
+        
+    def forward(self, spec, diffusion_step, cond):
+        """
+        :param spec: [B, F, M, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, H, T]
+        :return:
+        """
+        
+        # To keep compatibility with DiffSVC, [B, 1, M, T]
+        x = spec
+        use_4_dim = False
+        if x.dim() == 4:
+            x = x[:, 0]
+            use_4_dim = True
+
+        assert x.dim() == 3, f"mel must be 3 dim tensor, but got {x.dim()}"
+
+        x = self.input_projection(x)  # x [B, residual_channel, T]
+        x = F.gelu(x)
+        
+        diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1)
+        
+        for layer in self.residual_layers:
+            x = layer(x, cond, diffusion_step)
+        
+        # MLP and GLU
+        x = self.output_projection(x)  # [B, 128, T]
+        
+        return x[:, None] if use_4_dim else x
diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py
index 1061b8779..a91578567 100644
--- a/modules/backbones/__init__.py
+++ b/modules/backbones/__init__.py
@@ -1,5 +1,7 @@
 from modules.backbones.wavenet import WaveNet
+from modules.backbones.LYNXNet import LYNXNet
 
 BACKBONES = {
-    'wavenet': WaveNet
+    'wavenet': WaveNet, 
+    'lynxnet': LYNXNet
 }

From 0a6a802860469521ca154e712532897bd146efd9 Mon Sep 17 00:00:00 2001
From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com>
Date: Fri, 6 Sep 2024 22:29:12 +0800
Subject: [PATCH 02/10] Lynxnet outnorm (#206)

* post-norm

* fix

* add norm+mlp

* Update LYNXNet.py

* Update LYNXNetDecoder.py

* do not need mlp

* do not need mlp

* Add out norm for LYNXNET

* Add out norm for LYNXNETDecoder
---
 modules/aux_decoder/LYNXNetDecoder.py | 5 ++++-
 modules/backbones/LYNXNet.py          | 9 ++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/modules/aux_decoder/LYNXNetDecoder.py b/modules/aux_decoder/LYNXNetDecoder.py
index 42b552fbf..4ac5923ee 100644
--- a/modules/aux_decoder/LYNXNetDecoder.py
+++ b/modules/aux_decoder/LYNXNetDecoder.py
@@ -49,6 +49,7 @@ def __init__(
                     activation='SiLU', 
                     dropout=dropout_rate) for _ in range(num_layers)
         )
+        self.norm = nn.LayerNorm(num_channels)
         self.output_projection = nn.Conv1d(num_channels, out_dims, kernel_size=1)
 
     def forward(self, x, infer=False):
@@ -63,8 +64,10 @@ def forward(self, x, infer=False):
         x = x.transpose(1, 2)
         for layer in self.encoder_layers:
             x = layer(x)
+        x = self.norm(x)
         x = x.transpose(1, 2)
+        
         x = self.output_projection(x)
         x = x.transpose(1, 2)
         
-        return x
\ No newline at end of file
+        return x
diff --git a/modules/backbones/LYNXNet.py b/modules/backbones/LYNXNet.py
index ffc5a94e5..b4527f98b 100644
--- a/modules/backbones/LYNXNet.py
+++ b/modules/backbones/LYNXNet.py
@@ -57,7 +57,6 @@ def calc_same_padding(kernel_size):
     def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU', dropout=0.):
         super().__init__()
         inner_dim = dim * expansion_factor
-        _normalize = nn.LayerNorm(dim) if in_norm or dim > 512 else nn.Identity()
         activation_classes = {
             'SiLU': nn.SiLU,
             'ReLU': nn.ReLU,
@@ -73,7 +72,7 @@ def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activat
         else:
             _dropout = nn.Identity()
         self.net = nn.Sequential(
-            _normalize,
+            nn.LayerNorm(dim),
             Transpose((1, 2)),
             nn.Conv1d(dim, inner_dim * 2, 1),
             SwiGLU(dim=1),
@@ -137,9 +136,10 @@ def __init__(self, in_dims, n_feats, *, n_layers=6, n_chans=512, n_dilates=2, in
                 for i in range(n_layers)
             ]
         )
+        self.norm = nn.LayerNorm(n_chans)
         self.output_projection = nn.Conv1d(n_chans, in_dims * n_feats, kernel_size=1)
         nn.init.zeros_(self.output_projection.weight)
-        
+    
     def forward(self, spec, diffusion_step, cond):
         """
         :param spec: [B, F, M, T]
@@ -164,6 +164,9 @@ def forward(self, spec, diffusion_step, cond):
         
         for layer in self.residual_layers:
             x = layer(x, cond, diffusion_step)
+
+        # post-norm
+        x = self.norm(x.transpose(1, 2)).transpose(1, 2)
         
         # MLP and GLU
         x = self.output_projection(x)  # [B, 128, T]

From 2ac0158b51f69c26c3b4fc4f55ccaa0f45916fb8 Mon Sep 17 00:00:00 2001
From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com>
Date: Sun, 20 Oct 2024 21:41:52 +0800
Subject: [PATCH 03/10] delete lynxnet aux_decoder (#212)

---
 modules/aux_decoder/LYNXNetDecoder.py | 73 ---------------------------
 modules/aux_decoder/__init__.py       |  7 +--
 2 files changed, 2 insertions(+), 78 deletions(-)
 delete mode 100644 modules/aux_decoder/LYNXNetDecoder.py

diff --git a/modules/aux_decoder/LYNXNetDecoder.py b/modules/aux_decoder/LYNXNetDecoder.py
deleted file mode 100644
index 4ac5923ee..000000000
--- a/modules/aux_decoder/LYNXNetDecoder.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# refer to： 
-# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py
-# https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from modules.backbones.LYNXNet import LYNXConvModule
-
-
-class LYNXNetDecoderLayer(nn.Module):
-    """
-    LYNXNet Decoder Layer
-
-    Args:
-        dim (int): Dimension of model
-        expansion_factor (int): Expansion factor of conv module, default 2
-        kernel_size (int): Kernel size of conv module, default 31
-        in_norm (bool): Whether to use norm
-        activation (str): Activation Function for conv module
-    """
-
-    def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='SiLU', dropout=0.):
-        super().__init__()
-        self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation, dropout=dropout)
-
-    def forward(self, x) -> torch.Tensor:
-        residual = x
-        x = self.convmodule(x)
-        x = residual + x
-        
-        return x
-
-
-class LYNXNetDecoder(nn.Module):
-    def __init__(
-            self, in_dims, out_dims, /, *,
-            num_channels=512, num_layers=6, kernel_size=31, dropout_rate=0.
-    ):
-        super().__init__()
-        self.input_projection = nn.Conv1d(in_dims, num_channels, 1)
-        self.encoder_layers = nn.ModuleList(
-                LYNXNetDecoderLayer(
-                    dim=num_channels, 
-                    expansion_factor=2, 
-                    kernel_size=kernel_size, 
-                    in_norm=False, 
-                    activation='SiLU', 
-                    dropout=dropout_rate) for _ in range(num_layers)
-        )
-        self.norm = nn.LayerNorm(num_channels)
-        self.output_projection = nn.Conv1d(num_channels, out_dims, kernel_size=1)
-
-    def forward(self, x, infer=False):
-        """
-        Args:
-            x (torch.Tensor): Input tensor (#batch, length, in_dims)
-        return:
-            torch.Tensor: Output tensor (#batch, length, out_dims)
-        """
-        x = x.transpose(1, 2)
-        x = self.input_projection(x)
-        x = x.transpose(1, 2)
-        for layer in self.encoder_layers:
-            x = layer(x)
-        x = self.norm(x)
-        x = x.transpose(1, 2)
-        
-        x = self.output_projection(x)
-        x = x.transpose(1, 2)
-        
-        return x
diff --git a/modules/aux_decoder/__init__.py b/modules/aux_decoder/__init__.py
index 4801b1156..54ceb2113 100644
--- a/modules/aux_decoder/__init__.py
+++ b/modules/aux_decoder/__init__.py
@@ -2,16 +2,13 @@
 from torch import nn
 
 from .convnext import ConvNeXtDecoder
-from .LYNXNetDecoder import LYNXNetDecoder
 from utils import filter_kwargs
 
 AUX_DECODERS = {
-    'convnext': ConvNeXtDecoder, 
-    'lynxnet': LYNXNetDecoder
+    'convnext': ConvNeXtDecoder
 }
 AUX_LOSSES = {
-    'convnext': nn.L1Loss, 
-    'lynxnet': nn.L1Loss
+    'convnext': nn.L1Loss
 }
 
 

From 40f8488c812819224292162242c1a431609a697f Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Sun, 3 Nov 2024 00:10:52 +0800
Subject: [PATCH 04/10] refactor configuration options

---
 configs/acoustic.yaml                  | 10 +++--
 configs/templates/config_acoustic.yaml | 16 ++++++--
 configs/templates/config_variance.yaml | 16 +++++---
 configs/variance.yaml                  | 16 +++++---
 deployment/modules/toplevel.py         | 32 ++++------------
 modules/backbones/LYNXNet.py           | 52 ++++++++++++++------------
 modules/backbones/__init__.py          | 10 +++++
 modules/backbones/wavenet.py           | 20 +++++-----
 modules/core/ddpm.py                   |  4 +-
 modules/core/reflow.py                 |  4 +-
 modules/fastspeech/param_adaptor.py    | 16 +++++---
 modules/toplevel.py                    | 47 +++++++++++------------
 12 files changed, 130 insertions(+), 113 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 0364b5c15..6efe72367 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -73,10 +73,12 @@ sampling_steps: 20
 diff_accelerator: ddim
 diff_speedup: 10
 hidden_size: 256
-residual_layers: 20
-residual_channels: 512
-dilation_cycle_length: 4  # *
-backbone_type: 'wavenet'
+backbone_type: 'lynxnet'
+backbone_args:
+    num_channels: 1024
+    num_layers: 6
+    kernel_size: 31
+    dropout_rate: 0.0
 main_loss_type: l2
 main_loss_log_norm: false
 schedule_type: 'linear'
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 198444bc7..f0edef7a8 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -51,16 +51,24 @@ augmentation_args:
     range: [0.5, 2.]
     scale: 0.75
 
-residual_channels: 512
-residual_layers: 20
-
-# shallow diffusion
+# diffusion and shallow diffusion
 diffusion_type: reflow
 use_shallow_diffusion: true
 T_start: 0.4
 T_start_infer: 0.4
 K_step: 300
 K_step_infer: 300
+backbone_type: 'lynxnet'
+backbone_args:
+    num_channels: 1024
+    num_layers: 6
+    kernel_size: 31
+    dropout_rate: 0.0
+# backbone_type: 'wavenet'
+# backbone_args:
+#    num_channels: 512
+#    num_layers: 20
+#    dilation_cycle_length: 4
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
index d75667797..842a76395 100644
--- a/configs/templates/config_variance.yaml
+++ b/configs/templates/config_variance.yaml
@@ -78,15 +78,19 @@ pitch_prediction_args:
   pitd_clip_min: -12.0
   pitd_clip_max: 12.0
   repeat_bins: 64
-  residual_layers: 20
-  residual_channels: 256
-  dilation_cycle_length: 5  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 20
+    num_channels: 256
+    dilation_cycle_length: 5
 
 variances_prediction_args:
   total_repeat_bins: 48
-  residual_layers: 10
-  residual_channels: 192
-  dilation_cycle_length: 4  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 10
+    num_channels: 192
+    dilation_cycle_length: 4
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
diff --git a/configs/variance.yaml b/configs/variance.yaml
index 2c6d002da..95a0781be 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -68,9 +68,11 @@ pitch_prediction_args:
   pitd_clip_min: -12.0
   pitd_clip_max: 12.0
   repeat_bins: 64
-  residual_layers: 20
-  residual_channels: 256
-  dilation_cycle_length: 5  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 20
+    num_channels: 256
+    dilation_cycle_length: 5 
 
 energy_db_min: -96.0
 energy_db_max: -12.0
@@ -89,9 +91,11 @@ tension_smooth_width: 0.12
 
 variances_prediction_args:
   total_repeat_bins: 48
-  residual_layers: 10
-  residual_channels: 192
-  dilation_cycle_length: 4  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 10
+    num_channels: 192
+    dilation_cycle_length: 4
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py
index 1dd4fe129..e358f25a0 100644
--- a/deployment/modules/toplevel.py
+++ b/deployment/modules/toplevel.py
@@ -31,12 +31,8 @@ def __init__(self, vocab_size, out_dims):
                 num_feats=1,
                 timesteps=hparams['timesteps'],
                 k_step=hparams['K_step'],
-                backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                backbone_args={
-                    'n_layers': hparams['residual_layers'],
-                    'n_chans': hparams['residual_channels'],
-                    'n_dilates': hparams['dilation_cycle_length'],
-                },
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
                 spec_min=hparams['spec_min'],
                 spec_max=hparams['spec_max']
             )
@@ -46,12 +42,8 @@ def __init__(self, vocab_size, out_dims):
                 num_feats=1,
                 t_start=hparams['T_start'],
                 time_scale_factor=hparams['time_scale_factor'],
-                backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                backbone_args={
-                    'n_layers': hparams['residual_layers'],
-                    'n_chans': hparams['residual_channels'],
-                    'n_dilates': hparams['dilation_cycle_length'],
-                },
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
                 spec_min=hparams['spec_min'],
                 spec_max=hparams['spec_max']
             )
@@ -155,12 +147,8 @@ def __init__(self, vocab_size):
                     repeat_bins=pitch_hparams['repeat_bins'],
                     timesteps=hparams['timesteps'],
                     k_step=hparams['K_step'],
-                    backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                    backbone_args={
-                        'n_layers': pitch_hparams['residual_layers'],
-                        'n_chans': pitch_hparams['residual_channels'],
-                        'n_dilates': pitch_hparams['dilation_cycle_length'],
-                    }
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args
                 )
             elif self.diffusion_type == 'reflow':
                 self.pitch_predictor = PitchRectifiedFlowONNX(
@@ -170,12 +158,8 @@ def __init__(self, vocab_size):
                     cmax=pitch_hparams['pitd_clip_max'],
                     repeat_bins=pitch_hparams['repeat_bins'],
                     time_scale_factor=hparams['time_scale_factor'],
-                    backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                    backbone_args={
-                        'n_layers': pitch_hparams['residual_layers'],
-                        'n_chans': pitch_hparams['residual_channels'],
-                        'n_dilates': pitch_hparams['dilation_cycle_length'],
-                    }
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args
                 )
             else:
                 raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")
diff --git a/modules/backbones/LYNXNet.py b/modules/backbones/LYNXNet.py
index b4527f98b..ecf97a95c 100644
--- a/modules/backbones/LYNXNet.py
+++ b/modules/backbones/LYNXNet.py
@@ -54,7 +54,7 @@ def calc_same_padding(kernel_size):
         pad = kernel_size // 2
         return (pad, pad - (kernel_size + 1) % 2)
 
-    def __init__(self, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU', dropout=0.):
+    def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
         super().__init__()
         inner_dim = dim * expansion_factor
         activation_classes = {
@@ -88,11 +88,11 @@ def forward(self, x):
 
 
 class LYNXNetResidualLayer(nn.Module):
-    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, in_norm=False, activation='PReLU', dropout=0.):
+    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
         super().__init__()
         self.diffusion_projection = nn.Conv1d(dim, dim, 1)
         self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1)
-        self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, in_norm=in_norm, activation=activation, dropout=dropout)
+        self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, activation=activation, dropout=dropout)
 
     def forward(self, x, conditioner, diffusion_step):
         res_x = x.transpose(1, 2)
@@ -106,7 +106,7 @@ def forward(self, x, conditioner, diffusion_step):
 
 
 class LYNXNet(nn.Module):
-    def __init__(self, in_dims, n_feats, *, n_layers=6, n_chans=512, n_dilates=2, in_norm=False, activation='PReLU', dropout=0.):
+    def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31, activation='PReLU', dropout=0.):
         """
         LYNXNet(Linear Gated Depthwise Separable Convolution Network)
         TIPS:You can control the style of the generated results by modifying the 'activation', 
@@ -115,29 +115,30 @@ def __init__(self, in_dims, n_feats, *, n_layers=6, n_chans=512, n_dilates=2, in
             - 'ReLU' : Contrary to 'SiLU', Voice will be weakened
         """
         super().__init__()
-        self.input_projection = nn.Conv1d(in_dims * n_feats, n_chans, 1)
+        self.in_dims = in_dims
+        self.n_feats = n_feats
+        self.input_projection = nn.Conv1d(in_dims * n_feats, num_channels, 1)
         self.diffusion_embedding = nn.Sequential(
-            SinusoidalPosEmb(n_chans),
-            nn.Linear(n_chans, n_chans * 4),
+            SinusoidalPosEmb(num_channels),
+            nn.Linear(num_channels, num_channels * 4),
             nn.GELU(),
-            nn.Linear(n_chans * 4, n_chans),
+            nn.Linear(num_channels * 4, num_channels),
         )
         self.residual_layers = nn.ModuleList(
             [
                 LYNXNetResidualLayer(
                     dim_cond=hparams['hidden_size'], 
-                    dim=n_chans, 
-                    expansion_factor=n_dilates, 
-                    kernel_size=31, 
-                    in_norm=in_norm, 
+                    dim=num_channels, 
+                    expansion_factor=expansion_factor, 
+                    kernel_size=kernel_size, 
                     activation=activation, 
                     dropout=dropout
                 )
-                for i in range(n_layers)
+                for i in range(num_layers)
             ]
         )
-        self.norm = nn.LayerNorm(n_chans)
-        self.output_projection = nn.Conv1d(n_chans, in_dims * n_feats, kernel_size=1)
+        self.norm = nn.LayerNorm(num_channels)
+        self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
         nn.init.zeros_(self.output_projection.weight)
     
     def forward(self, spec, diffusion_step, cond):
@@ -148,14 +149,10 @@ def forward(self, spec, diffusion_step, cond):
         :return:
         """
         
-        # To keep compatibility with DiffSVC, [B, 1, M, T]
-        x = spec
-        use_4_dim = False
-        if x.dim() == 4:
-            x = x[:, 0]
-            use_4_dim = True
-
-        assert x.dim() == 3, f"mel must be 3 dim tensor, but got {x.dim()}"
+        if self.n_feats == 1:
+            x = spec.squeeze(1)  # [B, M, T]
+        else:
+            x = spec.flatten(start_dim=1, end_dim=2)  # [B, F x M, T]
 
         x = self.input_projection(x)  # x [B, residual_channel, T]
         x = F.gelu(x)
@@ -171,4 +168,11 @@ def forward(self, spec, diffusion_step, cond):
         # MLP and GLU
         x = self.output_projection(x)  # [B, 128, T]
         
-        return x[:, None] if use_4_dim else x
+        if self.n_feats == 1:
+            x = x[:, None, :, :]
+        else:
+            # This is the temporary solution since PyTorch 1.13
+            # does not support exporting aten::unflatten to ONNX
+            # x = x.unflatten(dim=1, sizes=(self.n_feats, self.in_dims))
+            x = x.reshape(-1, self.n_feats, self.in_dims, x.shape[2])
+        return x
diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py
index a91578567..c9cf0b8d5 100644
--- a/modules/backbones/__init__.py
+++ b/modules/backbones/__init__.py
@@ -1,7 +1,17 @@
+import torch.nn
 from modules.backbones.wavenet import WaveNet
 from modules.backbones.LYNXNet import LYNXNet
+from utils import filter_kwargs
 
 BACKBONES = {
     'wavenet': WaveNet, 
     'lynxnet': LYNXNet
 }
+
+def build_backbone(
+        out_dims: int, num_feats: int,
+        backbone_type: str, backbone_args: dict
+) -> torch.nn.Module:
+    backbone = BACKBONES[backbone_type]
+    kwargs = filter_kwargs(backbone_args, backbone)
+    return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
\ No newline at end of file
diff --git a/modules/backbones/wavenet.py b/modules/backbones/wavenet.py
index 0a1400d30..3ddbb4689 100644
--- a/modules/backbones/wavenet.py
+++ b/modules/backbones/wavenet.py
@@ -63,27 +63,27 @@ def forward(self, x, conditioner, diffusion_step):
 
 
 class WaveNet(nn.Module):
-    def __init__(self, in_dims, n_feats, *, n_layers=20, n_chans=256, n_dilates=4):
+    def __init__(self, in_dims, n_feats, *, num_layers=20, num_channels=256, dilation_cycle_length=4):
         super().__init__()
         self.in_dims = in_dims
         self.n_feats = n_feats
-        self.input_projection = Conv1d(in_dims * n_feats, n_chans, 1)
-        self.diffusion_embedding = SinusoidalPosEmb(n_chans)
+        self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
+        self.diffusion_embedding = SinusoidalPosEmb(num_channels)
         self.mlp = nn.Sequential(
-            nn.Linear(n_chans, n_chans * 4),
+            nn.Linear(num_channels, num_channels * 4),
             nn.Mish(),
-            nn.Linear(n_chans * 4, n_chans)
+            nn.Linear(num_channels * 4, num_channels)
         )
         self.residual_layers = nn.ModuleList([
             ResidualBlock(
                 encoder_hidden=hparams['hidden_size'],
-                residual_channels=n_chans,
-                dilation=2 ** (i % n_dilates)
+                residual_channels=num_channels,
+                dilation=2 ** (i % dilation_cycle_length)
             )
-            for i in range(n_layers)
+            for i in range(num_layers)
         ])
-        self.skip_projection = Conv1d(n_chans, n_chans, 1)
-        self.output_projection = Conv1d(n_chans, in_dims * n_feats, 1)
+        self.skip_projection = Conv1d(num_channels, num_channels, 1)
+        self.output_projection = Conv1d(num_channels, in_dims * n_feats, 1)
         nn.init.zeros_(self.output_projection.weight)
 
     def forward(self, spec, diffusion_step, cond):
diff --git a/modules/core/ddpm.py b/modules/core/ddpm.py
index d79f21c79..6b0ae4803 100644
--- a/modules/core/ddpm.py
+++ b/modules/core/ddpm.py
@@ -9,7 +9,7 @@
 from torch import nn
 from tqdm import tqdm
 
-from modules.backbones import BACKBONES
+from modules.backbones import build_backbone
 from utils.hparams import hparams
 
 
@@ -57,7 +57,7 @@ def __init__(self, out_dims, num_feats=1, timesteps=1000, k_step=1000,
                  backbone_type=None, backbone_args=None, betas=None,
                  spec_min=None, spec_max=None):
         super().__init__()
-        self.denoise_fn: nn.Module = BACKBONES[backbone_type](out_dims, num_feats, **backbone_args)
+        self.denoise_fn: nn.Module = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
         self.out_dims = out_dims
         self.num_feats = num_feats
 
diff --git a/modules/core/reflow.py b/modules/core/reflow.py
index 2a2b21fcb..f09eb2392 100644
--- a/modules/core/reflow.py
+++ b/modules/core/reflow.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 from tqdm import tqdm
 
-from modules.backbones import BACKBONES
+from modules.backbones import build_backbone
 from utils.hparams import hparams
 
 
@@ -15,7 +15,7 @@ def __init__(self, out_dims, num_feats=1, t_start=0., time_scale_factor=1000,
                  backbone_type=None, backbone_args=None,
                  spec_min=None, spec_max=None):
         super().__init__()
-        self.velocity_fn: nn.Module = BACKBONES[backbone_type](out_dims, num_feats, **backbone_args)
+        self.velocity_fn: nn.Module = build_backbone(out_dims, num_feats, backbone_type, backbone_args)
         self.out_dims = out_dims
         self.num_feats = num_feats
         self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False)
diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py
index e5668536b..ace58ff41 100644
--- a/modules/fastspeech/param_adaptor.py
+++ b/modules/fastspeech/param_adaptor.py
@@ -68,6 +68,14 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
             f'Total number of repeat bins must be divisible by number of ' \
             f'variance parameters ({len(self.variance_prediction_list)}).'
         repeat_bins = total_repeat_bins // len(self.variance_prediction_list)
+        backbone_type = variances_hparams.get('backbone_type', 
+                                variances_hparams.get('backbone_type', 
+                                variances_hparams.get('diff_decoder_type', 'wavenet')))
+        backbone_args = variances_hparams.get('backbone_args', {
+                'num_layers': variances_hparams.get('residual_layers'),
+                'num_channels': variances_hparams.get('residual_channels'),
+                'dilation_cycle_length': variances_hparams.get('dilation_cycle_length'),
+        } if backbone_type == 'wavenet' else None)
         kwargs = filter_kwargs(
             {
                 'ranges': ranges,
@@ -75,12 +83,8 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
                 'repeat_bins': repeat_bins,
                 'timesteps': hparams.get('timesteps'),
                 'time_scale_factor': hparams.get('time_scale_factor'),
-                'backbone_type': hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                'backbone_args': {
-                    'n_layers': variances_hparams['residual_layers'],
-                    'n_chans': variances_hparams['residual_channels'],
-                    'n_dilates': variances_hparams['dilation_cycle_length'],
-                }
+                'backbone_type': backbone_type,
+                'backbone_args': backbone_args
             },
             cls
         )
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 1976d09a9..9ec16bec1 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -53,18 +53,20 @@ def __init__(self, vocab_size, out_dims):
                 aux_decoder_args=self.shallow_args['aux_decoder_args']
             )
         self.diffusion_type = hparams.get('diffusion_type', 'ddpm')
+        self.backbone_type = hparams.get('backbone_type', hparams.get('diff_decoder_type', 'wavenet'))
+        self.backbone_args = hparams.get('backbone_args', {
+                'num_layers': hparams.get('residual_layers'),
+                'num_channels': hparams.get('residual_channels'),
+                'dilation_cycle_length': hparams.get('dilation_cycle_length'),
+        } if self.backbone_type == 'wavenet' else None)
         if self.diffusion_type == 'ddpm':
             self.diffusion = GaussianDiffusion(
                 out_dims=out_dims,
                 num_feats=1,
                 timesteps=hparams['timesteps'],
                 k_step=hparams['K_step'],
-                backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                backbone_args={
-                    'n_layers': hparams['residual_layers'],
-                    'n_chans': hparams['residual_channels'],
-                    'n_dilates': hparams['dilation_cycle_length'],
-                },
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
                 spec_min=hparams['spec_min'],
                 spec_max=hparams['spec_max']
             )
@@ -74,12 +76,8 @@ def __init__(self, vocab_size, out_dims):
                 num_feats=1,
                 t_start=hparams['T_start'],
                 time_scale_factor=hparams['time_scale_factor'],
-                backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                backbone_args={
-                    'n_layers': hparams['residual_layers'],
-                    'n_chans': hparams['residual_channels'],
-                    'n_dilates': hparams['dilation_cycle_length'],
-                },
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
                 spec_min=hparams['spec_min'],
                 spec_max=hparams['spec_max']
             )
@@ -157,7 +155,14 @@ def __init__(self, vocab_size):
 
             self.pitch_retake_embed = Embedding(2, hparams['hidden_size'])
             pitch_hparams = hparams['pitch_prediction_args']
-
+            self.pitch_backbone_type = pitch_hparams.get('backbone_type',
+                                    hparams.get('backbone_type', 
+                                    hparams.get('diff_decoder_type', 'wavenet')))
+            self.pitch_backbone_args = pitch_hparams.get('backbone_args', {
+                'num_layers': pitch_hparams.get('residual_layers'),
+                'num_channels': pitch_hparams.get('residual_channels'),
+                'dilation_cycle_length': pitch_hparams.get('dilation_cycle_length'),
+            } if self.pitch_backbone_type == 'wavenet' else None)
             if self.diffusion_type == 'ddpm':
                 self.pitch_predictor = PitchDiffusion(
                     vmin=pitch_hparams['pitd_norm_min'],
@@ -167,12 +172,8 @@ def __init__(self, vocab_size):
                     repeat_bins=pitch_hparams['repeat_bins'],
                     timesteps=hparams['timesteps'],
                     k_step=hparams['K_step'],
-                    backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                    backbone_args={
-                        'n_layers': pitch_hparams['residual_layers'],
-                        'n_chans': pitch_hparams['residual_channels'],
-                        'n_dilates': pitch_hparams['dilation_cycle_length'],
-                    }
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args
                 )
             elif self.diffusion_type == 'reflow':
                 self.pitch_predictor = PitchRectifiedFlow(
@@ -182,12 +183,8 @@ def __init__(self, vocab_size):
                     cmax=pitch_hparams['pitd_clip_max'],
                     repeat_bins=pitch_hparams['repeat_bins'],
                     time_scale_factor=hparams['time_scale_factor'],
-                    backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                    backbone_args={
-                        'n_layers': pitch_hparams['residual_layers'],
-                        'n_chans': pitch_hparams['residual_channels'],
-                        'n_dilates': pitch_hparams['dilation_cycle_length'],
-                    }
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args
                 )
             else:
                 raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")

From 4a57e86842c806fd7f3de64c436d3e56be8130ac Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Mon, 4 Nov 2024 19:00:39 +0800
Subject: [PATCH 05/10] fix onnx exporter for lynxnet

---
 modules/backbones/LYNXNet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/backbones/LYNXNet.py b/modules/backbones/LYNXNet.py
index ecf97a95c..50789ab00 100644
--- a/modules/backbones/LYNXNet.py
+++ b/modules/backbones/LYNXNet.py
@@ -150,7 +150,7 @@ def forward(self, spec, diffusion_step, cond):
         """
         
         if self.n_feats == 1:
-            x = spec.squeeze(1)  # [B, M, T]
+            x = spec[:, 0]  # [B, M, T]
         else:
             x = spec.flatten(start_dim=1, end_dim=2)  # [B, F x M, T]
 

From 3c14fc3bb4b34ecddfa6f49fbda6d6b6703497dd Mon Sep 17 00:00:00 2001
From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com>
Date: Tue, 5 Nov 2024 14:00:01 +0800
Subject: [PATCH 06/10] Add Pytorch version check when export onnx (#216)

---
 scripts/export.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/scripts/export.py b/scripts/export.py
index 537cdad9f..613b3ee45 100644
--- a/scripts/export.py
+++ b/scripts/export.py
@@ -14,6 +14,16 @@
 from utils.hparams import set_hparams, hparams
 
 
+def check_pytorch_version():
+    version = torch.__version__
+    print(f"PyTorch version: {version}")
+    major, minor, _ = version.split('.')
+    if major != '1' and minor != '13':
+        raise RuntimeError(f"Unsupported PyTorch Version: {version}. need 1.13.x.")
+    else:
+        pass
+
+
 def find_exp(exp):
     if not (root_dir / 'checkpoints' / exp).exists():
         for subdir in (root_dir / 'checkpoints').iterdir():
@@ -291,4 +301,5 @@ def nsf_hifigan(
 
 
 if __name__ == '__main__':
+    check_pytorch_version()
     main()

From 89d870f36188a14826cd0c328b1d64a101f0b759 Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Fri, 8 Nov 2024 15:17:52 +0800
Subject: [PATCH 07/10] recommended lynxnet hyperparameters for variance models

---
 configs/templates/config_variance.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
index 842a76395..daa8e15dc 100644
--- a/configs/templates/config_variance.yaml
+++ b/configs/templates/config_variance.yaml
@@ -83,6 +83,10 @@ pitch_prediction_args:
     num_layers: 20
     num_channels: 256
     dilation_cycle_length: 5
+# backbone_type: 'lynxnet'
+# backbone_args:
+#   num_layers: 6
+#   num_channels: 512
 
 variances_prediction_args:
   total_repeat_bins: 48
@@ -91,6 +95,10 @@ variances_prediction_args:
     num_layers: 10
     num_channels: 192
     dilation_cycle_length: 4
+# backbone_type: 'lynxnet'
+# backbone_args:
+#   num_layers: 6
+#   num_channels: 384
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0

From 97bf14daa0b0c3ef3acc96100a492eb44c947dc6 Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Fri, 8 Nov 2024 19:19:09 +0800
Subject: [PATCH 08/10] remove invalid items

---
 configs/variance.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/configs/variance.yaml b/configs/variance.yaml
index 95a0781be..e9a7764f2 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -107,7 +107,6 @@ schedule_type: 'linear'
 K_step: 1000
 timesteps: 1000
 max_beta: 0.02
-backbone_type: 'wavenet'
 main_loss_type: l2
 main_loss_log_norm: true
 sampling_algorithm: euler

From 09b5a5484897cc512020b6065148d68a2c2c3745 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Fri, 15 Nov 2024 23:21:01 +0800
Subject: [PATCH 09/10] Refactor code

---
 configs/acoustic.yaml                        |  8 +--
 configs/templates/config_acoustic.yaml       | 18 +++----
 modules/backbones/__init__.py                |  7 +--
 modules/backbones/{LYNXNet.py => lynxnet.py} | 51 ++++++++------------
 modules/backbones/wavenet.py                 | 16 +-----
 modules/commons/common_layers.py             | 15 ++++++
 modules/compat.py                            | 24 +++++++++
 modules/fastspeech/param_adaptor.py          | 11 ++---
 modules/toplevel.py                          | 19 ++------
 scripts/export.py                            | 11 ++---
 10 files changed, 88 insertions(+), 92 deletions(-)
 rename modules/backbones/{LYNXNet.py => lynxnet.py} (84%)
 create mode 100644 modules/compat.py

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 6efe72367..2cbc45303 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -75,10 +75,10 @@ diff_speedup: 10
 hidden_size: 256
 backbone_type: 'lynxnet'
 backbone_args:
-    num_channels: 1024
-    num_layers: 6
-    kernel_size: 31
-    dropout_rate: 0.0
+  num_channels: 1024
+  num_layers: 6
+  kernel_size: 31
+  dropout_rate: 0.0
 main_loss_type: l2
 main_loss_log_norm: false
 schedule_type: 'linear'
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index f0edef7a8..a9453a368 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -60,15 +60,15 @@ K_step: 300
 K_step_infer: 300
 backbone_type: 'lynxnet'
 backbone_args:
-    num_channels: 1024
-    num_layers: 6
-    kernel_size: 31
-    dropout_rate: 0.0
-# backbone_type: 'wavenet'
-# backbone_args:
-#    num_channels: 512
-#    num_layers: 20
-#    dilation_cycle_length: 4
+  num_channels: 1024
+  num_layers: 6
+  kernel_size: 31
+  dropout_rate: 0.0
+#backbone_type: 'wavenet'
+#backbone_args:
+#  num_channels: 512
+#  num_layers: 20
+#  dilation_cycle_length: 4
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py
index c9cf0b8d5..8fce796ab 100644
--- a/modules/backbones/__init__.py
+++ b/modules/backbones/__init__.py
@@ -1,17 +1,18 @@
 import torch.nn
 from modules.backbones.wavenet import WaveNet
-from modules.backbones.LYNXNet import LYNXNet
+from modules.backbones.lynxnet import LYNXNet
 from utils import filter_kwargs
 
 BACKBONES = {
-    'wavenet': WaveNet, 
+    'wavenet': WaveNet,
     'lynxnet': LYNXNet
 }
 
+
 def build_backbone(
         out_dims: int, num_feats: int,
         backbone_type: str, backbone_args: dict
 ) -> torch.nn.Module:
     backbone = BACKBONES[backbone_type]
     kwargs = filter_kwargs(backbone_args, backbone)
-    return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
\ No newline at end of file
+    return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
diff --git a/modules/backbones/LYNXNet.py b/modules/backbones/lynxnet.py
similarity index 84%
rename from modules/backbones/LYNXNet.py
rename to modules/backbones/lynxnet.py
index 50789ab00..744967c6b 100644
--- a/modules/backbones/LYNXNet.py
+++ b/modules/backbones/lynxnet.py
@@ -2,20 +2,20 @@
 # https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/model_conformer_naive.py
 # https://github.com/CNChTu/Diffusion-SVC/blob/v2.0_dev/diffusion/naive_v2/naive_v2_diff.py
 
-import math
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from modules.commons.common_layers import SinusoidalPosEmb
 from utils.hparams import hparams
 
 
 class SwiGLU(nn.Module):
-    ## Swish-Applies the gated linear unit function.
+    # Swish-Applies the gated linear unit function.
     def __init__(self, dim=-1):
         super().__init__()
         self.dim = dim
+
     def forward(self, x):
         # out, gate = x.chunk(2, dim=self.dim)
         # Using torch.split instead of chunk for ONNX export compatibility.
@@ -23,21 +23,6 @@ def forward(self, x):
         return out * F.silu(gate)
 
 
-class SinusoidalPosEmb(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, x):
-        device = x.device
-        half_dim = self.dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
-        emb = x[:, None] * emb[None, :]
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
-        return emb
-
-
 class Transpose(nn.Module):
     def __init__(self, dims):
         super().__init__()
@@ -52,7 +37,7 @@ class LYNXConvModule(nn.Module):
     @staticmethod
     def calc_same_padding(kernel_size):
         pad = kernel_size // 2
-        return (pad, pad - (kernel_size + 1) % 2)
+        return pad, pad - (kernel_size + 1) % 2
 
     def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
         super().__init__()
@@ -92,7 +77,8 @@ def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='
         super().__init__()
         self.diffusion_projection = nn.Conv1d(dim, dim, 1)
         self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1)
-        self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size, activation=activation, dropout=dropout)
+        self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size,
+                                         activation=activation, dropout=dropout)
 
     def forward(self, x, conditioner, diffusion_step):
         res_x = x.transpose(1, 2)
@@ -106,7 +92,8 @@ def forward(self, x, conditioner, diffusion_step):
 
 
 class LYNXNet(nn.Module):
-    def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31, activation='PReLU', dropout=0.):
+    def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31,
+                 activation='PReLU', dropout=0.):
         """
         LYNXNet(Linear Gated Depthwise Separable Convolution Network)
         TIPS:You can control the style of the generated results by modifying the 'activation', 
@@ -127,11 +114,11 @@ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansio
         self.residual_layers = nn.ModuleList(
             [
                 LYNXNetResidualLayer(
-                    dim_cond=hparams['hidden_size'], 
-                    dim=num_channels, 
-                    expansion_factor=expansion_factor, 
-                    kernel_size=kernel_size, 
-                    activation=activation, 
+                    dim_cond=hparams['hidden_size'],
+                    dim=num_channels,
+                    expansion_factor=expansion_factor,
+                    kernel_size=kernel_size,
+                    activation=activation,
                     dropout=dropout
                 )
                 for i in range(num_layers)
@@ -140,7 +127,7 @@ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansio
         self.norm = nn.LayerNorm(num_channels)
         self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
         nn.init.zeros_(self.output_projection.weight)
-    
+
     def forward(self, spec, diffusion_step, cond):
         """
         :param spec: [B, F, M, T]
@@ -148,7 +135,7 @@ def forward(self, spec, diffusion_step, cond):
         :param cond: [B, H, T]
         :return:
         """
-        
+
         if self.n_feats == 1:
             x = spec[:, 0]  # [B, M, T]
         else:
@@ -156,18 +143,18 @@ def forward(self, spec, diffusion_step, cond):
 
         x = self.input_projection(x)  # x [B, residual_channel, T]
         x = F.gelu(x)
-        
+
         diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1)
-        
+
         for layer in self.residual_layers:
             x = layer(x, cond, diffusion_step)
 
         # post-norm
         x = self.norm(x.transpose(1, 2)).transpose(1, 2)
-        
+
         # MLP and GLU
         x = self.output_projection(x)  # [B, 128, T]
-        
+
         if self.n_feats == 1:
             x = x[:, None, :, :]
         else:
diff --git a/modules/backbones/wavenet.py b/modules/backbones/wavenet.py
index 3ddbb4689..08e57eff4 100644
--- a/modules/backbones/wavenet.py
+++ b/modules/backbones/wavenet.py
@@ -5,6 +5,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from modules.commons.common_layers import SinusoidalPosEmb
 from utils.hparams import hparams
 
 
@@ -14,21 +15,6 @@ def __init__(self, *args, **kwargs):
         nn.init.kaiming_normal_(self.weight)
 
 
-class SinusoidalPosEmb(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, x):
-        device = x.device
-        half_dim = self.dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
-        emb = x[:, None] * emb[None, :]
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
-        return emb
-
-
 class ResidualBlock(nn.Module):
     def __init__(self, encoder_hidden, residual_channels, dilation):
         super().__init__()
diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py
index 9ea2c2638..b12cc7f96 100644
--- a/modules/commons/common_layers.py
+++ b/modules/commons/common_layers.py
@@ -168,3 +168,18 @@ def forward(self, x, encoder_padding_mask=None, **kwargs):
         x = residual + x
         x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
         return x
+
+
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
diff --git a/modules/compat.py b/modules/compat.py
new file mode 100644
index 000000000..8311b16dd
--- /dev/null
+++ b/modules/compat.py
@@ -0,0 +1,24 @@
+def get_backbone_type(root_config: dict, nested_config: dict = None):
+    if nested_config is None:
+        nested_config = root_config
+    return nested_config.get(
+        'backbone_type',
+        root_config.get(
+            'backbone_type',
+            root_config.get('diff_decoder_type', 'wavenet')
+        )
+    )
+
+
+def get_backbone_args(config: dict, backbone_type: str):
+    args = config.get('backbone_args')
+    if args is not None:
+        return args
+    elif backbone_type == 'wavenet':
+        return {
+            'num_layers': config.get('residual_layers'),
+            'num_channels': config.get('residual_channels'),
+            'dilation_cycle_length': config.get('dilation_cycle_length'),
+        }
+    else:
+        return None
diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py
index ace58ff41..77ebb8331 100644
--- a/modules/fastspeech/param_adaptor.py
+++ b/modules/fastspeech/param_adaptor.py
@@ -2,6 +2,7 @@
 
 import torch
 
+import modules.compat as compat
 from modules.core.ddpm import MultiVarianceDiffusion
 from utils import filter_kwargs
 from utils.hparams import hparams
@@ -68,14 +69,8 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
             f'Total number of repeat bins must be divisible by number of ' \
             f'variance parameters ({len(self.variance_prediction_list)}).'
         repeat_bins = total_repeat_bins // len(self.variance_prediction_list)
-        backbone_type = variances_hparams.get('backbone_type', 
-                                variances_hparams.get('backbone_type', 
-                                variances_hparams.get('diff_decoder_type', 'wavenet')))
-        backbone_args = variances_hparams.get('backbone_args', {
-                'num_layers': variances_hparams.get('residual_layers'),
-                'num_channels': variances_hparams.get('residual_channels'),
-                'dilation_cycle_length': variances_hparams.get('dilation_cycle_length'),
-        } if backbone_type == 'wavenet' else None)
+        backbone_type = compat.get_backbone_type(hparams, nested_config=variances_hparams)
+        backbone_args = compat.get_backbone_args(variances_hparams, backbone_type=backbone_type)
         kwargs = filter_kwargs(
             {
                 'ranges': ranges,
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 9ec16bec1..99f73d541 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -5,6 +5,7 @@
 import torch.nn.functional as F
 from torch import Tensor
 
+import modules.compat as compat
 from basics.base_module import CategorizedModule
 from modules.aux_decoder import AuxDecoderAdaptor
 from modules.commons.common_layers import (
@@ -53,12 +54,8 @@ def __init__(self, vocab_size, out_dims):
                 aux_decoder_args=self.shallow_args['aux_decoder_args']
             )
         self.diffusion_type = hparams.get('diffusion_type', 'ddpm')
-        self.backbone_type = hparams.get('backbone_type', hparams.get('diff_decoder_type', 'wavenet'))
-        self.backbone_args = hparams.get('backbone_args', {
-                'num_layers': hparams.get('residual_layers'),
-                'num_channels': hparams.get('residual_channels'),
-                'dilation_cycle_length': hparams.get('dilation_cycle_length'),
-        } if self.backbone_type == 'wavenet' else None)
+        self.backbone_type = compat.get_backbone_type(hparams)
+        self.backbone_args = compat.get_backbone_args(hparams, self.backbone_type)
         if self.diffusion_type == 'ddpm':
             self.diffusion = GaussianDiffusion(
                 out_dims=out_dims,
@@ -155,14 +152,8 @@ def __init__(self, vocab_size):
 
             self.pitch_retake_embed = Embedding(2, hparams['hidden_size'])
             pitch_hparams = hparams['pitch_prediction_args']
-            self.pitch_backbone_type = pitch_hparams.get('backbone_type',
-                                    hparams.get('backbone_type', 
-                                    hparams.get('diff_decoder_type', 'wavenet')))
-            self.pitch_backbone_args = pitch_hparams.get('backbone_args', {
-                'num_layers': pitch_hparams.get('residual_layers'),
-                'num_channels': pitch_hparams.get('residual_channels'),
-                'dilation_cycle_length': pitch_hparams.get('dilation_cycle_length'),
-            } if self.pitch_backbone_type == 'wavenet' else None)
+            self.pitch_backbone_type = compat.get_backbone_type(hparams, nested_config=pitch_hparams)
+            self.pitch_backbone_args = compat.get_backbone_args(hparams, backbone_type=self.pitch_backbone_type)
             if self.diffusion_type == 'ddpm':
                 self.pitch_predictor = PitchDiffusion(
                     vmin=pitch_hparams['pitd_norm_min'],
diff --git a/scripts/export.py b/scripts/export.py
index 613b3ee45..d666175d6 100644
--- a/scripts/export.py
+++ b/scripts/export.py
@@ -15,13 +15,10 @@
 
 
 def check_pytorch_version():
-    version = torch.__version__
-    print(f"PyTorch version: {version}")
-    major, minor, _ = version.split('.')
-    if major != '1' and minor != '13':
-        raise RuntimeError(f"Unsupported PyTorch Version: {version}. need 1.13.x.")
-    else:
-        pass
+    # Require PyTorch version to be exactly 1.13.x
+    if torch.__version__.startswith('1.13.'):
+        return
+    raise RuntimeError('This script requires PyTorch 1.13.x. Please install the correct version.')
 
 
 def find_exp(exp):

From 8dd53f773cb34cb514bf1604318de36aeaddac1e Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Fri, 15 Nov 2024 23:55:32 +0800
Subject: [PATCH 10/10] Finish configuration schemas

---
 docs/ConfigurationSchemas.md | 123 ++++++++++++-----------------------
 1 file changed, 43 insertions(+), 80 deletions(-)

diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index d7297b1f5..b0e8dee66 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -201,6 +201,24 @@ Scale ratio of random time stretching augmentation.
 <tr><td align="center"><b>default</b></td><td>0.75</td>
 </tbody></table>
 
+### backbone_args
+
+Keyword arguments for the backbone of main decoder module.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>nn</td>
+<tr><td align="center"><b>type</b></td><td>dict</td>
+</tbody></table>
+
+Some available arguments are listed below.
+
+|     argument name     | for backbone type |                                                 description                                                 |
+|:---------------------:|:-----------------:|:-----------------------------------------------------------------------------------------------------------:|
+|      num_layers       |  wavenet/lynxnet  |                               Number of layer blocks, or depth of the network                               |
+|     num_channels      |  wavenet/lynxnet  |                                 Number of channels, or width of the network                                 |
+| dilation_cycle_length |      wavenet      | Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks. |
+
 ### backbone_type
 
 Backbone type of the main decoder/predictor module.
@@ -208,9 +226,10 @@ Backbone type of the main decoder/predictor module.
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
 <tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>reserved</td>
+<tr><td align="center"><b>customizability</b></td><td>normal</td>
 <tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>wavenet</td>
+<tr><td align="center"><b>default</b></td><td>lynxnet</td>
+<tr><td align="center"><b>constraints</b></td><td>Choose from 'wavenet', 'lynxnet'.</td>
 </tbody></table>
 
 ### base_config
@@ -418,18 +437,6 @@ The type of ODE-based generative model algorithm. The following models are curre
 <tr><td align="center"><b>constraints</b></td><td>Choose from 'ddpm', 'reflow'.</td>
 </tbody></table>
 
-### dilation_cycle_length
-
-Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>4</td>
-</tbody></table>
-
 ### dropout
 
 Dropout rate in some FastSpeech2 modules.
@@ -1273,13 +1280,21 @@ Arguments for pitch prediction.
 <tr><td align="center"><b>type</b></td><td>dict</td>
 </tbody></table>
 
-### pitch_prediction_args.dilation_cycle_length
+### pitch_prediction_args.backbone_args
 
-Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the pitch predictor model.
+Equivalent to [backbone_args](#backbone_args) but only for the pitch predictor model.  If not set, use the root backbone type.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>5</td>
+</tbody></table>
+
+### pitch_prediction_args.backbone_type
+
+Equivalent to [backbone_type](#backbone_type) but only for the pitch predictor model.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>variance</td>
+<tr><td align="center"><b>default</b></td><td>wavenet</td>
 </tbody></table>
 
 ### pitch_prediction_args.pitd_clip_max
@@ -1340,24 +1355,6 @@ Number of repeating bins in the pitch predictor.
 <tr><td align="center"><b>default</b></td><td>64</td>
 </tbody></table>
 
-### pitch_prediction_args.residual_channels
-
-Equivalent to [residual_channels](#residual_channels) but only for the pitch predictor.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>256</td>
-</tbody></table>
-
-### pitch_prediction_args.residual_layers
-
-Equivalent to [residual_layers](#residual_layers) but only for the pitch predictor.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>20</td>
-</tbody></table>
-
 ### pl_trainer_accelerator
 
 Type of Lightning trainer hardware accelerator.
@@ -1525,30 +1522,6 @@ Whether to use relative positional encoding in FastSpeech2 module.
 <tr><td align="center"><b>default</b></td><td>true</td>
 </tbody></table>
 
-### residual_channels
-
-Number of dilated convolution channels in residual blocks in WaveNet.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>512</td>
-</tbody></table>
-
-### residual_layers
-
-Number of residual blocks in WaveNet.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>20</td>
-</tbody></table>
-
 ### sampler_frame_count_grid
 
 The batch sampler applies an algorithm called _sorting by similar length_ when collecting batches. Data samples are first grouped by their approximate lengths before they get shuffled within each group. Assume this value is set to $L_{grid}$, the approximate length of a data sample with length $L_{real}$ can be calculated through the following expression:
@@ -2034,43 +2007,33 @@ Arguments for prediction of variance parameters other than pitch, like energy, b
 <tr><td align="center"><b>type</b></td><td>dict</td>
 </tbody></table>
 
-### variances_prediction_args.dilation_cycle_length
+### variances_prediction_args.backbone_args
 
-Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the multi-variance predictor model.
+Equivalent to [backbone_args](#backbone_args) but only for the multi-variance predictor.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>4</td>
 </tbody></table>
 
-### variances_prediction_args.total_repeat_bins
+### variances_prediction_args.backbone_type
 
-Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter.
+Equivalent to [backbone_type](#backbone_type) but only for the multi-variance predictor model. If not set, use the root backbone type.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>scope</b></td><td>nn, inference</td>
-<tr><td align="center"><b>customizability</b></td><td>recommended</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>48</td>
-</tbody></table>
-
-### variances_prediction_args.residual_channels
-
-Equivalent to [residual_channels](#residual_channels) but only for the multi-variance predictor.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>192</td>
+<tr><td align="center"><b>default</b></td><td>wavenet</td>
 </tbody></table>
 
-### variances_prediction_args.residual_layers
+### variances_prediction_args.total_repeat_bins
 
-Equivalent to [residual_layers](#residual_layers) but only for the multi-variance predictor.
+Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>10</td>
+<tr><td align="center"><b>scope</b></td><td>nn, inference</td>
+<tr><td align="center"><b>customizability</b></td><td>recommended</td>
+<tr><td align="center"><b>type</b></td><td>int</td>
+<tr><td align="center"><b>default</b></td><td>48</td>
 </tbody></table>
 
 ### vocoder