openvpi · yqzhishen · Nov 15, 2024 · Aug 28, 2024 · Sep 6, 2024 · Oct 20, 2024
diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
@@ -73,10 +73,12 @@ sampling_steps: 20
 diff_accelerator: ddim
 diff_speedup: 10
 hidden_size: 256
-residual_layers: 20
-residual_channels: 512
-dilation_cycle_length: 4  # *
-backbone_type: 'wavenet'
+backbone_type: 'lynxnet'
+backbone_args:
+  num_channels: 1024
+  num_layers: 6
+  kernel_size: 31
+  dropout_rate: 0.0
 main_loss_type: l2
 main_loss_log_norm: false
 schedule_type: 'linear'

diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
@@ -51,16 +51,24 @@ augmentation_args:
     range: [0.5, 2.]
     scale: 0.75
 
-residual_channels: 512
-residual_layers: 20
-
-# shallow diffusion
+# diffusion and shallow diffusion
 diffusion_type: reflow
 use_shallow_diffusion: true
 T_start: 0.4
 T_start_infer: 0.4
 K_step: 300
 K_step_infer: 300
+backbone_type: 'lynxnet'
+backbone_args:
+  num_channels: 1024
+  num_layers: 6
+  kernel_size: 31
+  dropout_rate: 0.0
+#backbone_type: 'wavenet'
+#backbone_args:
+#  num_channels: 512
+#  num_layers: 20
+#  dilation_cycle_length: 4
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true

diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
@@ -78,15 +78,27 @@ pitch_prediction_args:
   pitd_clip_min: -12.0
   pitd_clip_max: 12.0
   repeat_bins: 64
-  residual_layers: 20
-  residual_channels: 256
-  dilation_cycle_length: 5  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 20
+    num_channels: 256
+    dilation_cycle_length: 5
+# backbone_type: 'lynxnet'
+# backbone_args:
+#   num_layers: 6
+#   num_channels: 512
 
 variances_prediction_args:
   total_repeat_bins: 48
-  residual_layers: 10
-  residual_channels: 192
-  dilation_cycle_length: 4  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 10
+    num_channels: 192
+    dilation_cycle_length: 4
+# backbone_type: 'lynxnet'
+# backbone_args:
+#   num_layers: 6
+#   num_channels: 384
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0

diff --git a/configs/variance.yaml b/configs/variance.yaml
@@ -68,9 +68,11 @@ pitch_prediction_args:
   pitd_clip_min: -12.0
   pitd_clip_max: 12.0
   repeat_bins: 64
-  residual_layers: 20
-  residual_channels: 256
-  dilation_cycle_length: 5  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 20
+    num_channels: 256
+    dilation_cycle_length: 5 
 
 energy_db_min: -96.0
 energy_db_max: -12.0
@@ -89,9 +91,11 @@ tension_smooth_width: 0.12
 
 variances_prediction_args:
   total_repeat_bins: 48
-  residual_layers: 10
-  residual_channels: 192
-  dilation_cycle_length: 4  # *
+  backbone_type: 'wavenet'
+  backbone_args:
+    num_layers: 10
+    num_channels: 192
+    dilation_cycle_length: 4
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
@@ -103,7 +107,6 @@ schedule_type: 'linear'
 K_step: 1000
 timesteps: 1000
 max_beta: 0.02
-backbone_type: 'wavenet'
 main_loss_type: l2
 main_loss_log_norm: true
 sampling_algorithm: euler

diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py
@@ -31,12 +31,8 @@ def __init__(self, vocab_size, out_dims):
                 num_feats=1,
                 timesteps=hparams['timesteps'],
                 k_step=hparams['K_step'],
-                backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                backbone_args={
-                    'n_layers': hparams['residual_layers'],
-                    'n_chans': hparams['residual_channels'],
-                    'n_dilates': hparams['dilation_cycle_length'],
-                },
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
                 spec_min=hparams['spec_min'],
                 spec_max=hparams['spec_max']
             )
@@ -46,12 +42,8 @@ def __init__(self, vocab_size, out_dims):
                 num_feats=1,
                 t_start=hparams['T_start'],
                 time_scale_factor=hparams['time_scale_factor'],
-                backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                backbone_args={
-                    'n_layers': hparams['residual_layers'],
-                    'n_chans': hparams['residual_channels'],
-                    'n_dilates': hparams['dilation_cycle_length'],
-                },
+                backbone_type=self.backbone_type,
+                backbone_args=self.backbone_args,
                 spec_min=hparams['spec_min'],
                 spec_max=hparams['spec_max']
             )
@@ -155,12 +147,8 @@ def __init__(self, vocab_size):
                     repeat_bins=pitch_hparams['repeat_bins'],
                     timesteps=hparams['timesteps'],
                     k_step=hparams['K_step'],
-                    backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                    backbone_args={
-                        'n_layers': pitch_hparams['residual_layers'],
-                        'n_chans': pitch_hparams['residual_channels'],
-                        'n_dilates': pitch_hparams['dilation_cycle_length'],
-                    }
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args
                 )
             elif self.diffusion_type == 'reflow':
                 self.pitch_predictor = PitchRectifiedFlowONNX(
@@ -170,12 +158,8 @@ def __init__(self, vocab_size):
                     cmax=pitch_hparams['pitd_clip_max'],
                     repeat_bins=pitch_hparams['repeat_bins'],
                     time_scale_factor=hparams['time_scale_factor'],
-                    backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
-                    backbone_args={
-                        'n_layers': pitch_hparams['residual_layers'],
-                        'n_chans': pitch_hparams['residual_channels'],
-                        'n_dilates': pitch_hparams['dilation_cycle_length'],
-                    }
+                    backbone_type=self.pitch_backbone_type,
+                    backbone_args=self.pitch_backbone_args
                 )
             else:
                 raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")

diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
@@ -201,16 +201,35 @@ Scale ratio of random time stretching augmentation.
 <tr><td align="center"><b>default</b></td><td>0.75</td>
 </tbody></table>
 
+### backbone_args
+
+Keyword arguments for the backbone of main decoder module.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>nn</td>
+<tr><td align="center"><b>type</b></td><td>dict</td>
+</tbody></table>
+
+Some available arguments are listed below.
+
+|     argument name     | for backbone type |                                                 description                                                 |
+|:---------------------:|:-----------------:|:-----------------------------------------------------------------------------------------------------------:|
+|      num_layers       |  wavenet/lynxnet  |                               Number of layer blocks, or depth of the network                               |
+|     num_channels      |  wavenet/lynxnet  |                                 Number of channels, or width of the network                                 |
+| dilation_cycle_length |      wavenet      | Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks. |
+
 ### backbone_type
 
 Backbone type of the main decoder/predictor module.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
 <tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>reserved</td>
+<tr><td align="center"><b>customizability</b></td><td>normal</td>
 <tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>wavenet</td>
+<tr><td align="center"><b>default</b></td><td>lynxnet</td>
+<tr><td align="center"><b>constraints</b></td><td>Choose from 'wavenet', 'lynxnet'.</td>
 </tbody></table>
 
 ### base_config
@@ -418,18 +437,6 @@ The type of ODE-based generative model algorithm. The following models are curre
 <tr><td align="center"><b>constraints</b></td><td>Choose from 'ddpm', 'reflow'.</td>
 </tbody></table>
 
-### dilation_cycle_length
-
-Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>4</td>
-</tbody></table>
-
 ### dropout
 
 Dropout rate in some FastSpeech2 modules.
@@ -1273,13 +1280,21 @@ Arguments for pitch prediction.
 <tr><td align="center"><b>type</b></td><td>dict</td>
 </tbody></table>
 
-### pitch_prediction_args.dilation_cycle_length
+### pitch_prediction_args.backbone_args
 
-Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the pitch predictor model.
+Equivalent to [backbone_args](#backbone_args) but only for the pitch predictor model.  If not set, use the root backbone type.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>5</td>
+</tbody></table>
+
+### pitch_prediction_args.backbone_type
+
+Equivalent to [backbone_type](#backbone_type) but only for the pitch predictor model.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>variance</td>
+<tr><td align="center"><b>default</b></td><td>wavenet</td>
 </tbody></table>
 
 ### pitch_prediction_args.pitd_clip_max
@@ -1340,24 +1355,6 @@ Number of repeating bins in the pitch predictor.
 <tr><td align="center"><b>default</b></td><td>64</td>
 </tbody></table>
 
-### pitch_prediction_args.residual_channels
-
-Equivalent to [residual_channels](#residual_channels) but only for the pitch predictor.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>256</td>
-</tbody></table>
-
-### pitch_prediction_args.residual_layers
-
-Equivalent to [residual_layers](#residual_layers) but only for the pitch predictor.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>20</td>
-</tbody></table>
-
 ### pl_trainer_accelerator
 
 Type of Lightning trainer hardware accelerator.
@@ -1525,30 +1522,6 @@ Whether to use relative positional encoding in FastSpeech2 module.
 <tr><td align="center"><b>default</b></td><td>true</td>
 </tbody></table>
 
-### residual_channels
-
-Number of dilated convolution channels in residual blocks in WaveNet.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>512</td>
-</tbody></table>
-
-### residual_layers
-
-Number of residual blocks in WaveNet.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>20</td>
-</tbody></table>
-
 ### sampler_frame_count_grid
 
 The batch sampler applies an algorithm called _sorting by similar length_ when collecting batches. Data samples are first grouped by their approximate lengths before they get shuffled within each group. Assume this value is set to $L_{grid}$, the approximate length of a data sample with length $L_{real}$ can be calculated through the following expression:
@@ -2034,43 +2007,33 @@ Arguments for prediction of variance parameters other than pitch, like energy, b
 <tr><td align="center"><b>type</b></td><td>dict</td>
 </tbody></table>
 
-### variances_prediction_args.dilation_cycle_length
+### variances_prediction_args.backbone_args
 
-Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the multi-variance predictor model.
+Equivalent to [backbone_args](#backbone_args) but only for the multi-variance predictor.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>4</td>
 </tbody></table>
 
-### variances_prediction_args.total_repeat_bins
+### variances_prediction_args.backbone_type
 
-Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter.
+Equivalent to [backbone_type](#backbone_type) but only for the multi-variance predictor model. If not set, use the root backbone type.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>scope</b></td><td>nn, inference</td>
-<tr><td align="center"><b>customizability</b></td><td>recommended</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>48</td>
-</tbody></table>
-
-### variances_prediction_args.residual_channels
-
-Equivalent to [residual_channels](#residual_channels) but only for the multi-variance predictor.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>192</td>
+<tr><td align="center"><b>default</b></td><td>wavenet</td>
 </tbody></table>
 
-### variances_prediction_args.residual_layers
+### variances_prediction_args.total_repeat_bins
 
-Equivalent to [residual_layers](#residual_layers) but only for the multi-variance predictor.
+Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>variance</td>
-<tr><td align="center"><b>default</b></td><td>10</td>
+<tr><td align="center"><b>scope</b></td><td>nn, inference</td>
+<tr><td align="center"><b>customizability</b></td><td>recommended</td>
+<tr><td align="center"><b>type</b></td><td>int</td>
+<tr><td align="center"><b>default</b></td><td>48</td>
 </tbody></table>
 
 ### vocoder

diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py
@@ -1,5 +1,18 @@
+import torch.nn
 from modules.backbones.wavenet import WaveNet
+from modules.backbones.lynxnet import LYNXNet
+from utils import filter_kwargs
 
 BACKBONES = {
-    'wavenet': WaveNet
+    'wavenet': WaveNet,
+    'lynxnet': LYNXNet
 }
+
+
+def build_backbone(
+        out_dims: int, num_feats: int,
+        backbone_type: str, backbone_args: dict
+) -> torch.nn.Module:
+    backbone = BACKBONES[backbone_type]
+    kwargs = filter_kwargs(backbone_args, backbone)
+    return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)