Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions configs/acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,12 @@ sampling_steps: 20
diff_accelerator: ddim
diff_speedup: 10
hidden_size: 256
residual_layers: 20
residual_channels: 512
dilation_cycle_length: 4 # *
backbone_type: 'wavenet'
backbone_type: 'lynxnet'
backbone_args:
num_channels: 1024
num_layers: 6
kernel_size: 31
dropout_rate: 0.0
main_loss_type: l2
main_loss_log_norm: false
schedule_type: 'linear'
Expand Down
16 changes: 12 additions & 4 deletions configs/templates/config_acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,24 @@ augmentation_args:
range: [0.5, 2.]
scale: 0.75

residual_channels: 512
residual_layers: 20

# shallow diffusion
# diffusion and shallow diffusion
diffusion_type: reflow
use_shallow_diffusion: true
T_start: 0.4
T_start_infer: 0.4
K_step: 300
K_step_infer: 300
backbone_type: 'lynxnet'
backbone_args:
num_channels: 1024
num_layers: 6
kernel_size: 31
dropout_rate: 0.0
#backbone_type: 'wavenet'
#backbone_args:
# num_channels: 512
# num_layers: 20
# dilation_cycle_length: 4
shallow_diffusion_args:
train_aux_decoder: true
train_diffusion: true
Expand Down
24 changes: 18 additions & 6 deletions configs/templates/config_variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,27 @@ pitch_prediction_args:
pitd_clip_min: -12.0
pitd_clip_max: 12.0
repeat_bins: 64
residual_layers: 20
residual_channels: 256
dilation_cycle_length: 5 # *
backbone_type: 'wavenet'
backbone_args:
num_layers: 20
num_channels: 256
dilation_cycle_length: 5
# backbone_type: 'lynxnet'
# backbone_args:
# num_layers: 6
# num_channels: 512

variances_prediction_args:
total_repeat_bins: 48
residual_layers: 10
residual_channels: 192
dilation_cycle_length: 4 # *
backbone_type: 'wavenet'
backbone_args:
num_layers: 10
num_channels: 192
dilation_cycle_length: 4
# backbone_type: 'lynxnet'
# backbone_args:
# num_layers: 6
# num_channels: 384

lambda_dur_loss: 1.0
lambda_pitch_loss: 1.0
Expand Down
17 changes: 10 additions & 7 deletions configs/variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,11 @@ pitch_prediction_args:
pitd_clip_min: -12.0
pitd_clip_max: 12.0
repeat_bins: 64
residual_layers: 20
residual_channels: 256
dilation_cycle_length: 5 # *
backbone_type: 'wavenet'
backbone_args:
num_layers: 20
num_channels: 256
dilation_cycle_length: 5

energy_db_min: -96.0
energy_db_max: -12.0
Expand All @@ -89,9 +91,11 @@ tension_smooth_width: 0.12

variances_prediction_args:
total_repeat_bins: 48
residual_layers: 10
residual_channels: 192
dilation_cycle_length: 4 # *
backbone_type: 'wavenet'
backbone_args:
num_layers: 10
num_channels: 192
dilation_cycle_length: 4

lambda_dur_loss: 1.0
lambda_pitch_loss: 1.0
Expand All @@ -103,7 +107,6 @@ schedule_type: 'linear'
K_step: 1000
timesteps: 1000
max_beta: 0.02
backbone_type: 'wavenet'
main_loss_type: l2
main_loss_log_norm: true
sampling_algorithm: euler
Expand Down
32 changes: 8 additions & 24 deletions deployment/modules/toplevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,8 @@ def __init__(self, vocab_size, out_dims):
num_feats=1,
timesteps=hparams['timesteps'],
k_step=hparams['K_step'],
backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
backbone_args={
'n_layers': hparams['residual_layers'],
'n_chans': hparams['residual_channels'],
'n_dilates': hparams['dilation_cycle_length'],
},
backbone_type=self.backbone_type,
backbone_args=self.backbone_args,
spec_min=hparams['spec_min'],
spec_max=hparams['spec_max']
)
Expand All @@ -46,12 +42,8 @@ def __init__(self, vocab_size, out_dims):
num_feats=1,
t_start=hparams['T_start'],
time_scale_factor=hparams['time_scale_factor'],
backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
backbone_args={
'n_layers': hparams['residual_layers'],
'n_chans': hparams['residual_channels'],
'n_dilates': hparams['dilation_cycle_length'],
},
backbone_type=self.backbone_type,
backbone_args=self.backbone_args,
spec_min=hparams['spec_min'],
spec_max=hparams['spec_max']
)
Expand Down Expand Up @@ -155,12 +147,8 @@ def __init__(self, vocab_size):
repeat_bins=pitch_hparams['repeat_bins'],
timesteps=hparams['timesteps'],
k_step=hparams['K_step'],
backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
backbone_args={
'n_layers': pitch_hparams['residual_layers'],
'n_chans': pitch_hparams['residual_channels'],
'n_dilates': pitch_hparams['dilation_cycle_length'],
}
backbone_type=self.pitch_backbone_type,
backbone_args=self.pitch_backbone_args
)
elif self.diffusion_type == 'reflow':
self.pitch_predictor = PitchRectifiedFlowONNX(
Expand All @@ -170,12 +158,8 @@ def __init__(self, vocab_size):
cmax=pitch_hparams['pitd_clip_max'],
repeat_bins=pitch_hparams['repeat_bins'],
time_scale_factor=hparams['time_scale_factor'],
backbone_type=hparams.get('backbone_type', hparams.get('diff_decoder_type')),
backbone_args={
'n_layers': pitch_hparams['residual_layers'],
'n_chans': pitch_hparams['residual_channels'],
'n_dilates': pitch_hparams['dilation_cycle_length'],
}
backbone_type=self.pitch_backbone_type,
backbone_args=self.pitch_backbone_args
)
else:
raise ValueError(f"Invalid diffusion type: {self.diffusion_type}")
Expand Down
123 changes: 43 additions & 80 deletions docs/ConfigurationSchemas.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,16 +201,35 @@ Scale ratio of random time stretching augmentation.
<tr><td align="center"><b>default</b></td><td>0.75</td>
</tbody></table>

### backbone_args

Keyword arguments for the backbone of main decoder module.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
<tr><td align="center"><b>scope</b></td><td>nn</td>
<tr><td align="center"><b>type</b></td><td>dict</td>
</tbody></table>

Some available arguments are listed below.

| argument name | for backbone type | description |
|:---------------------:|:-----------------:|:-----------------------------------------------------------------------------------------------------------:|
| num_layers | wavenet/lynxnet | Number of layer blocks, or depth of the network |
| num_channels | wavenet/lynxnet | Number of channels, or width of the network |
| dilation_cycle_length | wavenet | Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks. |

### backbone_type

Backbone type of the main decoder/predictor module.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
<tr><td align="center"><b>scope</b></td><td>nn</td>
<tr><td align="center"><b>customizability</b></td><td>reserved</td>
<tr><td align="center"><b>customizability</b></td><td>normal</td>
<tr><td align="center"><b>type</b></td><td>str</td>
<tr><td align="center"><b>default</b></td><td>wavenet</td>
<tr><td align="center"><b>default</b></td><td>lynxnet</td>
<tr><td align="center"><b>constraints</b></td><td>Choose from 'wavenet', 'lynxnet'.</td>
</tbody></table>

### base_config
Expand Down Expand Up @@ -418,18 +437,6 @@ The type of ODE-based generative model algorithm. The following models are curre
<tr><td align="center"><b>constraints</b></td><td>Choose from 'ddpm', 'reflow'.</td>
</tbody></table>

### dilation_cycle_length

Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
<tr><td align="center"><b>scope</b></td><td>nn</td>
<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
<tr><td align="center"><b>type</b></td><td>int</td>
<tr><td align="center"><b>default</b></td><td>4</td>
</tbody></table>

### dropout

Dropout rate in some FastSpeech2 modules.
Expand Down Expand Up @@ -1273,13 +1280,21 @@ Arguments for pitch prediction.
<tr><td align="center"><b>type</b></td><td>dict</td>
</tbody></table>

### pitch_prediction_args.dilation_cycle_length
### pitch_prediction_args.backbone_args

Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the pitch predictor model.
Equivalent to [backbone_args](#backbone_args) but only for the pitch predictor model. If not set, use the root backbone type.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>variance</td>
<tr><td align="center"><b>default</b></td><td>5</td>
</tbody></table>

### pitch_prediction_args.backbone_type

Equivalent to [backbone_type](#backbone_type) but only for the pitch predictor model.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>variance</td>
<tr><td align="center"><b>default</b></td><td>wavenet</td>
</tbody></table>

### pitch_prediction_args.pitd_clip_max
Expand Down Expand Up @@ -1340,24 +1355,6 @@ Number of repeating bins in the pitch predictor.
<tr><td align="center"><b>default</b></td><td>64</td>
</tbody></table>

### pitch_prediction_args.residual_channels

Equivalent to [residual_channels](#residual_channels) but only for the pitch predictor.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>variance</td>
<tr><td align="center"><b>default</b></td><td>256</td>
</tbody></table>

### pitch_prediction_args.residual_layers

Equivalent to [residual_layers](#residual_layers) but only for the pitch predictor.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>variance</td>
<tr><td align="center"><b>default</b></td><td>20</td>
</tbody></table>

### pl_trainer_accelerator

Type of Lightning trainer hardware accelerator.
Expand Down Expand Up @@ -1525,30 +1522,6 @@ Whether to use relative positional encoding in FastSpeech2 module.
<tr><td align="center"><b>default</b></td><td>true</td>
</tbody></table>

### residual_channels

Number of dilated convolution channels in residual blocks in WaveNet.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
<tr><td align="center"><b>scope</b></td><td>nn</td>
<tr><td align="center"><b>customizability</b></td><td>normal</td>
<tr><td align="center"><b>type</b></td><td>int</td>
<tr><td align="center"><b>default</b></td><td>512</td>
</tbody></table>

### residual_layers

Number of residual blocks in WaveNet.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
<tr><td align="center"><b>scope</b></td><td>nn</td>
<tr><td align="center"><b>customizability</b></td><td>normal</td>
<tr><td align="center"><b>type</b></td><td>int</td>
<tr><td align="center"><b>default</b></td><td>20</td>
</tbody></table>

### sampler_frame_count_grid

The batch sampler applies an algorithm called _sorting by similar length_ when collecting batches. Data samples are first grouped by their approximate lengths before they get shuffled within each group. Assume this value is set to $L_{grid}$, the approximate length of a data sample with length $L_{real}$ can be calculated through the following expression:
Expand Down Expand Up @@ -2034,43 +2007,33 @@ Arguments for prediction of variance parameters other than pitch, like energy, b
<tr><td align="center"><b>type</b></td><td>dict</td>
</tbody></table>

### variances_prediction_args.dilation_cycle_length
### variances_prediction_args.backbone_args

Equivalent to [dilation_cycle_length](#dilation_cycle_length) but only for the multi-variance predictor model.
Equivalent to [backbone_args](#backbone_args) but only for the multi-variance predictor.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>variance</td>
<tr><td align="center"><b>default</b></td><td>4</td>
</tbody></table>

### variances_prediction_args.total_repeat_bins
### variances_prediction_args.backbone_type

Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter.
Equivalent to [backbone_type](#backbone_type) but only for the multi-variance predictor model. If not set, use the root backbone type.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>variance</td>
<tr><td align="center"><b>scope</b></td><td>nn, inference</td>
<tr><td align="center"><b>customizability</b></td><td>recommended</td>
<tr><td align="center"><b>type</b></td><td>int</td>
<tr><td align="center"><b>default</b></td><td>48</td>
</tbody></table>

### variances_prediction_args.residual_channels

Equivalent to [residual_channels](#residual_channels) but only for the multi-variance predictor.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>variance</td>
<tr><td align="center"><b>default</b></td><td>192</td>
<tr><td align="center"><b>default</b></td><td>wavenet</td>
</tbody></table>

### variances_prediction_args.residual_layers
### variances_prediction_args.total_repeat_bins

Equivalent to [residual_layers](#residual_layers) but only for the multi-variance predictor.
Total number of repeating bins in the multi-variance predictor. Repeating bins are distributed evenly to each variance parameter.

<table><tbody>
<tr><td align="center"><b>visibility</b></td><td>variance</td>
<tr><td align="center"><b>default</b></td><td>10</td>
<tr><td align="center"><b>scope</b></td><td>nn, inference</td>
<tr><td align="center"><b>customizability</b></td><td>recommended</td>
<tr><td align="center"><b>type</b></td><td>int</td>
<tr><td align="center"><b>default</b></td><td>48</td>
</tbody></table>

### vocoder
Expand Down
15 changes: 14 additions & 1 deletion modules/backbones/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
import torch.nn
from modules.backbones.wavenet import WaveNet
from modules.backbones.lynxnet import LYNXNet
from utils import filter_kwargs

BACKBONES = {
'wavenet': WaveNet
'wavenet': WaveNet,
'lynxnet': LYNXNet
}


def build_backbone(
out_dims: int, num_feats: int,
backbone_type: str, backbone_args: dict
) -> torch.nn.Module:
backbone = BACKBONES[backbone_type]
kwargs = filter_kwargs(backbone_args, backbone)
return BACKBONES[backbone_type](out_dims, num_feats, **kwargs)
Loading