From 7da2d65d36fc3aafcce6e225e2dc9af4beb488eb Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 6 Aug 2023 19:22:24 +0800 Subject: [PATCH 01/33] Add shallow diffusion API --- configs/acoustic.yaml | 2 ++ modules/diffusion/ddpm.py | 9 +++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 88ae1b12b..ec98f065a 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -66,6 +66,8 @@ K_step: 1000 timesteps: 1000 max_beta: 0.02 rel_pos: true +use_shallow_diffusion: false +diff_depth: 400 diff_accelerator: ddim pndm_speedup: 10 hidden_size: 256 diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index 46c3eaccb..d17070cb8 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -216,7 +216,10 @@ def p_losses(self, x_start, t, cond, noise=None): return x_recon, noise - def inference(self, cond, b=1, device=None): + def inference(self, cond, b=1, src_spec=None, device=None): + depth = hparams.get('diff_depth', self.k_step) + # TODO: implement shallow diffusion + t = self.k_step shape = (b, self.num_feats, self.out_dims, cond.shape[2]) x = torch.randn(shape, device=device) @@ -329,7 +332,7 @@ def wrapped(x, t, **kwargs): x = x.transpose(2, 3).squeeze(1) # [B, F, M, T] => [B, T, M] or [B, F, T, M] return x - def forward(self, condition, gt_spec=None, infer=True): + def forward(self, condition, gt_spec=None, src_spec=None, infer=True): """ conditioning diffusion, use fastspeech2 encoder output as the condition """ @@ -344,6 +347,8 @@ def forward(self, condition, gt_spec=None, infer=True): t = torch.randint(0, self.k_step, (b,), device=device).long() return self.p_losses(spec, t, cond=cond) else: + # src_spec: [B, T, M] + # TODO: implement shallow diffusion x = self.inference(cond, b=b, device=device) return self.denorm_spec(x) From 4f3d765d2a9f6861d46a640256ff81e3158e9f58 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 6 Aug 2023 21:32:27 +0800 Subject: [PATCH 02/33] Support aux decoder training --- configs/acoustic.yaml | 9 +++++++ modules/toplevel.py | 41 ++++++++++++++++++++++++++++--- training/acoustic_task.py | 51 +++++++++++++++++++++++++++------------ 3 files changed, 82 insertions(+), 19 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index ec98f065a..9913429d8 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -78,6 +78,15 @@ diff_decoder_type: 'wavenet' diff_loss_type: l2 schedule_type: 'linear' +shallow_diffusion_args: + train_aux_decoder: true + train_diffusion: true + shared_encoder: true + aux_decoder_arch: ps + aux_decoder_args: + arch: ps + # kernel_size: xxx + # train and eval num_sanity_val_steps: 1 optimizer_args: diff --git a/modules/toplevel.py b/modules/toplevel.py index a93ed1e34..41bdee28b 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -20,6 +20,22 @@ from utils.hparams import hparams +class ShallowDiffusionOutput: + def __init__(self, *, aux_out=None, diff_out=None): + self.aux_out = aux_out + self.diff_out = diff_out + + +# TODO: replace the following placeholder with real modules +class ExampleAuxDecoder(nn.Module): + def __init__(self, out_dims): + super().__init__() + self.out_dims = out_dims + + def forward(self, condition, infer=True): + return torch.randn(condition.shape[0], condition.shape[1], self.out_dims, device=condition.device) + + class DiffSingerAcoustic(ParameterAdaptorModule, CategorizedModule): @property def category(self): @@ -31,6 +47,13 @@ def __init__(self, vocab_size, out_dims): vocab_size=vocab_size ) + self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False) + if self.use_shallow_diffusion: + # TODO: replace the following placeholder with real modules + self.aux_decoder = ExampleAuxDecoder( + out_dims=out_dims + ) + self.diffusion = GaussianDiffusion( out_dims=out_dims, num_feats=1, @@ -49,19 +72,29 @@ def __init__(self, vocab_size, out_dims): def forward( self, txt_tokens, mel2ph, f0, key_shift=None, speed=None, spk_embed_id=None, gt_mel=None, infer=True, **kwargs - ): + ) -> ShallowDiffusionOutput: condition = self.fs2( txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id, **kwargs ) if infer: - mel_pred = self.diffusion(condition, infer=True) + if self.use_shallow_diffusion: + aux_mel_pred = self.aux_decoder(condition, infer=True) + aux_mel_pred *= ((mel2ph > 0).float()[:, :, None]) + else: + aux_mel_pred = None + mel_pred = self.diffusion(condition, src_spec=aux_mel_pred, infer=True) mel_pred *= ((mel2ph > 0).float()[:, :, None]) - return mel_pred + return ShallowDiffusionOutput(aux_out=aux_mel_pred, diff_out=mel_pred) else: + if self.use_shallow_diffusion: + # TODO: replace the following placeholder with real calling code + aux_out = self.aux_decoder(condition, infer=False) + else: + aux_out = None x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False) - return x_recon, noise + return ShallowDiffusionOutput(aux_out=aux_out, diff_out=(x_recon, noise)) class DiffSingerVariance(ParameterAdaptorModule, CategorizedModule): diff --git a/training/acoustic_task.py b/training/acoustic_task.py index b0723912b..e34c774be 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -10,7 +10,7 @@ from basics.base_task import BaseTask from basics.base_vocoder import BaseVocoder from modules.losses.diff_loss import DiffusionNoiseLoss -from modules.toplevel import DiffSingerAcoustic +from modules.toplevel import DiffSingerAcoustic, ShallowDiffusionOutput from modules.vocoders.registry import get_vocoder_cls from utils.hparams import hparams from utils.plot import spec_to_figure, curve_to_figure @@ -60,6 +60,7 @@ class AcousticTask(BaseTask): def __init__(self): super().__init__() self.dataset_cls = AcousticDataset + self.use_shallow_diffusion = hparams['use_shallow_diffusion'] self.use_vocoder = hparams['infer'] or hparams['val_with_vocoder'] if self.use_vocoder: self.vocoder: BaseVocoder = get_vocoder_cls(hparams)() @@ -78,6 +79,9 @@ def build_model(self): # noinspection PyAttributeOutsideInit def build_losses_and_metrics(self): + if self.use_shallow_diffusion: + # TODO: replace the following placeholder with real loss creation + self.aux_mel_loss = torch.nn.L1Loss() self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type']) def run_model(self, sample, infer=False): @@ -96,20 +100,24 @@ def run_model(self, sample, infer=False): spk_embed_id = sample['spk_ids'] else: spk_embed_id = None - output = self.model( + output: ShallowDiffusionOutput = self.model( txt_tokens, mel2ph=mel2ph, f0=f0, **variances, key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id, gt_mel=target, infer=infer ) if infer: - return output # mel_pred + return output else: - x_recon, x_noise = output + losses = {} + if self.use_shallow_diffusion: + aux_out = output.aux_out + # TODO: replace the following placeholder with real loss calculation + aux_mel_loss = self.aux_mel_loss(aux_out, target) + losses['aux_mel_loss'] = aux_mel_loss + x_recon, x_noise = output.diff_out mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) - losses = { - 'mel_loss': mel_loss - } + losses['mel_loss'] = mel_loss return losses @@ -126,29 +134,42 @@ def _validation_step(self, sample, batch_idx): if batch_idx < hparams['num_valid_plots'] \ and (self.trainer.distributed_sampler_kwargs or {}).get('rank', 0) == 0: - mel_pred = self.run_model(sample, infer=True) + mel_out: ShallowDiffusionOutput = self.run_model(sample, infer=True) if self.use_vocoder: - self.plot_wav(batch_idx, sample['mel'], mel_pred, f0=sample['f0']) - self.plot_mel(batch_idx, sample['mel'], mel_pred, name=f'diffmel_{batch_idx}') + self.plot_wav( + batch_idx, gt_mel=sample['mel'], + aux_mel=mel_out.aux_out, diff_mel=mel_out.diff_out, + f0=sample['f0'] + ) + self.plot_mel(batch_idx, sample['mel'], mel_out.aux_out, name=f'auxmel_{batch_idx}') + self.plot_mel(batch_idx, sample['mel'], mel_out.diff_out, name=f'diffmel_{batch_idx}') return losses, sample['size'] ############ # validation plots ############ - def plot_wav(self, batch_idx, gt_mel, pred_mel, f0=None): + def plot_wav(self, batch_idx, gt_mel, aux_mel=None, diff_mel=None, f0=None): gt_mel = gt_mel[0].cpu().numpy() - pred_mel = pred_mel[0].cpu().numpy() + if aux_mel is not None: + aux_mel = aux_mel[0].cpu().numpy() + if diff_mel is not None: + diff_mel = diff_mel[0].cpu().numpy() f0 = f0[0].cpu().numpy() if batch_idx not in self.logged_gt_wav: gt_wav = self.vocoder.spec2wav(gt_mel, f0=f0) self.logger.experiment.add_audio(f'gt_{batch_idx}', gt_wav, sample_rate=hparams['audio_sample_rate'], global_step=self.global_step) self.logged_gt_wav.add(batch_idx) - pred_wav = self.vocoder.spec2wav(pred_mel, f0=f0) - self.logger.experiment.add_audio(f'pred_{batch_idx}', pred_wav, sample_rate=hparams['audio_sample_rate'], - global_step=self.global_step) + if aux_mel is not None: + aux_wav = self.vocoder.spec2wav(aux_mel, f0=f0) + self.logger.experiment.add_audio(f'aux_{batch_idx}', aux_wav, sample_rate=hparams['audio_sample_rate'], + global_step=self.global_step) + if diff_mel is not None: + diff_wav = self.vocoder.spec2wav(diff_mel, f0=f0) + self.logger.experiment.add_audio(f'diff_{batch_idx}', diff_wav, sample_rate=hparams['audio_sample_rate'], + global_step=self.global_step) def plot_mel(self, batch_idx, spec, spec_out, name=None): name = f'mel_{batch_idx}' if name is None else name From 2380f88642e8fe53d230cd55c770030f6c046944 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sun, 6 Aug 2023 21:51:51 +0800 Subject: [PATCH 03/33] Support shallow diffusion inference --- inference/ds_acoustic.py | 6 +++--- scripts/infer.py | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index b37727dad..b3254046e 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -11,7 +11,7 @@ from basics.base_svs_infer import BaseSVSInfer from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST from modules.fastspeech.tts_modules import LengthRegulator -from modules.toplevel import DiffSingerAcoustic +from modules.toplevel import DiffSingerAcoustic, ShallowDiffusionOutput from modules.vocoders.registry import VOCODERS from utils import load_ckpt from utils.hparams import hparams @@ -170,12 +170,12 @@ def forward_model(self, sample): ) # => [B, T, H] else: spk_mix_embed = None - mel_pred = self.model( + mel_pred: ShallowDiffusionOutput = self.model( txt_tokens, mel2ph=sample['mel2ph'], f0=sample['f0'], **variances, key_shift=sample.get('key_shift'), speed=sample.get('speed'), spk_mix_embed=spk_mix_embed, infer=True ) - return mel_pred + return mel_pred.diff_out @torch.no_grad() def run_vocoder(self, spec, **kwargs): diff --git a/scripts/infer.py b/scripts/infer.py index 00389c22b..c53c7f81f 100644 --- a/scripts/infer.py +++ b/scripts/infer.py @@ -47,6 +47,7 @@ def main(): @click.option('--key', type=int, required=False, default=0, help='Key transition of pitch') @click.option('--gender', type=float, required=False, help='Formant shifting (gender control)') @click.option('--seed', type=int, required=False, default=-1, help='Random seed of the inference') +@click.option('--depth', type=int, required=False, default=-1, help='Shallow diffusion depth') @click.option('--speedup', type=int, required=False, default=0, help='Diffusion acceleration ratio') @click.option('--mel', is_flag=True, help='Save intermediate mel format instead of waveform') def acoustic( @@ -60,6 +61,7 @@ def acoustic( key: int, gender: float, seed: int, + depth: int, speedup: int, mel: bool ): @@ -107,8 +109,13 @@ def acoustic( f'Vocoder ckpt \'{hparams["vocoder_ckpt"]}\' not found. ' \ f'Please put it to the checkpoints directory to run inference.' + if depth >= 0: + assert depth <= hparams['K_step'], f'Diffusion depth should not be larger than K_step {hparams["K_step"]}.' + else: + depth = hparams['K_step'] # gaussian start (full depth diffusion) + if speedup > 0: - assert hparams['K_step'] % speedup == 0, f'Acceleration ratio must be factor of K_step {hparams["K_step"]}.' + assert depth % speedup == 0, f'Acceleration ratio must be factor of diffusion depth {depth}.' hparams['pndm_speedup'] = speedup spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None From e3863485a4541f4ecae6e94c62a1d1c844d0e75e Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Mon, 7 Aug 2023 11:38:35 +0800 Subject: [PATCH 04/33] add shallow farmwork --- configs/acoustic.yaml | 7 ++- modules/shallow/__init__.py | 0 modules/shallow/fast_speech2_decoder.py | 82 +++++++++++++++++++++++++ modules/shallow/shallow_adapter.py | 53 ++++++++++++++++ modules/toplevel.py | 7 +-- training/acoustic_task.py | 4 +- 6 files changed, 144 insertions(+), 9 deletions(-) create mode 100644 modules/shallow/__init__.py create mode 100644 modules/shallow/fast_speech2_decoder.py create mode 100644 modules/shallow/shallow_adapter.py diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 9913429d8..e9d333b07 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -81,10 +81,11 @@ schedule_type: 'linear' shallow_diffusion_args: train_aux_decoder: true train_diffusion: true - shared_encoder: true - aux_decoder_arch: ps + aux_decoder_arch: fs2 + aux_decode_strict_hparams: true aux_decoder_args: - arch: ps + shared_encoder: true +# arch: ps # kernel_size: xxx # train and eval diff --git a/modules/shallow/__init__.py b/modules/shallow/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py new file mode 100644 index 000000000..5bd20c1f6 --- /dev/null +++ b/modules/shallow/fast_speech2_decoder.py @@ -0,0 +1,82 @@ +from typing import Optional + +import torch +import torch.nn as nn + + + + +class ConvNeXtBlock(nn.Module): + """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. + + Args: + dim (int): Number of input channels. + intermediate_dim (int): Dimensionality of the intermediate layer. + layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. + Defaults to None. + adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. + None means non-conditional LayerNorm. Defaults to None. + """ + + def __init__( + self, + dim: int, + intermediate_dim: int, + layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0 + + ): + super().__init__() + self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + + + + self.norm = nn.LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(intermediate_dim, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() + + def forward(self, x: torch.Tensor, ) -> torch.Tensor: + residual = x + x = self.dwconv(x) + x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) + + + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) + x=self.dropout(x) + + x = residual + self.drop_path (x) + return x + + +class fs2_decode(nn.Module): + def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers): + super().__init__() + self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate) for _ in range(n_layers)]) + self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + + def losses(self,x,gt): + return nn.L1Loss()(x,gt) + + def forward(self, x): + x=x.transpose(1, 2) + x=self.inconv(x) + for i in self.conv: + x=i(x) + x=self.outconv(x).transpose(1, 2) + return x + pass diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py new file mode 100644 index 000000000..0e3a5526d --- /dev/null +++ b/modules/shallow/shallow_adapter.py @@ -0,0 +1,53 @@ +import torch +import torch.nn as nn + +cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode'} + + +def build_object_from_class_name(cls_str, parent_cls, strict, *args, **kwargs): + import importlib + + pkg = ".".join(cls_str.split(".")[:-1]) + cls_name = cls_str.split(".")[-1] + cls_type = getattr(importlib.import_module(pkg), cls_name) + if parent_cls is not None: + assert issubclass(cls_type, parent_cls), f'| {cls_type} is not subclass of {parent_cls}.' + if strict: + return cls_type(*args, **kwargs) + return cls_type(*args, **filter_kwargs(kwargs, cls_type)) + + +def filter_kwargs(dict_to_filter, kwarg_obj): + import inspect + + sig = inspect.signature(kwarg_obj) + filter_keys = [param.name for param in sig.parameters.values() if param.kind == param.POSITIONAL_OR_KEYWORD] + filtered_dict = {filter_key: dict_to_filter[filter_key] for filter_key in filter_keys if + filter_key in dict_to_filter} + return filtered_dict + + +class shallow_adapt(nn.Module): + def __init__(self, parame, out_dims): + super().__init__() + self.parame = parame + + decodeparame=parame['shallow_diffusion_args']['aux_decoder_args'] + decodeparame[ 'encoder_hidden'] = parame['hidden_size'] + decodeparame['out_dims'] = out_dims + + self.model = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']], + nn.Module, + parame['shallow_diffusion_args']['aux_decode_strict_hparams'], + **decodeparame) + pass + + def forward(self, condition,gt_spec =None, infer=False): + if infer: + return self.model(condition) + else: + return self.model.losses(self.model(condition),gt_spec) + + pass + + diff --git a/modules/toplevel.py b/modules/toplevel.py index 41bdee28b..fd7af5c6c 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -17,6 +17,7 @@ from modules.fastspeech.param_adaptor import ParameterAdaptorModule from modules.fastspeech.tts_modules import RhythmRegulator, LengthRegulator from modules.fastspeech.variance_encoder import FastSpeech2Variance +from modules.shallow.shallow_adapter import shallow_adapt from utils.hparams import hparams @@ -50,9 +51,7 @@ def __init__(self, vocab_size, out_dims): self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False) if self.use_shallow_diffusion: # TODO: replace the following placeholder with real modules - self.aux_decoder = ExampleAuxDecoder( - out_dims=out_dims - ) + self.aux_decoder = shallow_adapt(hparams, out_dims) self.diffusion = GaussianDiffusion( out_dims=out_dims, @@ -90,7 +89,7 @@ def forward( else: if self.use_shallow_diffusion: # TODO: replace the following placeholder with real calling code - aux_out = self.aux_decoder(condition, infer=False) + aux_out = self.aux_decoder(condition, gt_spec=gt_mel, infer=False) else: aux_out = None x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False) diff --git a/training/acoustic_task.py b/training/acoustic_task.py index e34c774be..11ee5514c 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -113,8 +113,8 @@ def run_model(self, sample, infer=False): if self.use_shallow_diffusion: aux_out = output.aux_out # TODO: replace the following placeholder with real loss calculation - aux_mel_loss = self.aux_mel_loss(aux_out, target) - losses['aux_mel_loss'] = aux_mel_loss + # aux_mel_loss = self.aux_mel_loss(aux_out, target) + losses['aux_mel_loss'] = aux_out x_recon, x_noise = output.diff_out mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) losses['mel_loss'] = mel_loss From 6ad8fd2d740218cc6945a2d4e7336368e6e1354a Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Mon, 7 Aug 2023 12:33:14 +0800 Subject: [PATCH 05/33] add shallow farmwork --- modules/shallow/fast_speech2_decoder.py | 6 ++++-- modules/shallow/shallow_adapter.py | 13 ++++++------- modules/toplevel.py | 2 +- training/acoustic_task.py | 7 ++++--- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py index 5bd20c1f6..ab45cb4ce 100644 --- a/modules/shallow/fast_speech2_decoder.py +++ b/modules/shallow/fast_speech2_decoder.py @@ -69,8 +69,10 @@ def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_lay self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate) for _ in range(n_layers)]) self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - def losses(self,x,gt): - return nn.L1Loss()(x,gt) + + + def build_loss(self): + return nn.L1Loss() def forward(self, x): x=x.transpose(1, 2) diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py index 0e3a5526d..d98623894 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/shallow/shallow_adapter.py @@ -40,14 +40,13 @@ def __init__(self, parame, out_dims): nn.Module, parame['shallow_diffusion_args']['aux_decode_strict_hparams'], **decodeparame) - pass - def forward(self, condition,gt_spec =None, infer=False): - if infer: - return self.model(condition) - else: - return self.model.losses(self.model(condition),gt_spec) - pass + def forward(self, condition, infer=False): + + return self.model(condition) + + def get_loss(self): + return self.model.build_loss() diff --git a/modules/toplevel.py b/modules/toplevel.py index fd7af5c6c..f582d8b7a 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -89,7 +89,7 @@ def forward( else: if self.use_shallow_diffusion: # TODO: replace the following placeholder with real calling code - aux_out = self.aux_decoder(condition, gt_spec=gt_mel, infer=False) + aux_out = self.aux_decoder(condition, infer=False) else: aux_out = None x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False) diff --git a/training/acoustic_task.py b/training/acoustic_task.py index 11ee5514c..384fe147b 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -81,7 +81,8 @@ def build_model(self): def build_losses_and_metrics(self): if self.use_shallow_diffusion: # TODO: replace the following placeholder with real loss creation - self.aux_mel_loss = torch.nn.L1Loss() + self.aux_mel_loss =self.model.aux_decoder.get_loss() + self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type']) def run_model(self, sample, infer=False): @@ -113,8 +114,8 @@ def run_model(self, sample, infer=False): if self.use_shallow_diffusion: aux_out = output.aux_out # TODO: replace the following placeholder with real loss calculation - # aux_mel_loss = self.aux_mel_loss(aux_out, target) - losses['aux_mel_loss'] = aux_out + aux_mel_loss = self.aux_mel_loss(aux_out, target) + losses['aux_mel_loss'] = aux_mel_loss x_recon, x_noise = output.diff_out mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) losses['mel_loss'] = mel_loss From 6d936106271a7add818a5ce45c3c4c778d4cfae8 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Mon, 7 Aug 2023 14:52:44 +0800 Subject: [PATCH 06/33] Support lambda for aux mel loss --- configs/acoustic.yaml | 5 ++--- training/acoustic_task.py | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index e9d333b07..70b2b7943 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -78,15 +78,14 @@ diff_decoder_type: 'wavenet' diff_loss_type: l2 schedule_type: 'linear' +# shallow diffusion shallow_diffusion_args: train_aux_decoder: true train_diffusion: true aux_decoder_arch: fs2 aux_decode_strict_hparams: true aux_decoder_args: - shared_encoder: true -# arch: ps - # kernel_size: xxx +lambda_aux_mel_loss: 1.0 # train and eval num_sanity_val_steps: 1 diff --git a/training/acoustic_task.py b/training/acoustic_task.py index 384fe147b..e1116d4d0 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -81,7 +81,8 @@ def build_model(self): def build_losses_and_metrics(self): if self.use_shallow_diffusion: # TODO: replace the following placeholder with real loss creation - self.aux_mel_loss =self.model.aux_decoder.get_loss() + self.aux_mel_loss = self.model.aux_decoder.get_loss() + self.lambda_aux_mel_loss = hparams['lambda_aux_mel_loss'] self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type']) @@ -114,7 +115,7 @@ def run_model(self, sample, infer=False): if self.use_shallow_diffusion: aux_out = output.aux_out # TODO: replace the following placeholder with real loss calculation - aux_mel_loss = self.aux_mel_loss(aux_out, target) + aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target) losses['aux_mel_loss'] = aux_mel_loss x_recon, x_noise = output.diff_out mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) From b16c066c4364c6a18d9fffc1587ddc09a13efc53 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Mon, 7 Aug 2023 14:56:36 +0800 Subject: [PATCH 07/33] Move config key --- configs/acoustic.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 70b2b7943..64ca4ad0f 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -66,8 +66,6 @@ K_step: 1000 timesteps: 1000 max_beta: 0.02 rel_pos: true -use_shallow_diffusion: false -diff_depth: 400 diff_accelerator: ddim pndm_speedup: 10 hidden_size: 256 @@ -79,6 +77,8 @@ diff_loss_type: l2 schedule_type: 'linear' # shallow diffusion +use_shallow_diffusion: false +diff_depth: 400 shallow_diffusion_args: train_aux_decoder: true train_diffusion: true From 8f3a6228c07506aa06d0589d967f1119dff3604f Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Mon, 7 Aug 2023 15:33:04 +0800 Subject: [PATCH 08/33] add shallow farmework --- training/acoustic_task.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/acoustic_task.py b/training/acoustic_task.py index 384fe147b..c5ae633ef 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -114,7 +114,8 @@ def run_model(self, sample, infer=False): if self.use_shallow_diffusion: aux_out = output.aux_out # TODO: replace the following placeholder with real loss calculation - aux_mel_loss = self.aux_mel_loss(aux_out, target) + + aux_mel_loss = self.aux_mel_loss(aux_out, (target - (-5)) / (0 - (-5)) * 2 - 1) losses['aux_mel_loss'] = aux_mel_loss x_recon, x_noise = output.diff_out mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) From ac29eeb231ba4ceec390a24b4d80d91118563b94 Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Mon, 7 Aug 2023 20:01:39 +0800 Subject: [PATCH 09/33] add shallow farmework --- configs/acoustic.yaml | 2 +- modules/shallow/fast_speech2_decoder.py | 16 ++++++++++++++-- modules/shallow/shallow_adapter.py | 2 +- training/acoustic_task.py | 2 +- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 64ca4ad0f..c08b35468 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -84,7 +84,7 @@ shallow_diffusion_args: train_diffusion: true aux_decoder_arch: fs2 aux_decode_strict_hparams: true - aux_decoder_args: + aux_decoder_args: {} lambda_aux_mel_loss: 1.0 # train and eval diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py index ab45cb4ce..3c9e15350 100644 --- a/modules/shallow/fast_speech2_decoder.py +++ b/modules/shallow/fast_speech2_decoder.py @@ -62,6 +62,15 @@ def forward(self, x: torch.Tensor, ) -> torch.Tensor: return x +class fs2_loss(nn.Module): + def __init__(self): + super().__init__() + + def forward(self,y, x): + x=(x - (-5)) / (0 - (-5)) * 2 - 1 + return nn.L1Loss()(y,x) + + class fs2_decode(nn.Module): def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers): super().__init__() @@ -72,13 +81,16 @@ def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_lay def build_loss(self): - return nn.L1Loss() - def forward(self, x): + return fs2_loss() + + def forward(self, x,infer): x=x.transpose(1, 2) x=self.inconv(x) for i in self.conv: x=i(x) x=self.outconv(x).transpose(1, 2) + if infer: + (x + 1) / 2 * (0 - (-5)) + (-5) return x pass diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py index d98623894..981e1d517 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/shallow/shallow_adapter.py @@ -44,7 +44,7 @@ def __init__(self, parame, out_dims): def forward(self, condition, infer=False): - return self.model(condition) + return self.model(condition,infer) def get_loss(self): return self.model.build_loss() diff --git a/training/acoustic_task.py b/training/acoustic_task.py index f3742a79b..ae4815d0a 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -116,7 +116,7 @@ def run_model(self, sample, infer=False): aux_out = output.aux_out # TODO: replace the following placeholder with real loss calculation - aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, (target - (-5)) / (0 - (-5)) * 2 - 1) + aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target ) losses['aux_mel_loss'] = aux_mel_loss x_recon, x_noise = output.diff_out mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) From 5c687ebb7470288cd4b7844363536f2aefffc3e1 Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Mon, 7 Aug 2023 20:16:00 +0800 Subject: [PATCH 10/33] add denorm --- modules/shallow/fast_speech2_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py index 3c9e15350..539232150 100644 --- a/modules/shallow/fast_speech2_decoder.py +++ b/modules/shallow/fast_speech2_decoder.py @@ -91,6 +91,6 @@ def forward(self, x,infer): x=i(x) x=self.outconv(x).transpose(1, 2) if infer: - (x + 1) / 2 * (0 - (-5)) + (-5) + x=(x + 1) / 2 * (0 - (-5)) + (-5) return x pass From a47f9ae4979a904c94be1dabfd0b1076d22f2f55 Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Mon, 7 Aug 2023 22:21:57 +0800 Subject: [PATCH 11/33] add shallow model training switch --- modules/shallow/fast_speech2_decoder.py | 2 +- modules/shallow/shallow_adapter.py | 1 + modules/toplevel.py | 18 +++++++++++++++--- training/acoustic_task.py | 20 ++++++++++++++++---- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py index 539232150..50774e6da 100644 --- a/modules/shallow/fast_speech2_decoder.py +++ b/modules/shallow/fast_speech2_decoder.py @@ -72,7 +72,7 @@ def forward(self,y, x): class fs2_decode(nn.Module): - def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers): + def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame): super().__init__() self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate) for _ in range(n_layers)]) diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py index 981e1d517..48bb13c84 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/shallow/shallow_adapter.py @@ -35,6 +35,7 @@ def __init__(self, parame, out_dims): decodeparame=parame['shallow_diffusion_args']['aux_decoder_args'] decodeparame[ 'encoder_hidden'] = parame['hidden_size'] decodeparame['out_dims'] = out_dims + decodeparame['parame'] = parame self.model = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']], nn.Module, diff --git a/modules/toplevel.py b/modules/toplevel.py index f582d8b7a..46dec47f8 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -52,6 +52,8 @@ def __init__(self, vocab_size, out_dims): if self.use_shallow_diffusion: # TODO: replace the following placeholder with real modules self.aux_decoder = shallow_adapt(hparams, out_dims) + self.train_aux_decoder=hparams['shallow_diffusion_args']['train_aux_decoder'] + self.train_diffusion = hparams['shallow_diffusion_args']['train_diffusion'] self.diffusion = GaussianDiffusion( out_dims=out_dims, @@ -89,11 +91,21 @@ def forward( else: if self.use_shallow_diffusion: # TODO: replace the following placeholder with real calling code - aux_out = self.aux_decoder(condition, infer=False) + if self.train_aux_decoder: + aux_out = self.aux_decoder(condition, infer=False) + else: + aux_out = None + if self.train_diffusion: + x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False) + diff_out=(x_recon, noise) + else: + diff_out=None + return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out) + else: aux_out = None - x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False) - return ShallowDiffusionOutput(aux_out=aux_out, diff_out=(x_recon, noise)) + x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False) + return ShallowDiffusionOutput(aux_out=aux_out, diff_out=(x_recon, noise)) class DiffSingerVariance(ParameterAdaptorModule, CategorizedModule): diff --git a/training/acoustic_task.py b/training/acoustic_task.py index ae4815d0a..771a5607c 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -61,6 +61,10 @@ def __init__(self): super().__init__() self.dataset_cls = AcousticDataset self.use_shallow_diffusion = hparams['use_shallow_diffusion'] + if self.use_shallow_diffusion: + self.train_aux_decoder = hparams['shallow_diffusion_args']['train_aux_decoder'] + self.train_diffusion = hparams['shallow_diffusion_args']['train_diffusion'] + self.use_vocoder = hparams['infer'] or hparams['val_with_vocoder'] if self.use_vocoder: self.vocoder: BaseVocoder = get_vocoder_cls(hparams)() @@ -112,12 +116,20 @@ def run_model(self, sample, infer=False): return output else: losses = {} + if self.use_shallow_diffusion: - aux_out = output.aux_out - # TODO: replace the following placeholder with real loss calculation + if self.train_aux_decoder: + aux_out = output.aux_out + # TODO: replace the following placeholder with real loss calculation + + aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target) + losses['aux_mel_loss'] = aux_mel_loss + if self.train_diffusion : + x_recon, x_noise = output.diff_out + mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) + losses['mel_loss'] = mel_loss + return losses - aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target ) - losses['aux_mel_loss'] = aux_mel_loss x_recon, x_noise = output.diff_out mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) losses['mel_loss'] = mel_loss From 47086928f0484b1a8f8c13b5f5f703a1e7cd639c Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Mon, 7 Aug 2023 22:29:56 +0800 Subject: [PATCH 12/33] Limit gradient from aux decoder --- configs/acoustic.yaml | 1 + modules/toplevel.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index c08b35468..ba6204163 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -82,6 +82,7 @@ diff_depth: 400 shallow_diffusion_args: train_aux_decoder: true train_diffusion: true + aux_decoder_grad: 0.1 aux_decoder_arch: fs2 aux_decode_strict_hparams: true aux_decoder_args: {} diff --git a/modules/toplevel.py b/modules/toplevel.py index 46dec47f8..950d7d74f 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -50,10 +50,11 @@ def __init__(self, vocab_size, out_dims): self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False) if self.use_shallow_diffusion: - # TODO: replace the following placeholder with real modules + shallow_args = hparams['shallow_diffusion_args'] + self.train_aux_decoder = shallow_args['train_aux_decoder'] + self.train_diffusion = shallow_args['train_diffusion'] + self.aux_decoder_grad = shallow_args['aux_decoder_grad'] self.aux_decoder = shallow_adapt(hparams, out_dims) - self.train_aux_decoder=hparams['shallow_diffusion_args']['train_aux_decoder'] - self.train_diffusion = hparams['shallow_diffusion_args']['train_diffusion'] self.diffusion = GaussianDiffusion( out_dims=out_dims, @@ -92,14 +93,15 @@ def forward( if self.use_shallow_diffusion: # TODO: replace the following placeholder with real calling code if self.train_aux_decoder: - aux_out = self.aux_decoder(condition, infer=False) + aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad) + aux_out = self.aux_decoder(aux_cond, infer=False) else: aux_out = None if self.train_diffusion: x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False) - diff_out=(x_recon, noise) + diff_out = (x_recon, noise) else: - diff_out=None + diff_out = None return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out) else: From f449e04b02547cc8555232c2677789053a8159a7 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Mon, 7 Aug 2023 22:37:32 +0800 Subject: [PATCH 13/33] Improve loss calculation control flow --- training/acoustic_task.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/training/acoustic_task.py b/training/acoustic_task.py index 771a5607c..6969b5f5d 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -62,8 +62,9 @@ def __init__(self): self.dataset_cls = AcousticDataset self.use_shallow_diffusion = hparams['use_shallow_diffusion'] if self.use_shallow_diffusion: - self.train_aux_decoder = hparams['shallow_diffusion_args']['train_aux_decoder'] - self.train_diffusion = hparams['shallow_diffusion_args']['train_diffusion'] + shallow_args = hparams['shallow_diffusion_args'] + self.train_aux_decoder = shallow_args['train_aux_decoder'] + self.train_diffusion = shallow_args['train_diffusion'] self.use_vocoder = hparams['infer'] or hparams['val_with_vocoder'] if self.use_vocoder: @@ -84,10 +85,8 @@ def build_model(self): # noinspection PyAttributeOutsideInit def build_losses_and_metrics(self): if self.use_shallow_diffusion: - # TODO: replace the following placeholder with real loss creation self.aux_mel_loss = self.model.aux_decoder.get_loss() self.lambda_aux_mel_loss = hparams['lambda_aux_mel_loss'] - self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type']) def run_model(self, sample, infer=False): @@ -117,22 +116,15 @@ def run_model(self, sample, infer=False): else: losses = {} - if self.use_shallow_diffusion: - if self.train_aux_decoder: - aux_out = output.aux_out - # TODO: replace the following placeholder with real loss calculation - - aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target) - losses['aux_mel_loss'] = aux_mel_loss - if self.train_diffusion : - x_recon, x_noise = output.diff_out - mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) - losses['mel_loss'] = mel_loss - return losses + if output.aux_out is not None: + aux_out = output.aux_out + aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target) + losses['aux_mel_loss'] = aux_mel_loss - x_recon, x_noise = output.diff_out - mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) - losses['mel_loss'] = mel_loss + if output.diff_out is not None: + x_recon, x_noise = output.diff_out + mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float()) + losses['mel_loss'] = mel_loss return losses @@ -157,8 +149,10 @@ def _validation_step(self, sample, batch_idx): aux_mel=mel_out.aux_out, diff_mel=mel_out.diff_out, f0=sample['f0'] ) - self.plot_mel(batch_idx, sample['mel'], mel_out.aux_out, name=f'auxmel_{batch_idx}') - self.plot_mel(batch_idx, sample['mel'], mel_out.diff_out, name=f'diffmel_{batch_idx}') + if mel_out.aux_out is not None: + self.plot_mel(batch_idx, sample['mel'], mel_out.aux_out, name=f'auxmel_{batch_idx}') + if mel_out.diff_out is not None: + self.plot_mel(batch_idx, sample['mel'], mel_out.diff_out, name=f'diffmel_{batch_idx}') return losses, sample['size'] From 28a67ae9769d3daa6c5450060e57b5c00eaa2d46 Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Tue, 8 Aug 2023 12:30:47 +0800 Subject: [PATCH 14/33] add independent encoder in shallow --- configs/acoustic.yaml | 6 ++++- modules/shallow/shallow_adapter.py | 38 ++++++++++++++++++++++++------ modules/toplevel.py | 9 ++++--- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index ba6204163..29052088a 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -82,9 +82,13 @@ diff_depth: 400 shallow_diffusion_args: train_aux_decoder: true train_diffusion: true + aux_share_encoder: true + aux_encoder_strict_hparams: false + aux_encoder_arch: fs2 + aux_encoder_args: {} aux_decoder_grad: 0.1 aux_decoder_arch: fs2 - aux_decode_strict_hparams: true + aux_decoder_strict_hparams: true aux_decoder_args: {} lambda_aux_mel_loss: 1.0 diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py index 48bb13c84..f2811423d 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/shallow/shallow_adapter.py @@ -2,7 +2,7 @@ import torch.nn as nn cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode'} - +encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'} def build_object_from_class_name(cls_str, parent_cls, strict, *args, **kwargs): import importlib @@ -28,7 +28,7 @@ def filter_kwargs(dict_to_filter, kwarg_obj): class shallow_adapt(nn.Module): - def __init__(self, parame, out_dims): + def __init__(self, parame, out_dims,vocab_size): super().__init__() self.parame = parame @@ -37,17 +37,41 @@ def __init__(self, parame, out_dims): decodeparame['out_dims'] = out_dims decodeparame['parame'] = parame - self.model = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']], + encoderparame=parame['shallow_diffusion_args']['aux_encoder_args'] + encoderparame['parame'] = parame + encoderparame['vocab_size'] = vocab_size + self.decoder = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']], nn.Module, - parame['shallow_diffusion_args']['aux_decode_strict_hparams'], + parame['shallow_diffusion_args']['aux_decoder_strict_hparams'], **decodeparame) - def forward(self, condition, infer=False): + if not parame['shallow_diffusion_args']['aux_share_encoder']: + # todo + self.use_encoder=True + self.encoder=build_object_from_class_name(encoder_cls_map[parame['shallow_diffusion_args']['aux_encoder_arch']], + nn.Module, + parame['shallow_diffusion_args']['aux_encoder_strict_hparams'], + **encoderparame) + else: + self.use_encoder = False + + + + + + def forward(self, condition, infer=False,txt_tokens=None, mel2ph=None, f0=None, + key_shift=None, speed=None, + spk_embed_id=None, **kwargs): + + if self.use_encoder: + condition=self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, + key_shift=key_shift, speed=speed, + spk_embed_id=spk_embed_id, **kwargs) - return self.model(condition,infer) + return self.decoder(condition,infer) def get_loss(self): - return self.model.build_loss() + return self.decoder.build_loss() diff --git a/modules/toplevel.py b/modules/toplevel.py index 950d7d74f..0fd577add 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -54,7 +54,7 @@ def __init__(self, vocab_size, out_dims): self.train_aux_decoder = shallow_args['train_aux_decoder'] self.train_diffusion = shallow_args['train_diffusion'] self.aux_decoder_grad = shallow_args['aux_decoder_grad'] - self.aux_decoder = shallow_adapt(hparams, out_dims) + self.aux_decoder = shallow_adapt(hparams, out_dims,vocab_size) self.diffusion = GaussianDiffusion( out_dims=out_dims, @@ -82,7 +82,9 @@ def forward( if infer: if self.use_shallow_diffusion: - aux_mel_pred = self.aux_decoder(condition, infer=True) + aux_mel_pred = self.aux_decoder(condition, infer=True,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, + key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs) + aux_mel_pred *= ((mel2ph > 0).float()[:, :, None]) else: aux_mel_pred = None @@ -94,7 +96,8 @@ def forward( # TODO: replace the following placeholder with real calling code if self.train_aux_decoder: aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad) - aux_out = self.aux_decoder(aux_cond, infer=False) + aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, + key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs) else: aux_out = None if self.train_diffusion: From 144c7760b3a696dbd4a995c391473e34b2dc53d6 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 8 Aug 2023 12:42:28 +0800 Subject: [PATCH 15/33] Adjust lambda --- configs/acoustic.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 29052088a..82f2163a8 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -90,7 +90,7 @@ shallow_diffusion_args: aux_decoder_arch: fs2 aux_decoder_strict_hparams: true aux_decoder_args: {} -lambda_aux_mel_loss: 1.0 +lambda_aux_mel_loss: 0.2 # train and eval num_sanity_val_steps: 1 From e269708e3c32891de6beadfa5a73e2e62e1fd371 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 8 Aug 2023 12:43:17 +0800 Subject: [PATCH 16/33] Implement shallow diffusion There are some issues to resolve in DPM-Solver++ and UniPC --- modules/diffusion/ddpm.py | 44 ++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index d17070cb8..9b7d11eaf 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -216,15 +216,24 @@ def p_losses(self, x_start, t, cond, noise=None): return x_recon, noise - def inference(self, cond, b=1, src_spec=None, device=None): + def inference(self, cond, b=1, x_start=None, device=None): depth = hparams.get('diff_depth', self.k_step) - # TODO: implement shallow diffusion + noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) + if x_start is None or depth >= self.k_step: + t_max = self.k_step + x = noise + elif depth > 0: + t_max = depth + x = self.q_sample( + x_start, torch.full((b,), t_max - 1, device=device, dtype=torch.long), noise + ) + else: + t_max = 0 + x = x_start - t = self.k_step - shape = (b, self.num_feats, self.out_dims, cond.shape[2]) - x = torch.randn(shape, device=device) if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1: algorithm = hparams.get('diff_accelerator', 'ddim') + algorithm = 'pndm' if algorithm == 'dpm-solver': from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver # 1. Define the noise schedule. @@ -254,7 +263,7 @@ def wrapped(x, t, **kwargs): # costs and the sample quality. dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") - steps = t // hparams["pndm_speedup"] + steps = t_max // hparams["pndm_speedup"] self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False) x = dpm_solver.sample( x, @@ -292,7 +301,7 @@ def wrapped(x, t, **kwargs): # costs and the sample quality. uni_pc = UniPC(model_fn, noise_schedule, variant='bh2') - steps = t // hparams["pndm_speedup"] + steps = t_max // hparams["pndm_speedup"] self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False) x = uni_pc.sample( x, @@ -306,8 +315,8 @@ def wrapped(x, t, **kwargs): self.noise_list = deque(maxlen=4) iteration_interval = hparams['pndm_speedup'] for i in tqdm( - reversed(range(0, t, iteration_interval)), desc='sample time step', - total=t // iteration_interval, disable=not hparams['infer'], leave=False + reversed(range(0, t_max, iteration_interval)), desc='sample time step', + total=t_max // iteration_interval, disable=not hparams['infer'], leave=False ): x = self.p_sample_plms( x, torch.full((b,), i, device=device, dtype=torch.long), @@ -316,8 +325,8 @@ def wrapped(x, t, **kwargs): elif algorithm == 'ddim': iteration_interval = hparams['pndm_speedup'] for i in tqdm( - reversed(range(0, t, iteration_interval)), desc='sample time step', - total=t // iteration_interval, disable=not hparams['infer'], leave=False + reversed(range(0, t_max, iteration_interval)), desc='sample time step', + total=t_max // iteration_interval, disable=not hparams['infer'], leave=False ): x = self.p_sample_ddim( x, torch.full((b,), i, device=device, dtype=torch.long), @@ -326,7 +335,7 @@ def wrapped(x, t, **kwargs): else: raise NotImplementedError(algorithm) else: - for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t, + for i in tqdm(reversed(range(0, t_max)), desc='sample time step', total=t_max, disable=not hparams['infer'], leave=False): x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond) x = x.transpose(2, 3).squeeze(1) # [B, F, M, T] => [B, T, M] or [B, F, T, M] @@ -347,9 +356,14 @@ def forward(self, condition, gt_spec=None, src_spec=None, infer=True): t = torch.randint(0, self.k_step, (b,), device=device).long() return self.p_losses(spec, t, cond=cond) else: - # src_spec: [B, T, M] - # TODO: implement shallow diffusion - x = self.inference(cond, b=b, device=device) + # src_spec: [B, T, M] or [B, F, T, M] + if src_spec is not None: + spec = self.norm_spec(src_spec).transpose(-2, -1) + if self.num_feats == 1: + spec = spec[:, None, :, :] + else: + spec = None + x = self.inference(cond, b=b, x_start=spec, device=device) return self.denorm_spec(x) def norm_spec(self, x): From 52b3125f5cd41cfad81c24e8ffe5b922a940191b Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 8 Aug 2023 12:43:38 +0800 Subject: [PATCH 17/33] Fix missing depth assignment --- scripts/infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/infer.py b/scripts/infer.py index c53c7f81f..d73b6268c 100644 --- a/scripts/infer.py +++ b/scripts/infer.py @@ -111,6 +111,7 @@ def acoustic( if depth >= 0: assert depth <= hparams['K_step'], f'Diffusion depth should not be larger than K_step {hparams["K_step"]}.' + hparams['diff_depth'] = depth else: depth = hparams['K_step'] # gaussian start (full depth diffusion) From 030223be14a88afec44e8d3d9c4a8e87c0d0dfff Mon Sep 17 00:00:00 2001 From: "llc1995@sina.com" Date: Tue, 8 Aug 2023 14:05:41 +0800 Subject: [PATCH 18/33] fix bugs of shallow diffusion inference --- modules/diffusion/ddpm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index 9b7d11eaf..76bde96bf 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -219,7 +219,7 @@ def p_losses(self, x_start, t, cond, noise=None): def inference(self, cond, b=1, x_start=None, device=None): depth = hparams.get('diff_depth', self.k_step) noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) - if x_start is None or depth >= self.k_step: + if x_start is None or depth > self.k_step: t_max = self.k_step x = noise elif depth > 0: @@ -231,13 +231,13 @@ def inference(self, cond, b=1, x_start=None, device=None): t_max = 0 x = x_start - if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1: + if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and tmax > 0: algorithm = hparams.get('diff_accelerator', 'ddim') algorithm = 'pndm' if algorithm == 'dpm-solver': from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver # 1. Define the noise schedule. - noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas) + noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:tmax]) # 2. Convert your discrete-time `model` to the continuous-time # noise prediction model. Here is an example for a diffusion model @@ -276,7 +276,7 @@ def wrapped(x, t, **kwargs): elif algorithm == 'unipc': from inference.uni_pc import NoiseScheduleVP, model_wrapper, UniPC # 1. Define the noise schedule. - noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas) + noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:tmax]) # 2. Convert your discrete-time `model` to the continuous-time # noise prediction model. Here is an example for a diffusion model From 39bdcb85e4f78858239c6664ec1b1b58639899af Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 8 Aug 2023 14:29:59 +0800 Subject: [PATCH 19/33] Fix errors and remove debug code --- modules/diffusion/ddpm.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index 76bde96bf..6dadbf443 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -219,7 +219,7 @@ def p_losses(self, x_start, t, cond, noise=None): def inference(self, cond, b=1, x_start=None, device=None): depth = hparams.get('diff_depth', self.k_step) noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) - if x_start is None or depth > self.k_step: + if x_start is None or depth >= self.k_step: t_max = self.k_step x = noise elif depth > 0: @@ -231,13 +231,12 @@ def inference(self, cond, b=1, x_start=None, device=None): t_max = 0 x = x_start - if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and tmax > 0: + if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0: algorithm = hparams.get('diff_accelerator', 'ddim') - algorithm = 'pndm' if algorithm == 'dpm-solver': from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver # 1. Define the noise schedule. - noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:tmax]) + noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t_max]) # 2. Convert your discrete-time `model` to the continuous-time # noise prediction model. Here is an example for a diffusion model @@ -276,7 +275,7 @@ def wrapped(x, t, **kwargs): elif algorithm == 'unipc': from inference.uni_pc import NoiseScheduleVP, model_wrapper, UniPC # 1. Define the noise schedule. - noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:tmax]) + noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t_max]) # 2. Convert your discrete-time `model` to the continuous-time # noise prediction model. Here is an example for a diffusion model From eb114d6a2e9bad7b4f134fc70692f10c9853b199 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 8 Aug 2023 14:46:01 +0800 Subject: [PATCH 20/33] Support K_step < timesteps (shallow-only diffusion) --- modules/diffusion/ddpm.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index 6dadbf443..62f6d5bb5 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -81,6 +81,12 @@ def __init__(self, out_dims, num_feats=1, timesteps=1000, k_step=1000, alphas_cumprod = np.cumprod(alphas, axis=0) alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) + self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False) + if self.use_shallow_diffusion: + assert k_step <= timesteps, 'K_step should not be larger than timesteps.' + else: + assert k_step == timesteps, 'K_step must equal timesteps if use_shallow_diffusion is False.' + self.timesteps = timesteps self.k_step = k_step self.noise_list = deque(maxlen=4) @@ -219,16 +225,19 @@ def p_losses(self, x_start, t, cond, noise=None): def inference(self, cond, b=1, x_start=None, device=None): depth = hparams.get('diff_depth', self.k_step) noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) - if x_start is None or depth >= self.k_step: + if self.use_shallow_diffusion: + t_max = min(depth, self.k_step) + else: t_max = self.k_step + + if t_max >= self.timesteps: x = noise - elif depth > 0: - t_max = depth + elif t_max > 0: + assert x_start is not None, 'Missing shallow diffusion source.' x = self.q_sample( x_start, torch.full((b,), t_max - 1, device=device, dtype=torch.long), noise ) else: - t_max = 0 x = x_start if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0: From 348e7cc84f900f4dd3eb991db9743e62d43a8f41 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 8 Aug 2023 15:59:04 +0800 Subject: [PATCH 21/33] Fix argument passing --- modules/toplevel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/toplevel.py b/modules/toplevel.py index 0fd577add..e915f9dc9 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -228,7 +228,7 @@ def forward( ] condition += torch.stack(variance_embeds, dim=-1).sum(-1) - variance_outputs = self.variance_predictor(condition, variance_inputs, infer) + variance_outputs = self.variance_predictor(condition, variance_inputs, infer=infer) if infer: variances_pred_out = self.collect_variance_outputs(variance_outputs) From 554c4ac692e1c15e1dd4ab459442495f4c6a3497 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Wed, 9 Aug 2023 14:41:56 +0800 Subject: [PATCH 22/33] Add missing checks --- modules/diffusion/ddpm.py | 1 + scripts/infer.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index 62f6d5bb5..7c4215bf1 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -238,6 +238,7 @@ def inference(self, cond, b=1, x_start=None, device=None): x_start, torch.full((b,), t_max - 1, device=device, dtype=torch.long), noise ) else: + assert x_start is not None, 'Missing shallow diffusion source.' x = x_start if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0: diff --git a/scripts/infer.py b/scripts/infer.py index d73b6268c..0d6b8f5eb 100644 --- a/scripts/infer.py +++ b/scripts/infer.py @@ -112,6 +112,8 @@ def acoustic( if depth >= 0: assert depth <= hparams['K_step'], f'Diffusion depth should not be larger than K_step {hparams["K_step"]}.' hparams['diff_depth'] = depth + elif hparams.get('use_shallow_diffusion', False): + depth = hparams['diff_depth'] else: depth = hparams['K_step'] # gaussian start (full depth diffusion) From b04b0391e98c55e85a2b969197e765d346ecbf9c Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Thu, 17 Aug 2023 19:09:23 +0800 Subject: [PATCH 23/33] add glow decoder --- modules/shallow/fast_speech2_decoder.py | 2 +- modules/shallow/fs2_decoder.py | 300 +++++++ modules/shallow/glow.py | 1000 +++++++++++++++++++++++ modules/shallow/light_decoder.py | 109 +++ modules/shallow/noise_decoder.py | 100 +++ modules/shallow/shallow_adapter.py | 53 +- modules/toplevel.py | 2 +- 7 files changed, 1537 insertions(+), 29 deletions(-) create mode 100644 modules/shallow/fs2_decoder.py create mode 100644 modules/shallow/glow.py create mode 100644 modules/shallow/light_decoder.py create mode 100644 modules/shallow/noise_decoder.py diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py index 50774e6da..ec264f3ce 100644 --- a/modules/shallow/fast_speech2_decoder.py +++ b/modules/shallow/fast_speech2_decoder.py @@ -84,7 +84,7 @@ def build_loss(self): return fs2_loss() - def forward(self, x,infer): + def forward(self, x,infer,**kwargs): x=x.transpose(1, 2) x=self.inconv(x) for i in self.conv: diff --git a/modules/shallow/fs2_decoder.py b/modules/shallow/fs2_decoder.py new file mode 100644 index 000000000..073819dd1 --- /dev/null +++ b/modules/shallow/fs2_decoder.py @@ -0,0 +1,300 @@ +import math + +import torch +from torch import nn +import torch.nn.functional as F + + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape +class RelativeFFTBlock(nn.Module): + """ FFT Block with Relative Multi-Head Attention """ + + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=None, block_length=None): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads, + window_size=window_size, p_dropout=p_dropout, block_length=block_length)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append(FFN( + hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask=None): + + if x_mask is not None: + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + else: + attn_mask = None + + for i in range(self.n_layers): + if x_mask is not None: + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + if x_mask is not None: + x = x * x_mask + return x + + +class RelativeSelfAttention(nn.Module): + """ Relative Multi-Head Attention """ + + def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., block_length=None, proximal_bias=False, proximal_init=False): + super(RelativeSelfAttention, self).__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter(torch.randn( + n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn( + n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, + t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, + t_s).transpose(2, 3) + + scores = torch.matmul(query, key.transpose(-2, -1) + ) / math.sqrt(self.k_channels) + if self.window_size is not None: + assert t_s == t_t, "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query, key_relative_embeddings) + rel_logits = self._relative_position_to_absolute_position( + rel_logits) + scores_local = rel_logits / math.sqrt(self.k_channels) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + \ + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + block_mask = torch.ones_like( + scores).triu(-self.block_length).tril(self.block_length) + scores = scores * block_mask + -1e4*(1 - block_mask) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position( + p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s) + output = output + \ + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings) + output = output.transpose(2, 3).contiguous().view( + b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[:, + slice_start_position:slice_end_position] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, convert_pad_shape( + [[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad(x_flat, convert_pad_shape( + [[0, 0], [0, 0], [0, length-1]])) + + # Reshape and slice out the padded elements. + x_final = x_flat.view( + [batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] + return x_final + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + # padd along column + x = F.pad(x, convert_pad_shape( + [[0, 0], [0, 0], [0, 0], [0, length-1]])) + x_flat = x.view([batch, heads, length**2 + length*(length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, convert_pad_shape( + [[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2*length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """ + Bias for self-attention to encourage attention to close positions. + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-4): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean)**2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class FFN(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + + self.conv = nn.Conv1d( + in_channels, out_channels, kernel_size, padding=kernel_size//2) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask=None): + if x_mask is not None: + x = self.conv(x * x_mask) + else: + x = self.conv(x ) + + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + if x_mask is not None: + x=x * x_mask + return x + + +class fs2_loss(nn.Module): + def __init__(self): + super().__init__() + + def forward(self,y, x): + x=(x - (-5)) / (0 - (-5)) * 2 - 1 + return nn.L1Loss()(y,x) + + +class attention_fs2_decoder(nn.Module): + def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,n_heads,attention_ffn_kernel_size,parame): + super().__init__() + self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.conv = RelativeFFTBlock(hidden_channels=n_chans,filter_channels=n_chans*4, n_heads=n_heads, n_layers=n_layers, kernel_size=attention_ffn_kernel_size, p_dropout=dropout_rate) + self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + + + + def build_loss(self): + + return fs2_loss() + + def forward(self, x,infer,**kwargs): + x=x.transpose(1, 2) + x=self.inconv(x) + + + x=self.conv(x) + x=self.outconv(x).transpose(1, 2) + if infer: + x=(x + 1) / 2 * (0 - (-5)) + (-5) + return x + pass diff --git a/modules/shallow/glow.py b/modules/shallow/glow.py new file mode 100644 index 000000000..c4be2b6ee --- /dev/null +++ b/modules/shallow/glow.py @@ -0,0 +1,1000 @@ +import math +from typing import Optional + +import torch +import torch.nn as nn + +import torch.nn.functional as F + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +class RelativeFFTBlock(nn.Module): + """ FFT Block with Relative Multi-Head Attention """ + + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., + window_size=None, block_length=None): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads, + window_size=window_size, p_dropout=p_dropout, + block_length=block_length)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append(FFN( + hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask=None): + + if x_mask is not None: + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + else: + attn_mask = None + + for i in range(self.n_layers): + if x_mask is not None: + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + if x_mask is not None: + x = x * x_mask + return x + + +class RelativeSelfAttention(nn.Module): + """ Relative Multi-Head Attention """ + + def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., + block_length=None, proximal_bias=False, proximal_init=False): + super(RelativeSelfAttention, self).__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels ** -0.5 + self.emb_rel_k = nn.Parameter(torch.randn( + n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn( + n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, + t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, + t_s).transpose(2, 3) + + scores = torch.matmul(query, key.transpose(-2, -1) + ) / math.sqrt(self.k_channels) + if self.window_size is not None: + assert t_s == t_t, "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query, key_relative_embeddings) + rel_logits = self._relative_position_to_absolute_position( + rel_logits) + scores_local = rel_logits / math.sqrt(self.k_channels) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + \ + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + block_mask = torch.ones_like( + scores).triu(-self.block_length).tril(self.block_length) + scores = scores * block_mask + -1e4 * (1 - block_mask) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position( + p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s) + output = output + \ + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings) + output = output.transpose(2, 3).contiguous().view( + b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[:, + slice_start_position:slice_end_position] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, convert_pad_shape( + [[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad(x_flat, convert_pad_shape( + [[0, 0], [0, 0], [0, length - 1]])) + + # Reshape and slice out the padded elements. + x_final = x_flat.view( + [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:] + return x_final + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + # padd along column + x = F.pad(x, convert_pad_shape( + [[0, 0], [0, 0], [0, 0], [0, length - 1]])) + x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, convert_pad_shape( + [[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """ + Bias for self-attention to encourage attention to close positions. + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-4): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean) ** 2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class FFN(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + + self.conv = nn.Conv1d( + in_channels, out_channels, kernel_size, padding=kernel_size // 2) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask=None): + if x_mask is not None: + x = self.conv(x * x_mask) + else: + x = self.conv(x) + + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + if x_mask is not None: + x = x * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +Conv1dModel = nn.Conv1d # 有毒 删 + + +class Depthwise_Separable_Conv1D(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=True, + padding_mode='zeros', # TODO: refine this type + device=None, + dtype=None + ): + super().__init__() + self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, + groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias, + padding_mode=padding_mode, device=device, dtype=dtype) + self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, + device=device, dtype=dtype) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + +def set_Conv1dModel(use_depthwise_conv): + global Conv1dModel + Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +@torch.jit.script +def add_and_GRU(input_a, input_b): + in_act = input_a + input_b + x1, x2 = in_act.chunk(2, dim=1) + t_act = torch.tanh(x2) + s_act = torch.sigmoid(x1) + acts = t_act * s_act + return acts + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert (kernel_size % 2 == 1) + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels # condition用的 + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + self.condition_layers = torch.nn.ModuleList() + + # if gin_channels != 0: + # cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) + # # self.cond_layer = weight_norm_modules(cond_layer, name='weight') + # self.cond_layer=cond_layer + + for i in range(n_layers): + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1) + # self.cond_layer = weight_norm_modules(cond_layer, name='weight') + # self.cond_layer = cond_layer + else: + cond_layer = nn.Identity() + self.condition_layers.append(cond_layer) + + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size, + dilation=dilation, padding=padding) + # in_layer = weight_norm_modules(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask=None, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + # if g is not None: + # g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + + if g is not None: + + condition = self.condition_layers[i](g) + else: + condition = torch.zeros_like(x_in) + + # acts = fused_add_tanh_sigmoid_multiply( # GRU 这不就是wavnet的那个 GRU + # x_in, + # condition, + # n_channels_tensor) + acts = add_and_GRU( # GRU 这不就是wavnet的那个 GRU + x_in, + condition, + ) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, :self.hidden_channels, :] + if x_mask is not None: + x = (x + res_acts) * x_mask + else: + x = x + res_acts + output = output + res_skip_acts[:, self.hidden_channels:, :] + else: + output = output + res_skip_acts + + if x_mask is not None: + out = output * x_mask + else: + out = output + return out + + # def remove_weight_norm(self): + # if self.gin_channels != 0: + # remove_weight_norm_modules(self.cond_layer) + # for l in self.in_layers: + # remove_weight_norm_modules(l) + # for l in self.res_skip_layers: + # remove_weight_norm_modules(l) + + +class ResidualCouplingLayer(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + wn_sharing_parameter=None # 不明的共享权重 + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, + gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask=None, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + if x_mask is not None: + h = self.pre(x0) * x_mask + else: + h = self.pre(x0) + h = self.enc(h, x_mask, g=g) + + if x_mask is not None: + stats = self.post(h) * x_mask + else: + stats = self.post(h) + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + if x_mask is not None: + x1 = m + x1 * torch.exp(logs) * x_mask + else: + x1 = m + x1 * torch.exp(logs) + # x1 = m + x1 * torch.exp(logs) * x_mask # 逆过程 + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + if x_mask is not None: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + else: + x1 = (x1 - m) * torch.exp(-logs) + # x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + +class ResidualCouplingBlock(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + share_parameter=False + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + + self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, + gin_channels=gin_channels) if share_parameter else None + + for i in range(n_flows): + self.flows.append( + ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, + gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn)) + self.flows.append(Flip()) + + def forward(self, x, x_mask=None, g=None, reverse=False): + if not reverse: + logdet_tot = 0 + for flow in self.flows: + x, logdet = flow(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + logdet_tot = None + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x, logdet_tot + + +# class TextEncoder(nn.Module): +# def __init__(self, +# out_channels, +# hidden_channels, +# kernel_size, +# n_layers, +# gin_channels=0, +# filter_channels=None, +# n_heads=None, +# p_dropout=None): +# super().__init__() +# self.out_channels = out_channels +# self.hidden_channels = hidden_channels +# self.kernel_size = kernel_size +# self.n_layers = n_layers +# self.gin_channels = gin_channels +# self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) +# self.f0_emb = nn.Embedding(256, hidden_channels) +# +# self.enc_ = attentions.Encoder( +# hidden_channels, +# filter_channels, +# n_heads, +# n_layers, +# kernel_size, +# p_dropout) +# +# def forward(self, x, x_mask, f0=None, noice_scale=1): +# x = x + self.f0_emb(f0).transpose(1, 2) +# x = self.enc_(x * x_mask, x_mask) +# stats = self.proj(x) * x_mask +# m, logs = torch.split(stats, self.out_channels, dim=1) +# z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask +# +# return z, m, logs, x_mask + + +class ConvNeXtBlock(nn.Module): + """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. + + Args: + dim (int): Number of input channels. + intermediate_dim (int): Dimensionality of the intermediate layer. + layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. + Defaults to None. + adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. + None means non-conditional LayerNorm. Defaults to None. + """ + + def __init__( + self, + dim: int, + intermediate_dim: int, + layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0 + + ): + super().__init__() + self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + + self.norm = nn.LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(intermediate_dim, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() + + def forward(self, x: torch.Tensor, ) -> torch.Tensor: + residual = x + x = self.dwconv(x) + x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) + + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) + x = self.dropout(x) + + x = residual + self.drop_path(x) + return x + + +class condition_latent_encoder_att(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, 1) + + self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, + n_layers=n_layers, + kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) + + self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) + + def forward(self, x, noice_scale=1): + x = self.proj_in(x) + + x = self.enc(x) + stats = self.proj_out(x) + m, logs = torch.chunk(stats, 2, 1) + z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) + + return z, m, logs, + + +class condition_latent_encoder_convnext(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, 1) + + self.conv = nn.ModuleList( + [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, + drop_out=dropout_rate) for _ in range(n_layers)]) + + self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) + + def forward(self, x, noice_scale=1): + x = self.proj_in(x) + + for i in self.conv: + x = i(x) + stats = self.proj_out(x) + m, logs = torch.chunk(stats, 2, 1) + z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) + + return z, m, logs, + + +class condition_encoder_att(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, + n_layers=n_layers, + kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) + + self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + def forward(self, x, ): + x = self.proj_in(x) + + x = self.enc(x) + stats = self.proj_out(x) + + return stats + + +class condition_encoder_convnext(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + self.conv = nn.ModuleList( + [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, + drop_out=dropout_rate) for _ in range(n_layers)]) + + self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + def forward(self, x, ): + x = self.proj_in(x) + + for i in self.conv: + x = i(x) + stats = self.proj_out(x) + + return stats + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads, + latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, + + condition_in_chans, + + condition_encoder_hidden_channels, + condition_encoder_n_heads, + condition_encoder_n_layers, + condition_encoder_kernel_size, + condition_encoder_dropout_rate, + + inter_channels, + hidden_channels, + + condition_channels, + + condition_encoder_filter_channels=None, + + latent_encoder_filter_channels=None, + + use_depthwise_conv=False, + + flow_share_parameter=False, + n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True, + ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention', + + **kwargs): + + super().__init__() + self.inter_channels = inter_channels + self.ues_condition = ues_condition + + self.use_latent = use_latent + + if use_latent_encoder and use_latent: + if latent_encoder_type == 'attention': + self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans, + out_channels=inter_channels, + n_chans=latent_encoder_hidden_channels, + n_heads=latent_encoder_n_heads, + n_layers=latent_encoder_n_layers, + condition_encoder_kernel_size=latent_encoder_kernel_size, + dropout_rate=latent_encoder_dropout_rate, + filter_channels=latent_encoder_filter_channels) + elif latent_encoder_type == 'convnext': + self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans, + out_channels=inter_channels, + n_chans=latent_encoder_hidden_channels, + n_heads=None, + n_layers=latent_encoder_n_layers, + condition_encoder_kernel_size=None, + dropout_rate=latent_encoder_dropout_rate, + filter_channels=latent_encoder_filter_channels) + else: + raise RuntimeError("unsupport_latent_encoder") + + elif ((not use_latent_encoder) and use_latent): + self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3) + + if ues_condition_encoder and ues_condition: + if condition_encoder_type == 'attention': + self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans, + out_channels=condition_channels, + n_chans=condition_encoder_hidden_channels, + n_heads=condition_encoder_n_heads, + n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + dropout_rate=condition_encoder_dropout_rate, + filter_channels=condition_encoder_filter_channels) + elif condition_encoder_type == 'convnext': + self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans, + out_channels=condition_channels, + n_chans=condition_encoder_hidden_channels, + n_heads=None, + n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + dropout_rate=condition_encoder_dropout_rate, + filter_channels=condition_encoder_filter_channels) + else: + raise RuntimeError("unsupport__encoder") + elif ((not ues_condition_encoder) and ues_condition): + self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3) + + self.use_depthwise_conv = use_depthwise_conv + + # self.enc_p = TextEncoder( + # inter_channels, + # hidden_channels, + # filter_channels=filter_channels, + # n_heads=n_heads, + # n_layers=n_layers, + # kernel_size=kernel_size, + # p_dropout=p_dropout + # ) + + set_Conv1dModel(self.use_depthwise_conv) + + if ues_condition: + condition_channelsw = condition_channels + else: + condition_channelsw = 0 + + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, + gin_channels=condition_channelsw, share_parameter=flow_share_parameter) + + def forward(self, c, mel, x_mask=None): + + # vol proj + + # f0 predict + + # encoder + if self.use_latent: + z_ptemp, m_p, logs_p = self.latent_encoder(c) + else: + m_p, logs_p = None, None + # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) + + # flow + if self.ues_condition: + condition = self.condition_encoder(c) + z_p, logdet = self.flow(mel, x_mask, g=condition) + else: + z_p, logdet = self.flow(mel, x_mask, g=None) + + return x_mask, (z_p, m_p, logs_p), logdet, + + @torch.no_grad() + def infer(self, c, noice_scale=0.35, seed=None, ): + if seed is not None: + + if c.device == torch.device("cuda"): + torch.cuda.manual_seed_all(seed) + else: + torch.manual_seed(seed) + + if self.use_latent: + z_p, m_p, logs_p = self.latent_encoder(c) + else: + z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale + + # vol proj + + # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) + # o, _ = self.flow(z_p, g=g, reverse=True) + + if self.ues_condition: + condition = self.condition_encoder(c) + # z_p, logdet = self.flow(mel, x_mask, g=condition) + o, _ = self.flow(z_p, g=condition, reverse=True) + else: + o, _ = self.flow(z_p, g=None, reverse=True) + + return o + + +class glow_loss_L(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, pack_loss,target): + + z, m, logs, logdet, mask = pack_loss + # z, m, logs, logdet, mask = None + + l = torch.sum(logs) + 0.5 * torch.sum( + torch.exp(-2 * logs) * ((z - m) ** 2)) # neg normal likelihood w/o the constant term + l = l - torch.sum(logdet) # log jacobian determinant + if mask is not None: + l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes + else: + l = l / torch.sum(torch.ones_like(z)) # averaging across batch, channel and time axes + l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term + return l + + +class glow_decoder(nn.Module): + def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads, + latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, + condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers, + condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels, + flow_condition_channels, parame,flow_infer_seed=None,flow_infer_scale=0.35, + condition_encoder_filter_channels=None, + + latent_encoder_filter_channels=None, + + use_depthwise_conv=False, + + flow_share_parameter=False, + n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, + use_latent=True, + ues_condition_encoder=False, ues_condition=False, + condition_encoder_type='attention'): + super().__init__() + self.use_latent=use_latent + self.flow_infer_seed=flow_infer_seed + self.flow_infer_scale=flow_infer_scale + self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels, + latent_encoder_n_heads=latent_encoder_n_heads, + latent_encoder_n_layers=latent_encoder_n_layers, + latent_encoder_kernel_size=latent_encoder_kernel_size, + latent_encoder_dropout_rate=latent_encoder_dropout_rate, + + condition_in_chans=encoder_hidden, + + condition_encoder_hidden_channels=condition_encoder_hidden_channels, + condition_encoder_n_heads=condition_encoder_n_heads, + condition_encoder_n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + condition_encoder_dropout_rate=condition_encoder_dropout_rate, + + inter_channels=out_dims, + hidden_channels=flow_hidden_channels, + + condition_channels=flow_condition_channels, + + condition_encoder_filter_channels=condition_encoder_filter_channels, + + latent_encoder_filter_channels=latent_encoder_filter_channels, + + use_depthwise_conv=use_depthwise_conv, + + flow_share_parameter=flow_share_parameter, + n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type, + use_latent_encoder=use_latent_encoder, + use_latent=use_latent, + ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition, + condition_encoder_type=condition_encoder_type) + + def build_loss(self): + if self.use_latent: + + return glow_loss_L() + + def forward(self, x, infer, x_gt): + + if infer: + out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) + return out + else: + + + x = x.transpose(1, 2) + x_gt=x_gt.transpose(1, 2) + + x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt) + pack_loss=(z_p, m_p, logs_p, logdet, x_mask ) + return pack_loss + + + + + pass diff --git a/modules/shallow/light_decoder.py b/modules/shallow/light_decoder.py new file mode 100644 index 000000000..bb2624765 --- /dev/null +++ b/modules/shallow/light_decoder.py @@ -0,0 +1,109 @@ +from typing import Optional + +import torch +import torch.nn as nn + +class GLU(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + out, gate = x.chunk(2, dim=self.dim) + return out * gate.sigmoid() + + +class ConvNeXtBlock(nn.Module): + """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. + + Args: + dim (int): Number of input channels. + intermediate_dim (int): Dimensionality of the intermediate layer. + layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. + Defaults to None. + adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. + None means non-conditional LayerNorm. Defaults to None. + """ + + def __init__( + self, + dim: int, + intermediate_dim: int, + layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0 + + ): + super().__init__() + self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + + + + self.norm = nn.LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.act2=GLU(2) + self.pwconv2 = nn.Linear(intermediate_dim//2, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() + + + + def forward(self, x: torch.Tensor, ) -> torch.Tensor: + residual = x + x=self.act(x) + x = self.dwconv(x) + + x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) + + + x = self.norm(x) + x = self.pwconv1(x) + x = self.act2(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) + x=self.dropout(x) + + x = residual + self.drop_path (x) + return x + + +class fs2_loss(nn.Module): + def __init__(self): + super().__init__() + + def forward(self,y, x): + x=(x - (-5)) / (0 - (-5)) * 2 - 1 + return nn.L1Loss()(y,x) + + +class noise_decoder(nn.Module): + def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame): + super().__init__() + self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate) for _ in range(n_layers)]) + self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + + + + def build_loss(self): + + return fs2_loss() + + def forward(self, x,infer,**kwargs): + x=x.transpose(1, 2) + x=self.inconv(x) + + for i in self.conv: + x=i(x) + x=self.outconv(x).transpose(1, 2) + if infer: + x=(x + 1) / 2 * (0 - (-5)) + (-5) + return x + pass diff --git a/modules/shallow/noise_decoder.py b/modules/shallow/noise_decoder.py new file mode 100644 index 000000000..862caf911 --- /dev/null +++ b/modules/shallow/noise_decoder.py @@ -0,0 +1,100 @@ +from typing import Optional + +import torch +import torch.nn as nn + + + + +class ConvNeXtBlock(nn.Module): + """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. + + Args: + dim (int): Number of input channels. + intermediate_dim (int): Dimensionality of the intermediate layer. + layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. + Defaults to None. + adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. + None means non-conditional LayerNorm. Defaults to None. + """ + + def __init__( + self, + dim: int, + intermediate_dim: int, + layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0 + + ): + super().__init__() + self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + + + + self.norm = nn.LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(intermediate_dim, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() + self.con = nn.Conv1d(dim, dim, kernel_size=1, ) + + + def forward(self, x: torch.Tensor,y ) -> torch.Tensor: + residual = x + x = self.dwconv(x) + x=x+self.con(y) + x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) + + + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) + x=self.dropout(x) + + x = residual + self.drop_path (x) + return x + + +class fs2_loss(nn.Module): + def __init__(self): + super().__init__() + + def forward(self,y, x): + x=(x - (-5)) / (0 - (-5)) * 2 - 1 + return nn.L1Loss()(y,x) + + +class noise_decoder(nn.Module): + def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame): + super().__init__() + self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate) for _ in range(n_layers)]) + self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + + + + def build_loss(self): + + return fs2_loss() + + def forward(self, x,infer,**kwargs): + x=x.transpose(1, 2) + x=self.inconv(x) + y=torch.randn_like(x) + for i in self.conv: + y=i(y,x) + x=self.outconv(y).transpose(1, 2) + if infer: + x=(x + 1) / 2 * (0 - (-5)) + (-5) + return x + pass diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py index f2811423d..11cf5d29f 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/shallow/shallow_adapter.py @@ -1,9 +1,13 @@ import torch import torch.nn as nn -cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode'} +cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode', + 'ns': 'modules.shallow.noise_decoder.noise_decoder', 'ld': 'modules.shallow.light_decoder.noise_decoder','att_fs2':'modules.shallow.fs2_decoder.attention_fs2_decoder' + ,'glow':'modules.shallow.glow.glow_decoder' + } encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'} + def build_object_from_class_name(cls_str, parent_cls, strict, *args, **kwargs): import importlib @@ -28,50 +32,45 @@ def filter_kwargs(dict_to_filter, kwarg_obj): class shallow_adapt(nn.Module): - def __init__(self, parame, out_dims,vocab_size): + def __init__(self, parame, out_dims, vocab_size): super().__init__() self.parame = parame - decodeparame=parame['shallow_diffusion_args']['aux_decoder_args'] - decodeparame[ 'encoder_hidden'] = parame['hidden_size'] + decodeparame = parame['shallow_diffusion_args']['aux_decoder_args'] + if decodeparame.get('encoder_hidden') is None: + decodeparame['encoder_hidden'] = parame['hidden_size'] decodeparame['out_dims'] = out_dims decodeparame['parame'] = parame - encoderparame=parame['shallow_diffusion_args']['aux_encoder_args'] + encoderparame = parame['shallow_diffusion_args']['aux_encoder_args'] encoderparame['parame'] = parame encoderparame['vocab_size'] = vocab_size self.decoder = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']], - nn.Module, - parame['shallow_diffusion_args']['aux_decoder_strict_hparams'], - **decodeparame) - + nn.Module, + parame['shallow_diffusion_args']['aux_decoder_strict_hparams'], + **decodeparame) if not parame['shallow_diffusion_args']['aux_share_encoder']: # todo - self.use_encoder=True - self.encoder=build_object_from_class_name(encoder_cls_map[parame['shallow_diffusion_args']['aux_encoder_arch']], - nn.Module, - parame['shallow_diffusion_args']['aux_encoder_strict_hparams'], - **encoderparame) + self.use_encoder = True + self.encoder = build_object_from_class_name( + encoder_cls_map[parame['shallow_diffusion_args']['aux_encoder_arch']], + nn.Module, + parame['shallow_diffusion_args']['aux_encoder_strict_hparams'], + **encoderparame) else: self.use_encoder = False - - - - - def forward(self, condition, infer=False,txt_tokens=None, mel2ph=None, f0=None, - key_shift=None, speed=None, - spk_embed_id=None, **kwargs): + def forward(self, condition, infer=False, txt_tokens=None, mel2ph=None, f0=None, + key_shift=None, speed=None, + spk_embed_id=None,gt_mel=None, **kwargs): if self.use_encoder: - condition=self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, - key_shift=key_shift, speed=speed, - spk_embed_id=spk_embed_id, **kwargs) + condition = self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, + key_shift=key_shift, speed=speed, + spk_embed_id=spk_embed_id, **kwargs) - return self.decoder(condition,infer) + return self.decoder(condition, infer,gt_mel) def get_loss(self): return self.decoder.build_loss() - - diff --git a/modules/toplevel.py b/modules/toplevel.py index 0fd577add..41a1fe939 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -97,7 +97,7 @@ def forward( if self.train_aux_decoder: aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad) aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, - key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs) + key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id,gt_mel=gt_mel, **kwargs) else: aux_out = None if self.train_diffusion: From 3a4e77a059d994ed4311cfd543a5f2676d0759f5 Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Fri, 18 Aug 2023 00:02:07 +0800 Subject: [PATCH 24/33] add glow decoder --- modules/shallow/glow.py | 44 ++++++++++++++++++++++++++---- modules/shallow/shallow_adapter.py | 4 +-- modules/toplevel.py | 2 +- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/modules/shallow/glow.py b/modules/shallow/glow.py index c4be2b6ee..65db536ad 100644 --- a/modules/shallow/glow.py +++ b/modules/shallow/glow.py @@ -759,7 +759,7 @@ def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads, inter_channels, hidden_channels, - condition_channels, + condition_channels,flow_wavenet_lay=4, condition_encoder_filter_channels=None, @@ -847,7 +847,7 @@ def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads, else: condition_channelsw = 0 - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay, gin_channels=condition_channelsw, share_parameter=flow_share_parameter) def forward(self, c, mel, x_mask=None): @@ -921,12 +921,17 @@ def forward(self, pack_loss,target): return l + + + + + class glow_decoder(nn.Module): def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads, latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers, condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels, - flow_condition_channels, parame,flow_infer_seed=None,flow_infer_scale=0.35, + flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35, condition_encoder_filter_channels=None, latent_encoder_filter_channels=None, @@ -957,6 +962,7 @@ def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, lat condition_encoder_dropout_rate=condition_encoder_dropout_rate, inter_channels=out_dims, + flow_wavenet_lay=flow_wavenet_lay, hidden_channels=flow_hidden_channels, condition_channels=flow_condition_channels, @@ -974,24 +980,50 @@ def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, lat ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition, condition_encoder_type=condition_encoder_type) + self.use_mask=use_mask + self.use_norm=use_norm + + def norm(self,x): + x = (x - (-5)) / (0 - (-5)) * 2 - 1 + return x + + def denorm(self,x): + x=(x + 1) / 2 * (0 - (-5)) + (-5) + return x + def build_loss(self): + + if self.use_latent: return glow_loss_L() - def forward(self, x, infer, x_gt): + def forward(self, x, infer, x_gt,mask): + if not self.use_mask or infer: + mask=None + else: + mask=mask.transpose(1, 2) + + + if infer: out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) + if self.use_norm: + out = self.denorm(out) return out else: + if self.use_norm: + x_gt = self.norm(x_gt) x = x.transpose(1, 2) x_gt=x_gt.transpose(1, 2) - x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt) - pack_loss=(z_p, m_p, logs_p, logdet, x_mask ) + x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask) + + + pack_loss = (z_p, m_p, logs_p, logdet, x_mask) return pack_loss diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py index 11cf5d29f..27a52ce6a 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/shallow/shallow_adapter.py @@ -63,14 +63,14 @@ def __init__(self, parame, out_dims, vocab_size): def forward(self, condition, infer=False, txt_tokens=None, mel2ph=None, f0=None, key_shift=None, speed=None, - spk_embed_id=None,gt_mel=None, **kwargs): + spk_embed_id=None,gt_mel=None,mask=None, **kwargs): if self.use_encoder: condition = self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id, **kwargs) - return self.decoder(condition, infer,gt_mel) + return self.decoder(condition, infer,gt_mel,mask) def get_loss(self): return self.decoder.build_loss() diff --git a/modules/toplevel.py b/modules/toplevel.py index 41a1fe939..0d4567599 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -97,7 +97,7 @@ def forward( if self.train_aux_decoder: aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad) aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, - key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id,gt_mel=gt_mel, **kwargs) + key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id,gt_mel=gt_mel,mask=((mel2ph > 0).float()[:, :, None]), **kwargs) else: aux_out = None if self.train_diffusion: From 4f6e50fa6422e7290e6a91cc0b7c161f5f12a8fb Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Fri, 18 Aug 2023 12:16:39 +0800 Subject: [PATCH 25/33] add convnext glow decoder --- modules/shallow/convnext_glow.py | 1116 ++++++++++++++++++++++++++++ modules/shallow/shallow_adapter.py | 2 +- 2 files changed, 1117 insertions(+), 1 deletion(-) create mode 100644 modules/shallow/convnext_glow.py diff --git a/modules/shallow/convnext_glow.py b/modules/shallow/convnext_glow.py new file mode 100644 index 000000000..0420b0100 --- /dev/null +++ b/modules/shallow/convnext_glow.py @@ -0,0 +1,1116 @@ +import math +from typing import Optional + +import torch +import torch.nn as nn + +import torch.nn.functional as F + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +class RelativeFFTBlock(nn.Module): + """ FFT Block with Relative Multi-Head Attention """ + + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., + window_size=None, block_length=None): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads, + window_size=window_size, p_dropout=p_dropout, + block_length=block_length)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append(FFN( + hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask=None): + + if x_mask is not None: + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + else: + attn_mask = None + + for i in range(self.n_layers): + if x_mask is not None: + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + if x_mask is not None: + x = x * x_mask + return x + + +class RelativeSelfAttention(nn.Module): + """ Relative Multi-Head Attention """ + + def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., + block_length=None, proximal_bias=False, proximal_init=False): + super(RelativeSelfAttention, self).__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels ** -0.5 + self.emb_rel_k = nn.Parameter(torch.randn( + n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn( + n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, + t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, + t_s).transpose(2, 3) + + scores = torch.matmul(query, key.transpose(-2, -1) + ) / math.sqrt(self.k_channels) + if self.window_size is not None: + assert t_s == t_t, "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query, key_relative_embeddings) + rel_logits = self._relative_position_to_absolute_position( + rel_logits) + scores_local = rel_logits / math.sqrt(self.k_channels) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + \ + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + block_mask = torch.ones_like( + scores).triu(-self.block_length).tril(self.block_length) + scores = scores * block_mask + -1e4 * (1 - block_mask) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position( + p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s) + output = output + \ + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings) + output = output.transpose(2, 3).contiguous().view( + b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[:, + slice_start_position:slice_end_position] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, convert_pad_shape( + [[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad(x_flat, convert_pad_shape( + [[0, 0], [0, 0], [0, length - 1]])) + + # Reshape and slice out the padded elements. + x_final = x_flat.view( + [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:] + return x_final + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + # padd along column + x = F.pad(x, convert_pad_shape( + [[0, 0], [0, 0], [0, 0], [0, length - 1]])) + x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, convert_pad_shape( + [[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """ + Bias for self-attention to encourage attention to close positions. + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-4): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean) ** 2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class FFN(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + + self.conv = nn.Conv1d( + in_channels, out_channels, kernel_size, padding=kernel_size // 2) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask=None): + if x_mask is not None: + x = self.conv(x * x_mask) + else: + x = self.conv(x) + + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + if x_mask is not None: + x = x * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +Conv1dModel = nn.Conv1d # 有毒 删 + + +class Depthwise_Separable_Conv1D(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=True, + padding_mode='zeros', # TODO: refine this type + device=None, + dtype=None + ): + super().__init__() + self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, + groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias, + padding_mode=padding_mode, device=device, dtype=dtype) + self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, + device=device, dtype=dtype) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + +def set_Conv1dModel(use_depthwise_conv): + global Conv1dModel + Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +@torch.jit.script +def add_and_GRU(input_a, input_b): + in_act = input_a + input_b + x1, x2 = in_act.chunk(2, dim=1) + t_act = torch.tanh(x2) + s_act = torch.sigmoid(x1) + acts = t_act * s_act + return acts + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert (kernel_size % 2 == 1) + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels # condition用的 + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + self.condition_layers = torch.nn.ModuleList() + + # if gin_channels != 0: + # cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) + # # self.cond_layer = weight_norm_modules(cond_layer, name='weight') + # self.cond_layer=cond_layer + + for i in range(n_layers): + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1) + # self.cond_layer = weight_norm_modules(cond_layer, name='weight') + # self.cond_layer = cond_layer + else: + cond_layer = nn.Identity() + self.condition_layers.append(cond_layer) + + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size, + dilation=dilation, padding=padding) + # in_layer = weight_norm_modules(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask=None, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + # if g is not None: + # g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + + if g is not None: + + condition = self.condition_layers[i](g) + else: + condition = torch.zeros_like(x_in) + + # acts = fused_add_tanh_sigmoid_multiply( # GRU 这不就是wavnet的那个 GRU + # x_in, + # condition, + # n_channels_tensor) + acts = add_and_GRU( # GRU 这不就是wavnet的那个 GRU + x_in, + condition, + ) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, :self.hidden_channels, :] + if x_mask is not None: + x = (x + res_acts) * x_mask + else: + x = x + res_acts + output = output + res_skip_acts[:, self.hidden_channels:, :] + else: + output = output + res_skip_acts + + if x_mask is not None: + out = output * x_mask + else: + out = output + return out +pass +class ConvNeXtBlock_condition(nn.Module): + """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. + + Args: + dim (int): Number of input channels. + intermediate_dim (int): Dimensionality of the intermediate layer. + layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. + Defaults to None. + adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. + None means non-conditional LayerNorm. Defaults to None. + """ + + def __init__( + self, + dim: int, + intermediate_dim: int, dilation, padding, + layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0,condione: int=0 + + ): + super().__init__() + if condione!=0: + self.cond_layer = torch.nn.Conv1d(condione, intermediate_dim, 1) + self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=padding, groups=dim,dilation=dilation) # depthwise conv + + self.norm = nn.LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(intermediate_dim, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() + + def forward(self, x: torch.Tensor,condition=None ) -> torch.Tensor: + + + + residual = x + x = self.dwconv(x) + x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) + + x = self.norm(x) + x = self.pwconv1(x) + if condition is not None: + + condition = self.cond_layer(condition) + else: + condition = torch.zeros_like(x.transpose(1, 2)) + + x=x+condition.transpose(1, 2) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) + x = self.dropout(x) + + x = residual + self.drop_path(x) + return x + +pass + + +class CONVnext_flow(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0,innx=3): + super().__init__() + assert (kernel_size % 2 == 1) + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels # condition用的 + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + self.condition_layers = torch.nn.ModuleList() + + # if gin_channels != 0: + # cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) + # # self.cond_layer = weight_norm_modules(cond_layer, name='weight') + # self.cond_layer=cond_layer + + for i in range(n_layers): + kernel_size=7 + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + + + + in_layer = ConvNeXtBlock_condition(dim=hidden_channels, intermediate_dim=innx * hidden_channels, drop_out=p_dropout, + dilation=dilation, padding=padding,layer_scale_init_value=1e-6,condione=gin_channels) + # in_layer = weight_norm_modules(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + + def forward(self, x, x_mask=None, g=None, **kwargs): + + + # if g is not None: + # g = self.cond_layer(g) + + for i in range(self.n_layers): + + + x = self.in_layers[i](x,g) + + if x_mask is not None: + x = x * x_mask + else: + x = x + + + + + + + + return x + + + + +class ResidualCouplingLayer(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + wn_sharing_parameter=None # 不明的共享权重 + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = CONVnext_flow(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, + gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask=None, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + if x_mask is not None: + h = self.pre(x0) * x_mask + else: + h = self.pre(x0) + h = self.enc(h, x_mask, g=g) + + if x_mask is not None: + stats = self.post(h) * x_mask + else: + stats = self.post(h) + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + if x_mask is not None: + x1 = m + x1 * torch.exp(logs) * x_mask + else: + x1 = m + x1 * torch.exp(logs) + # x1 = m + x1 * torch.exp(logs) * x_mask # 逆过程 + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + if x_mask is not None: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + else: + x1 = (x1 - m) * torch.exp(-logs) + # x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + +class ResidualCouplingBlock(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + share_parameter=False + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + + self.wn = CONVnext_flow(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, + gin_channels=gin_channels) if share_parameter else None + + for i in range(n_flows): + self.flows.append( + ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, + gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn)) + self.flows.append(Flip()) + + def forward(self, x, x_mask=None, g=None, reverse=False): + if not reverse: + logdet_tot = 0 + for flow in self.flows: + x, logdet = flow(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + logdet_tot = None + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x, logdet_tot + + + +class ConvNeXtBlock(nn.Module): + """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. + + Args: + dim (int): Number of input channels. + intermediate_dim (int): Dimensionality of the intermediate layer. + layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. + Defaults to None. + adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. + None means non-conditional LayerNorm. Defaults to None. + """ + + def __init__( + self, + dim: int, + intermediate_dim: int, + layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0 + + ): + super().__init__() + self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + + self.norm = nn.LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(intermediate_dim, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() + + def forward(self, x: torch.Tensor, ) -> torch.Tensor: + residual = x + x = self.dwconv(x) + x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) + + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) + x = self.dropout(x) + + x = residual + self.drop_path(x) + return x + + +class condition_latent_encoder_att(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, 1) + + self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, + n_layers=n_layers, + kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) + + self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) + + def forward(self, x, noice_scale=1): + x = self.proj_in(x) + + x = self.enc(x) + stats = self.proj_out(x) + m, logs = torch.chunk(stats, 2, 1) + z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) + + return z, m, logs, + + +class condition_latent_encoder_convnext(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, 1) + + self.conv = nn.ModuleList( + [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, + drop_out=dropout_rate) for _ in range(n_layers)]) + + self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) + + def forward(self, x, noice_scale=1): + x = self.proj_in(x) + + for i in self.conv: + x = i(x) + stats = self.proj_out(x) + m, logs = torch.chunk(stats, 2, 1) + z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) + + return z, m, logs, + + +class condition_encoder_att(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, + n_layers=n_layers, + kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) + + self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + def forward(self, x, ): + x = self.proj_in(x) + + x = self.enc(x) + stats = self.proj_out(x) + + return stats + + +class condition_encoder_convnext(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + self.conv = nn.ModuleList( + [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, + drop_out=dropout_rate) for _ in range(n_layers)]) + + self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + def forward(self, x, ): + x = self.proj_in(x) + + for i in self.conv: + x = i(x) + stats = self.proj_out(x) + + return stats + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads, + latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, + + condition_in_chans, + + condition_encoder_hidden_channels, + condition_encoder_n_heads, + condition_encoder_n_layers, + condition_encoder_kernel_size, + condition_encoder_dropout_rate, + + inter_channels, + hidden_channels, + + condition_channels,flow_wavenet_lay=4, + + condition_encoder_filter_channels=None, + + latent_encoder_filter_channels=None, + + use_depthwise_conv=False, + + flow_share_parameter=False, + n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True, + ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention', + + **kwargs): + + super().__init__() + self.inter_channels = inter_channels + self.ues_condition = ues_condition + + self.use_latent = use_latent + + if use_latent_encoder and use_latent: + if latent_encoder_type == 'attention': + self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans, + out_channels=inter_channels, + n_chans=latent_encoder_hidden_channels, + n_heads=latent_encoder_n_heads, + n_layers=latent_encoder_n_layers, + condition_encoder_kernel_size=latent_encoder_kernel_size, + dropout_rate=latent_encoder_dropout_rate, + filter_channels=latent_encoder_filter_channels) + elif latent_encoder_type == 'convnext': + self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans, + out_channels=inter_channels, + n_chans=latent_encoder_hidden_channels, + n_heads=None, + n_layers=latent_encoder_n_layers, + condition_encoder_kernel_size=None, + dropout_rate=latent_encoder_dropout_rate, + filter_channels=latent_encoder_filter_channels) + else: + raise RuntimeError("unsupport_latent_encoder") + + elif ((not use_latent_encoder) and use_latent): + self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3) + + if ues_condition_encoder and ues_condition: + if condition_encoder_type == 'attention': + self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans, + out_channels=condition_channels, + n_chans=condition_encoder_hidden_channels, + n_heads=condition_encoder_n_heads, + n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + dropout_rate=condition_encoder_dropout_rate, + filter_channels=condition_encoder_filter_channels) + elif condition_encoder_type == 'convnext': + self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans, + out_channels=condition_channels, + n_chans=condition_encoder_hidden_channels, + n_heads=None, + n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + dropout_rate=condition_encoder_dropout_rate, + filter_channels=condition_encoder_filter_channels) + else: + raise RuntimeError("unsupport__encoder") + elif ((not ues_condition_encoder) and ues_condition): + self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3) + + self.use_depthwise_conv = use_depthwise_conv + + # self.enc_p = TextEncoder( + # inter_channels, + # hidden_channels, + # filter_channels=filter_channels, + # n_heads=n_heads, + # n_layers=n_layers, + # kernel_size=kernel_size, + # p_dropout=p_dropout + # ) + + set_Conv1dModel(self.use_depthwise_conv) + + if ues_condition: + condition_channelsw = condition_channels + else: + condition_channelsw = 0 + + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay, + gin_channels=condition_channelsw, share_parameter=flow_share_parameter) + + def forward(self, c, mel, x_mask=None): + + # vol proj + + # f0 predict + + # encoder + if self.use_latent: + z_ptemp, m_p, logs_p = self.latent_encoder(c) + else: + m_p, logs_p = None, None + # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) + + # flow + if self.ues_condition: + condition = self.condition_encoder(c) + z_p, logdet = self.flow(mel, x_mask, g=condition) + else: + z_p, logdet = self.flow(mel, x_mask, g=None) + + return x_mask, (z_p, m_p, logs_p), logdet, + + @torch.no_grad() + def infer(self, c, noice_scale=0.35, seed=None, ): + if seed is not None: + + if c.device == torch.device("cuda"): + torch.cuda.manual_seed_all(seed) + else: + torch.manual_seed(seed) + + if self.use_latent: + z_p, m_p, logs_p = self.latent_encoder(c) + else: + z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale + + # vol proj + + # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) + # o, _ = self.flow(z_p, g=g, reverse=True) + + if self.ues_condition: + condition = self.condition_encoder(c) + # z_p, logdet = self.flow(mel, x_mask, g=condition) + o, _ = self.flow(z_p, g=condition, reverse=True) + else: + o, _ = self.flow(z_p, g=None, reverse=True) + + return o + + +class glow_loss_L(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, pack_loss,target): + + z, m, logs, logdet, mask = pack_loss + # z, m, logs, logdet, mask = None + + l = torch.sum(logs) + 0.5 * torch.sum( + torch.exp(-2 * logs) * ((z - m) ** 2)) # neg normal likelihood w/o the constant term + l = l - torch.sum(logdet) # log jacobian determinant + if mask is not None: + l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes + else: + l = l / torch.sum(torch.ones_like(z)) # averaging across batch, channel and time axes + l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term + return l + + + + + + + +class glow_decoder_convnext(nn.Module): + def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads, + latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, + condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers, + condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels, + flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35, + condition_encoder_filter_channels=None, + + latent_encoder_filter_channels=None, + + use_depthwise_conv=False, + + flow_share_parameter=False, + n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, + use_latent=True, + ues_condition_encoder=False, ues_condition=False, + condition_encoder_type='attention'): + super().__init__() + self.use_latent=use_latent + self.flow_infer_seed=flow_infer_seed + self.flow_infer_scale=flow_infer_scale + self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels, + latent_encoder_n_heads=latent_encoder_n_heads, + latent_encoder_n_layers=latent_encoder_n_layers, + latent_encoder_kernel_size=latent_encoder_kernel_size, + latent_encoder_dropout_rate=latent_encoder_dropout_rate, + + condition_in_chans=encoder_hidden, + + condition_encoder_hidden_channels=condition_encoder_hidden_channels, + condition_encoder_n_heads=condition_encoder_n_heads, + condition_encoder_n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + condition_encoder_dropout_rate=condition_encoder_dropout_rate, + + inter_channels=out_dims, + flow_wavenet_lay=flow_wavenet_lay, + hidden_channels=flow_hidden_channels, + + condition_channels=flow_condition_channels, + + condition_encoder_filter_channels=condition_encoder_filter_channels, + + latent_encoder_filter_channels=latent_encoder_filter_channels, + + use_depthwise_conv=use_depthwise_conv, + + flow_share_parameter=flow_share_parameter, + n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type, + use_latent_encoder=use_latent_encoder, + use_latent=use_latent, + ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition, + condition_encoder_type=condition_encoder_type) + + self.use_mask=use_mask + self.use_norm=use_norm + + def norm(self,x): + x = (x - (-5)) / (0 - (-5)) * 2 - 1 + return x + + def denorm(self,x): + x=(x + 1) / 2 * (0 - (-5)) + (-5) + return x + + def build_loss(self): + + + if self.use_latent: + + return glow_loss_L() + + def forward(self, x, infer, x_gt,mask): + if not self.use_mask or infer: + mask=None + else: + mask=mask.transpose(1, 2) + + + + + if infer: + out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) + if self.use_norm: + out = self.denorm(out) + return out + else: + if self.use_norm: + x_gt = self.norm(x_gt) + + + x = x.transpose(1, 2) + x_gt=x_gt.transpose(1, 2) + + x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask) + + + pack_loss = (z_p, m_p, logs_p, logdet, x_mask) + return pack_loss + + + + + pass diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py index 27a52ce6a..403c32f38 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/shallow/shallow_adapter.py @@ -3,7 +3,7 @@ cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode', 'ns': 'modules.shallow.noise_decoder.noise_decoder', 'ld': 'modules.shallow.light_decoder.noise_decoder','att_fs2':'modules.shallow.fs2_decoder.attention_fs2_decoder' - ,'glow':'modules.shallow.glow.glow_decoder' + ,'glow':'modules.shallow.glow.glow_decoder','glow_convnext':'modules.shallow.convnext_glow.glow_decoder_convnext' } encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'} From bf1d62c7f1f0b077ec942ebba2d8a9f1f51c08ba Mon Sep 17 00:00:00 2001 From: autumn <2> Date: Sun, 20 Aug 2023 13:12:18 +0800 Subject: [PATCH 26/33] fix fs2 --- modules/shallow/convnext_glow.py | 32 +- modules/shallow/fast_speech2_decoder.py | 2 +- modules/shallow/fs2_decoder.py | 2 +- modules/shallow/gglow.py | 1033 +++++++++++++++++++++++ modules/shallow/shallow_adapter.py | 2 +- 5 files changed, 1061 insertions(+), 10 deletions(-) create mode 100644 modules/shallow/gglow.py diff --git a/modules/shallow/convnext_glow.py b/modules/shallow/convnext_glow.py index 0420b0100..f9e8dbd6b 100644 --- a/modules/shallow/convnext_glow.py +++ b/modules/shallow/convnext_glow.py @@ -956,7 +956,7 @@ def forward(self, c, mel, x_mask=None): return x_mask, (z_p, m_p, logs_p), logdet, - @torch.no_grad() + def infer(self, c, noice_scale=0.35, seed=None, ): if seed is not None: @@ -984,6 +984,14 @@ def infer(self, c, noice_scale=0.35, seed=None, ): return o +class fs2_loss(nn.Module): + def __init__(self): + super().__init__() + + def forward(self,y, x): + x=(x - (-5)) / (0 - (-5)) * 2 - 1 + return nn.L1Loss()(y,x) + class glow_loss_L(nn.Module): def __init__(self): @@ -1015,7 +1023,7 @@ def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, lat latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers, condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels, - flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35, + flow_condition_channels, parame,ft_flow=False,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35, condition_encoder_filter_channels=None, latent_encoder_filter_channels=None, @@ -1031,6 +1039,7 @@ def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, lat self.use_latent=use_latent self.flow_infer_seed=flow_infer_seed self.flow_infer_scale=flow_infer_scale + self.ft_flow=ft_flow self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels, latent_encoder_n_heads=latent_encoder_n_heads, latent_encoder_n_layers=latent_encoder_n_layers, @@ -1076,26 +1085,35 @@ def denorm(self,x): return x def build_loss(self): - + if self.ft_flow: + return fs2_loss() if self.use_latent: return glow_loss_L() + + def forward(self, x, infer, x_gt,mask): if not self.use_mask or infer: mask=None else: mask=mask.transpose(1, 2) + if self.ft_flow and not infer: + out = self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, + seed=self.flow_infer_seed).transpose(1, 2) + return out + if infer: - out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) - if self.use_norm: - out = self.denorm(out) - return out + with torch.no_grad(): + out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) + if self.use_norm: + out = self.denorm(out) + return out else: if self.use_norm: x_gt = self.norm(x_gt) diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py index ec264f3ce..61dc04860 100644 --- a/modules/shallow/fast_speech2_decoder.py +++ b/modules/shallow/fast_speech2_decoder.py @@ -84,7 +84,7 @@ def build_loss(self): return fs2_loss() - def forward(self, x,infer,**kwargs): + def forward(self, x,infer,*args,**kwargs): x=x.transpose(1, 2) x=self.inconv(x) for i in self.conv: diff --git a/modules/shallow/fs2_decoder.py b/modules/shallow/fs2_decoder.py index 073819dd1..acb3408be 100644 --- a/modules/shallow/fs2_decoder.py +++ b/modules/shallow/fs2_decoder.py @@ -287,7 +287,7 @@ def build_loss(self): return fs2_loss() - def forward(self, x,infer,**kwargs): + def forward(self, x,infer,*args,**kwargs): x=x.transpose(1, 2) x=self.inconv(x) diff --git a/modules/shallow/gglow.py b/modules/shallow/gglow.py new file mode 100644 index 000000000..817005579 --- /dev/null +++ b/modules/shallow/gglow.py @@ -0,0 +1,1033 @@ +import math +from typing import Optional + +import torch +import torch.nn as nn + +import torch.nn.functional as F + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +class RelativeFFTBlock(nn.Module): + """ FFT Block with Relative Multi-Head Attention """ + + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., + window_size=None, block_length=None): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads, + window_size=window_size, p_dropout=p_dropout, + block_length=block_length)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append(FFN( + hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask=None): + + if x_mask is not None: + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + else: + attn_mask = None + + for i in range(self.n_layers): + if x_mask is not None: + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + if x_mask is not None: + x = x * x_mask + return x + + +class RelativeSelfAttention(nn.Module): + """ Relative Multi-Head Attention """ + + def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., + block_length=None, proximal_bias=False, proximal_init=False): + super(RelativeSelfAttention, self).__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels ** -0.5 + self.emb_rel_k = nn.Parameter(torch.randn( + n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn( + n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, + t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, + t_s).transpose(2, 3) + + scores = torch.matmul(query, key.transpose(-2, -1) + ) / math.sqrt(self.k_channels) + if self.window_size is not None: + assert t_s == t_t, "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query, key_relative_embeddings) + rel_logits = self._relative_position_to_absolute_position( + rel_logits) + scores_local = rel_logits / math.sqrt(self.k_channels) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + \ + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + block_mask = torch.ones_like( + scores).triu(-self.block_length).tril(self.block_length) + scores = scores * block_mask + -1e4 * (1 - block_mask) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position( + p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s) + output = output + \ + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings) + output = output.transpose(2, 3).contiguous().view( + b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[:, + slice_start_position:slice_end_position] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, convert_pad_shape( + [[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad(x_flat, convert_pad_shape( + [[0, 0], [0, 0], [0, length - 1]])) + + # Reshape and slice out the padded elements. + x_final = x_flat.view( + [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:] + return x_final + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + # padd along column + x = F.pad(x, convert_pad_shape( + [[0, 0], [0, 0], [0, 0], [0, length - 1]])) + x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, convert_pad_shape( + [[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """ + Bias for self-attention to encourage attention to close positions. + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-4): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean) ** 2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class FFN(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + + self.conv = nn.Conv1d( + in_channels, out_channels, kernel_size, padding=kernel_size // 2) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask=None): + if x_mask is not None: + x = self.conv(x * x_mask) + else: + x = self.conv(x) + + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + if x_mask is not None: + x = x * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +Conv1dModel = nn.Conv1d # 有毒 删 + + +class Depthwise_Separable_Conv1D(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=True, + padding_mode='zeros', # TODO: refine this type + device=None, + dtype=None + ): + super().__init__() + self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, + groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias, + padding_mode=padding_mode, device=device, dtype=dtype) + self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, + device=device, dtype=dtype) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + +def set_Conv1dModel(use_depthwise_conv): + global Conv1dModel + Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +@torch.jit.script +def add_and_GRU(input_a, input_b): + in_act = input_a + input_b + x1, x2 = in_act.chunk(2, dim=1) + t_act = torch.tanh(x2) + s_act = torch.sigmoid(x1) + acts = t_act * s_act + return acts + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert (kernel_size % 2 == 1) + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels # condition用的 + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + self.condition_layers = torch.nn.ModuleList() + + # if gin_channels != 0: + # cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) + # # self.cond_layer = weight_norm_modules(cond_layer, name='weight') + # self.cond_layer=cond_layer + + for i in range(n_layers): + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1) + # self.cond_layer = weight_norm_modules(cond_layer, name='weight') + # self.cond_layer = cond_layer + else: + cond_layer = nn.Identity() + self.condition_layers.append(cond_layer) + + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size, + dilation=dilation, padding=padding) + # in_layer = weight_norm_modules(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask=None, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + # if g is not None: + # g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + + if g is not None: + + condition = self.condition_layers[i](g) + else: + condition = torch.zeros_like(x_in) + + # acts = fused_add_tanh_sigmoid_multiply( # GRU 这不就是wavnet的那个 GRU + # x_in, + # condition, + # n_channels_tensor) + acts = add_and_GRU( # GRU 这不就是wavnet的那个 GRU + x_in, + condition, + ) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, :self.hidden_channels, :] + if x_mask is not None: + x = (x + res_acts) * x_mask + else: + x = x + res_acts + output = output + res_skip_acts[:, self.hidden_channels:, :] + else: + output = output + res_skip_acts + + if x_mask is not None: + out = output * x_mask + else: + out = output + return out + + # def remove_weight_norm(self): + # if self.gin_channels != 0: + # remove_weight_norm_modules(self.cond_layer) + # for l in self.in_layers: + # remove_weight_norm_modules(l) + # for l in self.res_skip_layers: + # remove_weight_norm_modules(l) + + +class ResidualCouplingLayer(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + wn_sharing_parameter=None # 不明的共享权重 + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, + gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask=None, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + if x_mask is not None: + h = self.pre(x0) * x_mask + else: + h = self.pre(x0) + h = self.enc(h, x_mask, g=g) + + if x_mask is not None: + stats = self.post(h) * x_mask + else: + stats = self.post(h) + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + if x_mask is not None: + x1 = m + x1 * torch.exp(logs) * x_mask + else: + x1 = m + x1 * torch.exp(logs) + # x1 = m + x1 * torch.exp(logs) * x_mask # 逆过程 + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + if x_mask is not None: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + else: + x1 = (x1 - m) * torch.exp(-logs) + # x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + +class ResidualCouplingBlock(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + share_parameter=False + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + + self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, + gin_channels=gin_channels) if share_parameter else None + + for i in range(n_flows): + self.flows.append( + ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, + gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn)) + self.flows.append(Flip()) + + def forward(self, x, x_mask=None, g=None, reverse=False): + if not reverse: + logdet_tot = 0 + for flow in self.flows: + x, logdet = flow(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + logdet_tot = None + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x, logdet_tot + + +# class TextEncoder(nn.Module): +# def __init__(self, +# out_channels, +# hidden_channels, +# kernel_size, +# n_layers, +# gin_channels=0, +# filter_channels=None, +# n_heads=None, +# p_dropout=None): +# super().__init__() +# self.out_channels = out_channels +# self.hidden_channels = hidden_channels +# self.kernel_size = kernel_size +# self.n_layers = n_layers +# self.gin_channels = gin_channels +# self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) +# self.f0_emb = nn.Embedding(256, hidden_channels) +# +# self.enc_ = attentions.Encoder( +# hidden_channels, +# filter_channels, +# n_heads, +# n_layers, +# kernel_size, +# p_dropout) +# +# def forward(self, x, x_mask, f0=None, noice_scale=1): +# x = x + self.f0_emb(f0).transpose(1, 2) +# x = self.enc_(x * x_mask, x_mask) +# stats = self.proj(x) * x_mask +# m, logs = torch.split(stats, self.out_channels, dim=1) +# z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask +# +# return z, m, logs, x_mask + + +class ConvNeXtBlock(nn.Module): + """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. + + Args: + dim (int): Number of input channels. + intermediate_dim (int): Dimensionality of the intermediate layer. + layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. + Defaults to None. + adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. + None means non-conditional LayerNorm. Defaults to None. + """ + + def __init__( + self, + dim: int, + intermediate_dim: int, + layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0 + + ): + super().__init__() + self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + + self.norm = nn.LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(intermediate_dim, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() + + def forward(self, x: torch.Tensor, ) -> torch.Tensor: + residual = x + x = self.dwconv(x) + x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) + + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) + x = self.dropout(x) + + x = residual + self.drop_path(x) + return x + + +class condition_latent_encoder_att(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, 1) + + self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, + n_layers=n_layers, + kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) + + self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) + + def forward(self, x, noice_scale=1): + x = self.proj_in(x) + + x = self.enc(x) + stats = self.proj_out(x) + m, logs = torch.chunk(stats, 2, 1) + z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) + + return z, m, logs, + + +class condition_latent_encoder_convnext(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, 1) + + self.conv = nn.ModuleList( + [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, + drop_out=dropout_rate) for _ in range(n_layers)]) + + self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) + + def forward(self, x, noice_scale=1): + x = self.proj_in(x) + + for i in self.conv: + x = i(x) + stats = self.proj_out(x) + m, logs = torch.chunk(stats, 2, 1) + z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) + + return z, m, logs, + + +class condition_encoder_att(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, + n_layers=n_layers, + kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) + + self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + def forward(self, x, ): + x = self.proj_in(x) + + x = self.enc(x) + stats = self.proj_out(x) + + return stats + + +class condition_encoder_convnext(nn.Module): + def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, + filter_channels=None): + super().__init__() + if filter_channels is None: + filter_channels = n_chans * 4 + + self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + self.conv = nn.ModuleList( + [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, + drop_out=dropout_rate) for _ in range(n_layers)]) + + self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, + padding=condition_encoder_kernel_size // 2) + + def forward(self, x, ): + x = self.proj_in(x) + + for i in self.conv: + x = i(x) + stats = self.proj_out(x) + + return stats + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads, + latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, + + condition_in_chans, + + condition_encoder_hidden_channels, + condition_encoder_n_heads, + condition_encoder_n_layers, + condition_encoder_kernel_size, + condition_encoder_dropout_rate, + + inter_channels, + hidden_channels, + + condition_channels,flow_wavenet_lay=4, + + condition_encoder_filter_channels=None, + + latent_encoder_filter_channels=None, + + use_depthwise_conv=False, + + flow_share_parameter=False, + n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True, + ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention', + + **kwargs): + + super().__init__() + self.inter_channels = inter_channels + self.ues_condition = ues_condition + + self.use_latent = use_latent + + if use_latent_encoder and use_latent: + if latent_encoder_type == 'attention': + self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans, + out_channels=inter_channels, + n_chans=latent_encoder_hidden_channels, + n_heads=latent_encoder_n_heads, + n_layers=latent_encoder_n_layers, + condition_encoder_kernel_size=latent_encoder_kernel_size, + dropout_rate=latent_encoder_dropout_rate, + filter_channels=latent_encoder_filter_channels) + elif latent_encoder_type == 'convnext': + self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans, + out_channels=inter_channels, + n_chans=latent_encoder_hidden_channels, + n_heads=None, + n_layers=latent_encoder_n_layers, + condition_encoder_kernel_size=None, + dropout_rate=latent_encoder_dropout_rate, + filter_channels=latent_encoder_filter_channels) + else: + raise RuntimeError("unsupport_latent_encoder") + + elif ((not use_latent_encoder) and use_latent): + self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3) + + if ues_condition_encoder and ues_condition: + if condition_encoder_type == 'attention': + self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans, + out_channels=condition_channels, + n_chans=condition_encoder_hidden_channels, + n_heads=condition_encoder_n_heads, + n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + dropout_rate=condition_encoder_dropout_rate, + filter_channels=condition_encoder_filter_channels) + elif condition_encoder_type == 'convnext': + self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans, + out_channels=condition_channels, + n_chans=condition_encoder_hidden_channels, + n_heads=None, + n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + dropout_rate=condition_encoder_dropout_rate, + filter_channels=condition_encoder_filter_channels) + else: + raise RuntimeError("unsupport__encoder") + elif ((not ues_condition_encoder) and ues_condition): + self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3) + + self.use_depthwise_conv = use_depthwise_conv + + # self.enc_p = TextEncoder( + # inter_channels, + # hidden_channels, + # filter_channels=filter_channels, + # n_heads=n_heads, + # n_layers=n_layers, + # kernel_size=kernel_size, + # p_dropout=p_dropout + # ) + + set_Conv1dModel(self.use_depthwise_conv) + + if ues_condition: + condition_channelsw = condition_channels + else: + condition_channelsw = 0 + + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay, + gin_channels=condition_channelsw, share_parameter=flow_share_parameter) + + def forward(self, c, mel, x_mask=None): + + # vol proj + + # f0 predict + + # encoder + if self.use_latent: + z_ptemp, m_p, logs_p = self.latent_encoder(c) + else: + m_p, logs_p = None, None + # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) + + # flow + if self.ues_condition: + condition = self.condition_encoder(c) + z_p, logdet = self.flow(mel, x_mask, g=condition) + else: + z_p, logdet = self.flow(mel, x_mask, g=None) + + return x_mask, (z_p, m_p, logs_p), logdet, + + @torch.no_grad() + def infer(self, c, noice_scale=0.35, seed=None, ): + if seed is not None: + + if c.device == torch.device("cuda"): + torch.cuda.manual_seed_all(seed) + else: + torch.manual_seed(seed) + + if self.use_latent: + z_p, m_p, logs_p = self.latent_encoder(c) + else: + z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale + + z_p=z_p.cuda() + + # vol proj + + # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) + # o, _ = self.flow(z_p, g=g, reverse=True) + + if self.ues_condition: + condition = self.condition_encoder(c) + # z_p, logdet = self.flow(mel, x_mask, g=condition) + o, _ = self.flow(z_p, g=condition, reverse=True) + else: + o, _ = self.flow(z_p, g=None, reverse=True) + + return o + + +class glow_loss_L(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, pack_loss,target): + + z, m, logs, logdet, mask = pack_loss + # z, m, logs, logdet, mask = None + + l = 0.5 * torch.sum( + torch.exp(-2 * logdet) * ((z ) ** 2)) # neg normal likelihood w/o the constant term + l = l - torch.sum(logdet) # log jacobian determinant + if mask is not None: + l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes + else: + l = l / torch.sum(torch.ones_like(z)) # averaging across batch, channel and time axes + l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term + return l + + + + + +class glow_decoder(nn.Module): + def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads, + latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, + condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers, + condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels, + flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35, + condition_encoder_filter_channels=None, + + latent_encoder_filter_channels=None, + + use_depthwise_conv=False, + + flow_share_parameter=False, + n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, + use_latent=True, + ues_condition_encoder=False, ues_condition=False, + condition_encoder_type='attention'): + super().__init__() + self.use_latent=use_latent + self.flow_infer_seed=flow_infer_seed + self.flow_infer_scale=flow_infer_scale + self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels, + latent_encoder_n_heads=latent_encoder_n_heads, + latent_encoder_n_layers=latent_encoder_n_layers, + latent_encoder_kernel_size=latent_encoder_kernel_size, + latent_encoder_dropout_rate=latent_encoder_dropout_rate, + + condition_in_chans=encoder_hidden, + + condition_encoder_hidden_channels=condition_encoder_hidden_channels, + condition_encoder_n_heads=condition_encoder_n_heads, + condition_encoder_n_layers=condition_encoder_n_layers, + condition_encoder_kernel_size=condition_encoder_kernel_size, + condition_encoder_dropout_rate=condition_encoder_dropout_rate, + + inter_channels=out_dims, + flow_wavenet_lay=flow_wavenet_lay, + hidden_channels=flow_hidden_channels, + + condition_channels=flow_condition_channels, + + condition_encoder_filter_channels=condition_encoder_filter_channels, + + latent_encoder_filter_channels=latent_encoder_filter_channels, + + use_depthwise_conv=use_depthwise_conv, + + flow_share_parameter=flow_share_parameter, + n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type, + use_latent_encoder=use_latent_encoder, + use_latent=use_latent, + ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition, + condition_encoder_type=condition_encoder_type) + + self.use_mask=use_mask + self.use_norm=use_norm + + def norm(self,x): + x = (x - (-5)) / (0 - (-5)) * 2 - 1 + return x + + def denorm(self,x): + x=(x + 1) / 2 * (0 - (-5)) + (-5) + return x + + def build_loss(self): + + + if self.use_latent: + + return glow_loss_L() + + return glow_loss_L() + def forward(self, x, infer, x_gt,mask): + if not self.use_mask or infer: + mask=None + else: + mask=mask.transpose(1, 2) + + + + + if infer: + out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) + if self.use_norm: + out = self.denorm(out) + return out + else: + if self.use_norm: + x_gt = self.norm(x_gt) + + + x = x.transpose(1, 2) + x_gt=x_gt.transpose(1, 2) + + x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask) + + + pack_loss = (z_p, m_p, logs_p, logdet, x_mask) + return pack_loss + + + + + pass diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py index 403c32f38..1a35c8507 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/shallow/shallow_adapter.py @@ -3,7 +3,7 @@ cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode', 'ns': 'modules.shallow.noise_decoder.noise_decoder', 'ld': 'modules.shallow.light_decoder.noise_decoder','att_fs2':'modules.shallow.fs2_decoder.attention_fs2_decoder' - ,'glow':'modules.shallow.glow.glow_decoder','glow_convnext':'modules.shallow.convnext_glow.glow_decoder_convnext' + ,'glow':'modules.shallow.glow.glow_decoder','glow_convnext':'modules.shallow.convnext_glow.glow_decoder_convnext','gglow':'modules.shallow.gglow.glow_decoder','fast_speech2_decoders':'modules.shallow.fast_speech2_decoders.fs2_decode' } encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'} From 6bbdf6c549a33af62a1d7278001cc1661ebbd97e Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Mon, 21 Aug 2023 18:16:26 +0800 Subject: [PATCH 27/33] Support using gt mel as source during validation --- configs/acoustic.yaml | 1 + modules/toplevel.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 82f2163a8..84ef7d721 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -82,6 +82,7 @@ diff_depth: 400 shallow_diffusion_args: train_aux_decoder: true train_diffusion: true + val_gt_start: false aux_share_encoder: true aux_encoder_strict_hparams: false aux_encoder_arch: fs2 diff --git a/modules/toplevel.py b/modules/toplevel.py index 211727d0e..2145489f2 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -49,11 +49,11 @@ def __init__(self, vocab_size, out_dims): ) self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False) + self.shallow_args = hparams['shallow_diffusion_args'] if self.use_shallow_diffusion: - shallow_args = hparams['shallow_diffusion_args'] - self.train_aux_decoder = shallow_args['train_aux_decoder'] - self.train_diffusion = shallow_args['train_diffusion'] - self.aux_decoder_grad = shallow_args['aux_decoder_grad'] + self.train_aux_decoder = self.shallow_args['train_aux_decoder'] + self.train_diffusion = self.shallow_args['train_diffusion'] + self.aux_decoder_grad = self.shallow_args['aux_decoder_grad'] self.aux_decoder = shallow_adapt(hparams, out_dims,vocab_size) self.diffusion = GaussianDiffusion( @@ -79,21 +79,22 @@ def forward( txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id, **kwargs ) - if infer: if self.use_shallow_diffusion: aux_mel_pred = self.aux_decoder(condition, infer=True,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs) - aux_mel_pred *= ((mel2ph > 0).float()[:, :, None]) + if gt_mel is not None and self.shallow_args['val_gt_start']: + src_mel = gt_mel + else: + src_mel = aux_mel_pred else: - aux_mel_pred = None - mel_pred = self.diffusion(condition, src_spec=aux_mel_pred, infer=True) + aux_mel_pred = src_mel = None + mel_pred = self.diffusion(condition, src_spec=src_mel, infer=True) mel_pred *= ((mel2ph > 0).float()[:, :, None]) return ShallowDiffusionOutput(aux_out=aux_mel_pred, diff_out=mel_pred) else: if self.use_shallow_diffusion: - # TODO: replace the following placeholder with real calling code if self.train_aux_decoder: aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad) aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, From f1cc641bf3b0b5a481822269427839696565d782 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Thu, 21 Sep 2023 14:59:54 +0800 Subject: [PATCH 28/33] Clean files and configs --- configs/acoustic.yaml | 15 +- modules/{shallow => aux_decoder}/__init__.py | 0 .../fast_speech2_decoder.py | 48 +- .../{shallow => aux_decoder}/fs2_decoder.py | 0 .../shallow_adapter.py | 6 +- modules/shallow/convnext_glow.py | 1134 ----------------- modules/shallow/gglow.py | 1033 --------------- modules/shallow/glow.py | 1032 --------------- modules/shallow/light_decoder.py | 109 -- modules/shallow/noise_decoder.py | 100 -- modules/toplevel.py | 2 +- 11 files changed, 35 insertions(+), 3444 deletions(-) rename modules/{shallow => aux_decoder}/__init__.py (100%) rename modules/{shallow => aux_decoder}/fast_speech2_decoder.py (62%) rename modules/{shallow => aux_decoder}/fs2_decoder.py (100%) rename modules/{shallow => aux_decoder}/shallow_adapter.py (85%) delete mode 100644 modules/shallow/convnext_glow.py delete mode 100644 modules/shallow/gglow.py delete mode 100644 modules/shallow/glow.py delete mode 100644 modules/shallow/light_decoder.py delete mode 100644 modules/shallow/noise_decoder.py diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 84ef7d721..e892018d8 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -79,18 +79,23 @@ schedule_type: 'linear' # shallow diffusion use_shallow_diffusion: false diff_depth: 400 + shallow_diffusion_args: train_aux_decoder: true train_diffusion: true + shared_encoder: true val_gt_start: false aux_share_encoder: true aux_encoder_strict_hparams: false - aux_encoder_arch: fs2 - aux_encoder_args: {} - aux_decoder_grad: 0.1 - aux_decoder_arch: fs2 + aux_decoder_arch: convnext + aux_decoder_args: + num_channels: 512 + num_layers: 6 + kernel_size: 7 + dropout_rate: 0.1 aux_decoder_strict_hparams: true - aux_decoder_args: {} + aux_decoder_grad: 0.1 + lambda_aux_mel_loss: 0.2 # train and eval diff --git a/modules/shallow/__init__.py b/modules/aux_decoder/__init__.py similarity index 100% rename from modules/shallow/__init__.py rename to modules/aux_decoder/__init__.py diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/aux_decoder/fast_speech2_decoder.py similarity index 62% rename from modules/shallow/fast_speech2_decoder.py rename to modules/aux_decoder/fast_speech2_decoder.py index 61dc04860..1ccb76fcc 100644 --- a/modules/shallow/fast_speech2_decoder.py +++ b/modules/aux_decoder/fast_speech2_decoder.py @@ -4,8 +4,6 @@ import torch.nn as nn - - class ConvNeXtBlock(nn.Module): """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. @@ -19,17 +17,15 @@ class ConvNeXtBlock(nn.Module): """ def __init__( - self, - dim: int, - intermediate_dim: int, - layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0 + self, + dim: int, + intermediate_dim: int, + layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0 ): super().__init__() self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv - - self.norm = nn.LayerNorm(dim, eps=1e-6) self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers self.act = nn.GELU() @@ -41,14 +37,13 @@ def __init__( ) # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.drop_path = nn.Identity() - self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() + self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() def forward(self, x: torch.Tensor, ) -> torch.Tensor: residual = x x = self.dwconv(x) x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - x = self.norm(x) x = self.pwconv1(x) x = self.act(x) @@ -56,9 +51,9 @@ def forward(self, x: torch.Tensor, ) -> torch.Tensor: if self.gamma is not None: x = self.gamma * x x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - x=self.dropout(x) + x = self.dropout(x) - x = residual + self.drop_path (x) + x = residual + self.drop_path(x) return x @@ -66,31 +61,30 @@ class fs2_loss(nn.Module): def __init__(self): super().__init__() - def forward(self,y, x): - x=(x - (-5)) / (0 - (-5)) * 2 - 1 - return nn.L1Loss()(y,x) + def forward(self, y, x): + x = (x - (-5)) / (0 - (-5)) * 2 - 1 + return nn.L1Loss()(y, x) class fs2_decode(nn.Module): - def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame): + def __init__(self, encoder_hidden, out_dims, n_chans, kernel_size, dropout_rate, n_layers, parame): super().__init__() - self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate) for _ in range(n_layers)]) - self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - - + self.inconv = nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans, intermediate_dim=n_chans * 4, layer_scale_init_value=1e-6, + drop_out=dropout_rate) for _ in range(n_layers)]) + self.outconv = nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) def build_loss(self): return fs2_loss() - def forward(self, x,infer,*args,**kwargs): - x=x.transpose(1, 2) - x=self.inconv(x) + def forward(self, x, infer, *args, **kwargs): + x = x.transpose(1, 2) + x = self.inconv(x) for i in self.conv: - x=i(x) - x=self.outconv(x).transpose(1, 2) + x = i(x) + x = self.outconv(x).transpose(1, 2) if infer: - x=(x + 1) / 2 * (0 - (-5)) + (-5) + x = (x + 1) / 2 * (0 - (-5)) + (-5) return x pass diff --git a/modules/shallow/fs2_decoder.py b/modules/aux_decoder/fs2_decoder.py similarity index 100% rename from modules/shallow/fs2_decoder.py rename to modules/aux_decoder/fs2_decoder.py diff --git a/modules/shallow/shallow_adapter.py b/modules/aux_decoder/shallow_adapter.py similarity index 85% rename from modules/shallow/shallow_adapter.py rename to modules/aux_decoder/shallow_adapter.py index 1a35c8507..53d47c56a 100644 --- a/modules/shallow/shallow_adapter.py +++ b/modules/aux_decoder/shallow_adapter.py @@ -1,9 +1,9 @@ import torch import torch.nn as nn -cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode', - 'ns': 'modules.shallow.noise_decoder.noise_decoder', 'ld': 'modules.shallow.light_decoder.noise_decoder','att_fs2':'modules.shallow.fs2_decoder.attention_fs2_decoder' - ,'glow':'modules.shallow.glow.glow_decoder','glow_convnext':'modules.shallow.convnext_glow.glow_decoder_convnext','gglow':'modules.shallow.gglow.glow_decoder','fast_speech2_decoders':'modules.shallow.fast_speech2_decoders.fs2_decode' +cls_map = {'fs2': 'modules.aux_decoder.fast_speech2_decoder.fs2_decode', + 'ns': 'modules.aux_decoder.noise_decoder.noise_decoder', 'ld': 'modules.aux_decoder.light_decoder.noise_decoder','att_fs2':'modules.aux_decoder.fs2_decoder.attention_fs2_decoder' + ,'glow':'modules.aux_decoder.glow.glow_decoder','glow_convnext':'modules.aux_decoder.convnext_glow.glow_decoder_convnext','gglow':'modules.aux_decoder.gglow.glow_decoder','fast_speech2_decoders':'modules.aux_decoder.fast_speech2_decoders.fs2_decode' } encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'} diff --git a/modules/shallow/convnext_glow.py b/modules/shallow/convnext_glow.py deleted file mode 100644 index f9e8dbd6b..000000000 --- a/modules/shallow/convnext_glow.py +++ /dev/null @@ -1,1134 +0,0 @@ -import math -from typing import Optional - -import torch -import torch.nn as nn - -import torch.nn.functional as F - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -class RelativeFFTBlock(nn.Module): - """ FFT Block with Relative Multi-Head Attention """ - - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., - window_size=None, block_length=None): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - self.block_length = block_length - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads, - window_size=window_size, p_dropout=p_dropout, - block_length=block_length)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN( - hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask=None): - - if x_mask is not None: - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - else: - attn_mask = None - - for i in range(self.n_layers): - if x_mask is not None: - x = x * x_mask - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - if x_mask is not None: - x = x * x_mask - return x - - -class RelativeSelfAttention(nn.Module): - """ Relative Multi-Head Attention """ - - def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., - block_length=None, proximal_bias=False, proximal_init=False): - super(RelativeSelfAttention, self).__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.p_dropout = p_dropout - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels ** -0.5 - self.emb_rel_k = nn.Parameter(torch.randn( - n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = nn.Parameter(torch.randn( - n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - if proximal_init: - self.conv_k.weight.data.copy_(self.conv_q.weight.data) - self.conv_k.bias.data.copy_(self.conv_q.bias.data) - nn.init.xavier_uniform_(self.conv_v.weight) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, - t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, - t_s).transpose(2, 3) - - scores = torch.matmul(query, key.transpose(-2, -1) - ) / math.sqrt(self.k_channels) - if self.window_size is not None: - assert t_s == t_t, "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query, key_relative_embeddings) - rel_logits = self._relative_position_to_absolute_position( - rel_logits) - scores_local = rel_logits / math.sqrt(self.k_channels) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + \ - self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - block_mask = torch.ones_like( - scores).triu(-self.block_length).tril(self.block_length) - scores = scores * block_mask + -1e4 * (1 - block_mask) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position( - p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s) - output = output + \ - self._matmul_with_relative_values( - relative_weights, value_relative_embeddings) - output = output.transpose(2, 3).contiguous().view( - b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[:, - slice_start_position:slice_end_position] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, convert_pad_shape( - [[0, 0], [0, 0], [0, 0], [0, 1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad(x_flat, convert_pad_shape( - [[0, 0], [0, 0], [0, length - 1]])) - - # Reshape and slice out the padded elements. - x_final = x_flat.view( - [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:] - return x_final - - def _absolute_position_to_relative_position(self, x): - batch, heads, length, _ = x.size() - # padd along column - x = F.pad(x, convert_pad_shape( - [[0, 0], [0, 0], [0, 0], [0, length - 1]])) - x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, convert_pad_shape( - [[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - """ - Bias for self-attention to encourage attention to close positions. - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-4): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - n_dims = len(x.shape) - mean = torch.mean(x, 1, keepdim=True) - variance = torch.mean((x - mean) ** 2, 1, keepdim=True) - - x = (x - mean) * torch.rsqrt(variance + self.eps) - - shape = [1, -1] + [1] * (n_dims - 2) - x = x * self.gamma.view(*shape) + self.beta.view(*shape) - return x - - -class FFN(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - - self.conv = nn.Conv1d( - in_channels, out_channels, kernel_size, padding=kernel_size // 2) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask=None): - if x_mask is not None: - x = self.conv(x * x_mask) - else: - x = self.conv(x) - - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - if x_mask is not None: - x = x * x_mask - return x - - -class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x - - -Conv1dModel = nn.Conv1d # 有毒 删 - - -class Depthwise_Separable_Conv1D(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - bias=True, - padding_mode='zeros', # TODO: refine this type - device=None, - dtype=None - ): - super().__init__() - self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, - groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias, - padding_mode=padding_mode, device=device, dtype=dtype) - self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, - device=device, dtype=dtype) - - def forward(self, input): - return self.point_conv(self.depth_conv(input)) - - -def set_Conv1dModel(use_depthwise_conv): - global Conv1dModel - Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -@torch.jit.script -def add_and_GRU(input_a, input_b): - in_act = input_a + input_b - x1, x2 = in_act.chunk(2, dim=1) - t_act = torch.tanh(x2) - s_act = torch.sigmoid(x1) - acts = t_act * s_act - return acts - - -class WN(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() - assert (kernel_size % 2 == 1) - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size, - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels # condition用的 - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - self.condition_layers = torch.nn.ModuleList() - - # if gin_channels != 0: - # cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) - # # self.cond_layer = weight_norm_modules(cond_layer, name='weight') - # self.cond_layer=cond_layer - - for i in range(n_layers): - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1) - # self.cond_layer = weight_norm_modules(cond_layer, name='weight') - # self.cond_layer = cond_layer - else: - cond_layer = nn.Identity() - self.condition_layers.append(cond_layer) - - dilation = dilation_rate ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size, - dilation=dilation, padding=padding) - # in_layer = weight_norm_modules(in_layer, name='weight') - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight') - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask=None, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - # if g is not None: - # g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - - if g is not None: - - condition = self.condition_layers[i](g) - else: - condition = torch.zeros_like(x_in) - - # acts = fused_add_tanh_sigmoid_multiply( # GRU 这不就是wavnet的那个 GRU - # x_in, - # condition, - # n_channels_tensor) - acts = add_and_GRU( # GRU 这不就是wavnet的那个 GRU - x_in, - condition, - ) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:, :self.hidden_channels, :] - if x_mask is not None: - x = (x + res_acts) * x_mask - else: - x = x + res_acts - output = output + res_skip_acts[:, self.hidden_channels:, :] - else: - output = output + res_skip_acts - - if x_mask is not None: - out = output * x_mask - else: - out = output - return out -pass -class ConvNeXtBlock_condition(nn.Module): - """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. - - Args: - dim (int): Number of input channels. - intermediate_dim (int): Dimensionality of the intermediate layer. - layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. - Defaults to None. - adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. - None means non-conditional LayerNorm. Defaults to None. - """ - - def __init__( - self, - dim: int, - intermediate_dim: int, dilation, padding, - layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0,condione: int=0 - - ): - super().__init__() - if condione!=0: - self.cond_layer = torch.nn.Conv1d(condione, intermediate_dim, 1) - self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=padding, groups=dim,dilation=dilation) # depthwise conv - - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.pwconv2 = nn.Linear(intermediate_dim, dim) - self.gamma = ( - nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) - if layer_scale_init_value > 0 - else None - ) - # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.drop_path = nn.Identity() - self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() - - def forward(self, x: torch.Tensor,condition=None ) -> torch.Tensor: - - - - residual = x - x = self.dwconv(x) - x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - - x = self.norm(x) - x = self.pwconv1(x) - if condition is not None: - - condition = self.cond_layer(condition) - else: - condition = torch.zeros_like(x.transpose(1, 2)) - - x=x+condition.transpose(1, 2) - x = self.act(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - x = self.dropout(x) - - x = residual + self.drop_path(x) - return x - -pass - - -class CONVnext_flow(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0,innx=3): - super().__init__() - assert (kernel_size % 2 == 1) - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size, - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels # condition用的 - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - self.condition_layers = torch.nn.ModuleList() - - # if gin_channels != 0: - # cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) - # # self.cond_layer = weight_norm_modules(cond_layer, name='weight') - # self.cond_layer=cond_layer - - for i in range(n_layers): - kernel_size=7 - dilation = dilation_rate ** i - padding = int((kernel_size * dilation - dilation) / 2) - - - - in_layer = ConvNeXtBlock_condition(dim=hidden_channels, intermediate_dim=innx * hidden_channels, drop_out=p_dropout, - dilation=dilation, padding=padding,layer_scale_init_value=1e-6,condione=gin_channels) - # in_layer = weight_norm_modules(in_layer, name='weight') - self.in_layers.append(in_layer) - - # last one is not necessary - - def forward(self, x, x_mask=None, g=None, **kwargs): - - - # if g is not None: - # g = self.cond_layer(g) - - for i in range(self.n_layers): - - - x = self.in_layers[i](x,g) - - if x_mask is not None: - x = x * x_mask - else: - x = x - - - - - - - - return x - - - - -class ResidualCouplingLayer(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False, - wn_sharing_parameter=None # 不明的共享权重 - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = CONVnext_flow(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, - gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask=None, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - if x_mask is not None: - h = self.pre(x0) * x_mask - else: - h = self.pre(x0) - h = self.enc(h, x_mask, g=g) - - if x_mask is not None: - stats = self.post(h) * x_mask - else: - stats = self.post(h) - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - if x_mask is not None: - x1 = m + x1 * torch.exp(logs) * x_mask - else: - x1 = m + x1 * torch.exp(logs) - # x1 = m + x1 * torch.exp(logs) * x_mask # 逆过程 - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1, 2]) - return x, logdet - else: - if x_mask is not None: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - else: - x1 = (x1 - m) * torch.exp(-logs) - # x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x - - -class ResidualCouplingBlock(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0, - share_parameter=False - ): - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - - self.wn = CONVnext_flow(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, - gin_channels=gin_channels) if share_parameter else None - - for i in range(n_flows): - self.flows.append( - ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, - gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn)) - self.flows.append(Flip()) - - def forward(self, x, x_mask=None, g=None, reverse=False): - if not reverse: - logdet_tot = 0 - for flow in self.flows: - x, logdet = flow(x, x_mask, g=g, reverse=reverse) - logdet_tot += logdet - else: - logdet_tot = None - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x, logdet_tot - - - -class ConvNeXtBlock(nn.Module): - """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. - - Args: - dim (int): Number of input channels. - intermediate_dim (int): Dimensionality of the intermediate layer. - layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. - Defaults to None. - adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. - None means non-conditional LayerNorm. Defaults to None. - """ - - def __init__( - self, - dim: int, - intermediate_dim: int, - layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0 - - ): - super().__init__() - self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv - - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.pwconv2 = nn.Linear(intermediate_dim, dim) - self.gamma = ( - nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) - if layer_scale_init_value > 0 - else None - ) - # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.drop_path = nn.Identity() - self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() - - def forward(self, x: torch.Tensor, ) -> torch.Tensor: - residual = x - x = self.dwconv(x) - x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - - x = self.norm(x) - x = self.pwconv1(x) - x = self.act(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - x = self.dropout(x) - - x = residual + self.drop_path(x) - return x - - -class condition_latent_encoder_att(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, 1) - - self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, - n_layers=n_layers, - kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) - - self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) - - def forward(self, x, noice_scale=1): - x = self.proj_in(x) - - x = self.enc(x) - stats = self.proj_out(x) - m, logs = torch.chunk(stats, 2, 1) - z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) - - return z, m, logs, - - -class condition_latent_encoder_convnext(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, 1) - - self.conv = nn.ModuleList( - [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, - drop_out=dropout_rate) for _ in range(n_layers)]) - - self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) - - def forward(self, x, noice_scale=1): - x = self.proj_in(x) - - for i in self.conv: - x = i(x) - stats = self.proj_out(x) - m, logs = torch.chunk(stats, 2, 1) - z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) - - return z, m, logs, - - -class condition_encoder_att(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, - n_layers=n_layers, - kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) - - self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - def forward(self, x, ): - x = self.proj_in(x) - - x = self.enc(x) - stats = self.proj_out(x) - - return stats - - -class condition_encoder_convnext(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - self.conv = nn.ModuleList( - [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, - drop_out=dropout_rate) for _ in range(n_layers)]) - - self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - def forward(self, x, ): - x = self.proj_in(x) - - for i in self.conv: - x = i(x) - stats = self.proj_out(x) - - return stats - - -class SynthesizerTrn(nn.Module): - """ - Synthesizer for Training - """ - - def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads, - latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, - - condition_in_chans, - - condition_encoder_hidden_channels, - condition_encoder_n_heads, - condition_encoder_n_layers, - condition_encoder_kernel_size, - condition_encoder_dropout_rate, - - inter_channels, - hidden_channels, - - condition_channels,flow_wavenet_lay=4, - - condition_encoder_filter_channels=None, - - latent_encoder_filter_channels=None, - - use_depthwise_conv=False, - - flow_share_parameter=False, - n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True, - ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention', - - **kwargs): - - super().__init__() - self.inter_channels = inter_channels - self.ues_condition = ues_condition - - self.use_latent = use_latent - - if use_latent_encoder and use_latent: - if latent_encoder_type == 'attention': - self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans, - out_channels=inter_channels, - n_chans=latent_encoder_hidden_channels, - n_heads=latent_encoder_n_heads, - n_layers=latent_encoder_n_layers, - condition_encoder_kernel_size=latent_encoder_kernel_size, - dropout_rate=latent_encoder_dropout_rate, - filter_channels=latent_encoder_filter_channels) - elif latent_encoder_type == 'convnext': - self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans, - out_channels=inter_channels, - n_chans=latent_encoder_hidden_channels, - n_heads=None, - n_layers=latent_encoder_n_layers, - condition_encoder_kernel_size=None, - dropout_rate=latent_encoder_dropout_rate, - filter_channels=latent_encoder_filter_channels) - else: - raise RuntimeError("unsupport_latent_encoder") - - elif ((not use_latent_encoder) and use_latent): - self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3) - - if ues_condition_encoder and ues_condition: - if condition_encoder_type == 'attention': - self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans, - out_channels=condition_channels, - n_chans=condition_encoder_hidden_channels, - n_heads=condition_encoder_n_heads, - n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - dropout_rate=condition_encoder_dropout_rate, - filter_channels=condition_encoder_filter_channels) - elif condition_encoder_type == 'convnext': - self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans, - out_channels=condition_channels, - n_chans=condition_encoder_hidden_channels, - n_heads=None, - n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - dropout_rate=condition_encoder_dropout_rate, - filter_channels=condition_encoder_filter_channels) - else: - raise RuntimeError("unsupport__encoder") - elif ((not ues_condition_encoder) and ues_condition): - self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3) - - self.use_depthwise_conv = use_depthwise_conv - - # self.enc_p = TextEncoder( - # inter_channels, - # hidden_channels, - # filter_channels=filter_channels, - # n_heads=n_heads, - # n_layers=n_layers, - # kernel_size=kernel_size, - # p_dropout=p_dropout - # ) - - set_Conv1dModel(self.use_depthwise_conv) - - if ues_condition: - condition_channelsw = condition_channels - else: - condition_channelsw = 0 - - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay, - gin_channels=condition_channelsw, share_parameter=flow_share_parameter) - - def forward(self, c, mel, x_mask=None): - - # vol proj - - # f0 predict - - # encoder - if self.use_latent: - z_ptemp, m_p, logs_p = self.latent_encoder(c) - else: - m_p, logs_p = None, None - # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) - - # flow - if self.ues_condition: - condition = self.condition_encoder(c) - z_p, logdet = self.flow(mel, x_mask, g=condition) - else: - z_p, logdet = self.flow(mel, x_mask, g=None) - - return x_mask, (z_p, m_p, logs_p), logdet, - - - def infer(self, c, noice_scale=0.35, seed=None, ): - if seed is not None: - - if c.device == torch.device("cuda"): - torch.cuda.manual_seed_all(seed) - else: - torch.manual_seed(seed) - - if self.use_latent: - z_p, m_p, logs_p = self.latent_encoder(c) - else: - z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale - - # vol proj - - # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) - # o, _ = self.flow(z_p, g=g, reverse=True) - - if self.ues_condition: - condition = self.condition_encoder(c) - # z_p, logdet = self.flow(mel, x_mask, g=condition) - o, _ = self.flow(z_p, g=condition, reverse=True) - else: - o, _ = self.flow(z_p, g=None, reverse=True) - - return o - -class fs2_loss(nn.Module): - def __init__(self): - super().__init__() - - def forward(self,y, x): - x=(x - (-5)) / (0 - (-5)) * 2 - 1 - return nn.L1Loss()(y,x) - - -class glow_loss_L(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, pack_loss,target): - - z, m, logs, logdet, mask = pack_loss - # z, m, logs, logdet, mask = None - - l = torch.sum(logs) + 0.5 * torch.sum( - torch.exp(-2 * logs) * ((z - m) ** 2)) # neg normal likelihood w/o the constant term - l = l - torch.sum(logdet) # log jacobian determinant - if mask is not None: - l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes - else: - l = l / torch.sum(torch.ones_like(z)) # averaging across batch, channel and time axes - l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term - return l - - - - - - - -class glow_decoder_convnext(nn.Module): - def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads, - latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, - condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers, - condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels, - flow_condition_channels, parame,ft_flow=False,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35, - condition_encoder_filter_channels=None, - - latent_encoder_filter_channels=None, - - use_depthwise_conv=False, - - flow_share_parameter=False, - n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, - use_latent=True, - ues_condition_encoder=False, ues_condition=False, - condition_encoder_type='attention'): - super().__init__() - self.use_latent=use_latent - self.flow_infer_seed=flow_infer_seed - self.flow_infer_scale=flow_infer_scale - self.ft_flow=ft_flow - self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels, - latent_encoder_n_heads=latent_encoder_n_heads, - latent_encoder_n_layers=latent_encoder_n_layers, - latent_encoder_kernel_size=latent_encoder_kernel_size, - latent_encoder_dropout_rate=latent_encoder_dropout_rate, - - condition_in_chans=encoder_hidden, - - condition_encoder_hidden_channels=condition_encoder_hidden_channels, - condition_encoder_n_heads=condition_encoder_n_heads, - condition_encoder_n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - condition_encoder_dropout_rate=condition_encoder_dropout_rate, - - inter_channels=out_dims, - flow_wavenet_lay=flow_wavenet_lay, - hidden_channels=flow_hidden_channels, - - condition_channels=flow_condition_channels, - - condition_encoder_filter_channels=condition_encoder_filter_channels, - - latent_encoder_filter_channels=latent_encoder_filter_channels, - - use_depthwise_conv=use_depthwise_conv, - - flow_share_parameter=flow_share_parameter, - n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type, - use_latent_encoder=use_latent_encoder, - use_latent=use_latent, - ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition, - condition_encoder_type=condition_encoder_type) - - self.use_mask=use_mask - self.use_norm=use_norm - - def norm(self,x): - x = (x - (-5)) / (0 - (-5)) * 2 - 1 - return x - - def denorm(self,x): - x=(x + 1) / 2 * (0 - (-5)) + (-5) - return x - - def build_loss(self): - if self.ft_flow: - return fs2_loss() - - if self.use_latent: - - return glow_loss_L() - - - - def forward(self, x, infer, x_gt,mask): - if not self.use_mask or infer: - mask=None - else: - mask=mask.transpose(1, 2) - - if self.ft_flow and not infer: - out = self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, - seed=self.flow_infer_seed).transpose(1, 2) - return out - - - - - if infer: - with torch.no_grad(): - out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) - if self.use_norm: - out = self.denorm(out) - return out - else: - if self.use_norm: - x_gt = self.norm(x_gt) - - - x = x.transpose(1, 2) - x_gt=x_gt.transpose(1, 2) - - x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask) - - - pack_loss = (z_p, m_p, logs_p, logdet, x_mask) - return pack_loss - - - - - pass diff --git a/modules/shallow/gglow.py b/modules/shallow/gglow.py deleted file mode 100644 index 817005579..000000000 --- a/modules/shallow/gglow.py +++ /dev/null @@ -1,1033 +0,0 @@ -import math -from typing import Optional - -import torch -import torch.nn as nn - -import torch.nn.functional as F - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -class RelativeFFTBlock(nn.Module): - """ FFT Block with Relative Multi-Head Attention """ - - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., - window_size=None, block_length=None): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - self.block_length = block_length - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads, - window_size=window_size, p_dropout=p_dropout, - block_length=block_length)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN( - hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask=None): - - if x_mask is not None: - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - else: - attn_mask = None - - for i in range(self.n_layers): - if x_mask is not None: - x = x * x_mask - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - if x_mask is not None: - x = x * x_mask - return x - - -class RelativeSelfAttention(nn.Module): - """ Relative Multi-Head Attention """ - - def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., - block_length=None, proximal_bias=False, proximal_init=False): - super(RelativeSelfAttention, self).__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.p_dropout = p_dropout - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels ** -0.5 - self.emb_rel_k = nn.Parameter(torch.randn( - n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = nn.Parameter(torch.randn( - n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - if proximal_init: - self.conv_k.weight.data.copy_(self.conv_q.weight.data) - self.conv_k.bias.data.copy_(self.conv_q.bias.data) - nn.init.xavier_uniform_(self.conv_v.weight) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, - t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, - t_s).transpose(2, 3) - - scores = torch.matmul(query, key.transpose(-2, -1) - ) / math.sqrt(self.k_channels) - if self.window_size is not None: - assert t_s == t_t, "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query, key_relative_embeddings) - rel_logits = self._relative_position_to_absolute_position( - rel_logits) - scores_local = rel_logits / math.sqrt(self.k_channels) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + \ - self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - block_mask = torch.ones_like( - scores).triu(-self.block_length).tril(self.block_length) - scores = scores * block_mask + -1e4 * (1 - block_mask) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position( - p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s) - output = output + \ - self._matmul_with_relative_values( - relative_weights, value_relative_embeddings) - output = output.transpose(2, 3).contiguous().view( - b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[:, - slice_start_position:slice_end_position] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, convert_pad_shape( - [[0, 0], [0, 0], [0, 0], [0, 1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad(x_flat, convert_pad_shape( - [[0, 0], [0, 0], [0, length - 1]])) - - # Reshape and slice out the padded elements. - x_final = x_flat.view( - [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:] - return x_final - - def _absolute_position_to_relative_position(self, x): - batch, heads, length, _ = x.size() - # padd along column - x = F.pad(x, convert_pad_shape( - [[0, 0], [0, 0], [0, 0], [0, length - 1]])) - x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, convert_pad_shape( - [[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - """ - Bias for self-attention to encourage attention to close positions. - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-4): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - n_dims = len(x.shape) - mean = torch.mean(x, 1, keepdim=True) - variance = torch.mean((x - mean) ** 2, 1, keepdim=True) - - x = (x - mean) * torch.rsqrt(variance + self.eps) - - shape = [1, -1] + [1] * (n_dims - 2) - x = x * self.gamma.view(*shape) + self.beta.view(*shape) - return x - - -class FFN(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - - self.conv = nn.Conv1d( - in_channels, out_channels, kernel_size, padding=kernel_size // 2) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask=None): - if x_mask is not None: - x = self.conv(x * x_mask) - else: - x = self.conv(x) - - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - if x_mask is not None: - x = x * x_mask - return x - - -class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x - - -Conv1dModel = nn.Conv1d # 有毒 删 - - -class Depthwise_Separable_Conv1D(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - bias=True, - padding_mode='zeros', # TODO: refine this type - device=None, - dtype=None - ): - super().__init__() - self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, - groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias, - padding_mode=padding_mode, device=device, dtype=dtype) - self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, - device=device, dtype=dtype) - - def forward(self, input): - return self.point_conv(self.depth_conv(input)) - - -def set_Conv1dModel(use_depthwise_conv): - global Conv1dModel - Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -@torch.jit.script -def add_and_GRU(input_a, input_b): - in_act = input_a + input_b - x1, x2 = in_act.chunk(2, dim=1) - t_act = torch.tanh(x2) - s_act = torch.sigmoid(x1) - acts = t_act * s_act - return acts - - -class WN(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() - assert (kernel_size % 2 == 1) - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size, - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels # condition用的 - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - self.condition_layers = torch.nn.ModuleList() - - # if gin_channels != 0: - # cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) - # # self.cond_layer = weight_norm_modules(cond_layer, name='weight') - # self.cond_layer=cond_layer - - for i in range(n_layers): - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1) - # self.cond_layer = weight_norm_modules(cond_layer, name='weight') - # self.cond_layer = cond_layer - else: - cond_layer = nn.Identity() - self.condition_layers.append(cond_layer) - - dilation = dilation_rate ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size, - dilation=dilation, padding=padding) - # in_layer = weight_norm_modules(in_layer, name='weight') - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight') - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask=None, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - # if g is not None: - # g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - - if g is not None: - - condition = self.condition_layers[i](g) - else: - condition = torch.zeros_like(x_in) - - # acts = fused_add_tanh_sigmoid_multiply( # GRU 这不就是wavnet的那个 GRU - # x_in, - # condition, - # n_channels_tensor) - acts = add_and_GRU( # GRU 这不就是wavnet的那个 GRU - x_in, - condition, - ) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:, :self.hidden_channels, :] - if x_mask is not None: - x = (x + res_acts) * x_mask - else: - x = x + res_acts - output = output + res_skip_acts[:, self.hidden_channels:, :] - else: - output = output + res_skip_acts - - if x_mask is not None: - out = output * x_mask - else: - out = output - return out - - # def remove_weight_norm(self): - # if self.gin_channels != 0: - # remove_weight_norm_modules(self.cond_layer) - # for l in self.in_layers: - # remove_weight_norm_modules(l) - # for l in self.res_skip_layers: - # remove_weight_norm_modules(l) - - -class ResidualCouplingLayer(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False, - wn_sharing_parameter=None # 不明的共享权重 - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, - gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask=None, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - if x_mask is not None: - h = self.pre(x0) * x_mask - else: - h = self.pre(x0) - h = self.enc(h, x_mask, g=g) - - if x_mask is not None: - stats = self.post(h) * x_mask - else: - stats = self.post(h) - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - if x_mask is not None: - x1 = m + x1 * torch.exp(logs) * x_mask - else: - x1 = m + x1 * torch.exp(logs) - # x1 = m + x1 * torch.exp(logs) * x_mask # 逆过程 - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1, 2]) - return x, logdet - else: - if x_mask is not None: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - else: - x1 = (x1 - m) * torch.exp(-logs) - # x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x - - -class ResidualCouplingBlock(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0, - share_parameter=False - ): - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - - self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, - gin_channels=gin_channels) if share_parameter else None - - for i in range(n_flows): - self.flows.append( - ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, - gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn)) - self.flows.append(Flip()) - - def forward(self, x, x_mask=None, g=None, reverse=False): - if not reverse: - logdet_tot = 0 - for flow in self.flows: - x, logdet = flow(x, x_mask, g=g, reverse=reverse) - logdet_tot += logdet - else: - logdet_tot = None - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x, logdet_tot - - -# class TextEncoder(nn.Module): -# def __init__(self, -# out_channels, -# hidden_channels, -# kernel_size, -# n_layers, -# gin_channels=0, -# filter_channels=None, -# n_heads=None, -# p_dropout=None): -# super().__init__() -# self.out_channels = out_channels -# self.hidden_channels = hidden_channels -# self.kernel_size = kernel_size -# self.n_layers = n_layers -# self.gin_channels = gin_channels -# self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) -# self.f0_emb = nn.Embedding(256, hidden_channels) -# -# self.enc_ = attentions.Encoder( -# hidden_channels, -# filter_channels, -# n_heads, -# n_layers, -# kernel_size, -# p_dropout) -# -# def forward(self, x, x_mask, f0=None, noice_scale=1): -# x = x + self.f0_emb(f0).transpose(1, 2) -# x = self.enc_(x * x_mask, x_mask) -# stats = self.proj(x) * x_mask -# m, logs = torch.split(stats, self.out_channels, dim=1) -# z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask -# -# return z, m, logs, x_mask - - -class ConvNeXtBlock(nn.Module): - """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. - - Args: - dim (int): Number of input channels. - intermediate_dim (int): Dimensionality of the intermediate layer. - layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. - Defaults to None. - adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. - None means non-conditional LayerNorm. Defaults to None. - """ - - def __init__( - self, - dim: int, - intermediate_dim: int, - layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0 - - ): - super().__init__() - self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv - - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.pwconv2 = nn.Linear(intermediate_dim, dim) - self.gamma = ( - nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) - if layer_scale_init_value > 0 - else None - ) - # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.drop_path = nn.Identity() - self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() - - def forward(self, x: torch.Tensor, ) -> torch.Tensor: - residual = x - x = self.dwconv(x) - x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - - x = self.norm(x) - x = self.pwconv1(x) - x = self.act(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - x = self.dropout(x) - - x = residual + self.drop_path(x) - return x - - -class condition_latent_encoder_att(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, 1) - - self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, - n_layers=n_layers, - kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) - - self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) - - def forward(self, x, noice_scale=1): - x = self.proj_in(x) - - x = self.enc(x) - stats = self.proj_out(x) - m, logs = torch.chunk(stats, 2, 1) - z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) - - return z, m, logs, - - -class condition_latent_encoder_convnext(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, 1) - - self.conv = nn.ModuleList( - [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, - drop_out=dropout_rate) for _ in range(n_layers)]) - - self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) - - def forward(self, x, noice_scale=1): - x = self.proj_in(x) - - for i in self.conv: - x = i(x) - stats = self.proj_out(x) - m, logs = torch.chunk(stats, 2, 1) - z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) - - return z, m, logs, - - -class condition_encoder_att(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, - n_layers=n_layers, - kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) - - self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - def forward(self, x, ): - x = self.proj_in(x) - - x = self.enc(x) - stats = self.proj_out(x) - - return stats - - -class condition_encoder_convnext(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - self.conv = nn.ModuleList( - [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, - drop_out=dropout_rate) for _ in range(n_layers)]) - - self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - def forward(self, x, ): - x = self.proj_in(x) - - for i in self.conv: - x = i(x) - stats = self.proj_out(x) - - return stats - - -class SynthesizerTrn(nn.Module): - """ - Synthesizer for Training - """ - - def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads, - latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, - - condition_in_chans, - - condition_encoder_hidden_channels, - condition_encoder_n_heads, - condition_encoder_n_layers, - condition_encoder_kernel_size, - condition_encoder_dropout_rate, - - inter_channels, - hidden_channels, - - condition_channels,flow_wavenet_lay=4, - - condition_encoder_filter_channels=None, - - latent_encoder_filter_channels=None, - - use_depthwise_conv=False, - - flow_share_parameter=False, - n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True, - ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention', - - **kwargs): - - super().__init__() - self.inter_channels = inter_channels - self.ues_condition = ues_condition - - self.use_latent = use_latent - - if use_latent_encoder and use_latent: - if latent_encoder_type == 'attention': - self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans, - out_channels=inter_channels, - n_chans=latent_encoder_hidden_channels, - n_heads=latent_encoder_n_heads, - n_layers=latent_encoder_n_layers, - condition_encoder_kernel_size=latent_encoder_kernel_size, - dropout_rate=latent_encoder_dropout_rate, - filter_channels=latent_encoder_filter_channels) - elif latent_encoder_type == 'convnext': - self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans, - out_channels=inter_channels, - n_chans=latent_encoder_hidden_channels, - n_heads=None, - n_layers=latent_encoder_n_layers, - condition_encoder_kernel_size=None, - dropout_rate=latent_encoder_dropout_rate, - filter_channels=latent_encoder_filter_channels) - else: - raise RuntimeError("unsupport_latent_encoder") - - elif ((not use_latent_encoder) and use_latent): - self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3) - - if ues_condition_encoder and ues_condition: - if condition_encoder_type == 'attention': - self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans, - out_channels=condition_channels, - n_chans=condition_encoder_hidden_channels, - n_heads=condition_encoder_n_heads, - n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - dropout_rate=condition_encoder_dropout_rate, - filter_channels=condition_encoder_filter_channels) - elif condition_encoder_type == 'convnext': - self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans, - out_channels=condition_channels, - n_chans=condition_encoder_hidden_channels, - n_heads=None, - n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - dropout_rate=condition_encoder_dropout_rate, - filter_channels=condition_encoder_filter_channels) - else: - raise RuntimeError("unsupport__encoder") - elif ((not ues_condition_encoder) and ues_condition): - self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3) - - self.use_depthwise_conv = use_depthwise_conv - - # self.enc_p = TextEncoder( - # inter_channels, - # hidden_channels, - # filter_channels=filter_channels, - # n_heads=n_heads, - # n_layers=n_layers, - # kernel_size=kernel_size, - # p_dropout=p_dropout - # ) - - set_Conv1dModel(self.use_depthwise_conv) - - if ues_condition: - condition_channelsw = condition_channels - else: - condition_channelsw = 0 - - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay, - gin_channels=condition_channelsw, share_parameter=flow_share_parameter) - - def forward(self, c, mel, x_mask=None): - - # vol proj - - # f0 predict - - # encoder - if self.use_latent: - z_ptemp, m_p, logs_p = self.latent_encoder(c) - else: - m_p, logs_p = None, None - # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) - - # flow - if self.ues_condition: - condition = self.condition_encoder(c) - z_p, logdet = self.flow(mel, x_mask, g=condition) - else: - z_p, logdet = self.flow(mel, x_mask, g=None) - - return x_mask, (z_p, m_p, logs_p), logdet, - - @torch.no_grad() - def infer(self, c, noice_scale=0.35, seed=None, ): - if seed is not None: - - if c.device == torch.device("cuda"): - torch.cuda.manual_seed_all(seed) - else: - torch.manual_seed(seed) - - if self.use_latent: - z_p, m_p, logs_p = self.latent_encoder(c) - else: - z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale - - z_p=z_p.cuda() - - # vol proj - - # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) - # o, _ = self.flow(z_p, g=g, reverse=True) - - if self.ues_condition: - condition = self.condition_encoder(c) - # z_p, logdet = self.flow(mel, x_mask, g=condition) - o, _ = self.flow(z_p, g=condition, reverse=True) - else: - o, _ = self.flow(z_p, g=None, reverse=True) - - return o - - -class glow_loss_L(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, pack_loss,target): - - z, m, logs, logdet, mask = pack_loss - # z, m, logs, logdet, mask = None - - l = 0.5 * torch.sum( - torch.exp(-2 * logdet) * ((z ) ** 2)) # neg normal likelihood w/o the constant term - l = l - torch.sum(logdet) # log jacobian determinant - if mask is not None: - l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes - else: - l = l / torch.sum(torch.ones_like(z)) # averaging across batch, channel and time axes - l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term - return l - - - - - -class glow_decoder(nn.Module): - def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads, - latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, - condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers, - condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels, - flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35, - condition_encoder_filter_channels=None, - - latent_encoder_filter_channels=None, - - use_depthwise_conv=False, - - flow_share_parameter=False, - n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, - use_latent=True, - ues_condition_encoder=False, ues_condition=False, - condition_encoder_type='attention'): - super().__init__() - self.use_latent=use_latent - self.flow_infer_seed=flow_infer_seed - self.flow_infer_scale=flow_infer_scale - self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels, - latent_encoder_n_heads=latent_encoder_n_heads, - latent_encoder_n_layers=latent_encoder_n_layers, - latent_encoder_kernel_size=latent_encoder_kernel_size, - latent_encoder_dropout_rate=latent_encoder_dropout_rate, - - condition_in_chans=encoder_hidden, - - condition_encoder_hidden_channels=condition_encoder_hidden_channels, - condition_encoder_n_heads=condition_encoder_n_heads, - condition_encoder_n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - condition_encoder_dropout_rate=condition_encoder_dropout_rate, - - inter_channels=out_dims, - flow_wavenet_lay=flow_wavenet_lay, - hidden_channels=flow_hidden_channels, - - condition_channels=flow_condition_channels, - - condition_encoder_filter_channels=condition_encoder_filter_channels, - - latent_encoder_filter_channels=latent_encoder_filter_channels, - - use_depthwise_conv=use_depthwise_conv, - - flow_share_parameter=flow_share_parameter, - n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type, - use_latent_encoder=use_latent_encoder, - use_latent=use_latent, - ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition, - condition_encoder_type=condition_encoder_type) - - self.use_mask=use_mask - self.use_norm=use_norm - - def norm(self,x): - x = (x - (-5)) / (0 - (-5)) * 2 - 1 - return x - - def denorm(self,x): - x=(x + 1) / 2 * (0 - (-5)) + (-5) - return x - - def build_loss(self): - - - if self.use_latent: - - return glow_loss_L() - - return glow_loss_L() - def forward(self, x, infer, x_gt,mask): - if not self.use_mask or infer: - mask=None - else: - mask=mask.transpose(1, 2) - - - - - if infer: - out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) - if self.use_norm: - out = self.denorm(out) - return out - else: - if self.use_norm: - x_gt = self.norm(x_gt) - - - x = x.transpose(1, 2) - x_gt=x_gt.transpose(1, 2) - - x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask) - - - pack_loss = (z_p, m_p, logs_p, logdet, x_mask) - return pack_loss - - - - - pass diff --git a/modules/shallow/glow.py b/modules/shallow/glow.py deleted file mode 100644 index 65db536ad..000000000 --- a/modules/shallow/glow.py +++ /dev/null @@ -1,1032 +0,0 @@ -import math -from typing import Optional - -import torch -import torch.nn as nn - -import torch.nn.functional as F - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -class RelativeFFTBlock(nn.Module): - """ FFT Block with Relative Multi-Head Attention """ - - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., - window_size=None, block_length=None): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - self.block_length = block_length - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads, - window_size=window_size, p_dropout=p_dropout, - block_length=block_length)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN( - hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask=None): - - if x_mask is not None: - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - else: - attn_mask = None - - for i in range(self.n_layers): - if x_mask is not None: - x = x * x_mask - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - if x_mask is not None: - x = x * x_mask - return x - - -class RelativeSelfAttention(nn.Module): - """ Relative Multi-Head Attention """ - - def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., - block_length=None, proximal_bias=False, proximal_init=False): - super(RelativeSelfAttention, self).__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.p_dropout = p_dropout - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels ** -0.5 - self.emb_rel_k = nn.Parameter(torch.randn( - n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = nn.Parameter(torch.randn( - n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - if proximal_init: - self.conv_k.weight.data.copy_(self.conv_q.weight.data) - self.conv_k.bias.data.copy_(self.conv_q.bias.data) - nn.init.xavier_uniform_(self.conv_v.weight) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, - t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, - t_s).transpose(2, 3) - - scores = torch.matmul(query, key.transpose(-2, -1) - ) / math.sqrt(self.k_channels) - if self.window_size is not None: - assert t_s == t_t, "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query, key_relative_embeddings) - rel_logits = self._relative_position_to_absolute_position( - rel_logits) - scores_local = rel_logits / math.sqrt(self.k_channels) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + \ - self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - block_mask = torch.ones_like( - scores).triu(-self.block_length).tril(self.block_length) - scores = scores * block_mask + -1e4 * (1 - block_mask) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position( - p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s) - output = output + \ - self._matmul_with_relative_values( - relative_weights, value_relative_embeddings) - output = output.transpose(2, 3).contiguous().view( - b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[:, - slice_start_position:slice_end_position] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, convert_pad_shape( - [[0, 0], [0, 0], [0, 0], [0, 1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad(x_flat, convert_pad_shape( - [[0, 0], [0, 0], [0, length - 1]])) - - # Reshape and slice out the padded elements. - x_final = x_flat.view( - [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:] - return x_final - - def _absolute_position_to_relative_position(self, x): - batch, heads, length, _ = x.size() - # padd along column - x = F.pad(x, convert_pad_shape( - [[0, 0], [0, 0], [0, 0], [0, length - 1]])) - x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, convert_pad_shape( - [[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - """ - Bias for self-attention to encourage attention to close positions. - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-4): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - n_dims = len(x.shape) - mean = torch.mean(x, 1, keepdim=True) - variance = torch.mean((x - mean) ** 2, 1, keepdim=True) - - x = (x - mean) * torch.rsqrt(variance + self.eps) - - shape = [1, -1] + [1] * (n_dims - 2) - x = x * self.gamma.view(*shape) + self.beta.view(*shape) - return x - - -class FFN(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - - self.conv = nn.Conv1d( - in_channels, out_channels, kernel_size, padding=kernel_size // 2) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask=None): - if x_mask is not None: - x = self.conv(x * x_mask) - else: - x = self.conv(x) - - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - if x_mask is not None: - x = x * x_mask - return x - - -class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x - - -Conv1dModel = nn.Conv1d # 有毒 删 - - -class Depthwise_Separable_Conv1D(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - bias=True, - padding_mode='zeros', # TODO: refine this type - device=None, - dtype=None - ): - super().__init__() - self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, - groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias, - padding_mode=padding_mode, device=device, dtype=dtype) - self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, - device=device, dtype=dtype) - - def forward(self, input): - return self.point_conv(self.depth_conv(input)) - - -def set_Conv1dModel(use_depthwise_conv): - global Conv1dModel - Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -@torch.jit.script -def add_and_GRU(input_a, input_b): - in_act = input_a + input_b - x1, x2 = in_act.chunk(2, dim=1) - t_act = torch.tanh(x2) - s_act = torch.sigmoid(x1) - acts = t_act * s_act - return acts - - -class WN(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() - assert (kernel_size % 2 == 1) - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size, - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels # condition用的 - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - self.condition_layers = torch.nn.ModuleList() - - # if gin_channels != 0: - # cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) - # # self.cond_layer = weight_norm_modules(cond_layer, name='weight') - # self.cond_layer=cond_layer - - for i in range(n_layers): - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1) - # self.cond_layer = weight_norm_modules(cond_layer, name='weight') - # self.cond_layer = cond_layer - else: - cond_layer = nn.Identity() - self.condition_layers.append(cond_layer) - - dilation = dilation_rate ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size, - dilation=dilation, padding=padding) - # in_layer = weight_norm_modules(in_layer, name='weight') - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight') - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask=None, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - # if g is not None: - # g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - - if g is not None: - - condition = self.condition_layers[i](g) - else: - condition = torch.zeros_like(x_in) - - # acts = fused_add_tanh_sigmoid_multiply( # GRU 这不就是wavnet的那个 GRU - # x_in, - # condition, - # n_channels_tensor) - acts = add_and_GRU( # GRU 这不就是wavnet的那个 GRU - x_in, - condition, - ) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:, :self.hidden_channels, :] - if x_mask is not None: - x = (x + res_acts) * x_mask - else: - x = x + res_acts - output = output + res_skip_acts[:, self.hidden_channels:, :] - else: - output = output + res_skip_acts - - if x_mask is not None: - out = output * x_mask - else: - out = output - return out - - # def remove_weight_norm(self): - # if self.gin_channels != 0: - # remove_weight_norm_modules(self.cond_layer) - # for l in self.in_layers: - # remove_weight_norm_modules(l) - # for l in self.res_skip_layers: - # remove_weight_norm_modules(l) - - -class ResidualCouplingLayer(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False, - wn_sharing_parameter=None # 不明的共享权重 - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, - gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask=None, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - if x_mask is not None: - h = self.pre(x0) * x_mask - else: - h = self.pre(x0) - h = self.enc(h, x_mask, g=g) - - if x_mask is not None: - stats = self.post(h) * x_mask - else: - stats = self.post(h) - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - if x_mask is not None: - x1 = m + x1 * torch.exp(logs) * x_mask - else: - x1 = m + x1 * torch.exp(logs) - # x1 = m + x1 * torch.exp(logs) * x_mask # 逆过程 - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1, 2]) - return x, logdet - else: - if x_mask is not None: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - else: - x1 = (x1 - m) * torch.exp(-logs) - # x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x - - -class ResidualCouplingBlock(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0, - share_parameter=False - ): - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - - self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, - gin_channels=gin_channels) if share_parameter else None - - for i in range(n_flows): - self.flows.append( - ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, - gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn)) - self.flows.append(Flip()) - - def forward(self, x, x_mask=None, g=None, reverse=False): - if not reverse: - logdet_tot = 0 - for flow in self.flows: - x, logdet = flow(x, x_mask, g=g, reverse=reverse) - logdet_tot += logdet - else: - logdet_tot = None - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x, logdet_tot - - -# class TextEncoder(nn.Module): -# def __init__(self, -# out_channels, -# hidden_channels, -# kernel_size, -# n_layers, -# gin_channels=0, -# filter_channels=None, -# n_heads=None, -# p_dropout=None): -# super().__init__() -# self.out_channels = out_channels -# self.hidden_channels = hidden_channels -# self.kernel_size = kernel_size -# self.n_layers = n_layers -# self.gin_channels = gin_channels -# self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) -# self.f0_emb = nn.Embedding(256, hidden_channels) -# -# self.enc_ = attentions.Encoder( -# hidden_channels, -# filter_channels, -# n_heads, -# n_layers, -# kernel_size, -# p_dropout) -# -# def forward(self, x, x_mask, f0=None, noice_scale=1): -# x = x + self.f0_emb(f0).transpose(1, 2) -# x = self.enc_(x * x_mask, x_mask) -# stats = self.proj(x) * x_mask -# m, logs = torch.split(stats, self.out_channels, dim=1) -# z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask -# -# return z, m, logs, x_mask - - -class ConvNeXtBlock(nn.Module): - """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. - - Args: - dim (int): Number of input channels. - intermediate_dim (int): Dimensionality of the intermediate layer. - layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. - Defaults to None. - adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. - None means non-conditional LayerNorm. Defaults to None. - """ - - def __init__( - self, - dim: int, - intermediate_dim: int, - layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0 - - ): - super().__init__() - self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv - - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.pwconv2 = nn.Linear(intermediate_dim, dim) - self.gamma = ( - nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) - if layer_scale_init_value > 0 - else None - ) - # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.drop_path = nn.Identity() - self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() - - def forward(self, x: torch.Tensor, ) -> torch.Tensor: - residual = x - x = self.dwconv(x) - x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - - x = self.norm(x) - x = self.pwconv1(x) - x = self.act(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - x = self.dropout(x) - - x = residual + self.drop_path(x) - return x - - -class condition_latent_encoder_att(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, 1) - - self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, - n_layers=n_layers, - kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) - - self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) - - def forward(self, x, noice_scale=1): - x = self.proj_in(x) - - x = self.enc(x) - stats = self.proj_out(x) - m, logs = torch.chunk(stats, 2, 1) - z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) - - return z, m, logs, - - -class condition_latent_encoder_convnext(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, 1) - - self.conv = nn.ModuleList( - [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, - drop_out=dropout_rate) for _ in range(n_layers)]) - - self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1) - - def forward(self, x, noice_scale=1): - x = self.proj_in(x) - - for i in self.conv: - x = i(x) - stats = self.proj_out(x) - m, logs = torch.chunk(stats, 2, 1) - z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) - - return z, m, logs, - - -class condition_encoder_att(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads, - n_layers=n_layers, - kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate) - - self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - def forward(self, x, ): - x = self.proj_in(x) - - x = self.enc(x) - stats = self.proj_out(x) - - return stats - - -class condition_encoder_convnext(nn.Module): - def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate, - filter_channels=None): - super().__init__() - if filter_channels is None: - filter_channels = n_chans * 4 - - self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - self.conv = nn.ModuleList( - [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6, - drop_out=dropout_rate) for _ in range(n_layers)]) - - self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size, - padding=condition_encoder_kernel_size // 2) - - def forward(self, x, ): - x = self.proj_in(x) - - for i in self.conv: - x = i(x) - stats = self.proj_out(x) - - return stats - - -class SynthesizerTrn(nn.Module): - """ - Synthesizer for Training - """ - - def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads, - latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, - - condition_in_chans, - - condition_encoder_hidden_channels, - condition_encoder_n_heads, - condition_encoder_n_layers, - condition_encoder_kernel_size, - condition_encoder_dropout_rate, - - inter_channels, - hidden_channels, - - condition_channels,flow_wavenet_lay=4, - - condition_encoder_filter_channels=None, - - latent_encoder_filter_channels=None, - - use_depthwise_conv=False, - - flow_share_parameter=False, - n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True, - ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention', - - **kwargs): - - super().__init__() - self.inter_channels = inter_channels - self.ues_condition = ues_condition - - self.use_latent = use_latent - - if use_latent_encoder and use_latent: - if latent_encoder_type == 'attention': - self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans, - out_channels=inter_channels, - n_chans=latent_encoder_hidden_channels, - n_heads=latent_encoder_n_heads, - n_layers=latent_encoder_n_layers, - condition_encoder_kernel_size=latent_encoder_kernel_size, - dropout_rate=latent_encoder_dropout_rate, - filter_channels=latent_encoder_filter_channels) - elif latent_encoder_type == 'convnext': - self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans, - out_channels=inter_channels, - n_chans=latent_encoder_hidden_channels, - n_heads=None, - n_layers=latent_encoder_n_layers, - condition_encoder_kernel_size=None, - dropout_rate=latent_encoder_dropout_rate, - filter_channels=latent_encoder_filter_channels) - else: - raise RuntimeError("unsupport_latent_encoder") - - elif ((not use_latent_encoder) and use_latent): - self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3) - - if ues_condition_encoder and ues_condition: - if condition_encoder_type == 'attention': - self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans, - out_channels=condition_channels, - n_chans=condition_encoder_hidden_channels, - n_heads=condition_encoder_n_heads, - n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - dropout_rate=condition_encoder_dropout_rate, - filter_channels=condition_encoder_filter_channels) - elif condition_encoder_type == 'convnext': - self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans, - out_channels=condition_channels, - n_chans=condition_encoder_hidden_channels, - n_heads=None, - n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - dropout_rate=condition_encoder_dropout_rate, - filter_channels=condition_encoder_filter_channels) - else: - raise RuntimeError("unsupport__encoder") - elif ((not ues_condition_encoder) and ues_condition): - self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3) - - self.use_depthwise_conv = use_depthwise_conv - - # self.enc_p = TextEncoder( - # inter_channels, - # hidden_channels, - # filter_channels=filter_channels, - # n_heads=n_heads, - # n_layers=n_layers, - # kernel_size=kernel_size, - # p_dropout=p_dropout - # ) - - set_Conv1dModel(self.use_depthwise_conv) - - if ues_condition: - condition_channelsw = condition_channels - else: - condition_channelsw = 0 - - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay, - gin_channels=condition_channelsw, share_parameter=flow_share_parameter) - - def forward(self, c, mel, x_mask=None): - - # vol proj - - # f0 predict - - # encoder - if self.use_latent: - z_ptemp, m_p, logs_p = self.latent_encoder(c) - else: - m_p, logs_p = None, None - # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) - - # flow - if self.ues_condition: - condition = self.condition_encoder(c) - z_p, logdet = self.flow(mel, x_mask, g=condition) - else: - z_p, logdet = self.flow(mel, x_mask, g=None) - - return x_mask, (z_p, m_p, logs_p), logdet, - - @torch.no_grad() - def infer(self, c, noice_scale=0.35, seed=None, ): - if seed is not None: - - if c.device == torch.device("cuda"): - torch.cuda.manual_seed_all(seed) - else: - torch.manual_seed(seed) - - if self.use_latent: - z_p, m_p, logs_p = self.latent_encoder(c) - else: - z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale - - # vol proj - - # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) - # o, _ = self.flow(z_p, g=g, reverse=True) - - if self.ues_condition: - condition = self.condition_encoder(c) - # z_p, logdet = self.flow(mel, x_mask, g=condition) - o, _ = self.flow(z_p, g=condition, reverse=True) - else: - o, _ = self.flow(z_p, g=None, reverse=True) - - return o - - -class glow_loss_L(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, pack_loss,target): - - z, m, logs, logdet, mask = pack_loss - # z, m, logs, logdet, mask = None - - l = torch.sum(logs) + 0.5 * torch.sum( - torch.exp(-2 * logs) * ((z - m) ** 2)) # neg normal likelihood w/o the constant term - l = l - torch.sum(logdet) # log jacobian determinant - if mask is not None: - l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes - else: - l = l / torch.sum(torch.ones_like(z)) # averaging across batch, channel and time axes - l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term - return l - - - - - - - -class glow_decoder(nn.Module): - def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads, - latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate, - condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers, - condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels, - flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35, - condition_encoder_filter_channels=None, - - latent_encoder_filter_channels=None, - - use_depthwise_conv=False, - - flow_share_parameter=False, - n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, - use_latent=True, - ues_condition_encoder=False, ues_condition=False, - condition_encoder_type='attention'): - super().__init__() - self.use_latent=use_latent - self.flow_infer_seed=flow_infer_seed - self.flow_infer_scale=flow_infer_scale - self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels, - latent_encoder_n_heads=latent_encoder_n_heads, - latent_encoder_n_layers=latent_encoder_n_layers, - latent_encoder_kernel_size=latent_encoder_kernel_size, - latent_encoder_dropout_rate=latent_encoder_dropout_rate, - - condition_in_chans=encoder_hidden, - - condition_encoder_hidden_channels=condition_encoder_hidden_channels, - condition_encoder_n_heads=condition_encoder_n_heads, - condition_encoder_n_layers=condition_encoder_n_layers, - condition_encoder_kernel_size=condition_encoder_kernel_size, - condition_encoder_dropout_rate=condition_encoder_dropout_rate, - - inter_channels=out_dims, - flow_wavenet_lay=flow_wavenet_lay, - hidden_channels=flow_hidden_channels, - - condition_channels=flow_condition_channels, - - condition_encoder_filter_channels=condition_encoder_filter_channels, - - latent_encoder_filter_channels=latent_encoder_filter_channels, - - use_depthwise_conv=use_depthwise_conv, - - flow_share_parameter=flow_share_parameter, - n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type, - use_latent_encoder=use_latent_encoder, - use_latent=use_latent, - ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition, - condition_encoder_type=condition_encoder_type) - - self.use_mask=use_mask - self.use_norm=use_norm - - def norm(self,x): - x = (x - (-5)) / (0 - (-5)) * 2 - 1 - return x - - def denorm(self,x): - x=(x + 1) / 2 * (0 - (-5)) + (-5) - return x - - def build_loss(self): - - - if self.use_latent: - - return glow_loss_L() - - def forward(self, x, infer, x_gt,mask): - if not self.use_mask or infer: - mask=None - else: - mask=mask.transpose(1, 2) - - - - - if infer: - out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2) - if self.use_norm: - out = self.denorm(out) - return out - else: - if self.use_norm: - x_gt = self.norm(x_gt) - - - x = x.transpose(1, 2) - x_gt=x_gt.transpose(1, 2) - - x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask) - - - pack_loss = (z_p, m_p, logs_p, logdet, x_mask) - return pack_loss - - - - - pass diff --git a/modules/shallow/light_decoder.py b/modules/shallow/light_decoder.py deleted file mode 100644 index bb2624765..000000000 --- a/modules/shallow/light_decoder.py +++ /dev/null @@ -1,109 +0,0 @@ -from typing import Optional - -import torch -import torch.nn as nn - -class GLU(nn.Module): - def __init__(self, dim): - super().__init__() - self.dim = dim - - def forward(self, x): - out, gate = x.chunk(2, dim=self.dim) - return out * gate.sigmoid() - - -class ConvNeXtBlock(nn.Module): - """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. - - Args: - dim (int): Number of input channels. - intermediate_dim (int): Dimensionality of the intermediate layer. - layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. - Defaults to None. - adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. - None means non-conditional LayerNorm. Defaults to None. - """ - - def __init__( - self, - dim: int, - intermediate_dim: int, - layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0 - - ): - super().__init__() - self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv - - - - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.act2=GLU(2) - self.pwconv2 = nn.Linear(intermediate_dim//2, dim) - self.gamma = ( - nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) - if layer_scale_init_value > 0 - else None - ) - # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.drop_path = nn.Identity() - self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() - - - - def forward(self, x: torch.Tensor, ) -> torch.Tensor: - residual = x - x=self.act(x) - x = self.dwconv(x) - - x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - - - x = self.norm(x) - x = self.pwconv1(x) - x = self.act2(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - x=self.dropout(x) - - x = residual + self.drop_path (x) - return x - - -class fs2_loss(nn.Module): - def __init__(self): - super().__init__() - - def forward(self,y, x): - x=(x - (-5)) / (0 - (-5)) * 2 - 1 - return nn.L1Loss()(y,x) - - -class noise_decoder(nn.Module): - def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame): - super().__init__() - self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate) for _ in range(n_layers)]) - self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - - - - def build_loss(self): - - return fs2_loss() - - def forward(self, x,infer,**kwargs): - x=x.transpose(1, 2) - x=self.inconv(x) - - for i in self.conv: - x=i(x) - x=self.outconv(x).transpose(1, 2) - if infer: - x=(x + 1) / 2 * (0 - (-5)) + (-5) - return x - pass diff --git a/modules/shallow/noise_decoder.py b/modules/shallow/noise_decoder.py deleted file mode 100644 index 862caf911..000000000 --- a/modules/shallow/noise_decoder.py +++ /dev/null @@ -1,100 +0,0 @@ -from typing import Optional - -import torch -import torch.nn as nn - - - - -class ConvNeXtBlock(nn.Module): - """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. - - Args: - dim (int): Number of input channels. - intermediate_dim (int): Dimensionality of the intermediate layer. - layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. - Defaults to None. - adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. - None means non-conditional LayerNorm. Defaults to None. - """ - - def __init__( - self, - dim: int, - intermediate_dim: int, - layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0 - - ): - super().__init__() - self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv - - - - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.pwconv2 = nn.Linear(intermediate_dim, dim) - self.gamma = ( - nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) - if layer_scale_init_value > 0 - else None - ) - # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.drop_path = nn.Identity() - self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity() - self.con = nn.Conv1d(dim, dim, kernel_size=1, ) - - - def forward(self, x: torch.Tensor,y ) -> torch.Tensor: - residual = x - x = self.dwconv(x) - x=x+self.con(y) - x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - - - x = self.norm(x) - x = self.pwconv1(x) - x = self.act(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - x=self.dropout(x) - - x = residual + self.drop_path (x) - return x - - -class fs2_loss(nn.Module): - def __init__(self): - super().__init__() - - def forward(self,y, x): - x=(x - (-5)) / (0 - (-5)) * 2 - 1 - return nn.L1Loss()(y,x) - - -class noise_decoder(nn.Module): - def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame): - super().__init__() - self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate) for _ in range(n_layers)]) - self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - - - - def build_loss(self): - - return fs2_loss() - - def forward(self, x,infer,**kwargs): - x=x.transpose(1, 2) - x=self.inconv(x) - y=torch.randn_like(x) - for i in self.conv: - y=i(y,x) - x=self.outconv(y).transpose(1, 2) - if infer: - x=(x + 1) / 2 * (0 - (-5)) + (-5) - return x - pass diff --git a/modules/toplevel.py b/modules/toplevel.py index 09b85d6ba..7c00561be 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -17,7 +17,7 @@ from modules.fastspeech.param_adaptor import ParameterAdaptorModule from modules.fastspeech.tts_modules import RhythmRegulator, LengthRegulator from modules.fastspeech.variance_encoder import FastSpeech2Variance -from modules.shallow.shallow_adapter import shallow_adapt +from modules.aux_decoder.shallow_adapter import shallow_adapt from utils.hparams import hparams From 20d5bb5b63c24b60f2f4f9a1ee5fb158095d302a Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Thu, 21 Sep 2023 20:06:55 +0800 Subject: [PATCH 29/33] Clean and refactor aux decoder --- configs/acoustic.yaml | 4 - configs/templates/config_acoustic.yaml | 17 + modules/aux_decoder/__init__.py | 66 ++++ .../{fast_speech2_decoder.py => convnext.py} | 55 ++-- modules/aux_decoder/fs2_decoder.py | 300 ------------------ modules/aux_decoder/shallow_adapter.py | 76 ----- modules/toplevel.py | 25 +- training/acoustic_task.py | 12 +- 8 files changed, 125 insertions(+), 430 deletions(-) rename modules/aux_decoder/{fast_speech2_decoder.py => convnext.py} (61%) delete mode 100644 modules/aux_decoder/fs2_decoder.py delete mode 100644 modules/aux_decoder/shallow_adapter.py diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index e892018d8..174cd9943 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -83,17 +83,13 @@ diff_depth: 400 shallow_diffusion_args: train_aux_decoder: true train_diffusion: true - shared_encoder: true val_gt_start: false - aux_share_encoder: true - aux_encoder_strict_hparams: false aux_decoder_arch: convnext aux_decoder_args: num_channels: 512 num_layers: 6 kernel_size: 7 dropout_rate: 0.1 - aux_decoder_strict_hparams: true aux_decoder_grad: 0.1 lambda_aux_mel_loss: 0.2 diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 0291177a1..12d0b1dba 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -40,9 +40,26 @@ augmentation_args: domain: log # or linear scale: 1.0 +K_step: 1000 residual_channels: 512 residual_layers: 20 +# shallow diffusion +use_shallow_diffusion: false +diff_depth: 400 +shallow_diffusion_args: + train_aux_decoder: true + train_diffusion: true + val_gt_start: false + aux_decoder_arch: convnext + aux_decoder_args: + num_channels: 512 + num_layers: 6 + kernel_size: 7 + dropout_rate: 0.1 + aux_decoder_grad: 0.1 +lambda_aux_mel_loss: 0.2 + optimizer_args: lr: 0.0004 lr_scheduler_args: diff --git a/modules/aux_decoder/__init__.py b/modules/aux_decoder/__init__.py index e69de29bb..b408e4b7b 100644 --- a/modules/aux_decoder/__init__.py +++ b/modules/aux_decoder/__init__.py @@ -0,0 +1,66 @@ +import torch.nn +from torch import nn + +from .convnext import ConvNeXtDecoder +from utils import filter_kwargs + +AUX_DECODERS = { + 'convnext': ConvNeXtDecoder +} +AUX_LOSSES = { + 'convnext': nn.L1Loss +} + + +def build_aux_decoder( + in_dims: int, out_dims: int, + aux_decoder_arch: str, aux_decoder_args: dict +) -> torch.nn.Module: + decoder_cls = AUX_DECODERS[aux_decoder_arch] + kwargs = filter_kwargs(aux_decoder_args, decoder_cls) + return AUX_DECODERS[aux_decoder_arch](in_dims, out_dims, **kwargs) + + +def build_aux_loss(aux_decoder_arch): + return AUX_LOSSES[aux_decoder_arch]() + + +class AuxDecoderAdaptor(nn.Module): + def __init__(self, in_dims: int, out_dims: int, num_feats: int, + spec_min: list, spec_max: list, + aux_decoder_arch: str, aux_decoder_args: dict): + super().__init__() + self.decoder = build_aux_decoder( + in_dims=in_dims, out_dims=out_dims * num_feats, + aux_decoder_arch=aux_decoder_arch, + aux_decoder_args=aux_decoder_args + ) + self.out_dims = out_dims + self.n_feats = num_feats + if spec_min is not None and spec_max is not None: + # spec: [B, T, M] or [B, F, T, M] + # spec_min and spec_max: [1, 1, M] or [1, 1, F, M] => transpose(-3, -2) => [1, 1, M] or [1, F, 1, M] + spec_min = torch.FloatTensor(spec_min)[None, None, :].transpose(-3, -2) + spec_max = torch.FloatTensor(spec_max)[None, None, :].transpose(-3, -2) + self.register_buffer('spec_min', spec_min, persistent=False) + self.register_buffer('spec_max', spec_max, persistent=False) + + def norm_spec(self, x): + return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 + + def denorm_spec(self, x): + return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min + + def forward(self, condition, infer=False): + x = self.decoder(condition, infer=infer) # [B, T, F x C] + + if self.n_feats > 1: + # This is the temporary solution since PyTorch 1.13 + # does not support exporting aten::unflatten to ONNX + # x = x.unflatten(dim=2, sizes=(self.n_feats, self.in_dims)) + x = x.reshape(-1, x.shape[1], self.n_feats, self.out_dims) # [B, T, F, C] + x = x.transpose(1, 2) # [B, F, T, C] + if infer: + x = self.denorm_spec(x) + + return x # [B, T, C] or [B, F, T, C] diff --git a/modules/aux_decoder/fast_speech2_decoder.py b/modules/aux_decoder/convnext.py similarity index 61% rename from modules/aux_decoder/fast_speech2_decoder.py rename to modules/aux_decoder/convnext.py index 1ccb76fcc..a03959ddf 100644 --- a/modules/aux_decoder/fast_speech2_decoder.py +++ b/modules/aux_decoder/convnext.py @@ -12,15 +12,13 @@ class ConvNeXtBlock(nn.Module): intermediate_dim (int): Dimensionality of the intermediate layer. layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. Defaults to None. - adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. - None means non-conditional LayerNorm. Defaults to None. """ def __init__( self, dim: int, intermediate_dim: int, - layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0 + layer_scale_init_value: Optional[float] = None, drop_out: float = 0.0 ): super().__init__() @@ -57,34 +55,33 @@ def forward(self, x: torch.Tensor, ) -> torch.Tensor: return x -class fs2_loss(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, y, x): - x = (x - (-5)) / (0 - (-5)) * 2 - 1 - return nn.L1Loss()(y, x) - - -class fs2_decode(nn.Module): - def __init__(self, encoder_hidden, out_dims, n_chans, kernel_size, dropout_rate, n_layers, parame): +class ConvNeXtDecoder(nn.Module): + def __init__( + self, in_dims, out_dims, /, *, + num_channels=512, num_layers=6, kernel_size=7, dropout_rate=0.1 + ): super().__init__() - self.inconv = nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans, intermediate_dim=n_chans * 4, layer_scale_init_value=1e-6, - drop_out=dropout_rate) for _ in range(n_layers)]) - self.outconv = nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - - def build_loss(self): - - return fs2_loss() + self.inconv = nn.Conv1d( + in_dims, num_channels, kernel_size, + stride=1, padding=(kernel_size - 1) // 2 + ) + self.conv = nn.ModuleList( + ConvNeXtBlock( + dim=num_channels, intermediate_dim=num_channels * 4, + layer_scale_init_value=1e-6, drop_out=dropout_rate + ) for _ in range(num_layers) + ) + self.outconv = nn.Conv1d( + num_channels, out_dims, kernel_size, + stride=1, padding=(kernel_size - 1) // 2 + ) - def forward(self, x, infer, *args, **kwargs): + # noinspection PyUnusedLocal + def forward(self, x, infer=False): x = x.transpose(1, 2) x = self.inconv(x) - for i in self.conv: - x = i(x) - x = self.outconv(x).transpose(1, 2) - if infer: - x = (x + 1) / 2 * (0 - (-5)) + (-5) + for conv in self.conv: + x = conv(x) + x = self.outconv(x) + x = x.transpose(1, 2) return x - pass diff --git a/modules/aux_decoder/fs2_decoder.py b/modules/aux_decoder/fs2_decoder.py deleted file mode 100644 index acb3408be..000000000 --- a/modules/aux_decoder/fs2_decoder.py +++ /dev/null @@ -1,300 +0,0 @@ -import math - -import torch -from torch import nn -import torch.nn.functional as F - - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape -class RelativeFFTBlock(nn.Module): - """ FFT Block with Relative Multi-Head Attention """ - - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=None, block_length=None): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - self.block_length = block_length - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads, - window_size=window_size, p_dropout=p_dropout, block_length=block_length)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN( - hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask=None): - - if x_mask is not None: - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - else: - attn_mask = None - - for i in range(self.n_layers): - if x_mask is not None: - x = x * x_mask - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - if x_mask is not None: - x = x * x_mask - return x - - -class RelativeSelfAttention(nn.Module): - """ Relative Multi-Head Attention """ - - def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., block_length=None, proximal_bias=False, proximal_init=False): - super(RelativeSelfAttention, self).__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.p_dropout = p_dropout - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter(torch.randn( - n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = nn.Parameter(torch.randn( - n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - if proximal_init: - self.conv_k.weight.data.copy_(self.conv_q.weight.data) - self.conv_k.bias.data.copy_(self.conv_q.bias.data) - nn.init.xavier_uniform_(self.conv_v.weight) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, - t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, - t_s).transpose(2, 3) - - scores = torch.matmul(query, key.transpose(-2, -1) - ) / math.sqrt(self.k_channels) - if self.window_size is not None: - assert t_s == t_t, "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query, key_relative_embeddings) - rel_logits = self._relative_position_to_absolute_position( - rel_logits) - scores_local = rel_logits / math.sqrt(self.k_channels) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + \ - self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - block_mask = torch.ones_like( - scores).triu(-self.block_length).tril(self.block_length) - scores = scores * block_mask + -1e4*(1 - block_mask) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position( - p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s) - output = output + \ - self._matmul_with_relative_values( - relative_weights, value_relative_embeddings) - output = output.transpose(2, 3).contiguous().view( - b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[:, - slice_start_position:slice_end_position] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, convert_pad_shape( - [[0, 0], [0, 0], [0, 0], [0, 1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad(x_flat, convert_pad_shape( - [[0, 0], [0, 0], [0, length-1]])) - - # Reshape and slice out the padded elements. - x_final = x_flat.view( - [batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] - return x_final - - def _absolute_position_to_relative_position(self, x): - batch, heads, length, _ = x.size() - # padd along column - x = F.pad(x, convert_pad_shape( - [[0, 0], [0, 0], [0, 0], [0, length-1]])) - x_flat = x.view([batch, heads, length**2 + length*(length - 1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, convert_pad_shape( - [[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2*length])[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - """ - Bias for self-attention to encourage attention to close positions. - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-4): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - n_dims = len(x.shape) - mean = torch.mean(x, 1, keepdim=True) - variance = torch.mean((x - mean)**2, 1, keepdim=True) - - x = (x - mean) * torch.rsqrt(variance + self.eps) - - shape = [1, -1] + [1] * (n_dims - 2) - x = x * self.gamma.view(*shape) + self.beta.view(*shape) - return x - - -class FFN(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - - self.conv = nn.Conv1d( - in_channels, out_channels, kernel_size, padding=kernel_size//2) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask=None): - if x_mask is not None: - x = self.conv(x * x_mask) - else: - x = self.conv(x ) - - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - if x_mask is not None: - x=x * x_mask - return x - - -class fs2_loss(nn.Module): - def __init__(self): - super().__init__() - - def forward(self,y, x): - x=(x - (-5)) / (0 - (-5)) * 2 - 1 - return nn.L1Loss()(y,x) - - -class attention_fs2_decoder(nn.Module): - def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,n_heads,attention_ffn_kernel_size,parame): - super().__init__() - self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - self.conv = RelativeFFTBlock(hidden_channels=n_chans,filter_channels=n_chans*4, n_heads=n_heads, n_layers=n_layers, kernel_size=attention_ffn_kernel_size, p_dropout=dropout_rate) - self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2) - - - - def build_loss(self): - - return fs2_loss() - - def forward(self, x,infer,*args,**kwargs): - x=x.transpose(1, 2) - x=self.inconv(x) - - - x=self.conv(x) - x=self.outconv(x).transpose(1, 2) - if infer: - x=(x + 1) / 2 * (0 - (-5)) + (-5) - return x - pass diff --git a/modules/aux_decoder/shallow_adapter.py b/modules/aux_decoder/shallow_adapter.py deleted file mode 100644 index 53d47c56a..000000000 --- a/modules/aux_decoder/shallow_adapter.py +++ /dev/null @@ -1,76 +0,0 @@ -import torch -import torch.nn as nn - -cls_map = {'fs2': 'modules.aux_decoder.fast_speech2_decoder.fs2_decode', - 'ns': 'modules.aux_decoder.noise_decoder.noise_decoder', 'ld': 'modules.aux_decoder.light_decoder.noise_decoder','att_fs2':'modules.aux_decoder.fs2_decoder.attention_fs2_decoder' - ,'glow':'modules.aux_decoder.glow.glow_decoder','glow_convnext':'modules.aux_decoder.convnext_glow.glow_decoder_convnext','gglow':'modules.aux_decoder.gglow.glow_decoder','fast_speech2_decoders':'modules.aux_decoder.fast_speech2_decoders.fs2_decode' - } -encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'} - - -def build_object_from_class_name(cls_str, parent_cls, strict, *args, **kwargs): - import importlib - - pkg = ".".join(cls_str.split(".")[:-1]) - cls_name = cls_str.split(".")[-1] - cls_type = getattr(importlib.import_module(pkg), cls_name) - if parent_cls is not None: - assert issubclass(cls_type, parent_cls), f'| {cls_type} is not subclass of {parent_cls}.' - if strict: - return cls_type(*args, **kwargs) - return cls_type(*args, **filter_kwargs(kwargs, cls_type)) - - -def filter_kwargs(dict_to_filter, kwarg_obj): - import inspect - - sig = inspect.signature(kwarg_obj) - filter_keys = [param.name for param in sig.parameters.values() if param.kind == param.POSITIONAL_OR_KEYWORD] - filtered_dict = {filter_key: dict_to_filter[filter_key] for filter_key in filter_keys if - filter_key in dict_to_filter} - return filtered_dict - - -class shallow_adapt(nn.Module): - def __init__(self, parame, out_dims, vocab_size): - super().__init__() - self.parame = parame - - decodeparame = parame['shallow_diffusion_args']['aux_decoder_args'] - if decodeparame.get('encoder_hidden') is None: - decodeparame['encoder_hidden'] = parame['hidden_size'] - decodeparame['out_dims'] = out_dims - decodeparame['parame'] = parame - - encoderparame = parame['shallow_diffusion_args']['aux_encoder_args'] - encoderparame['parame'] = parame - encoderparame['vocab_size'] = vocab_size - self.decoder = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']], - nn.Module, - parame['shallow_diffusion_args']['aux_decoder_strict_hparams'], - **decodeparame) - - if not parame['shallow_diffusion_args']['aux_share_encoder']: - # todo - self.use_encoder = True - self.encoder = build_object_from_class_name( - encoder_cls_map[parame['shallow_diffusion_args']['aux_encoder_arch']], - nn.Module, - parame['shallow_diffusion_args']['aux_encoder_strict_hparams'], - **encoderparame) - else: - self.use_encoder = False - - def forward(self, condition, infer=False, txt_tokens=None, mel2ph=None, f0=None, - key_shift=None, speed=None, - spk_embed_id=None,gt_mel=None,mask=None, **kwargs): - - if self.use_encoder: - condition = self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, - key_shift=key_shift, speed=speed, - spk_embed_id=spk_embed_id, **kwargs) - - return self.decoder(condition, infer,gt_mel,mask) - - def get_loss(self): - return self.decoder.build_loss() diff --git a/modules/toplevel.py b/modules/toplevel.py index 7c00561be..1b917c585 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -6,6 +6,7 @@ from torch import Tensor from basics.base_module import CategorizedModule +from modules.aux_decoder import AuxDecoderAdaptor from modules.commons.common_layers import ( XavierUniformInitLinear as Linear, NormalInitEmbedding as Embedding @@ -17,7 +18,6 @@ from modules.fastspeech.param_adaptor import ParameterAdaptorModule from modules.fastspeech.tts_modules import RhythmRegulator, LengthRegulator from modules.fastspeech.variance_encoder import FastSpeech2Variance -from modules.aux_decoder.shallow_adapter import shallow_adapt from utils.hparams import hparams @@ -27,16 +27,6 @@ def __init__(self, *, aux_out=None, diff_out=None): self.diff_out = diff_out -# TODO: replace the following placeholder with real modules -class ExampleAuxDecoder(nn.Module): - def __init__(self, out_dims): - super().__init__() - self.out_dims = out_dims - - def forward(self, condition, infer=True): - return torch.randn(condition.shape[0], condition.shape[1], self.out_dims, device=condition.device) - - class DiffSingerAcoustic(ParameterAdaptorModule, CategorizedModule): @property def category(self): @@ -54,7 +44,12 @@ def __init__(self, vocab_size, out_dims): self.train_aux_decoder = self.shallow_args['train_aux_decoder'] self.train_diffusion = self.shallow_args['train_diffusion'] self.aux_decoder_grad = self.shallow_args['aux_decoder_grad'] - self.aux_decoder = shallow_adapt(hparams, out_dims,vocab_size) + self.aux_decoder = AuxDecoderAdaptor( + in_dims=hparams['hidden_size'], out_dims=out_dims, num_feats=1, + spec_min=hparams['spec_min'], spec_max=hparams['spec_max'], + aux_decoder_arch=self.shallow_args['aux_decoder_arch'], + aux_decoder_args=self.shallow_args['aux_decoder_args'] + ) self.diffusion = GaussianDiffusion( out_dims=out_dims, @@ -81,8 +76,7 @@ def forward( ) if infer: if self.use_shallow_diffusion: - aux_mel_pred = self.aux_decoder(condition, infer=True,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, - key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs) + aux_mel_pred = self.aux_decoder(condition, infer=True) aux_mel_pred *= ((mel2ph > 0).float()[:, :, None]) if gt_mel is not None and self.shallow_args['val_gt_start']: src_mel = gt_mel @@ -97,8 +91,7 @@ def forward( if self.use_shallow_diffusion: if self.train_aux_decoder: aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad) - aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0, - key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id,gt_mel=gt_mel,mask=((mel2ph > 0).float()[:, :, None]), **kwargs) + aux_out = self.aux_decoder(aux_cond, infer=False) else: aux_out = None if self.train_diffusion: diff --git a/training/acoustic_task.py b/training/acoustic_task.py index 6969b5f5d..04dedb65c 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -9,6 +9,7 @@ from basics.base_dataset import BaseDataset from basics.base_task import BaseTask from basics.base_vocoder import BaseVocoder +from modules.aux_decoder import build_aux_loss from modules.losses.diff_loss import DiffusionNoiseLoss from modules.toplevel import DiffSingerAcoustic, ShallowDiffusionOutput from modules.vocoders.registry import get_vocoder_cls @@ -62,9 +63,9 @@ def __init__(self): self.dataset_cls = AcousticDataset self.use_shallow_diffusion = hparams['use_shallow_diffusion'] if self.use_shallow_diffusion: - shallow_args = hparams['shallow_diffusion_args'] - self.train_aux_decoder = shallow_args['train_aux_decoder'] - self.train_diffusion = shallow_args['train_diffusion'] + self.shallow_args = hparams['shallow_diffusion_args'] + self.train_aux_decoder = self.shallow_args['train_aux_decoder'] + self.train_diffusion = self.shallow_args['train_diffusion'] self.use_vocoder = hparams['infer'] or hparams['val_with_vocoder'] if self.use_vocoder: @@ -85,7 +86,7 @@ def build_model(self): # noinspection PyAttributeOutsideInit def build_losses_and_metrics(self): if self.use_shallow_diffusion: - self.aux_mel_loss = self.model.aux_decoder.get_loss() + self.aux_mel_loss = build_aux_loss(self.shallow_args['aux_decoder_arch']) self.lambda_aux_mel_loss = hparams['lambda_aux_mel_loss'] self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type']) @@ -118,7 +119,8 @@ def run_model(self, sample, infer=False): if output.aux_out is not None: aux_out = output.aux_out - aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target) + norm_gt = self.model.aux_decoder.norm_spec(target) + aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, norm_gt) losses['aux_mel_loss'] = aux_mel_loss if output.diff_out is not None: From ef87664e441f1844ae9f1d000751ef57ab603f85 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Thu, 21 Sep 2023 23:54:10 +0800 Subject: [PATCH 30/33] Fix KeyError --- modules/toplevel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/toplevel.py b/modules/toplevel.py index 1b917c585..38cdbe7c9 100644 --- a/modules/toplevel.py +++ b/modules/toplevel.py @@ -39,7 +39,7 @@ def __init__(self, vocab_size, out_dims): ) self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False) - self.shallow_args = hparams['shallow_diffusion_args'] + self.shallow_args = hparams.get('shallow_diffusion_args', {}) if self.use_shallow_diffusion: self.train_aux_decoder = self.shallow_args['train_aux_decoder'] self.train_diffusion = self.shallow_args['train_diffusion'] From 2986c888084f2e63074e6984fc0257dd047705fe Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Thu, 21 Sep 2023 23:54:47 +0800 Subject: [PATCH 31/33] Support exporting shallow diffusion to ONNX --- deployment/exporters/acoustic_exporter.py | 88 ++++++++++++++++------- deployment/modules/diffusion.py | 37 ++++++++-- deployment/modules/toplevel.py | 49 +++++++------ modules/aux_decoder/__init__.py | 8 ++- utils/onnx_helper.py | 42 ++++++----- 5 files changed, 148 insertions(+), 76 deletions(-) diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py index 34cf2a016..ebfd75a10 100644 --- a/deployment/exporters/acoustic_exporter.py +++ b/deployment/exporters/acoustic_exporter.py @@ -33,12 +33,22 @@ def __init__( self.spk_map: dict = self.build_spk_map() self.vocab = TokenTextEncoder(vocab_list=build_phoneme_list()) self.model = self.build_model() - self.fs2_cache_path = self.cache_dir / 'fs2.onnx' + self.fs2_aux_cache_path = self.cache_dir / ( + 'fs2_aux.onnx' if self.model.use_shallow_diffusion else 'fs2.onnx' + ) self.diffusion_cache_path = self.cache_dir / 'diffusion.onnx' # Attributes for logging self.model_class_name = remove_suffix(self.model.__class__.__name__, 'ONNX') - self.fs2_class_name = remove_suffix(self.model.fs2.__class__.__name__, 'ONNX') + fs2_aux_cls_logging = [remove_suffix(self.model.fs2.__class__.__name__, 'ONNX')] + if self.model.use_shallow_diffusion: + fs2_aux_cls_logging.append(remove_suffix( + self.model.aux_decoder.decoder.__class__.__name__, 'ONNX' + )) + self.fs2_aux_class_name = ', '.join(fs2_aux_cls_logging) + self.aux_decoder_class_name = remove_suffix( + self.model.aux_decoder.decoder.__class__.__name__, 'ONNX' + ) if self.model.use_shallow_diffusion else None self.denoiser_class_name = remove_suffix(self.model.diffusion.denoise_fn.__class__.__name__, 'ONNX') self.diffusion_class_name = remove_suffix(self.model.diffusion.__class__.__name__, 'ONNX') @@ -86,11 +96,11 @@ def export(self, path: Path): def export_model(self, path: Path): self._torch_export_model() - fs2_onnx = self._optimize_fs2_graph(onnx.load(self.fs2_cache_path)) + fs2_aux_onnx = self._optimize_fs2_aux_graph(onnx.load(self.fs2_aux_cache_path)) diffusion_onnx = self._optimize_diffusion_graph(onnx.load(self.diffusion_cache_path)) - model_onnx = self._merge_fs2_diffusion_graphs(fs2_onnx, diffusion_onnx) + model_onnx = self._merge_fs2_aux_diffusion_graphs(fs2_aux_onnx, diffusion_onnx) onnx.save(model_onnx, path) - self.fs2_cache_path.unlink() + self.fs2_aux_cache_path.unlink() self.diffusion_cache_path.unlink() print(f'| export model => {path}') @@ -105,7 +115,7 @@ def export_attachments(self, path: Path): @torch.no_grad() def _torch_export_model(self): - # Prepare inputs for FastSpeech2 tracing + # Prepare inputs for FastSpeech2 and aux decoder tracing n_frames = 10 tokens = torch.LongTensor([[1]]).to(self.device) durations = torch.LongTensor([[n_frames]]).to(self.device) @@ -161,22 +171,30 @@ def _torch_export_model(self): 1: 'n_frames' } - # PyTorch ONNX export for FastSpeech2 - print(f'Exporting {self.fs2_class_name}...') + # PyTorch ONNX export for FastSpeech2 and aux decoder + output_names = ['condition'] + if self.model.use_shallow_diffusion: + output_names.append('aux_mel') + dynamix_axes['aux_mel'] = { + 1: 'n_frames' + } + print(f'Exporting {self.fs2_aux_class_name}...') torch.onnx.export( - self.model.view_as_fs2(), + self.model.view_as_fs2_aux(), arguments, - self.fs2_cache_path, + self.fs2_aux_cache_path, input_names=input_names, - output_names=['condition'], + output_names=output_names, dynamic_axes=dynamix_axes, opset_version=15 ) + condition = torch.rand((1, n_frames, hparams['hidden_size']), device=self.device) + # Prepare inputs for denoiser tracing and GaussianDiffusion scripting shape = (1, 1, hparams['audio_num_mel_bins'], n_frames) noise = torch.randn(shape, device=self.device) - condition = torch.rand((1, hparams['hidden_size'], n_frames), device=self.device) + x_start = torch.randn((1, n_frames, hparams['audio_num_mel_bins']),device=self.device) step = (torch.rand((1,), device=self.device) * hparams['K_step']).long() print(f'Tracing {self.denoiser_class_name} denoiser...') @@ -186,20 +204,24 @@ def _torch_export_model(self): ( noise, step, - condition + condition.transpose(1, 2) ) ) print(f'Scripting {self.diffusion_class_name}...') + diffusion_inputs = [ + condition, + *([x_start, 100] if self.model.use_shallow_diffusion else []) + ] diffusion = torch.jit.script( diffusion, example_inputs=[ ( - condition.transpose(1, 2), + *diffusion_inputs, 1 # p_sample branch ), ( - condition.transpose(1, 2), + *diffusion_inputs, 200 # p_sample_plms branch ) ] @@ -210,12 +232,14 @@ def _torch_export_model(self): torch.onnx.export( diffusion, ( - condition.transpose(1, 2), + *diffusion_inputs, 200 ), self.diffusion_cache_path, input_names=[ - 'condition', 'speedup' + 'condition', + *(['x_start', 'depth'] if self.model.use_shallow_diffusion else []), + 'speedup' ], output_names=[ 'mel' @@ -224,6 +248,7 @@ def _torch_export_model(self): 'condition': { 1: 'n_frames' }, + **({'x_start': {1: 'n_frames'}} if self.model.use_shallow_diffusion else {}), 'mel': { 1: 'n_frames' } @@ -252,11 +277,11 @@ def _perform_spk_mix(self, spk_mix: Dict[str, float]): ) # => [1, H] return spk_mix_embed - def _optimize_fs2_graph(self, fs2: onnx.ModelProto) -> onnx.ModelProto: - print(f'Running ONNX Simplifier on {self.fs2_class_name}...') + def _optimize_fs2_aux_graph(self, fs2: onnx.ModelProto) -> onnx.ModelProto: + print(f'Running ONNX Simplifier on {self.fs2_aux_class_name}...') fs2, check = onnxsim.simplify(fs2, include_subgraph=True) assert check, 'Simplified ONNX model could not be validated' - print(f'| optimize graph: {self.fs2_class_name}') + print(f'| optimize graph: {self.fs2_aux_class_name}') return fs2 def _optimize_diffusion_graph(self, diffusion: onnx.ModelProto) -> onnx.ModelProto: @@ -282,18 +307,33 @@ def _optimize_diffusion_graph(self, diffusion: onnx.ModelProto) -> onnx.ModelPro print(f'| optimize graph: {self.diffusion_class_name}') return diffusion - def _merge_fs2_diffusion_graphs(self, fs2: onnx.ModelProto, diffusion: onnx.ModelProto) -> onnx.ModelProto: - onnx_helper.model_add_prefixes(fs2, dim_prefix='fs2.', ignored_pattern=r'(n_tokens)|(n_frames)') + def _merge_fs2_aux_diffusion_graphs(self, fs2: onnx.ModelProto, diffusion: onnx.ModelProto) -> onnx.ModelProto: + onnx_helper.model_add_prefixes( + fs2, dim_prefix=('fs2aux.' if self.model.use_shallow_diffusion else 'fs2.'), + ignored_pattern=r'(n_tokens)|(n_frames)' + ) onnx_helper.model_add_prefixes(diffusion, dim_prefix='diffusion.', ignored_pattern='n_frames') - print(f'Merging {self.fs2_class_name} and {self.diffusion_class_name} ' + print(f'Merging {self.fs2_aux_class_name} and {self.diffusion_class_name} ' f'back into {self.model_class_name}...') merged = onnx.compose.merge_models( - fs2, diffusion, io_map=[('condition', 'condition')], + fs2, diffusion, io_map=[ + ('condition', 'condition'), + *([('aux_mel', 'x_start')] if self.model.use_shallow_diffusion else []), + ], prefix1='', prefix2='', doc_string='', producer_name=fs2.producer_name, producer_version=fs2.producer_version, domain=fs2.domain, model_version=fs2.model_version ) merged.graph.name = fs2.graph.name + + print(f'Running ONNX Simplifier on {self.model_class_name}...') + merged, check = onnxsim.simplify( + merged, + include_subgraph=True + ) + assert check, 'Simplified ONNX model could not be validated' + print(f'| optimize graph: {self.model_class_name}') + return merged # noinspection PyMethodMayBeStatic diff --git a/deployment/modules/diffusion.py b/deployment/modules/diffusion.py index 8905bebda..3c139f649 100644 --- a/deployment/modules/diffusion.py +++ b/deployment/modules/diffusion.py @@ -16,6 +16,12 @@ def extract(a, t): # noinspection PyMethodOverriding class GaussianDiffusionONNX(GaussianDiffusion): + def q_sample(self, x_start, t, noise): + return ( + extract(self.sqrt_alphas_cumprod, t) * x_start + + extract(self.sqrt_one_minus_alphas_cumprod, t) * noise + ) + def p_sample(self, x, t, cond): x_pred = self.denoise_fn(x, t, cond) x_recon = ( @@ -74,18 +80,37 @@ def p_sample_plms(self, x_prev, t, interval: int, cond, noise_list: List[Tensor] x_prev = self.plms_get_x_pred(x_prev, noise_pred_prime, t, t_prev) return noise_pred, x_prev + def norm_spec(self, x): + k = (self.spec_max - self.spec_min) / 2. + b = (self.spec_max + self.spec_min) / 2. + return (x - b) / k + def denorm_spec(self, x): - d = (self.spec_max - self.spec_min) / 2. - m = (self.spec_max + self.spec_min) / 2. - return x * d + m + k = (self.spec_max - self.spec_min) / 2. + b = (self.spec_max + self.spec_min) / 2. + return x * k + b - def forward(self, condition, speedup: int): + def forward(self, condition, x_start=None, depth: int = 1000, speedup: int = 1): condition = condition.transpose(1, 2) # [1, T, H] => [1, H, T] device = condition.device n_frames = condition.shape[2] - step_range = torch.arange(0, self.k_step, speedup, dtype=torch.long, device=device).flip(0)[:, None] - x = torch.randn((1, self.num_feats, self.out_dims, n_frames), device=device) + noise = torch.randn((1, self.num_feats, self.out_dims, n_frames), device=device) + if x_start is None: + step_range = torch.arange(0, self.k_step, speedup, dtype=torch.long, device=device).flip(0)[:, None] + x = noise + else: + depth = min(depth, self.k_step) + step_range = torch.arange(0, depth, speedup, dtype=torch.long, device=device).flip(0)[:, None] + x_start = self.norm_spec(x_start).transpose(-2, -1) + if self.num_feats == 1: + x_start = x_start[:, None, :, :] + if depth > 0: + x = self.q_sample( + x_start, torch.full((1,), depth - 1, device=device, dtype=torch.long), noise + ) + else: + x = x_start if speedup > 1: for t in step_range: diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py index 2cbbda8fb..027997218 100644 --- a/deployment/modules/toplevel.py +++ b/deployment/modules/toplevel.py @@ -1,6 +1,6 @@ -import numpy as np import copy +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -37,7 +37,7 @@ def __init__(self, vocab_size, out_dims): spec_max=hparams['spec_max'] ) - def forward_fs2( + def forward_fs2_aux( self, tokens: Tensor, durations: Tensor, @@ -46,41 +46,40 @@ def forward_fs2( gender: Tensor = None, velocity: Tensor = None, spk_embed: Tensor = None - ) -> Tensor: - return self.fs2( + ): + condition = self.fs2( tokens, durations, f0, variances=variances, gender=gender, velocity=velocity, spk_embed=spk_embed ) + if self.use_shallow_diffusion: + aux_mel_pred = self.aux_decoder(condition, infer=True) + return condition, aux_mel_pred + else: + return condition + + def forward_shallow_diffusion( + self, condition: Tensor, x_start: Tensor, + depth: int, speedup: int + ) -> Tensor: + return self.diffusion(condition, x_start=x_start, depth=depth, speedup=speedup) - def forward_diffusion(self, condition: Tensor, speedup: int) -> Tensor: - return self.diffusion(condition, speedup) + def forward_diffusion(self, condition: Tensor, speedup: int): + return self.diffusion(condition, speedup=speedup) - def view_as_fs2(self) -> nn.Module: + def view_as_fs2_aux(self) -> nn.Module: model = copy.deepcopy(self) - try: - del model.variance_embeds - del model.variance_adaptor - except AttributeError: - pass del model.diffusion - model.forward = model.forward_fs2 + model.forward = model.forward_fs2_aux return model - def view_as_adaptor(self) -> nn.Module: - model = copy.deepcopy(self) - del model.fs2 - del model.diffusion - raise NotImplementedError() - def view_as_diffusion(self) -> nn.Module: model = copy.deepcopy(self) del model.fs2 - try: - del model.variance_embeds - del model.variance_adaptor - except AttributeError: - pass - model.forward = model.forward_diffusion + if self.use_shallow_diffusion: + del model.aux_decoder + model.forward = model.forward_shallow_diffusion + else: + model.forward = model.forward_diffusion return model diff --git a/modules/aux_decoder/__init__.py b/modules/aux_decoder/__init__.py index b408e4b7b..54ceb2113 100644 --- a/modules/aux_decoder/__init__.py +++ b/modules/aux_decoder/__init__.py @@ -46,10 +46,14 @@ def __init__(self, in_dims: int, out_dims: int, num_feats: int, self.register_buffer('spec_max', spec_max, persistent=False) def norm_spec(self, x): - return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 + k = (self.spec_max - self.spec_min) / 2. + b = (self.spec_max + self.spec_min) / 2. + return (x - b) / k def denorm_spec(self, x): - return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min + k = (self.spec_max - self.spec_min) / 2. + b = (self.spec_max + self.spec_min) / 2. + return x * k + b def forward(self, condition, infer=False): x = self.decoder(condition, infer=infer) # [B, T, F x C] diff --git a/utils/onnx_helper.py b/utils/onnx_helper.py index 9fc3f6fad..bebe97565 100644 --- a/utils/onnx_helper.py +++ b/utils/onnx_helper.py @@ -277,27 +277,31 @@ def _extract_conv_nodes_recursive(subgraph: GraphProto): to_be_removed.append(sub_node) [subgraph.node.remove(_n) for _n in to_be_removed] + toplevel_if_idx = toplevel_if_node = None + # Find the **last** If node in toplevel graph for i, n in enumerate(graph.node): if n.op_type == 'If': - for a in n.attribute: - b = onnx.helper.get_attribute_value(a) - _extract_conv_nodes_recursive(b) - # Insert the extracted nodes before the first 'If' node which carries the main denoising loop. - for key in reversed(node_dict): - alias, node = node_dict[key] - # Rename output of the node. - out_name = node.output[0] - node.output.remove(node.output[0]) - node.output.insert(0, alias) - # Insert node into the main graph. - graph.node.insert(i, node) - # Rename value info of the output. - for v in graph.value_info: - if v.name == out_name: - v.name = alias - break - _verbose(f'| extract conditioner projection: \'{node.name}\'') - break + toplevel_if_idx = i + toplevel_if_node = n + if toplevel_if_node is not None: + for a in toplevel_if_node.attribute: + b = onnx.helper.get_attribute_value(a) + _extract_conv_nodes_recursive(b) + # Insert the extracted nodes before the first 'If' node which carries the main denoising loop. + for key in reversed(node_dict): + alias, node = node_dict[key] + # Rename output of the node. + out_name = node.output[0] + node.output.remove(node.output[0]) + node.output.insert(0, alias) + # Insert node into the main graph. + graph.node.insert(toplevel_if_idx, node) + # Rename value info of the output. + for v in graph.value_info: + if v.name == out_name: + v.name = alias + break + _verbose(f'| extract conditioner projection: \'{node.name}\'') def graph_remove_unused_values(graph: GraphProto): From 1a8fb72db3db0983c1a0e0dfa0be2ecebcdde9e5 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Fri, 22 Sep 2023 01:19:32 +0800 Subject: [PATCH 32/33] Add missing logic to ONNX --- deployment/modules/diffusion.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deployment/modules/diffusion.py b/deployment/modules/diffusion.py index 3c139f649..c8a03fe5a 100644 --- a/deployment/modules/diffusion.py +++ b/deployment/modules/diffusion.py @@ -105,7 +105,9 @@ def forward(self, condition, x_start=None, depth: int = 1000, speedup: int = 1): x_start = self.norm_spec(x_start).transpose(-2, -1) if self.num_feats == 1: x_start = x_start[:, None, :, :] - if depth > 0: + if depth >= self.timesteps: + x = noise + elif depth > 0: x = self.q_sample( x_start, torch.full((1,), depth - 1, device=device, dtype=torch.long), noise ) From acf00e483fdc45e080d7b9d4c42f979964da1a4f Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Fri, 22 Sep 2023 21:43:03 +0800 Subject: [PATCH 33/33] Rename `diff_depth` to `K_step_infer` --- configs/acoustic.yaml | 2 +- configs/templates/config_acoustic.yaml | 2 +- modules/diffusion/ddpm.py | 2 +- scripts/infer.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 174cd9943..92c7aa7f9 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -78,7 +78,7 @@ schedule_type: 'linear' # shallow diffusion use_shallow_diffusion: false -diff_depth: 400 +K_step_infer: 400 shallow_diffusion_args: train_aux_decoder: true diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 12d0b1dba..72e3c2dfd 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -46,7 +46,7 @@ residual_layers: 20 # shallow diffusion use_shallow_diffusion: false -diff_depth: 400 +K_step_infer: 400 shallow_diffusion_args: train_aux_decoder: true train_diffusion: true diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py index 7c4215bf1..d8bdc4442 100644 --- a/modules/diffusion/ddpm.py +++ b/modules/diffusion/ddpm.py @@ -223,7 +223,7 @@ def p_losses(self, x_start, t, cond, noise=None): return x_recon, noise def inference(self, cond, b=1, x_start=None, device=None): - depth = hparams.get('diff_depth', self.k_step) + depth = hparams.get('K_step_infer', self.k_step) noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) if self.use_shallow_diffusion: t_max = min(depth, self.k_step) diff --git a/scripts/infer.py b/scripts/infer.py index 9108ff353..8c6e6e835 100644 --- a/scripts/infer.py +++ b/scripts/infer.py @@ -111,9 +111,9 @@ def acoustic( if depth >= 0: assert depth <= hparams['K_step'], f'Diffusion depth should not be larger than K_step {hparams["K_step"]}.' - hparams['diff_depth'] = depth + hparams['K_step_infer'] = depth elif hparams.get('use_shallow_diffusion', False): - depth = hparams['diff_depth'] + depth = hparams['K_step_infer'] else: depth = hparams['K_step'] # gaussian start (full depth diffusion)