From 7da2d65d36fc3aafcce6e225e2dc9af4beb488eb Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 6 Aug 2023 19:22:24 +0800
Subject: [PATCH 01/33] Add shallow diffusion API

---
 configs/acoustic.yaml     | 2 ++
 modules/diffusion/ddpm.py | 9 +++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 88ae1b12b..ec98f065a 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -66,6 +66,8 @@ K_step: 1000
 timesteps: 1000
 max_beta: 0.02
 rel_pos: true
+use_shallow_diffusion: false
+diff_depth: 400
 diff_accelerator: ddim
 pndm_speedup: 10
 hidden_size: 256
diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index 46c3eaccb..d17070cb8 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -216,7 +216,10 @@ def p_losses(self, x_start, t, cond, noise=None):
 
         return x_recon, noise
 
-    def inference(self, cond, b=1, device=None):
+    def inference(self, cond, b=1, src_spec=None, device=None):
+        depth = hparams.get('diff_depth', self.k_step)
+        # TODO: implement shallow diffusion
+
         t = self.k_step
         shape = (b, self.num_feats, self.out_dims, cond.shape[2])
         x = torch.randn(shape, device=device)
@@ -329,7 +332,7 @@ def wrapped(x, t, **kwargs):
         x = x.transpose(2, 3).squeeze(1)  # [B, F, M, T] => [B, T, M] or [B, F, T, M]
         return x
 
-    def forward(self, condition, gt_spec=None, infer=True):
+    def forward(self, condition, gt_spec=None, src_spec=None, infer=True):
         """
             conditioning diffusion, use fastspeech2 encoder output as the condition
         """
@@ -344,6 +347,8 @@ def forward(self, condition, gt_spec=None, infer=True):
             t = torch.randint(0, self.k_step, (b,), device=device).long()
             return self.p_losses(spec, t, cond=cond)
         else:
+            # src_spec: [B, T, M]
+            # TODO: implement shallow diffusion
             x = self.inference(cond, b=b, device=device)
             return self.denorm_spec(x)
 

From 4f3d765d2a9f6861d46a640256ff81e3158e9f58 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 6 Aug 2023 21:32:27 +0800
Subject: [PATCH 02/33] Support aux decoder training

---
 configs/acoustic.yaml     |  9 +++++++
 modules/toplevel.py       | 41 ++++++++++++++++++++++++++++---
 training/acoustic_task.py | 51 +++++++++++++++++++++++++++------------
 3 files changed, 82 insertions(+), 19 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index ec98f065a..9913429d8 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -78,6 +78,15 @@ diff_decoder_type: 'wavenet'
 diff_loss_type: l2
 schedule_type: 'linear'
 
+shallow_diffusion_args:
+  train_aux_decoder: true
+  train_diffusion: true
+  shared_encoder: true
+  aux_decoder_arch: ps
+  aux_decoder_args:
+    arch: ps
+    # kernel_size: xxx
+
 # train and eval
 num_sanity_val_steps: 1
 optimizer_args:
diff --git a/modules/toplevel.py b/modules/toplevel.py
index a93ed1e34..41bdee28b 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -20,6 +20,22 @@
 from utils.hparams import hparams
 
 
+class ShallowDiffusionOutput:
+    def __init__(self, *, aux_out=None, diff_out=None):
+        self.aux_out = aux_out
+        self.diff_out = diff_out
+
+
+# TODO: replace the following placeholder with real modules
+class ExampleAuxDecoder(nn.Module):
+    def __init__(self, out_dims):
+        super().__init__()
+        self.out_dims = out_dims
+
+    def forward(self, condition, infer=True):
+        return torch.randn(condition.shape[0], condition.shape[1], self.out_dims, device=condition.device)
+
+
 class DiffSingerAcoustic(ParameterAdaptorModule, CategorizedModule):
     @property
     def category(self):
@@ -31,6 +47,13 @@ def __init__(self, vocab_size, out_dims):
             vocab_size=vocab_size
         )
 
+        self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False)
+        if self.use_shallow_diffusion:
+            # TODO: replace the following placeholder with real modules
+            self.aux_decoder = ExampleAuxDecoder(
+                out_dims=out_dims
+            )
+
         self.diffusion = GaussianDiffusion(
             out_dims=out_dims,
             num_feats=1,
@@ -49,19 +72,29 @@ def __init__(self, vocab_size, out_dims):
     def forward(
             self, txt_tokens, mel2ph, f0, key_shift=None, speed=None,
             spk_embed_id=None, gt_mel=None, infer=True, **kwargs
-    ):
+    ) -> ShallowDiffusionOutput:
         condition = self.fs2(
             txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed,
             spk_embed_id=spk_embed_id, **kwargs
         )
 
         if infer:
-            mel_pred = self.diffusion(condition, infer=True)
+            if self.use_shallow_diffusion:
+                aux_mel_pred = self.aux_decoder(condition, infer=True)
+                aux_mel_pred *= ((mel2ph > 0).float()[:, :, None])
+            else:
+                aux_mel_pred = None
+            mel_pred = self.diffusion(condition, src_spec=aux_mel_pred, infer=True)
             mel_pred *= ((mel2ph > 0).float()[:, :, None])
-            return mel_pred
+            return ShallowDiffusionOutput(aux_out=aux_mel_pred, diff_out=mel_pred)
         else:
+            if self.use_shallow_diffusion:
+                # TODO: replace the following placeholder with real calling code
+                aux_out = self.aux_decoder(condition, infer=False)
+            else:
+                aux_out = None
             x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False)
-            return x_recon, noise
+            return ShallowDiffusionOutput(aux_out=aux_out, diff_out=(x_recon, noise))
 
 
 class DiffSingerVariance(ParameterAdaptorModule, CategorizedModule):
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index b0723912b..e34c774be 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -10,7 +10,7 @@
 from basics.base_task import BaseTask
 from basics.base_vocoder import BaseVocoder
 from modules.losses.diff_loss import DiffusionNoiseLoss
-from modules.toplevel import DiffSingerAcoustic
+from modules.toplevel import DiffSingerAcoustic, ShallowDiffusionOutput
 from modules.vocoders.registry import get_vocoder_cls
 from utils.hparams import hparams
 from utils.plot import spec_to_figure, curve_to_figure
@@ -60,6 +60,7 @@ class AcousticTask(BaseTask):
     def __init__(self):
         super().__init__()
         self.dataset_cls = AcousticDataset
+        self.use_shallow_diffusion = hparams['use_shallow_diffusion']
         self.use_vocoder = hparams['infer'] or hparams['val_with_vocoder']
         if self.use_vocoder:
             self.vocoder: BaseVocoder = get_vocoder_cls(hparams)()
@@ -78,6 +79,9 @@ def build_model(self):
 
     # noinspection PyAttributeOutsideInit
     def build_losses_and_metrics(self):
+        if self.use_shallow_diffusion:
+            # TODO: replace the following placeholder with real loss creation
+            self.aux_mel_loss = torch.nn.L1Loss()
         self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type'])
 
     def run_model(self, sample, infer=False):
@@ -96,20 +100,24 @@ def run_model(self, sample, infer=False):
             spk_embed_id = sample['spk_ids']
         else:
             spk_embed_id = None
-        output = self.model(
+        output: ShallowDiffusionOutput = self.model(
             txt_tokens, mel2ph=mel2ph, f0=f0, **variances,
             key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id,
             gt_mel=target, infer=infer
         )
 
         if infer:
-            return output  # mel_pred
+            return output
         else:
-            x_recon, x_noise = output
+            losses = {}
+            if self.use_shallow_diffusion:
+                aux_out = output.aux_out
+                # TODO: replace the following placeholder with real loss calculation
+                aux_mel_loss = self.aux_mel_loss(aux_out, target)
+                losses['aux_mel_loss'] = aux_mel_loss
+            x_recon, x_noise = output.diff_out
             mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())
-            losses = {
-                'mel_loss': mel_loss
-            }
+            losses['mel_loss'] = mel_loss
 
             return losses
 
@@ -126,29 +134,42 @@ def _validation_step(self, sample, batch_idx):
 
         if batch_idx < hparams['num_valid_plots'] \
                 and (self.trainer.distributed_sampler_kwargs or {}).get('rank', 0) == 0:
-            mel_pred = self.run_model(sample, infer=True)
+            mel_out: ShallowDiffusionOutput = self.run_model(sample, infer=True)
 
             if self.use_vocoder:
-                self.plot_wav(batch_idx, sample['mel'], mel_pred, f0=sample['f0'])
-            self.plot_mel(batch_idx, sample['mel'], mel_pred, name=f'diffmel_{batch_idx}')
+                self.plot_wav(
+                    batch_idx, gt_mel=sample['mel'],
+                    aux_mel=mel_out.aux_out, diff_mel=mel_out.diff_out,
+                    f0=sample['f0']
+                )
+            self.plot_mel(batch_idx, sample['mel'], mel_out.aux_out, name=f'auxmel_{batch_idx}')
+            self.plot_mel(batch_idx, sample['mel'], mel_out.diff_out, name=f'diffmel_{batch_idx}')
 
         return losses, sample['size']
 
     ############
     # validation plots
     ############
-    def plot_wav(self, batch_idx, gt_mel, pred_mel, f0=None):
+    def plot_wav(self, batch_idx, gt_mel, aux_mel=None, diff_mel=None, f0=None):
         gt_mel = gt_mel[0].cpu().numpy()
-        pred_mel = pred_mel[0].cpu().numpy()
+        if aux_mel is not None:
+            aux_mel = aux_mel[0].cpu().numpy()
+        if diff_mel is not None:
+            diff_mel = diff_mel[0].cpu().numpy()
         f0 = f0[0].cpu().numpy()
         if batch_idx not in self.logged_gt_wav:
             gt_wav = self.vocoder.spec2wav(gt_mel, f0=f0)
             self.logger.experiment.add_audio(f'gt_{batch_idx}', gt_wav, sample_rate=hparams['audio_sample_rate'],
                                              global_step=self.global_step)
             self.logged_gt_wav.add(batch_idx)
-        pred_wav = self.vocoder.spec2wav(pred_mel, f0=f0)
-        self.logger.experiment.add_audio(f'pred_{batch_idx}', pred_wav, sample_rate=hparams['audio_sample_rate'],
-                                         global_step=self.global_step)
+        if aux_mel is not None:
+            aux_wav = self.vocoder.spec2wav(aux_mel, f0=f0)
+            self.logger.experiment.add_audio(f'aux_{batch_idx}', aux_wav, sample_rate=hparams['audio_sample_rate'],
+                                             global_step=self.global_step)
+        if diff_mel is not None:
+            diff_wav = self.vocoder.spec2wav(diff_mel, f0=f0)
+            self.logger.experiment.add_audio(f'diff_{batch_idx}', diff_wav, sample_rate=hparams['audio_sample_rate'],
+                                             global_step=self.global_step)
 
     def plot_mel(self, batch_idx, spec, spec_out, name=None):
         name = f'mel_{batch_idx}' if name is None else name

From 2380f88642e8fe53d230cd55c770030f6c046944 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 6 Aug 2023 21:51:51 +0800
Subject: [PATCH 03/33] Support shallow diffusion inference

---
 inference/ds_acoustic.py | 6 +++---
 scripts/infer.py         | 9 ++++++++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py
index b37727dad..b3254046e 100644
--- a/inference/ds_acoustic.py
+++ b/inference/ds_acoustic.py
@@ -11,7 +11,7 @@
 from basics.base_svs_infer import BaseSVSInfer
 from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST
 from modules.fastspeech.tts_modules import LengthRegulator
-from modules.toplevel import DiffSingerAcoustic
+from modules.toplevel import DiffSingerAcoustic, ShallowDiffusionOutput
 from modules.vocoders.registry import VOCODERS
 from utils import load_ckpt
 from utils.hparams import hparams
@@ -170,12 +170,12 @@ def forward_model(self, sample):
             )  # => [B, T, H]
         else:
             spk_mix_embed = None
-        mel_pred = self.model(
+        mel_pred: ShallowDiffusionOutput = self.model(
             txt_tokens, mel2ph=sample['mel2ph'], f0=sample['f0'], **variances,
             key_shift=sample.get('key_shift'), speed=sample.get('speed'),
             spk_mix_embed=spk_mix_embed, infer=True
         )
-        return mel_pred
+        return mel_pred.diff_out
 
     @torch.no_grad()
     def run_vocoder(self, spec, **kwargs):
diff --git a/scripts/infer.py b/scripts/infer.py
index 00389c22b..c53c7f81f 100644
--- a/scripts/infer.py
+++ b/scripts/infer.py
@@ -47,6 +47,7 @@ def main():
 @click.option('--key', type=int, required=False, default=0, help='Key transition of pitch')
 @click.option('--gender', type=float, required=False, help='Formant shifting (gender control)')
 @click.option('--seed', type=int, required=False, default=-1, help='Random seed of the inference')
+@click.option('--depth', type=int, required=False, default=-1, help='Shallow diffusion depth')
 @click.option('--speedup', type=int, required=False, default=0, help='Diffusion acceleration ratio')
 @click.option('--mel', is_flag=True, help='Save intermediate mel format instead of waveform')
 def acoustic(
@@ -60,6 +61,7 @@ def acoustic(
         key: int,
         gender: float,
         seed: int,
+        depth: int,
         speedup: int,
         mel: bool
 ):
@@ -107,8 +109,13 @@ def acoustic(
         f'Vocoder ckpt \'{hparams["vocoder_ckpt"]}\' not found. ' \
         f'Please put it to the checkpoints directory to run inference.'
 
+    if depth >= 0:
+        assert depth <= hparams['K_step'], f'Diffusion depth should not be larger than K_step {hparams["K_step"]}.'
+    else:
+        depth = hparams['K_step']  # gaussian start (full depth diffusion)
+
     if speedup > 0:
-        assert hparams['K_step'] % speedup == 0, f'Acceleration ratio must be factor of K_step {hparams["K_step"]}.'
+        assert depth % speedup == 0, f'Acceleration ratio must be factor of diffusion depth {depth}.'
         hparams['pndm_speedup'] = speedup
 
     spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None

From e3863485a4541f4ecae6e94c62a1d1c844d0e75e Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Mon, 7 Aug 2023 11:38:35 +0800
Subject: [PATCH 04/33] add shallow farmwork

---
 configs/acoustic.yaml                   |  7 ++-
 modules/shallow/__init__.py             |  0
 modules/shallow/fast_speech2_decoder.py | 82 +++++++++++++++++++++++++
 modules/shallow/shallow_adapter.py      | 53 ++++++++++++++++
 modules/toplevel.py                     |  7 +--
 training/acoustic_task.py               |  4 +-
 6 files changed, 144 insertions(+), 9 deletions(-)
 create mode 100644 modules/shallow/__init__.py
 create mode 100644 modules/shallow/fast_speech2_decoder.py
 create mode 100644 modules/shallow/shallow_adapter.py

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 9913429d8..e9d333b07 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -81,10 +81,11 @@ schedule_type: 'linear'
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
-  shared_encoder: true
-  aux_decoder_arch: ps
+  aux_decoder_arch: fs2
+  aux_decode_strict_hparams: true
   aux_decoder_args:
-    arch: ps
+    shared_encoder: true
+#    arch: ps
     # kernel_size: xxx
 
 # train and eval
diff --git a/modules/shallow/__init__.py b/modules/shallow/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py
new file mode 100644
index 000000000..5bd20c1f6
--- /dev/null
+++ b/modules/shallow/fast_speech2_decoder.py
@@ -0,0 +1,82 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+
+
+
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0
+
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+
+
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
+
+    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+
+
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x=self.dropout(x)
+
+        x = residual + self.drop_path (x)
+        return x
+
+
+class fs2_decode(nn.Module):
+    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers):
+        super().__init__()
+        self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate)  for _ in range(n_layers)])
+        self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+
+    def losses(self,x,gt):
+        return nn.L1Loss()(x,gt)
+
+    def forward(self, x):
+        x=x.transpose(1, 2)
+        x=self.inconv(x)
+        for i in self.conv:
+            x=i(x)
+        x=self.outconv(x).transpose(1, 2)
+        return x
+        pass
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
new file mode 100644
index 000000000..0e3a5526d
--- /dev/null
+++ b/modules/shallow/shallow_adapter.py
@@ -0,0 +1,53 @@
+import torch
+import torch.nn as nn
+
+cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode'}
+
+
+def build_object_from_class_name(cls_str, parent_cls, strict, *args, **kwargs):
+    import importlib
+
+    pkg = ".".join(cls_str.split(".")[:-1])
+    cls_name = cls_str.split(".")[-1]
+    cls_type = getattr(importlib.import_module(pkg), cls_name)
+    if parent_cls is not None:
+        assert issubclass(cls_type, parent_cls), f'| {cls_type} is not subclass of {parent_cls}.'
+    if strict:
+        return cls_type(*args, **kwargs)
+    return cls_type(*args, **filter_kwargs(kwargs, cls_type))
+
+
+def filter_kwargs(dict_to_filter, kwarg_obj):
+    import inspect
+
+    sig = inspect.signature(kwarg_obj)
+    filter_keys = [param.name for param in sig.parameters.values() if param.kind == param.POSITIONAL_OR_KEYWORD]
+    filtered_dict = {filter_key: dict_to_filter[filter_key] for filter_key in filter_keys if
+                     filter_key in dict_to_filter}
+    return filtered_dict
+
+
+class shallow_adapt(nn.Module):
+    def __init__(self, parame, out_dims):
+        super().__init__()
+        self.parame = parame
+
+        decodeparame=parame['shallow_diffusion_args']['aux_decoder_args']
+        decodeparame[ 'encoder_hidden'] = parame['hidden_size']
+        decodeparame['out_dims'] = out_dims
+
+        self.model = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']],
+                                                  nn.Module,
+                                                  parame['shallow_diffusion_args']['aux_decode_strict_hparams'],
+                                                  **decodeparame)
+        pass
+
+    def forward(self, condition,gt_spec =None, infer=False):
+        if infer:
+            return self.model(condition)
+        else:
+            return self.model.losses(self.model(condition),gt_spec)
+
+        pass
+
+
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 41bdee28b..fd7af5c6c 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -17,6 +17,7 @@
 from modules.fastspeech.param_adaptor import ParameterAdaptorModule
 from modules.fastspeech.tts_modules import RhythmRegulator, LengthRegulator
 from modules.fastspeech.variance_encoder import FastSpeech2Variance
+from modules.shallow.shallow_adapter import shallow_adapt
 from utils.hparams import hparams
 
 
@@ -50,9 +51,7 @@ def __init__(self, vocab_size, out_dims):
         self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False)
         if self.use_shallow_diffusion:
             # TODO: replace the following placeholder with real modules
-            self.aux_decoder = ExampleAuxDecoder(
-                out_dims=out_dims
-            )
+            self.aux_decoder = shallow_adapt(hparams, out_dims)
 
         self.diffusion = GaussianDiffusion(
             out_dims=out_dims,
@@ -90,7 +89,7 @@ def forward(
         else:
             if self.use_shallow_diffusion:
                 # TODO: replace the following placeholder with real calling code
-                aux_out = self.aux_decoder(condition, infer=False)
+                aux_out = self.aux_decoder(condition, gt_spec=gt_mel, infer=False)
             else:
                 aux_out = None
             x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False)
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index e34c774be..11ee5514c 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -113,8 +113,8 @@ def run_model(self, sample, infer=False):
             if self.use_shallow_diffusion:
                 aux_out = output.aux_out
                 # TODO: replace the following placeholder with real loss calculation
-                aux_mel_loss = self.aux_mel_loss(aux_out, target)
-                losses['aux_mel_loss'] = aux_mel_loss
+                # aux_mel_loss = self.aux_mel_loss(aux_out, target)
+                losses['aux_mel_loss'] = aux_out
             x_recon, x_noise = output.diff_out
             mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())
             losses['mel_loss'] = mel_loss

From 6ad8fd2d740218cc6945a2d4e7336368e6e1354a Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Mon, 7 Aug 2023 12:33:14 +0800
Subject: [PATCH 05/33] add shallow farmwork

---
 modules/shallow/fast_speech2_decoder.py |  6 ++++--
 modules/shallow/shallow_adapter.py      | 13 ++++++-------
 modules/toplevel.py                     |  2 +-
 training/acoustic_task.py               |  7 ++++---
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py
index 5bd20c1f6..ab45cb4ce 100644
--- a/modules/shallow/fast_speech2_decoder.py
+++ b/modules/shallow/fast_speech2_decoder.py
@@ -69,8 +69,10 @@ def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_lay
         self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate)  for _ in range(n_layers)])
         self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
 
-    def losses(self,x,gt):
-        return nn.L1Loss()(x,gt)
+
+
+    def build_loss(self):
+        return nn.L1Loss()
 
     def forward(self, x):
         x=x.transpose(1, 2)
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
index 0e3a5526d..d98623894 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/shallow/shallow_adapter.py
@@ -40,14 +40,13 @@ def __init__(self, parame, out_dims):
                                                   nn.Module,
                                                   parame['shallow_diffusion_args']['aux_decode_strict_hparams'],
                                                   **decodeparame)
-        pass
 
-    def forward(self, condition,gt_spec =None, infer=False):
-        if infer:
-            return self.model(condition)
-        else:
-            return self.model.losses(self.model(condition),gt_spec)
 
-        pass
+    def forward(self, condition, infer=False):
+
+        return self.model(condition)
+
+    def get_loss(self):
+        return self.model.build_loss()
 
 
diff --git a/modules/toplevel.py b/modules/toplevel.py
index fd7af5c6c..f582d8b7a 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -89,7 +89,7 @@ def forward(
         else:
             if self.use_shallow_diffusion:
                 # TODO: replace the following placeholder with real calling code
-                aux_out = self.aux_decoder(condition, gt_spec=gt_mel, infer=False)
+                aux_out = self.aux_decoder(condition, infer=False)
             else:
                 aux_out = None
             x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False)
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index 11ee5514c..384fe147b 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -81,7 +81,8 @@ def build_model(self):
     def build_losses_and_metrics(self):
         if self.use_shallow_diffusion:
             # TODO: replace the following placeholder with real loss creation
-            self.aux_mel_loss = torch.nn.L1Loss()
+            self.aux_mel_loss =self.model.aux_decoder.get_loss()
+
         self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type'])
 
     def run_model(self, sample, infer=False):
@@ -113,8 +114,8 @@ def run_model(self, sample, infer=False):
             if self.use_shallow_diffusion:
                 aux_out = output.aux_out
                 # TODO: replace the following placeholder with real loss calculation
-                # aux_mel_loss = self.aux_mel_loss(aux_out, target)
-                losses['aux_mel_loss'] = aux_out
+                aux_mel_loss = self.aux_mel_loss(aux_out, target)
+                losses['aux_mel_loss'] = aux_mel_loss
             x_recon, x_noise = output.diff_out
             mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())
             losses['mel_loss'] = mel_loss

From 6d936106271a7add818a5ce45c3c4c778d4cfae8 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Mon, 7 Aug 2023 14:52:44 +0800
Subject: [PATCH 06/33] Support lambda for aux mel loss

---
 configs/acoustic.yaml     | 5 ++---
 training/acoustic_task.py | 5 +++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index e9d333b07..70b2b7943 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -78,15 +78,14 @@ diff_decoder_type: 'wavenet'
 diff_loss_type: l2
 schedule_type: 'linear'
 
+# shallow diffusion
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
   aux_decoder_arch: fs2
   aux_decode_strict_hparams: true
   aux_decoder_args:
-    shared_encoder: true
-#    arch: ps
-    # kernel_size: xxx
+lambda_aux_mel_loss: 1.0
 
 # train and eval
 num_sanity_val_steps: 1
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index 384fe147b..e1116d4d0 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -81,7 +81,8 @@ def build_model(self):
     def build_losses_and_metrics(self):
         if self.use_shallow_diffusion:
             # TODO: replace the following placeholder with real loss creation
-            self.aux_mel_loss =self.model.aux_decoder.get_loss()
+            self.aux_mel_loss = self.model.aux_decoder.get_loss()
+            self.lambda_aux_mel_loss = hparams['lambda_aux_mel_loss']
 
         self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type'])
 
@@ -114,7 +115,7 @@ def run_model(self, sample, infer=False):
             if self.use_shallow_diffusion:
                 aux_out = output.aux_out
                 # TODO: replace the following placeholder with real loss calculation
-                aux_mel_loss = self.aux_mel_loss(aux_out, target)
+                aux_mel_loss =  self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target)
                 losses['aux_mel_loss'] = aux_mel_loss
             x_recon, x_noise = output.diff_out
             mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())

From b16c066c4364c6a18d9fffc1587ddc09a13efc53 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Mon, 7 Aug 2023 14:56:36 +0800
Subject: [PATCH 07/33] Move config key

---
 configs/acoustic.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 70b2b7943..64ca4ad0f 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -66,8 +66,6 @@ K_step: 1000
 timesteps: 1000
 max_beta: 0.02
 rel_pos: true
-use_shallow_diffusion: false
-diff_depth: 400
 diff_accelerator: ddim
 pndm_speedup: 10
 hidden_size: 256
@@ -79,6 +77,8 @@ diff_loss_type: l2
 schedule_type: 'linear'
 
 # shallow diffusion
+use_shallow_diffusion: false
+diff_depth: 400
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true

From 8f3a6228c07506aa06d0589d967f1119dff3604f Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Mon, 7 Aug 2023 15:33:04 +0800
Subject: [PATCH 08/33] add shallow farmework

---
 training/acoustic_task.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index 384fe147b..c5ae633ef 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -114,7 +114,8 @@ def run_model(self, sample, infer=False):
             if self.use_shallow_diffusion:
                 aux_out = output.aux_out
                 # TODO: replace the following placeholder with real loss calculation
-                aux_mel_loss = self.aux_mel_loss(aux_out, target)
+
+                aux_mel_loss = self.aux_mel_loss(aux_out, (target - (-5)) / (0 - (-5)) * 2 - 1)
                 losses['aux_mel_loss'] = aux_mel_loss
             x_recon, x_noise = output.diff_out
             mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())

From ac29eeb231ba4ceec390a24b4d80d91118563b94 Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Mon, 7 Aug 2023 20:01:39 +0800
Subject: [PATCH 09/33] add shallow farmework

---
 configs/acoustic.yaml                   |  2 +-
 modules/shallow/fast_speech2_decoder.py | 16 ++++++++++++++--
 modules/shallow/shallow_adapter.py      |  2 +-
 training/acoustic_task.py               |  2 +-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 64ca4ad0f..c08b35468 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -84,7 +84,7 @@ shallow_diffusion_args:
   train_diffusion: true
   aux_decoder_arch: fs2
   aux_decode_strict_hparams: true
-  aux_decoder_args:
+  aux_decoder_args: {}
 lambda_aux_mel_loss: 1.0
 
 # train and eval
diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py
index ab45cb4ce..3c9e15350 100644
--- a/modules/shallow/fast_speech2_decoder.py
+++ b/modules/shallow/fast_speech2_decoder.py
@@ -62,6 +62,15 @@ def forward(self, x: torch.Tensor, ) -> torch.Tensor:
         return x
 
 
+class fs2_loss(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self,y, x):
+        x=(x - (-5)) / (0 - (-5)) * 2 - 1
+        return nn.L1Loss()(y,x)
+
+
 class fs2_decode(nn.Module):
     def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers):
         super().__init__()
@@ -72,13 +81,16 @@ def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_lay
 
 
     def build_loss(self):
-        return nn.L1Loss()
 
-    def forward(self, x):
+        return fs2_loss()
+
+    def forward(self, x,infer):
         x=x.transpose(1, 2)
         x=self.inconv(x)
         for i in self.conv:
             x=i(x)
         x=self.outconv(x).transpose(1, 2)
+        if infer:
+            (x + 1) / 2 * (0 - (-5)) + (-5)
         return x
         pass
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
index d98623894..981e1d517 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/shallow/shallow_adapter.py
@@ -44,7 +44,7 @@ def __init__(self, parame, out_dims):
 
     def forward(self, condition, infer=False):
 
-        return self.model(condition)
+        return self.model(condition,infer)
 
     def get_loss(self):
         return self.model.build_loss()
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index f3742a79b..ae4815d0a 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -116,7 +116,7 @@ def run_model(self, sample, infer=False):
                 aux_out = output.aux_out
                 # TODO: replace the following placeholder with real loss calculation
 
-                aux_mel_loss =  self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, (target - (-5)) / (0 - (-5)) * 2 - 1)
+                aux_mel_loss =  self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target )
                 losses['aux_mel_loss'] = aux_mel_loss
             x_recon, x_noise = output.diff_out
             mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())

From 5c687ebb7470288cd4b7844363536f2aefffc3e1 Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Mon, 7 Aug 2023 20:16:00 +0800
Subject: [PATCH 10/33] add denorm

---
 modules/shallow/fast_speech2_decoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py
index 3c9e15350..539232150 100644
--- a/modules/shallow/fast_speech2_decoder.py
+++ b/modules/shallow/fast_speech2_decoder.py
@@ -91,6 +91,6 @@ def forward(self, x,infer):
             x=i(x)
         x=self.outconv(x).transpose(1, 2)
         if infer:
-            (x + 1) / 2 * (0 - (-5)) + (-5)
+            x=(x + 1) / 2 * (0 - (-5)) + (-5)
         return x
         pass

From a47f9ae4979a904c94be1dabfd0b1076d22f2f55 Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Mon, 7 Aug 2023 22:21:57 +0800
Subject: [PATCH 11/33] add shallow model training switch

---
 modules/shallow/fast_speech2_decoder.py |  2 +-
 modules/shallow/shallow_adapter.py      |  1 +
 modules/toplevel.py                     | 18 +++++++++++++++---
 training/acoustic_task.py               | 20 ++++++++++++++++----
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py
index 539232150..50774e6da 100644
--- a/modules/shallow/fast_speech2_decoder.py
+++ b/modules/shallow/fast_speech2_decoder.py
@@ -72,7 +72,7 @@ def forward(self,y, x):
 
 
 class fs2_decode(nn.Module):
-    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers):
+    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame):
         super().__init__()
         self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
         self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate)  for _ in range(n_layers)])
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
index 981e1d517..48bb13c84 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/shallow/shallow_adapter.py
@@ -35,6 +35,7 @@ def __init__(self, parame, out_dims):
         decodeparame=parame['shallow_diffusion_args']['aux_decoder_args']
         decodeparame[ 'encoder_hidden'] = parame['hidden_size']
         decodeparame['out_dims'] = out_dims
+        decodeparame['parame'] = parame
 
         self.model = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']],
                                                   nn.Module,
diff --git a/modules/toplevel.py b/modules/toplevel.py
index f582d8b7a..46dec47f8 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -52,6 +52,8 @@ def __init__(self, vocab_size, out_dims):
         if self.use_shallow_diffusion:
             # TODO: replace the following placeholder with real modules
             self.aux_decoder = shallow_adapt(hparams, out_dims)
+            self.train_aux_decoder=hparams['shallow_diffusion_args']['train_aux_decoder']
+            self.train_diffusion = hparams['shallow_diffusion_args']['train_diffusion']
 
         self.diffusion = GaussianDiffusion(
             out_dims=out_dims,
@@ -89,11 +91,21 @@ def forward(
         else:
             if self.use_shallow_diffusion:
                 # TODO: replace the following placeholder with real calling code
-                aux_out = self.aux_decoder(condition, infer=False)
+                if self.train_aux_decoder:
+                    aux_out = self.aux_decoder(condition, infer=False)
+                else:
+                    aux_out = None
+                if self.train_diffusion:
+                    x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False)
+                    diff_out=(x_recon, noise)
+                else:
+                    diff_out=None
+                return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out)
+
             else:
                 aux_out = None
-            x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False)
-            return ShallowDiffusionOutput(aux_out=aux_out, diff_out=(x_recon, noise))
+                x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False)
+                return ShallowDiffusionOutput(aux_out=aux_out, diff_out=(x_recon, noise))
 
 
 class DiffSingerVariance(ParameterAdaptorModule, CategorizedModule):
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index ae4815d0a..771a5607c 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -61,6 +61,10 @@ def __init__(self):
         super().__init__()
         self.dataset_cls = AcousticDataset
         self.use_shallow_diffusion = hparams['use_shallow_diffusion']
+        if self.use_shallow_diffusion:
+            self.train_aux_decoder = hparams['shallow_diffusion_args']['train_aux_decoder']
+            self.train_diffusion = hparams['shallow_diffusion_args']['train_diffusion']
+
         self.use_vocoder = hparams['infer'] or hparams['val_with_vocoder']
         if self.use_vocoder:
             self.vocoder: BaseVocoder = get_vocoder_cls(hparams)()
@@ -112,12 +116,20 @@ def run_model(self, sample, infer=False):
             return output
         else:
             losses = {}
+
             if self.use_shallow_diffusion:
-                aux_out = output.aux_out
-                # TODO: replace the following placeholder with real loss calculation
+                if self.train_aux_decoder:
+                    aux_out = output.aux_out
+                    # TODO: replace the following placeholder with real loss calculation
+
+                    aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target)
+                    losses['aux_mel_loss'] = aux_mel_loss
+                if self.train_diffusion :
+                    x_recon, x_noise = output.diff_out
+                    mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())
+                    losses['mel_loss'] = mel_loss
+                return losses
 
-                aux_mel_loss =  self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target )
-                losses['aux_mel_loss'] = aux_mel_loss
             x_recon, x_noise = output.diff_out
             mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())
             losses['mel_loss'] = mel_loss

From 47086928f0484b1a8f8c13b5f5f703a1e7cd639c Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Mon, 7 Aug 2023 22:29:56 +0800
Subject: [PATCH 12/33] Limit gradient from aux decoder

---
 configs/acoustic.yaml |  1 +
 modules/toplevel.py   | 14 ++++++++------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index c08b35468..ba6204163 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -82,6 +82,7 @@ diff_depth: 400
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
+  aux_decoder_grad: 0.1
   aux_decoder_arch: fs2
   aux_decode_strict_hparams: true
   aux_decoder_args: {}
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 46dec47f8..950d7d74f 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -50,10 +50,11 @@ def __init__(self, vocab_size, out_dims):
 
         self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False)
         if self.use_shallow_diffusion:
-            # TODO: replace the following placeholder with real modules
+            shallow_args = hparams['shallow_diffusion_args']
+            self.train_aux_decoder = shallow_args['train_aux_decoder']
+            self.train_diffusion = shallow_args['train_diffusion']
+            self.aux_decoder_grad = shallow_args['aux_decoder_grad']
             self.aux_decoder = shallow_adapt(hparams, out_dims)
-            self.train_aux_decoder=hparams['shallow_diffusion_args']['train_aux_decoder']
-            self.train_diffusion = hparams['shallow_diffusion_args']['train_diffusion']
 
         self.diffusion = GaussianDiffusion(
             out_dims=out_dims,
@@ -92,14 +93,15 @@ def forward(
             if self.use_shallow_diffusion:
                 # TODO: replace the following placeholder with real calling code
                 if self.train_aux_decoder:
-                    aux_out = self.aux_decoder(condition, infer=False)
+                    aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
+                    aux_out = self.aux_decoder(aux_cond, infer=False)
                 else:
                     aux_out = None
                 if self.train_diffusion:
                     x_recon, noise = self.diffusion(condition, gt_spec=gt_mel, infer=False)
-                    diff_out=(x_recon, noise)
+                    diff_out = (x_recon, noise)
                 else:
-                    diff_out=None
+                    diff_out = None
                 return ShallowDiffusionOutput(aux_out=aux_out, diff_out=diff_out)
 
             else:

From f449e04b02547cc8555232c2677789053a8159a7 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Mon, 7 Aug 2023 22:37:32 +0800
Subject: [PATCH 13/33] Improve loss calculation control flow

---
 training/acoustic_task.py | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index 771a5607c..6969b5f5d 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -62,8 +62,9 @@ def __init__(self):
         self.dataset_cls = AcousticDataset
         self.use_shallow_diffusion = hparams['use_shallow_diffusion']
         if self.use_shallow_diffusion:
-            self.train_aux_decoder = hparams['shallow_diffusion_args']['train_aux_decoder']
-            self.train_diffusion = hparams['shallow_diffusion_args']['train_diffusion']
+            shallow_args = hparams['shallow_diffusion_args']
+            self.train_aux_decoder = shallow_args['train_aux_decoder']
+            self.train_diffusion = shallow_args['train_diffusion']
 
         self.use_vocoder = hparams['infer'] or hparams['val_with_vocoder']
         if self.use_vocoder:
@@ -84,10 +85,8 @@ def build_model(self):
     # noinspection PyAttributeOutsideInit
     def build_losses_and_metrics(self):
         if self.use_shallow_diffusion:
-            # TODO: replace the following placeholder with real loss creation
             self.aux_mel_loss = self.model.aux_decoder.get_loss()
             self.lambda_aux_mel_loss = hparams['lambda_aux_mel_loss']
-
         self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type'])
 
     def run_model(self, sample, infer=False):
@@ -117,22 +116,15 @@ def run_model(self, sample, infer=False):
         else:
             losses = {}
 
-            if self.use_shallow_diffusion:
-                if self.train_aux_decoder:
-                    aux_out = output.aux_out
-                    # TODO: replace the following placeholder with real loss calculation
-
-                    aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target)
-                    losses['aux_mel_loss'] = aux_mel_loss
-                if self.train_diffusion :
-                    x_recon, x_noise = output.diff_out
-                    mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())
-                    losses['mel_loss'] = mel_loss
-                return losses
+            if output.aux_out is not None:
+                aux_out = output.aux_out
+                aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target)
+                losses['aux_mel_loss'] = aux_mel_loss
 
-            x_recon, x_noise = output.diff_out
-            mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())
-            losses['mel_loss'] = mel_loss
+            if output.diff_out is not None:
+                x_recon, x_noise = output.diff_out
+                mel_loss = self.mel_loss(x_recon, x_noise, nonpadding=(mel2ph > 0).unsqueeze(-1).float())
+                losses['mel_loss'] = mel_loss
 
             return losses
 
@@ -157,8 +149,10 @@ def _validation_step(self, sample, batch_idx):
                     aux_mel=mel_out.aux_out, diff_mel=mel_out.diff_out,
                     f0=sample['f0']
                 )
-            self.plot_mel(batch_idx, sample['mel'], mel_out.aux_out, name=f'auxmel_{batch_idx}')
-            self.plot_mel(batch_idx, sample['mel'], mel_out.diff_out, name=f'diffmel_{batch_idx}')
+            if mel_out.aux_out is not None:
+                self.plot_mel(batch_idx, sample['mel'], mel_out.aux_out, name=f'auxmel_{batch_idx}')
+            if mel_out.diff_out is not None:
+                self.plot_mel(batch_idx, sample['mel'], mel_out.diff_out, name=f'diffmel_{batch_idx}')
 
         return losses, sample['size']
 

From 28a67ae9769d3daa6c5450060e57b5c00eaa2d46 Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Tue, 8 Aug 2023 12:30:47 +0800
Subject: [PATCH 14/33] add   independent encoder in shallow

---
 configs/acoustic.yaml              |  6 ++++-
 modules/shallow/shallow_adapter.py | 38 ++++++++++++++++++++++++------
 modules/toplevel.py                |  9 ++++---
 3 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index ba6204163..29052088a 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -82,9 +82,13 @@ diff_depth: 400
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
+  aux_share_encoder: true
+  aux_encoder_strict_hparams: false
+  aux_encoder_arch: fs2
+  aux_encoder_args: {}
   aux_decoder_grad: 0.1
   aux_decoder_arch: fs2
-  aux_decode_strict_hparams: true
+  aux_decoder_strict_hparams: true
   aux_decoder_args: {}
 lambda_aux_mel_loss: 1.0
 
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
index 48bb13c84..f2811423d 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/shallow/shallow_adapter.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 
 cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode'}
-
+encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'}
 
 def build_object_from_class_name(cls_str, parent_cls, strict, *args, **kwargs):
     import importlib
@@ -28,7 +28,7 @@ def filter_kwargs(dict_to_filter, kwarg_obj):
 
 
 class shallow_adapt(nn.Module):
-    def __init__(self, parame, out_dims):
+    def __init__(self, parame, out_dims,vocab_size):
         super().__init__()
         self.parame = parame
 
@@ -37,17 +37,41 @@ def __init__(self, parame, out_dims):
         decodeparame['out_dims'] = out_dims
         decodeparame['parame'] = parame
 
-        self.model = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']],
+        encoderparame=parame['shallow_diffusion_args']['aux_encoder_args']
+        encoderparame['parame'] = parame
+        encoderparame['vocab_size'] = vocab_size
+        self.decoder = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']],
                                                   nn.Module,
-                                                  parame['shallow_diffusion_args']['aux_decode_strict_hparams'],
+                                                  parame['shallow_diffusion_args']['aux_decoder_strict_hparams'],
                                                   **decodeparame)
 
 
-    def forward(self, condition, infer=False):
+        if not parame['shallow_diffusion_args']['aux_share_encoder']:
+            # todo
+            self.use_encoder=True
+            self.encoder=build_object_from_class_name(encoder_cls_map[parame['shallow_diffusion_args']['aux_encoder_arch']],
+                                                  nn.Module,
+                                                  parame['shallow_diffusion_args']['aux_encoder_strict_hparams'],
+                                                  **encoderparame)
+        else:
+            self.use_encoder = False
+
+
+
+
+
+    def forward(self, condition, infer=False,txt_tokens=None, mel2ph=None, f0=None,
+            key_shift=None, speed=None,
+            spk_embed_id=None, **kwargs):
+
+        if self.use_encoder:
+            condition=self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
+            key_shift=key_shift, speed=speed,
+            spk_embed_id=spk_embed_id, **kwargs)
 
-        return self.model(condition,infer)
+        return self.decoder(condition,infer)
 
     def get_loss(self):
-        return self.model.build_loss()
+        return self.decoder.build_loss()
 
 
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 950d7d74f..0fd577add 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -54,7 +54,7 @@ def __init__(self, vocab_size, out_dims):
             self.train_aux_decoder = shallow_args['train_aux_decoder']
             self.train_diffusion = shallow_args['train_diffusion']
             self.aux_decoder_grad = shallow_args['aux_decoder_grad']
-            self.aux_decoder = shallow_adapt(hparams, out_dims)
+            self.aux_decoder = shallow_adapt(hparams, out_dims,vocab_size)
 
         self.diffusion = GaussianDiffusion(
             out_dims=out_dims,
@@ -82,7 +82,9 @@ def forward(
 
         if infer:
             if self.use_shallow_diffusion:
-                aux_mel_pred = self.aux_decoder(condition, infer=True)
+                aux_mel_pred = self.aux_decoder(condition, infer=True,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
+            key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs)
+
                 aux_mel_pred *= ((mel2ph > 0).float()[:, :, None])
             else:
                 aux_mel_pred = None
@@ -94,7 +96,8 @@ def forward(
                 # TODO: replace the following placeholder with real calling code
                 if self.train_aux_decoder:
                     aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
-                    aux_out = self.aux_decoder(aux_cond, infer=False)
+                    aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
+            key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs)
                 else:
                     aux_out = None
                 if self.train_diffusion:

From 144c7760b3a696dbd4a995c391473e34b2dc53d6 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 8 Aug 2023 12:42:28 +0800
Subject: [PATCH 15/33] Adjust lambda

---
 configs/acoustic.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 29052088a..82f2163a8 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -90,7 +90,7 @@ shallow_diffusion_args:
   aux_decoder_arch: fs2
   aux_decoder_strict_hparams: true
   aux_decoder_args: {}
-lambda_aux_mel_loss: 1.0
+lambda_aux_mel_loss: 0.2
 
 # train and eval
 num_sanity_val_steps: 1

From e269708e3c32891de6beadfa5a73e2e62e1fd371 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 8 Aug 2023 12:43:17 +0800
Subject: [PATCH 16/33] Implement shallow diffusion There are some issues to
 resolve in DPM-Solver++ and UniPC

---
 modules/diffusion/ddpm.py | 44 ++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index d17070cb8..9b7d11eaf 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -216,15 +216,24 @@ def p_losses(self, x_start, t, cond, noise=None):
 
         return x_recon, noise
 
-    def inference(self, cond, b=1, src_spec=None, device=None):
+    def inference(self, cond, b=1, x_start=None, device=None):
         depth = hparams.get('diff_depth', self.k_step)
-        # TODO: implement shallow diffusion
+        noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device)
+        if x_start is None or depth >= self.k_step:
+            t_max = self.k_step
+            x = noise
+        elif depth > 0:
+            t_max = depth
+            x = self.q_sample(
+                x_start, torch.full((b,), t_max - 1, device=device, dtype=torch.long), noise
+            )
+        else:
+            t_max = 0
+            x = x_start
 
-        t = self.k_step
-        shape = (b, self.num_feats, self.out_dims, cond.shape[2])
-        x = torch.randn(shape, device=device)
         if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1:
             algorithm = hparams.get('diff_accelerator', 'ddim')
+            algorithm = 'pndm'
             if algorithm == 'dpm-solver':
                 from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
                 # 1. Define the noise schedule.
@@ -254,7 +263,7 @@ def wrapped(x, t, **kwargs):
                 # costs and the sample quality.
                 dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
 
-                steps = t // hparams["pndm_speedup"]
+                steps = t_max // hparams["pndm_speedup"]
                 self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False)
                 x = dpm_solver.sample(
                     x,
@@ -292,7 +301,7 @@ def wrapped(x, t, **kwargs):
                 # costs and the sample quality.
                 uni_pc = UniPC(model_fn, noise_schedule, variant='bh2')
 
-                steps = t // hparams["pndm_speedup"]
+                steps = t_max // hparams["pndm_speedup"]
                 self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False)
                 x = uni_pc.sample(
                     x,
@@ -306,8 +315,8 @@ def wrapped(x, t, **kwargs):
                 self.noise_list = deque(maxlen=4)
                 iteration_interval = hparams['pndm_speedup']
                 for i in tqdm(
-                        reversed(range(0, t, iteration_interval)), desc='sample time step',
-                        total=t // iteration_interval, disable=not hparams['infer'], leave=False
+                        reversed(range(0, t_max, iteration_interval)), desc='sample time step',
+                        total=t_max // iteration_interval, disable=not hparams['infer'], leave=False
                 ):
                     x = self.p_sample_plms(
                         x, torch.full((b,), i, device=device, dtype=torch.long),
@@ -316,8 +325,8 @@ def wrapped(x, t, **kwargs):
             elif algorithm == 'ddim':
                 iteration_interval = hparams['pndm_speedup']
                 for i in tqdm(
-                        reversed(range(0, t, iteration_interval)), desc='sample time step',
-                        total=t // iteration_interval, disable=not hparams['infer'], leave=False
+                        reversed(range(0, t_max, iteration_interval)), desc='sample time step',
+                        total=t_max // iteration_interval, disable=not hparams['infer'], leave=False
                 ):
                     x = self.p_sample_ddim(
                         x, torch.full((b,), i, device=device, dtype=torch.long),
@@ -326,7 +335,7 @@ def wrapped(x, t, **kwargs):
             else:
                 raise NotImplementedError(algorithm)
         else:
-            for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t,
+            for i in tqdm(reversed(range(0, t_max)), desc='sample time step', total=t_max,
                           disable=not hparams['infer'], leave=False):
                 x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
         x = x.transpose(2, 3).squeeze(1)  # [B, F, M, T] => [B, T, M] or [B, F, T, M]
@@ -347,9 +356,14 @@ def forward(self, condition, gt_spec=None, src_spec=None, infer=True):
             t = torch.randint(0, self.k_step, (b,), device=device).long()
             return self.p_losses(spec, t, cond=cond)
         else:
-            # src_spec: [B, T, M]
-            # TODO: implement shallow diffusion
-            x = self.inference(cond, b=b, device=device)
+            # src_spec: [B, T, M] or [B, F, T, M]
+            if src_spec is not None:
+                spec = self.norm_spec(src_spec).transpose(-2, -1)
+                if self.num_feats == 1:
+                    spec = spec[:, None, :, :]
+            else:
+                spec = None
+            x = self.inference(cond, b=b, x_start=spec, device=device)
             return self.denorm_spec(x)
 
     def norm_spec(self, x):

From 52b3125f5cd41cfad81c24e8ffe5b922a940191b Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 8 Aug 2023 12:43:38 +0800
Subject: [PATCH 17/33] Fix missing depth assignment

---
 scripts/infer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/infer.py b/scripts/infer.py
index c53c7f81f..d73b6268c 100644
--- a/scripts/infer.py
+++ b/scripts/infer.py
@@ -111,6 +111,7 @@ def acoustic(
 
     if depth >= 0:
         assert depth <= hparams['K_step'], f'Diffusion depth should not be larger than K_step {hparams["K_step"]}.'
+        hparams['diff_depth'] = depth
     else:
         depth = hparams['K_step']  # gaussian start (full depth diffusion)
 

From 030223be14a88afec44e8d3d9c4a8e87c0d0dfff Mon Sep 17 00:00:00 2001
From: "llc1995@sina.com" <llc1995@sina.com>
Date: Tue, 8 Aug 2023 14:05:41 +0800
Subject: [PATCH 18/33] fix bugs of shallow diffusion inference

---
 modules/diffusion/ddpm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index 9b7d11eaf..76bde96bf 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -219,7 +219,7 @@ def p_losses(self, x_start, t, cond, noise=None):
     def inference(self, cond, b=1, x_start=None, device=None):
         depth = hparams.get('diff_depth', self.k_step)
         noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device)
-        if x_start is None or depth >= self.k_step:
+        if x_start is None or depth > self.k_step:
             t_max = self.k_step
             x = noise
         elif depth > 0:
@@ -231,13 +231,13 @@ def inference(self, cond, b=1, x_start=None, device=None):
             t_max = 0
             x = x_start
 
-        if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1:
+        if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and tmax > 0:
             algorithm = hparams.get('diff_accelerator', 'ddim')
             algorithm = 'pndm'
             if algorithm == 'dpm-solver':
                 from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
                 # 1. Define the noise schedule.
-                noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas)
+                noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:tmax])
 
                 # 2. Convert your discrete-time `model` to the continuous-time
                 # noise prediction model. Here is an example for a diffusion model
@@ -276,7 +276,7 @@ def wrapped(x, t, **kwargs):
             elif algorithm == 'unipc':
                 from inference.uni_pc import NoiseScheduleVP, model_wrapper, UniPC
                 # 1. Define the noise schedule.
-                noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas)
+                noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:tmax])
 
                 # 2. Convert your discrete-time `model` to the continuous-time
                 # noise prediction model. Here is an example for a diffusion model

From 39bdcb85e4f78858239c6664ec1b1b58639899af Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 8 Aug 2023 14:29:59 +0800
Subject: [PATCH 19/33] Fix errors and remove debug code

---
 modules/diffusion/ddpm.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index 76bde96bf..6dadbf443 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -219,7 +219,7 @@ def p_losses(self, x_start, t, cond, noise=None):
     def inference(self, cond, b=1, x_start=None, device=None):
         depth = hparams.get('diff_depth', self.k_step)
         noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device)
-        if x_start is None or depth > self.k_step:
+        if x_start is None or depth >= self.k_step:
             t_max = self.k_step
             x = noise
         elif depth > 0:
@@ -231,13 +231,12 @@ def inference(self, cond, b=1, x_start=None, device=None):
             t_max = 0
             x = x_start
 
-        if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and tmax > 0:
+        if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0:
             algorithm = hparams.get('diff_accelerator', 'ddim')
-            algorithm = 'pndm'
             if algorithm == 'dpm-solver':
                 from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
                 # 1. Define the noise schedule.
-                noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:tmax])
+                noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t_max])
 
                 # 2. Convert your discrete-time `model` to the continuous-time
                 # noise prediction model. Here is an example for a diffusion model
@@ -276,7 +275,7 @@ def wrapped(x, t, **kwargs):
             elif algorithm == 'unipc':
                 from inference.uni_pc import NoiseScheduleVP, model_wrapper, UniPC
                 # 1. Define the noise schedule.
-                noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:tmax])
+                noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t_max])
 
                 # 2. Convert your discrete-time `model` to the continuous-time
                 # noise prediction model. Here is an example for a diffusion model

From eb114d6a2e9bad7b4f134fc70692f10c9853b199 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 8 Aug 2023 14:46:01 +0800
Subject: [PATCH 20/33] Support K_step < timesteps (shallow-only diffusion)

---
 modules/diffusion/ddpm.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index 6dadbf443..62f6d5bb5 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -81,6 +81,12 @@ def __init__(self, out_dims, num_feats=1, timesteps=1000, k_step=1000,
         alphas_cumprod = np.cumprod(alphas, axis=0)
         alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
 
+        self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False)
+        if self.use_shallow_diffusion:
+            assert k_step <= timesteps, 'K_step should not be larger than timesteps.'
+        else:
+            assert k_step == timesteps, 'K_step must equal timesteps if use_shallow_diffusion is False.'
+        self.timesteps = timesteps
         self.k_step = k_step
         self.noise_list = deque(maxlen=4)
 
@@ -219,16 +225,19 @@ def p_losses(self, x_start, t, cond, noise=None):
     def inference(self, cond, b=1, x_start=None, device=None):
         depth = hparams.get('diff_depth', self.k_step)
         noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device)
-        if x_start is None or depth >= self.k_step:
+        if self.use_shallow_diffusion:
+            t_max = min(depth, self.k_step)
+        else:
             t_max = self.k_step
+
+        if t_max >= self.timesteps:
             x = noise
-        elif depth > 0:
-            t_max = depth
+        elif t_max > 0:
+            assert x_start is not None, 'Missing shallow diffusion source.'
             x = self.q_sample(
                 x_start, torch.full((b,), t_max - 1, device=device, dtype=torch.long), noise
             )
         else:
-            t_max = 0
             x = x_start
 
         if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0:

From 348e7cc84f900f4dd3eb991db9743e62d43a8f41 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 8 Aug 2023 15:59:04 +0800
Subject: [PATCH 21/33] Fix argument passing

---
 modules/toplevel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/toplevel.py b/modules/toplevel.py
index 0fd577add..e915f9dc9 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -228,7 +228,7 @@ def forward(
             ]
             condition += torch.stack(variance_embeds, dim=-1).sum(-1)
 
-        variance_outputs = self.variance_predictor(condition, variance_inputs, infer)
+        variance_outputs = self.variance_predictor(condition, variance_inputs, infer=infer)
 
         if infer:
             variances_pred_out = self.collect_variance_outputs(variance_outputs)

From 554c4ac692e1c15e1dd4ab459442495f4c6a3497 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 9 Aug 2023 14:41:56 +0800
Subject: [PATCH 22/33] Add missing checks

---
 modules/diffusion/ddpm.py | 1 +
 scripts/infer.py          | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index 62f6d5bb5..7c4215bf1 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -238,6 +238,7 @@ def inference(self, cond, b=1, x_start=None, device=None):
                 x_start, torch.full((b,), t_max - 1, device=device, dtype=torch.long), noise
             )
         else:
+            assert x_start is not None, 'Missing shallow diffusion source.'
             x = x_start
 
         if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0:
diff --git a/scripts/infer.py b/scripts/infer.py
index d73b6268c..0d6b8f5eb 100644
--- a/scripts/infer.py
+++ b/scripts/infer.py
@@ -112,6 +112,8 @@ def acoustic(
     if depth >= 0:
         assert depth <= hparams['K_step'], f'Diffusion depth should not be larger than K_step {hparams["K_step"]}.'
         hparams['diff_depth'] = depth
+    elif hparams.get('use_shallow_diffusion', False):
+        depth = hparams['diff_depth']
     else:
         depth = hparams['K_step']  # gaussian start (full depth diffusion)
 

From b04b0391e98c55e85a2b969197e765d346ecbf9c Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Thu, 17 Aug 2023 19:09:23 +0800
Subject: [PATCH 23/33] add   glow decoder

---
 modules/shallow/fast_speech2_decoder.py |    2 +-
 modules/shallow/fs2_decoder.py          |  300 +++++++
 modules/shallow/glow.py                 | 1000 +++++++++++++++++++++++
 modules/shallow/light_decoder.py        |  109 +++
 modules/shallow/noise_decoder.py        |  100 +++
 modules/shallow/shallow_adapter.py      |   53 +-
 modules/toplevel.py                     |    2 +-
 7 files changed, 1537 insertions(+), 29 deletions(-)
 create mode 100644 modules/shallow/fs2_decoder.py
 create mode 100644 modules/shallow/glow.py
 create mode 100644 modules/shallow/light_decoder.py
 create mode 100644 modules/shallow/noise_decoder.py

diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py
index 50774e6da..ec264f3ce 100644
--- a/modules/shallow/fast_speech2_decoder.py
+++ b/modules/shallow/fast_speech2_decoder.py
@@ -84,7 +84,7 @@ def build_loss(self):
 
         return fs2_loss()
 
-    def forward(self, x,infer):
+    def forward(self, x,infer,**kwargs):
         x=x.transpose(1, 2)
         x=self.inconv(x)
         for i in self.conv:
diff --git a/modules/shallow/fs2_decoder.py b/modules/shallow/fs2_decoder.py
new file mode 100644
index 000000000..073819dd1
--- /dev/null
+++ b/modules/shallow/fs2_decoder.py
@@ -0,0 +1,300 @@
+import math
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+class RelativeFFTBlock(nn.Module):
+    """ FFT Block with Relative Multi-Head Attention """
+
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=None, block_length=None):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.block_length = block_length
+
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads,
+                                    window_size=window_size, p_dropout=p_dropout, block_length=block_length))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(
+                hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask=None):
+
+        if x_mask is  not None:
+            attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        else:
+            attn_mask = None
+
+        for i in range(self.n_layers):
+            if x_mask is not None:
+                x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+
+class RelativeSelfAttention(nn.Module):
+    """ Relative Multi-Head Attention """
+
+    def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., block_length=None, proximal_bias=False, proximal_init=False):
+        super(RelativeSelfAttention, self).__init__()
+        assert channels % n_heads == 0
+
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.p_dropout = p_dropout
+        self.attn = None
+
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(torch.randn(
+                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(
+                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        if proximal_init:
+            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+        x = self.conv_o(x)
+        return x
+
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels,
+                           t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels,
+                           t_s).transpose(2, 3)
+
+        scores = torch.matmul(query, key.transpose(-2, -1)
+                              ) / math.sqrt(self.k_channels)
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query, key_relative_embeddings)
+            rel_logits = self._relative_position_to_absolute_position(
+                rel_logits)
+            scores_local = rel_logits / math.sqrt(self.k_channels)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + \
+                self._attention_bias_proximal(t_s).to(
+                    device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                block_mask = torch.ones_like(
+                    scores).triu(-self.block_length).tril(self.block_length)
+                scores = scores * block_mask + -1e4*(1 - block_mask)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(
+                p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s)
+            output = output + \
+                self._matmul_with_relative_values(
+                    relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(
+            b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:,
+                                                              slice_start_position:slice_end_position]
+        return used_relative_embeddings
+
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, convert_pad_shape(
+            [[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(x_flat, convert_pad_shape(
+            [[0, 0], [0, 0], [0, length-1]]))
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view(
+            [batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(x, convert_pad_shape(
+            [[0, 0], [0, 0], [0, 0], [0, length-1]]))
+        x_flat = x.view([batch, heads, length**2 + length*(length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, convert_pad_shape(
+            [[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2*length])[:, :, :, 1:]
+        return x_final
+
+    def _attention_bias_proximal(self, length):
+        """
+        Bias for self-attention to encourage attention to close positions.
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-4):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean)**2, 1, keepdim=True)
+
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+
+
+class FFN(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, padding=kernel_size//2)
+        self.drop = nn.Dropout(p_dropout)
+
+    def forward(self, x, x_mask=None):
+        if x_mask is  not None:
+            x = self.conv(x * x_mask)
+        else:
+            x = self.conv(x )
+
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        if x_mask is not None:
+            x=x * x_mask
+        return x
+
+
+class fs2_loss(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self,y, x):
+        x=(x - (-5)) / (0 - (-5)) * 2 - 1
+        return nn.L1Loss()(y,x)
+
+
+class attention_fs2_decoder(nn.Module):
+    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,n_heads,attention_ffn_kernel_size,parame):
+        super().__init__()
+        self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.conv = RelativeFFTBlock(hidden_channels=n_chans,filter_channels=n_chans*4, n_heads=n_heads, n_layers=n_layers, kernel_size=attention_ffn_kernel_size, p_dropout=dropout_rate)
+        self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+
+
+
+    def build_loss(self):
+
+        return fs2_loss()
+
+    def forward(self, x,infer,**kwargs):
+        x=x.transpose(1, 2)
+        x=self.inconv(x)
+
+
+        x=self.conv(x)
+        x=self.outconv(x).transpose(1, 2)
+        if infer:
+            x=(x + 1) / 2 * (0 - (-5)) + (-5)
+        return x
+        pass
diff --git a/modules/shallow/glow.py b/modules/shallow/glow.py
new file mode 100644
index 000000000..c4be2b6ee
--- /dev/null
+++ b/modules/shallow/glow.py
@@ -0,0 +1,1000 @@
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+import torch.nn.functional as F
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+class RelativeFFTBlock(nn.Module):
+    """ FFT Block with Relative Multi-Head Attention """
+
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
+                 window_size=None, block_length=None):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.block_length = block_length
+
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads,
+                                                          window_size=window_size, p_dropout=p_dropout,
+                                                          block_length=block_length))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(
+                hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask=None):
+
+        if x_mask is not None:
+            attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        else:
+            attn_mask = None
+
+        for i in range(self.n_layers):
+            if x_mask is not None:
+                x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+
+class RelativeSelfAttention(nn.Module):
+    """ Relative Multi-Head Attention """
+
+    def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0.,
+                 block_length=None, proximal_bias=False, proximal_init=False):
+        super(RelativeSelfAttention, self).__init__()
+        assert channels % n_heads == 0
+
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.p_dropout = p_dropout
+        self.attn = None
+
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels ** -0.5
+            self.emb_rel_k = nn.Parameter(torch.randn(
+                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(
+                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        if proximal_init:
+            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+        x = self.conv_o(x)
+        return x
+
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels,
+                           t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels,
+                           t_s).transpose(2, 3)
+
+        scores = torch.matmul(query, key.transpose(-2, -1)
+                              ) / math.sqrt(self.k_channels)
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query, key_relative_embeddings)
+            rel_logits = self._relative_position_to_absolute_position(
+                rel_logits)
+            scores_local = rel_logits / math.sqrt(self.k_channels)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + \
+                     self._attention_bias_proximal(t_s).to(
+                         device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                block_mask = torch.ones_like(
+                    scores).triu(-self.block_length).tril(self.block_length)
+                scores = scores * block_mask + -1e4 * (1 - block_mask)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(
+                p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s)
+            output = output + \
+                     self._matmul_with_relative_values(
+                         relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(
+            b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:,
+                                   slice_start_position:slice_end_position]
+        return used_relative_embeddings
+
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, convert_pad_shape(
+            [[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(x_flat, convert_pad_shape(
+            [[0, 0], [0, 0], [0, length - 1]]))
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view(
+            [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(x, convert_pad_shape(
+            [[0, 0], [0, 0], [0, 0], [0, length - 1]]))
+        x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, convert_pad_shape(
+            [[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+
+    def _attention_bias_proximal(self, length):
+        """
+        Bias for self-attention to encourage attention to close positions.
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-4):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+
+
+class FFN(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, padding=kernel_size // 2)
+        self.drop = nn.Dropout(p_dropout)
+
+    def forward(self, x, x_mask=None):
+        if x_mask is not None:
+            x = self.conv(x * x_mask)
+        else:
+            x = self.conv(x)
+
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+
+
+Conv1dModel = nn.Conv1d  # 有毒 删
+
+
+class Depthwise_Separable_Conv1D(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            bias=True,
+            padding_mode='zeros',  # TODO: refine this type
+            device=None,
+            dtype=None
+    ):
+        super().__init__()
+        self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+                                    groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias,
+                                    padding_mode=padding_mode, device=device, dtype=dtype)
+        self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias,
+                                    device=device, dtype=dtype)
+
+    def forward(self, input):
+        return self.point_conv(self.depth_conv(input))
+
+
+def set_Conv1dModel(use_depthwise_conv):
+    global Conv1dModel
+    Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+@torch.jit.script
+def add_and_GRU(input_a, input_b):
+    in_act = input_a + input_b
+    x1, x2 = in_act.chunk(2, dim=1)
+    t_act = torch.tanh(x2)
+    s_act = torch.sigmoid(x1)
+    acts = t_act * s_act
+    return acts
+
+
+class WN(torch.nn.Module):
+    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+        super(WN, self).__init__()
+        assert (kernel_size % 2 == 1)
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size,
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels  # condition用的
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        self.condition_layers = torch.nn.ModuleList()
+
+        # if gin_channels != 0:
+        #     cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+        #     # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+        #     self.cond_layer=cond_layer
+
+        for i in range(n_layers):
+
+            if gin_channels != 0:
+                cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1)
+                # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+                # self.cond_layer = cond_layer
+            else:
+                cond_layer = nn.Identity()
+            self.condition_layers.append(cond_layer)
+
+            dilation = dilation_rate ** i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size,
+                                   dilation=dilation, padding=padding)
+            # in_layer = weight_norm_modules(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight')
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, x, x_mask=None, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+        # if g is not None:
+        #     g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+
+            if g is not None:
+
+                condition = self.condition_layers[i](g)
+            else:
+                condition = torch.zeros_like(x_in)
+
+            # acts = fused_add_tanh_sigmoid_multiply(  # GRU 这不就是wavnet的那个 GRU
+            #     x_in,
+            #     condition,
+            #     n_channels_tensor)
+            acts = add_and_GRU(  # GRU 这不就是wavnet的那个 GRU
+                x_in,
+                condition,
+            )
+            acts = self.drop(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, :self.hidden_channels, :]
+                if x_mask is not None:
+                    x = (x + res_acts) * x_mask
+                else:
+                    x = x + res_acts
+                output = output + res_skip_acts[:, self.hidden_channels:, :]
+            else:
+                output = output + res_skip_acts
+
+        if x_mask is not None:
+            out = output * x_mask
+        else:
+            out = output
+        return out
+
+    # def remove_weight_norm(self):
+    #     if self.gin_channels != 0:
+    #         remove_weight_norm_modules(self.cond_layer)
+    #     for l in self.in_layers:
+    #         remove_weight_norm_modules(l)
+    #     for l in self.res_skip_layers:
+    #         remove_weight_norm_modules(l)
+
+
+class ResidualCouplingLayer(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 p_dropout=0,
+                 gin_channels=0,
+                 mean_only=False,
+                 wn_sharing_parameter=None  # 不明的共享权重
+                 ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
+                      gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+
+    def forward(self, x, x_mask=None, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        if x_mask is not None:
+            h = self.pre(x0) * x_mask
+        else:
+            h = self.pre(x0)
+        h = self.enc(h, x_mask, g=g)
+
+        if x_mask is not None:
+            stats = self.post(h) * x_mask
+        else:
+            stats = self.post(h)
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not reverse:
+            if x_mask is not None:
+                x1 = m + x1 * torch.exp(logs) * x_mask
+            else:
+                x1 = m + x1 * torch.exp(logs)
+            # x1 = m + x1 * torch.exp(logs) * x_mask  # 逆过程
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            if x_mask is not None:
+                x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            else:
+                x1 = (x1 - m) * torch.exp(-logs)
+            # x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 n_flows=4,
+                 gin_channels=0,
+                 share_parameter=False
+                 ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+
+        self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0,
+                     gin_channels=gin_channels) if share_parameter else None
+
+        for i in range(n_flows):
+            self.flows.append(
+                ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
+                                      gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
+            self.flows.append(Flip())
+
+    def forward(self, x, x_mask=None, g=None, reverse=False):
+        if not reverse:
+            logdet_tot = 0
+            for flow in self.flows:
+                x, logdet = flow(x, x_mask, g=g, reverse=reverse)
+                logdet_tot += logdet
+        else:
+            logdet_tot = None
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x, logdet_tot
+
+
+# class TextEncoder(nn.Module):
+#     def __init__(self,
+#                  out_channels,
+#                  hidden_channels,
+#                  kernel_size,
+#                  n_layers,
+#                  gin_channels=0,
+#                  filter_channels=None,
+#                  n_heads=None,
+#                  p_dropout=None):
+#         super().__init__()
+#         self.out_channels = out_channels
+#         self.hidden_channels = hidden_channels
+#         self.kernel_size = kernel_size
+#         self.n_layers = n_layers
+#         self.gin_channels = gin_channels
+#         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+#         self.f0_emb = nn.Embedding(256, hidden_channels)
+#
+#         self.enc_ = attentions.Encoder(
+#             hidden_channels,
+#             filter_channels,
+#             n_heads,
+#             n_layers,
+#             kernel_size,
+#             p_dropout)
+#
+#     def forward(self, x, x_mask, f0=None, noice_scale=1):
+#         x = x + self.f0_emb(f0).transpose(1, 2)
+#         x = self.enc_(x * x_mask, x_mask)
+#         stats = self.proj(x) * x_mask
+#         m, logs = torch.split(stats, self.out_channels, dim=1)
+#         z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
+#
+#         return z, m, logs, x_mask
+
+
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            intermediate_dim: int,
+            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0
+
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
+
+    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = self.dropout(x)
+
+        x = residual + self.drop_path(x)
+        return x
+
+
+class condition_latent_encoder_att(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
+
+        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
+                                    n_layers=n_layers,
+                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
+
+    def forward(self, x, noice_scale=1):
+        x = self.proj_in(x)
+
+        x = self.enc(x)
+        stats = self.proj_out(x)
+        m, logs = torch.chunk(stats, 2, 1)
+        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
+
+        return z, m, logs,
+
+
+class condition_latent_encoder_convnext(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
+
+        self.conv = nn.ModuleList(
+            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
+                           drop_out=dropout_rate) for _ in range(n_layers)])
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
+
+    def forward(self, x, noice_scale=1):
+        x = self.proj_in(x)
+
+        for i in self.conv:
+            x = i(x)
+        stats = self.proj_out(x)
+        m, logs = torch.chunk(stats, 2, 1)
+        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
+
+        return z, m, logs,
+
+
+class condition_encoder_att(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
+                                 padding=condition_encoder_kernel_size // 2)
+
+        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
+                                    n_layers=n_layers,
+                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
+                                  padding=condition_encoder_kernel_size // 2)
+
+    def forward(self, x, ):
+        x = self.proj_in(x)
+
+        x = self.enc(x)
+        stats = self.proj_out(x)
+
+        return stats
+
+
+class condition_encoder_convnext(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
+                                 padding=condition_encoder_kernel_size // 2)
+
+        self.conv = nn.ModuleList(
+            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
+                           drop_out=dropout_rate) for _ in range(n_layers)])
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
+                                  padding=condition_encoder_kernel_size // 2)
+
+    def forward(self, x, ):
+        x = self.proj_in(x)
+
+        for i in self.conv:
+            x = i(x)
+        stats = self.proj_out(x)
+
+        return stats
+
+
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+
+    def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads,
+                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
+
+                 condition_in_chans,
+
+                 condition_encoder_hidden_channels,
+                 condition_encoder_n_heads,
+                 condition_encoder_n_layers,
+                 condition_encoder_kernel_size,
+                 condition_encoder_dropout_rate,
+
+                 inter_channels,
+                 hidden_channels,
+
+                 condition_channels,
+
+                 condition_encoder_filter_channels=None,
+
+                 latent_encoder_filter_channels=None,
+
+                 use_depthwise_conv=False,
+
+                 flow_share_parameter=False,
+                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True,
+                 ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention',
+
+                 **kwargs):
+
+        super().__init__()
+        self.inter_channels = inter_channels
+        self.ues_condition = ues_condition
+
+        self.use_latent = use_latent
+
+        if use_latent_encoder and use_latent:
+            if latent_encoder_type == 'attention':
+                self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans,
+                                                                   out_channels=inter_channels,
+                                                                   n_chans=latent_encoder_hidden_channels,
+                                                                   n_heads=latent_encoder_n_heads,
+                                                                   n_layers=latent_encoder_n_layers,
+                                                                   condition_encoder_kernel_size=latent_encoder_kernel_size,
+                                                                   dropout_rate=latent_encoder_dropout_rate,
+                                                                   filter_channels=latent_encoder_filter_channels)
+            elif latent_encoder_type == 'convnext':
+                self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans,
+                                                                        out_channels=inter_channels,
+                                                                        n_chans=latent_encoder_hidden_channels,
+                                                                        n_heads=None,
+                                                                        n_layers=latent_encoder_n_layers,
+                                                                        condition_encoder_kernel_size=None,
+                                                                        dropout_rate=latent_encoder_dropout_rate,
+                                                                        filter_channels=latent_encoder_filter_channels)
+            else:
+                raise RuntimeError("unsupport_latent_encoder")
+
+        elif ((not use_latent_encoder) and use_latent):
+            self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3)
+
+        if ues_condition_encoder and ues_condition:
+            if condition_encoder_type == 'attention':
+                self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans,
+                                                               out_channels=condition_channels,
+                                                               n_chans=condition_encoder_hidden_channels,
+                                                               n_heads=condition_encoder_n_heads,
+                                                               n_layers=condition_encoder_n_layers,
+                                                               condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                                               dropout_rate=condition_encoder_dropout_rate,
+                                                               filter_channels=condition_encoder_filter_channels)
+            elif condition_encoder_type == 'convnext':
+                self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans,
+                                                                    out_channels=condition_channels,
+                                                                    n_chans=condition_encoder_hidden_channels,
+                                                                    n_heads=None,
+                                                                    n_layers=condition_encoder_n_layers,
+                                                                    condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                                                    dropout_rate=condition_encoder_dropout_rate,
+                                                                    filter_channels=condition_encoder_filter_channels)
+            else:
+                raise RuntimeError("unsupport__encoder")
+        elif ((not ues_condition_encoder) and ues_condition):
+            self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3)
+
+        self.use_depthwise_conv = use_depthwise_conv
+
+        # self.enc_p = TextEncoder(
+        #     inter_channels,
+        #     hidden_channels,
+        #     filter_channels=filter_channels,
+        #     n_heads=n_heads,
+        #     n_layers=n_layers,
+        #     kernel_size=kernel_size,
+        #     p_dropout=p_dropout
+        # )
+
+        set_Conv1dModel(self.use_depthwise_conv)
+
+        if ues_condition:
+            condition_channelsw = condition_channels
+        else:
+            condition_channelsw = 0
+
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,
+                                          gin_channels=condition_channelsw, share_parameter=flow_share_parameter)
+
+    def forward(self, c, mel, x_mask=None):
+
+        # vol proj
+
+        # f0 predict
+
+        # encoder
+        if self.use_latent:
+            z_ptemp, m_p, logs_p = self.latent_encoder(c)
+        else:
+            m_p, logs_p = None, None
+        # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
+
+        # flow
+        if self.ues_condition:
+            condition = self.condition_encoder(c)
+            z_p, logdet = self.flow(mel, x_mask, g=condition)
+        else:
+            z_p, logdet = self.flow(mel, x_mask, g=None)
+
+        return x_mask, (z_p, m_p, logs_p), logdet,
+
+    @torch.no_grad()
+    def infer(self, c, noice_scale=0.35, seed=None, ):
+        if seed is not None:
+
+            if c.device == torch.device("cuda"):
+                torch.cuda.manual_seed_all(seed)
+            else:
+                torch.manual_seed(seed)
+
+        if self.use_latent:
+            z_p, m_p, logs_p = self.latent_encoder(c)
+        else:
+            z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale
+
+        # vol proj
+
+        # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
+        # o, _ = self.flow(z_p,  g=g, reverse=True)
+
+        if self.ues_condition:
+            condition = self.condition_encoder(c)
+            # z_p, logdet = self.flow(mel, x_mask, g=condition)
+            o, _ = self.flow(z_p, g=condition, reverse=True)
+        else:
+            o, _ = self.flow(z_p, g=None, reverse=True)
+
+        return o
+
+
+class glow_loss_L(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, pack_loss,target):
+
+        z, m, logs, logdet, mask = pack_loss
+        # z, m, logs, logdet, mask = None
+
+        l = torch.sum(logs) + 0.5 * torch.sum(
+            torch.exp(-2 * logs) * ((z - m) ** 2))  # neg normal likelihood w/o the constant term
+        l = l - torch.sum(logdet)  # log jacobian determinant
+        if mask is not None:
+            l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
+        else:
+            l = l / torch.sum(torch.ones_like(z))  # averaging across batch, channel and time axes
+        l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
+        return l
+
+
+class glow_decoder(nn.Module):
+    def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads,
+                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
+                 condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers,
+                 condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels,
+                 flow_condition_channels, parame,flow_infer_seed=None,flow_infer_scale=0.35,
+                 condition_encoder_filter_channels=None,
+
+                 latent_encoder_filter_channels=None,
+
+                 use_depthwise_conv=False,
+
+                 flow_share_parameter=False,
+                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True,
+                 use_latent=True,
+                 ues_condition_encoder=False, ues_condition=False,
+                 condition_encoder_type='attention'):
+        super().__init__()
+        self.use_latent=use_latent
+        self.flow_infer_seed=flow_infer_seed
+        self.flow_infer_scale=flow_infer_scale
+        self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels,
+                                           latent_encoder_n_heads=latent_encoder_n_heads,
+                                           latent_encoder_n_layers=latent_encoder_n_layers,
+                                           latent_encoder_kernel_size=latent_encoder_kernel_size,
+                                           latent_encoder_dropout_rate=latent_encoder_dropout_rate,
+
+                                           condition_in_chans=encoder_hidden,
+
+                                           condition_encoder_hidden_channels=condition_encoder_hidden_channels,
+                                           condition_encoder_n_heads=condition_encoder_n_heads,
+                                           condition_encoder_n_layers=condition_encoder_n_layers,
+                                           condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                           condition_encoder_dropout_rate=condition_encoder_dropout_rate,
+
+                                           inter_channels=out_dims,
+                                           hidden_channels=flow_hidden_channels,
+
+                                           condition_channels=flow_condition_channels,
+
+                                           condition_encoder_filter_channels=condition_encoder_filter_channels,
+
+                                           latent_encoder_filter_channels=latent_encoder_filter_channels,
+
+                                           use_depthwise_conv=use_depthwise_conv,
+
+                                           flow_share_parameter=flow_share_parameter,
+                                           n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type,
+                                           use_latent_encoder=use_latent_encoder,
+                                           use_latent=use_latent,
+                                           ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition,
+                                           condition_encoder_type=condition_encoder_type)
+
+    def build_loss(self):
+        if self.use_latent:
+
+            return glow_loss_L()
+
+    def forward(self, x, infer, x_gt):
+
+        if infer:
+            out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
+            return out
+        else:
+
+
+            x = x.transpose(1, 2)
+            x_gt=x_gt.transpose(1, 2)
+
+            x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt)
+            pack_loss=(z_p, m_p, logs_p, logdet, x_mask )
+            return pack_loss
+
+
+
+
+        pass
diff --git a/modules/shallow/light_decoder.py b/modules/shallow/light_decoder.py
new file mode 100644
index 000000000..bb2624765
--- /dev/null
+++ b/modules/shallow/light_decoder.py
@@ -0,0 +1,109 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+class GLU(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        out, gate = x.chunk(2, dim=self.dim)
+        return out * gate.sigmoid()
+
+
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0
+
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+
+
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.act2=GLU(2)
+        self.pwconv2 = nn.Linear(intermediate_dim//2, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
+
+
+
+    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
+        residual = x
+        x=self.act(x)
+        x = self.dwconv(x)
+
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+
+
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act2(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x=self.dropout(x)
+
+        x = residual + self.drop_path (x)
+        return x
+
+
+class fs2_loss(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self,y, x):
+        x=(x - (-5)) / (0 - (-5)) * 2 - 1
+        return nn.L1Loss()(y,x)
+
+
+class noise_decoder(nn.Module):
+    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame):
+        super().__init__()
+        self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate)  for _ in range(n_layers)])
+        self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+
+
+
+    def build_loss(self):
+
+        return fs2_loss()
+
+    def forward(self, x,infer,**kwargs):
+        x=x.transpose(1, 2)
+        x=self.inconv(x)
+
+        for i in self.conv:
+            x=i(x)
+        x=self.outconv(x).transpose(1, 2)
+        if infer:
+            x=(x + 1) / 2 * (0 - (-5)) + (-5)
+        return x
+        pass
diff --git a/modules/shallow/noise_decoder.py b/modules/shallow/noise_decoder.py
new file mode 100644
index 000000000..862caf911
--- /dev/null
+++ b/modules/shallow/noise_decoder.py
@@ -0,0 +1,100 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+
+
+
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0
+
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+
+
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
+        self.con = nn.Conv1d(dim, dim, kernel_size=1, )
+
+
+    def forward(self, x: torch.Tensor,y ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x=x+self.con(y)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+
+
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x=self.dropout(x)
+
+        x = residual + self.drop_path (x)
+        return x
+
+
+class fs2_loss(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self,y, x):
+        x=(x - (-5)) / (0 - (-5)) * 2 - 1
+        return nn.L1Loss()(y,x)
+
+
+class noise_decoder(nn.Module):
+    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame):
+        super().__init__()
+        self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate)  for _ in range(n_layers)])
+        self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+
+
+
+    def build_loss(self):
+
+        return fs2_loss()
+
+    def forward(self, x,infer,**kwargs):
+        x=x.transpose(1, 2)
+        x=self.inconv(x)
+        y=torch.randn_like(x)
+        for i in self.conv:
+            y=i(y,x)
+        x=self.outconv(y).transpose(1, 2)
+        if infer:
+            x=(x + 1) / 2 * (0 - (-5)) + (-5)
+        return x
+        pass
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
index f2811423d..11cf5d29f 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/shallow/shallow_adapter.py
@@ -1,9 +1,13 @@
 import torch
 import torch.nn as nn
 
-cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode'}
+cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode',
+           'ns': 'modules.shallow.noise_decoder.noise_decoder', 'ld': 'modules.shallow.light_decoder.noise_decoder','att_fs2':'modules.shallow.fs2_decoder.attention_fs2_decoder'
+           ,'glow':'modules.shallow.glow.glow_decoder'
+           }
 encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'}
 
+
 def build_object_from_class_name(cls_str, parent_cls, strict, *args, **kwargs):
     import importlib
 
@@ -28,50 +32,45 @@ def filter_kwargs(dict_to_filter, kwarg_obj):
 
 
 class shallow_adapt(nn.Module):
-    def __init__(self, parame, out_dims,vocab_size):
+    def __init__(self, parame, out_dims, vocab_size):
         super().__init__()
         self.parame = parame
 
-        decodeparame=parame['shallow_diffusion_args']['aux_decoder_args']
-        decodeparame[ 'encoder_hidden'] = parame['hidden_size']
+        decodeparame = parame['shallow_diffusion_args']['aux_decoder_args']
+        if decodeparame.get('encoder_hidden') is None:
+            decodeparame['encoder_hidden'] = parame['hidden_size']
         decodeparame['out_dims'] = out_dims
         decodeparame['parame'] = parame
 
-        encoderparame=parame['shallow_diffusion_args']['aux_encoder_args']
+        encoderparame = parame['shallow_diffusion_args']['aux_encoder_args']
         encoderparame['parame'] = parame
         encoderparame['vocab_size'] = vocab_size
         self.decoder = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']],
-                                                  nn.Module,
-                                                  parame['shallow_diffusion_args']['aux_decoder_strict_hparams'],
-                                                  **decodeparame)
-
+                                                    nn.Module,
+                                                    parame['shallow_diffusion_args']['aux_decoder_strict_hparams'],
+                                                    **decodeparame)
 
         if not parame['shallow_diffusion_args']['aux_share_encoder']:
             # todo
-            self.use_encoder=True
-            self.encoder=build_object_from_class_name(encoder_cls_map[parame['shallow_diffusion_args']['aux_encoder_arch']],
-                                                  nn.Module,
-                                                  parame['shallow_diffusion_args']['aux_encoder_strict_hparams'],
-                                                  **encoderparame)
+            self.use_encoder = True
+            self.encoder = build_object_from_class_name(
+                encoder_cls_map[parame['shallow_diffusion_args']['aux_encoder_arch']],
+                nn.Module,
+                parame['shallow_diffusion_args']['aux_encoder_strict_hparams'],
+                **encoderparame)
         else:
             self.use_encoder = False
 
-
-
-
-
-    def forward(self, condition, infer=False,txt_tokens=None, mel2ph=None, f0=None,
-            key_shift=None, speed=None,
-            spk_embed_id=None, **kwargs):
+    def forward(self, condition, infer=False, txt_tokens=None, mel2ph=None, f0=None,
+                key_shift=None, speed=None,
+                spk_embed_id=None,gt_mel=None, **kwargs):
 
         if self.use_encoder:
-            condition=self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
-            key_shift=key_shift, speed=speed,
-            spk_embed_id=spk_embed_id, **kwargs)
+            condition = self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
+                                     key_shift=key_shift, speed=speed,
+                                     spk_embed_id=spk_embed_id, **kwargs)
 
-        return self.decoder(condition,infer)
+        return self.decoder(condition, infer,gt_mel)
 
     def get_loss(self):
         return self.decoder.build_loss()
-
-
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 0fd577add..41a1fe939 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -97,7 +97,7 @@ def forward(
                 if self.train_aux_decoder:
                     aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
                     aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
-            key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs)
+            key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id,gt_mel=gt_mel, **kwargs)
                 else:
                     aux_out = None
                 if self.train_diffusion:

From 3a4e77a059d994ed4311cfd543a5f2676d0759f5 Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Fri, 18 Aug 2023 00:02:07 +0800
Subject: [PATCH 24/33] add   glow decoder

---
 modules/shallow/glow.py            | 44 ++++++++++++++++++++++++++----
 modules/shallow/shallow_adapter.py |  4 +--
 modules/toplevel.py                |  2 +-
 3 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/modules/shallow/glow.py b/modules/shallow/glow.py
index c4be2b6ee..65db536ad 100644
--- a/modules/shallow/glow.py
+++ b/modules/shallow/glow.py
@@ -759,7 +759,7 @@ def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads,
                  inter_channels,
                  hidden_channels,
 
-                 condition_channels,
+                 condition_channels,flow_wavenet_lay=4,
 
                  condition_encoder_filter_channels=None,
 
@@ -847,7 +847,7 @@ def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads,
         else:
             condition_channelsw = 0
 
-        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay,
                                           gin_channels=condition_channelsw, share_parameter=flow_share_parameter)
 
     def forward(self, c, mel, x_mask=None):
@@ -921,12 +921,17 @@ def forward(self, pack_loss,target):
         return l
 
 
+
+
+
+
+
 class glow_decoder(nn.Module):
     def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads,
                  latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
                  condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers,
                  condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels,
-                 flow_condition_channels, parame,flow_infer_seed=None,flow_infer_scale=0.35,
+                 flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35,
                  condition_encoder_filter_channels=None,
 
                  latent_encoder_filter_channels=None,
@@ -957,6 +962,7 @@ def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, lat
                                            condition_encoder_dropout_rate=condition_encoder_dropout_rate,
 
                                            inter_channels=out_dims,
+                                           flow_wavenet_lay=flow_wavenet_lay,
                                            hidden_channels=flow_hidden_channels,
 
                                            condition_channels=flow_condition_channels,
@@ -974,24 +980,50 @@ def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, lat
                                            ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition,
                                            condition_encoder_type=condition_encoder_type)
 
+        self.use_mask=use_mask
+        self.use_norm=use_norm
+
+    def norm(self,x):
+        x = (x - (-5)) / (0 - (-5)) * 2 - 1
+        return x
+
+    def denorm(self,x):
+        x=(x + 1) / 2 * (0 - (-5)) + (-5)
+        return x
+
     def build_loss(self):
+
+
         if self.use_latent:
 
             return glow_loss_L()
 
-    def forward(self, x, infer, x_gt):
+    def forward(self, x, infer, x_gt,mask):
+        if not self.use_mask or infer:
+            mask=None
+        else:
+            mask=mask.transpose(1, 2)
+
+
+
 
         if infer:
             out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
+            if self.use_norm:
+                out = self.denorm(out)
             return out
         else:
+            if self.use_norm:
+                x_gt = self.norm(x_gt)
 
 
             x = x.transpose(1, 2)
             x_gt=x_gt.transpose(1, 2)
 
-            x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt)
-            pack_loss=(z_p, m_p, logs_p, logdet, x_mask )
+            x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask)
+
+
+            pack_loss = (z_p, m_p, logs_p, logdet, x_mask)
             return pack_loss
 
 
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
index 11cf5d29f..27a52ce6a 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/shallow/shallow_adapter.py
@@ -63,14 +63,14 @@ def __init__(self, parame, out_dims, vocab_size):
 
     def forward(self, condition, infer=False, txt_tokens=None, mel2ph=None, f0=None,
                 key_shift=None, speed=None,
-                spk_embed_id=None,gt_mel=None, **kwargs):
+                spk_embed_id=None,gt_mel=None,mask=None, **kwargs):
 
         if self.use_encoder:
             condition = self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
                                      key_shift=key_shift, speed=speed,
                                      spk_embed_id=spk_embed_id, **kwargs)
 
-        return self.decoder(condition, infer,gt_mel)
+        return self.decoder(condition, infer,gt_mel,mask)
 
     def get_loss(self):
         return self.decoder.build_loss()
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 41a1fe939..0d4567599 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -97,7 +97,7 @@ def forward(
                 if self.train_aux_decoder:
                     aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
                     aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
-            key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id,gt_mel=gt_mel, **kwargs)
+            key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id,gt_mel=gt_mel,mask=((mel2ph > 0).float()[:, :, None]), **kwargs)
                 else:
                     aux_out = None
                 if self.train_diffusion:

From 4f6e50fa6422e7290e6a91cc0b7c161f5f12a8fb Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Fri, 18 Aug 2023 12:16:39 +0800
Subject: [PATCH 25/33] add   convnext glow decoder

---
 modules/shallow/convnext_glow.py   | 1116 ++++++++++++++++++++++++++++
 modules/shallow/shallow_adapter.py |    2 +-
 2 files changed, 1117 insertions(+), 1 deletion(-)
 create mode 100644 modules/shallow/convnext_glow.py

diff --git a/modules/shallow/convnext_glow.py b/modules/shallow/convnext_glow.py
new file mode 100644
index 000000000..0420b0100
--- /dev/null
+++ b/modules/shallow/convnext_glow.py
@@ -0,0 +1,1116 @@
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+import torch.nn.functional as F
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+class RelativeFFTBlock(nn.Module):
+    """ FFT Block with Relative Multi-Head Attention """
+
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
+                 window_size=None, block_length=None):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.block_length = block_length
+
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads,
+                                                          window_size=window_size, p_dropout=p_dropout,
+                                                          block_length=block_length))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(
+                hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask=None):
+
+        if x_mask is not None:
+            attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        else:
+            attn_mask = None
+
+        for i in range(self.n_layers):
+            if x_mask is not None:
+                x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+
+class RelativeSelfAttention(nn.Module):
+    """ Relative Multi-Head Attention """
+
+    def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0.,
+                 block_length=None, proximal_bias=False, proximal_init=False):
+        super(RelativeSelfAttention, self).__init__()
+        assert channels % n_heads == 0
+
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.p_dropout = p_dropout
+        self.attn = None
+
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels ** -0.5
+            self.emb_rel_k = nn.Parameter(torch.randn(
+                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(
+                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        if proximal_init:
+            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+        x = self.conv_o(x)
+        return x
+
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels,
+                           t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels,
+                           t_s).transpose(2, 3)
+
+        scores = torch.matmul(query, key.transpose(-2, -1)
+                              ) / math.sqrt(self.k_channels)
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query, key_relative_embeddings)
+            rel_logits = self._relative_position_to_absolute_position(
+                rel_logits)
+            scores_local = rel_logits / math.sqrt(self.k_channels)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + \
+                     self._attention_bias_proximal(t_s).to(
+                         device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                block_mask = torch.ones_like(
+                    scores).triu(-self.block_length).tril(self.block_length)
+                scores = scores * block_mask + -1e4 * (1 - block_mask)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(
+                p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s)
+            output = output + \
+                     self._matmul_with_relative_values(
+                         relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(
+            b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:,
+                                   slice_start_position:slice_end_position]
+        return used_relative_embeddings
+
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, convert_pad_shape(
+            [[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(x_flat, convert_pad_shape(
+            [[0, 0], [0, 0], [0, length - 1]]))
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view(
+            [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(x, convert_pad_shape(
+            [[0, 0], [0, 0], [0, 0], [0, length - 1]]))
+        x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, convert_pad_shape(
+            [[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+
+    def _attention_bias_proximal(self, length):
+        """
+        Bias for self-attention to encourage attention to close positions.
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-4):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+
+
+class FFN(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, padding=kernel_size // 2)
+        self.drop = nn.Dropout(p_dropout)
+
+    def forward(self, x, x_mask=None):
+        if x_mask is not None:
+            x = self.conv(x * x_mask)
+        else:
+            x = self.conv(x)
+
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+
+
+Conv1dModel = nn.Conv1d  # 有毒 删
+
+
+class Depthwise_Separable_Conv1D(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            bias=True,
+            padding_mode='zeros',  # TODO: refine this type
+            device=None,
+            dtype=None
+    ):
+        super().__init__()
+        self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+                                    groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias,
+                                    padding_mode=padding_mode, device=device, dtype=dtype)
+        self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias,
+                                    device=device, dtype=dtype)
+
+    def forward(self, input):
+        return self.point_conv(self.depth_conv(input))
+
+
+def set_Conv1dModel(use_depthwise_conv):
+    global Conv1dModel
+    Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+@torch.jit.script
+def add_and_GRU(input_a, input_b):
+    in_act = input_a + input_b
+    x1, x2 = in_act.chunk(2, dim=1)
+    t_act = torch.tanh(x2)
+    s_act = torch.sigmoid(x1)
+    acts = t_act * s_act
+    return acts
+
+
+class WN(torch.nn.Module):
+    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+        super(WN, self).__init__()
+        assert (kernel_size % 2 == 1)
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size,
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels  # condition用的
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        self.condition_layers = torch.nn.ModuleList()
+
+        # if gin_channels != 0:
+        #     cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+        #     # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+        #     self.cond_layer=cond_layer
+
+        for i in range(n_layers):
+
+            if gin_channels != 0:
+                cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1)
+                # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+                # self.cond_layer = cond_layer
+            else:
+                cond_layer = nn.Identity()
+            self.condition_layers.append(cond_layer)
+
+            dilation = dilation_rate ** i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size,
+                                   dilation=dilation, padding=padding)
+            # in_layer = weight_norm_modules(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight')
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, x, x_mask=None, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+        # if g is not None:
+        #     g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+
+            if g is not None:
+
+                condition = self.condition_layers[i](g)
+            else:
+                condition = torch.zeros_like(x_in)
+
+            # acts = fused_add_tanh_sigmoid_multiply(  # GRU 这不就是wavnet的那个 GRU
+            #     x_in,
+            #     condition,
+            #     n_channels_tensor)
+            acts = add_and_GRU(  # GRU 这不就是wavnet的那个 GRU
+                x_in,
+                condition,
+            )
+            acts = self.drop(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, :self.hidden_channels, :]
+                if x_mask is not None:
+                    x = (x + res_acts) * x_mask
+                else:
+                    x = x + res_acts
+                output = output + res_skip_acts[:, self.hidden_channels:, :]
+            else:
+                output = output + res_skip_acts
+
+        if x_mask is not None:
+            out = output * x_mask
+        else:
+            out = output
+        return out
+pass
+class ConvNeXtBlock_condition(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            intermediate_dim: int, dilation, padding,
+            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0,condione: int=0
+
+    ):
+        super().__init__()
+        if condione!=0:
+            self.cond_layer = torch.nn.Conv1d(condione, intermediate_dim, 1)
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=padding, groups=dim,dilation=dilation)  # depthwise conv
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
+
+    def forward(self, x: torch.Tensor,condition=None ) -> torch.Tensor:
+
+
+
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        if condition is not None:
+
+            condition = self.cond_layer(condition)
+        else:
+            condition = torch.zeros_like(x.transpose(1, 2))
+
+        x=x+condition.transpose(1, 2)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = self.dropout(x)
+
+        x = residual + self.drop_path(x)
+        return x
+
+pass
+
+
+class CONVnext_flow(torch.nn.Module):
+    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0,innx=3):
+        super().__init__()
+        assert (kernel_size % 2 == 1)
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size,
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels  # condition用的
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        self.condition_layers = torch.nn.ModuleList()
+
+        # if gin_channels != 0:
+        #     cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+        #     # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+        #     self.cond_layer=cond_layer
+
+        for i in range(n_layers):
+            kernel_size=7
+            dilation = dilation_rate ** i
+            padding = int((kernel_size * dilation - dilation) / 2)
+
+
+
+            in_layer = ConvNeXtBlock_condition(dim=hidden_channels, intermediate_dim=innx * hidden_channels, drop_out=p_dropout,
+                                   dilation=dilation, padding=padding,layer_scale_init_value=1e-6,condione=gin_channels)
+            # in_layer = weight_norm_modules(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+
+    def forward(self, x, x_mask=None, g=None, **kwargs):
+
+
+        # if g is not None:
+        #     g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+
+
+            x = self.in_layers[i](x,g)
+
+            if x_mask is not None:
+                x = x * x_mask
+            else:
+                x = x
+
+
+
+
+
+
+
+        return x
+
+
+
+
+class ResidualCouplingLayer(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 p_dropout=0,
+                 gin_channels=0,
+                 mean_only=False,
+                 wn_sharing_parameter=None  # 不明的共享权重
+                 ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = CONVnext_flow(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
+                      gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+
+    def forward(self, x, x_mask=None, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        if x_mask is not None:
+            h = self.pre(x0) * x_mask
+        else:
+            h = self.pre(x0)
+        h = self.enc(h, x_mask, g=g)
+
+        if x_mask is not None:
+            stats = self.post(h) * x_mask
+        else:
+            stats = self.post(h)
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not reverse:
+            if x_mask is not None:
+                x1 = m + x1 * torch.exp(logs) * x_mask
+            else:
+                x1 = m + x1 * torch.exp(logs)
+            # x1 = m + x1 * torch.exp(logs) * x_mask  # 逆过程
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            if x_mask is not None:
+                x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            else:
+                x1 = (x1 - m) * torch.exp(-logs)
+            # x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 n_flows=4,
+                 gin_channels=0,
+                 share_parameter=False
+                 ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+
+        self.wn = CONVnext_flow(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0,
+                     gin_channels=gin_channels) if share_parameter else None
+
+        for i in range(n_flows):
+            self.flows.append(
+                ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
+                                      gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
+            self.flows.append(Flip())
+
+    def forward(self, x, x_mask=None, g=None, reverse=False):
+        if not reverse:
+            logdet_tot = 0
+            for flow in self.flows:
+                x, logdet = flow(x, x_mask, g=g, reverse=reverse)
+                logdet_tot += logdet
+        else:
+            logdet_tot = None
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x, logdet_tot
+
+
+
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            intermediate_dim: int,
+            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0
+
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
+
+    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = self.dropout(x)
+
+        x = residual + self.drop_path(x)
+        return x
+
+
+class condition_latent_encoder_att(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
+
+        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
+                                    n_layers=n_layers,
+                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
+
+    def forward(self, x, noice_scale=1):
+        x = self.proj_in(x)
+
+        x = self.enc(x)
+        stats = self.proj_out(x)
+        m, logs = torch.chunk(stats, 2, 1)
+        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
+
+        return z, m, logs,
+
+
+class condition_latent_encoder_convnext(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
+
+        self.conv = nn.ModuleList(
+            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
+                           drop_out=dropout_rate) for _ in range(n_layers)])
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
+
+    def forward(self, x, noice_scale=1):
+        x = self.proj_in(x)
+
+        for i in self.conv:
+            x = i(x)
+        stats = self.proj_out(x)
+        m, logs = torch.chunk(stats, 2, 1)
+        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
+
+        return z, m, logs,
+
+
+class condition_encoder_att(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
+                                 padding=condition_encoder_kernel_size // 2)
+
+        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
+                                    n_layers=n_layers,
+                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
+                                  padding=condition_encoder_kernel_size // 2)
+
+    def forward(self, x, ):
+        x = self.proj_in(x)
+
+        x = self.enc(x)
+        stats = self.proj_out(x)
+
+        return stats
+
+
+class condition_encoder_convnext(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
+                                 padding=condition_encoder_kernel_size // 2)
+
+        self.conv = nn.ModuleList(
+            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
+                           drop_out=dropout_rate) for _ in range(n_layers)])
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
+                                  padding=condition_encoder_kernel_size // 2)
+
+    def forward(self, x, ):
+        x = self.proj_in(x)
+
+        for i in self.conv:
+            x = i(x)
+        stats = self.proj_out(x)
+
+        return stats
+
+
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+
+    def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads,
+                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
+
+                 condition_in_chans,
+
+                 condition_encoder_hidden_channels,
+                 condition_encoder_n_heads,
+                 condition_encoder_n_layers,
+                 condition_encoder_kernel_size,
+                 condition_encoder_dropout_rate,
+
+                 inter_channels,
+                 hidden_channels,
+
+                 condition_channels,flow_wavenet_lay=4,
+
+                 condition_encoder_filter_channels=None,
+
+                 latent_encoder_filter_channels=None,
+
+                 use_depthwise_conv=False,
+
+                 flow_share_parameter=False,
+                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True,
+                 ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention',
+
+                 **kwargs):
+
+        super().__init__()
+        self.inter_channels = inter_channels
+        self.ues_condition = ues_condition
+
+        self.use_latent = use_latent
+
+        if use_latent_encoder and use_latent:
+            if latent_encoder_type == 'attention':
+                self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans,
+                                                                   out_channels=inter_channels,
+                                                                   n_chans=latent_encoder_hidden_channels,
+                                                                   n_heads=latent_encoder_n_heads,
+                                                                   n_layers=latent_encoder_n_layers,
+                                                                   condition_encoder_kernel_size=latent_encoder_kernel_size,
+                                                                   dropout_rate=latent_encoder_dropout_rate,
+                                                                   filter_channels=latent_encoder_filter_channels)
+            elif latent_encoder_type == 'convnext':
+                self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans,
+                                                                        out_channels=inter_channels,
+                                                                        n_chans=latent_encoder_hidden_channels,
+                                                                        n_heads=None,
+                                                                        n_layers=latent_encoder_n_layers,
+                                                                        condition_encoder_kernel_size=None,
+                                                                        dropout_rate=latent_encoder_dropout_rate,
+                                                                        filter_channels=latent_encoder_filter_channels)
+            else:
+                raise RuntimeError("unsupport_latent_encoder")
+
+        elif ((not use_latent_encoder) and use_latent):
+            self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3)
+
+        if ues_condition_encoder and ues_condition:
+            if condition_encoder_type == 'attention':
+                self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans,
+                                                               out_channels=condition_channels,
+                                                               n_chans=condition_encoder_hidden_channels,
+                                                               n_heads=condition_encoder_n_heads,
+                                                               n_layers=condition_encoder_n_layers,
+                                                               condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                                               dropout_rate=condition_encoder_dropout_rate,
+                                                               filter_channels=condition_encoder_filter_channels)
+            elif condition_encoder_type == 'convnext':
+                self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans,
+                                                                    out_channels=condition_channels,
+                                                                    n_chans=condition_encoder_hidden_channels,
+                                                                    n_heads=None,
+                                                                    n_layers=condition_encoder_n_layers,
+                                                                    condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                                                    dropout_rate=condition_encoder_dropout_rate,
+                                                                    filter_channels=condition_encoder_filter_channels)
+            else:
+                raise RuntimeError("unsupport__encoder")
+        elif ((not ues_condition_encoder) and ues_condition):
+            self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3)
+
+        self.use_depthwise_conv = use_depthwise_conv
+
+        # self.enc_p = TextEncoder(
+        #     inter_channels,
+        #     hidden_channels,
+        #     filter_channels=filter_channels,
+        #     n_heads=n_heads,
+        #     n_layers=n_layers,
+        #     kernel_size=kernel_size,
+        #     p_dropout=p_dropout
+        # )
+
+        set_Conv1dModel(self.use_depthwise_conv)
+
+        if ues_condition:
+            condition_channelsw = condition_channels
+        else:
+            condition_channelsw = 0
+
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay,
+                                          gin_channels=condition_channelsw, share_parameter=flow_share_parameter)
+
+    def forward(self, c, mel, x_mask=None):
+
+        # vol proj
+
+        # f0 predict
+
+        # encoder
+        if self.use_latent:
+            z_ptemp, m_p, logs_p = self.latent_encoder(c)
+        else:
+            m_p, logs_p = None, None
+        # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
+
+        # flow
+        if self.ues_condition:
+            condition = self.condition_encoder(c)
+            z_p, logdet = self.flow(mel, x_mask, g=condition)
+        else:
+            z_p, logdet = self.flow(mel, x_mask, g=None)
+
+        return x_mask, (z_p, m_p, logs_p), logdet,
+
+    @torch.no_grad()
+    def infer(self, c, noice_scale=0.35, seed=None, ):
+        if seed is not None:
+
+            if c.device == torch.device("cuda"):
+                torch.cuda.manual_seed_all(seed)
+            else:
+                torch.manual_seed(seed)
+
+        if self.use_latent:
+            z_p, m_p, logs_p = self.latent_encoder(c)
+        else:
+            z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale
+
+        # vol proj
+
+        # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
+        # o, _ = self.flow(z_p,  g=g, reverse=True)
+
+        if self.ues_condition:
+            condition = self.condition_encoder(c)
+            # z_p, logdet = self.flow(mel, x_mask, g=condition)
+            o, _ = self.flow(z_p, g=condition, reverse=True)
+        else:
+            o, _ = self.flow(z_p, g=None, reverse=True)
+
+        return o
+
+
+class glow_loss_L(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, pack_loss,target):
+
+        z, m, logs, logdet, mask = pack_loss
+        # z, m, logs, logdet, mask = None
+
+        l = torch.sum(logs) + 0.5 * torch.sum(
+            torch.exp(-2 * logs) * ((z - m) ** 2))  # neg normal likelihood w/o the constant term
+        l = l - torch.sum(logdet)  # log jacobian determinant
+        if mask is not None:
+            l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
+        else:
+            l = l / torch.sum(torch.ones_like(z))  # averaging across batch, channel and time axes
+        l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
+        return l
+
+
+
+
+
+
+
+class glow_decoder_convnext(nn.Module):
+    def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads,
+                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
+                 condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers,
+                 condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels,
+                 flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35,
+                 condition_encoder_filter_channels=None,
+
+                 latent_encoder_filter_channels=None,
+
+                 use_depthwise_conv=False,
+
+                 flow_share_parameter=False,
+                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True,
+                 use_latent=True,
+                 ues_condition_encoder=False, ues_condition=False,
+                 condition_encoder_type='attention'):
+        super().__init__()
+        self.use_latent=use_latent
+        self.flow_infer_seed=flow_infer_seed
+        self.flow_infer_scale=flow_infer_scale
+        self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels,
+                                           latent_encoder_n_heads=latent_encoder_n_heads,
+                                           latent_encoder_n_layers=latent_encoder_n_layers,
+                                           latent_encoder_kernel_size=latent_encoder_kernel_size,
+                                           latent_encoder_dropout_rate=latent_encoder_dropout_rate,
+
+                                           condition_in_chans=encoder_hidden,
+
+                                           condition_encoder_hidden_channels=condition_encoder_hidden_channels,
+                                           condition_encoder_n_heads=condition_encoder_n_heads,
+                                           condition_encoder_n_layers=condition_encoder_n_layers,
+                                           condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                           condition_encoder_dropout_rate=condition_encoder_dropout_rate,
+
+                                           inter_channels=out_dims,
+                                           flow_wavenet_lay=flow_wavenet_lay,
+                                           hidden_channels=flow_hidden_channels,
+
+                                           condition_channels=flow_condition_channels,
+
+                                           condition_encoder_filter_channels=condition_encoder_filter_channels,
+
+                                           latent_encoder_filter_channels=latent_encoder_filter_channels,
+
+                                           use_depthwise_conv=use_depthwise_conv,
+
+                                           flow_share_parameter=flow_share_parameter,
+                                           n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type,
+                                           use_latent_encoder=use_latent_encoder,
+                                           use_latent=use_latent,
+                                           ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition,
+                                           condition_encoder_type=condition_encoder_type)
+
+        self.use_mask=use_mask
+        self.use_norm=use_norm
+
+    def norm(self,x):
+        x = (x - (-5)) / (0 - (-5)) * 2 - 1
+        return x
+
+    def denorm(self,x):
+        x=(x + 1) / 2 * (0 - (-5)) + (-5)
+        return x
+
+    def build_loss(self):
+
+
+        if self.use_latent:
+
+            return glow_loss_L()
+
+    def forward(self, x, infer, x_gt,mask):
+        if not self.use_mask or infer:
+            mask=None
+        else:
+            mask=mask.transpose(1, 2)
+
+
+
+
+        if infer:
+            out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
+            if self.use_norm:
+                out = self.denorm(out)
+            return out
+        else:
+            if self.use_norm:
+                x_gt = self.norm(x_gt)
+
+
+            x = x.transpose(1, 2)
+            x_gt=x_gt.transpose(1, 2)
+
+            x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask)
+
+
+            pack_loss = (z_p, m_p, logs_p, logdet, x_mask)
+            return pack_loss
+
+
+
+
+        pass
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
index 27a52ce6a..403c32f38 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/shallow/shallow_adapter.py
@@ -3,7 +3,7 @@
 
 cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode',
            'ns': 'modules.shallow.noise_decoder.noise_decoder', 'ld': 'modules.shallow.light_decoder.noise_decoder','att_fs2':'modules.shallow.fs2_decoder.attention_fs2_decoder'
-           ,'glow':'modules.shallow.glow.glow_decoder'
+           ,'glow':'modules.shallow.glow.glow_decoder','glow_convnext':'modules.shallow.convnext_glow.glow_decoder_convnext'
            }
 encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'}
 

From bf1d62c7f1f0b077ec942ebba2d8a9f1f51c08ba Mon Sep 17 00:00:00 2001
From: autumn <2>
Date: Sun, 20 Aug 2023 13:12:18 +0800
Subject: [PATCH 26/33] fix fs2

---
 modules/shallow/convnext_glow.py        |   32 +-
 modules/shallow/fast_speech2_decoder.py |    2 +-
 modules/shallow/fs2_decoder.py          |    2 +-
 modules/shallow/gglow.py                | 1033 +++++++++++++++++++++++
 modules/shallow/shallow_adapter.py      |    2 +-
 5 files changed, 1061 insertions(+), 10 deletions(-)
 create mode 100644 modules/shallow/gglow.py

diff --git a/modules/shallow/convnext_glow.py b/modules/shallow/convnext_glow.py
index 0420b0100..f9e8dbd6b 100644
--- a/modules/shallow/convnext_glow.py
+++ b/modules/shallow/convnext_glow.py
@@ -956,7 +956,7 @@ def forward(self, c, mel, x_mask=None):
 
         return x_mask, (z_p, m_p, logs_p), logdet,
 
-    @torch.no_grad()
+
     def infer(self, c, noice_scale=0.35, seed=None, ):
         if seed is not None:
 
@@ -984,6 +984,14 @@ def infer(self, c, noice_scale=0.35, seed=None, ):
 
         return o
 
+class fs2_loss(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self,y, x):
+        x=(x - (-5)) / (0 - (-5)) * 2 - 1
+        return nn.L1Loss()(y,x)
+
 
 class glow_loss_L(nn.Module):
     def __init__(self):
@@ -1015,7 +1023,7 @@ def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, lat
                  latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
                  condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers,
                  condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels,
-                 flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35,
+                 flow_condition_channels, parame,ft_flow=False,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35,
                  condition_encoder_filter_channels=None,
 
                  latent_encoder_filter_channels=None,
@@ -1031,6 +1039,7 @@ def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, lat
         self.use_latent=use_latent
         self.flow_infer_seed=flow_infer_seed
         self.flow_infer_scale=flow_infer_scale
+        self.ft_flow=ft_flow
         self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels,
                                            latent_encoder_n_heads=latent_encoder_n_heads,
                                            latent_encoder_n_layers=latent_encoder_n_layers,
@@ -1076,26 +1085,35 @@ def denorm(self,x):
         return x
 
     def build_loss(self):
-
+        if self.ft_flow:
+            return fs2_loss()
 
         if self.use_latent:
 
             return glow_loss_L()
 
+
+
     def forward(self, x, infer, x_gt,mask):
         if not self.use_mask or infer:
             mask=None
         else:
             mask=mask.transpose(1, 2)
 
+        if self.ft_flow and not infer:
+            out = self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale,
+                                          seed=self.flow_infer_seed).transpose(1, 2)
+            return out
+
 
 
 
         if infer:
-            out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
-            if self.use_norm:
-                out = self.denorm(out)
-            return out
+            with torch.no_grad():
+                out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
+                if self.use_norm:
+                    out = self.denorm(out)
+                return out
         else:
             if self.use_norm:
                 x_gt = self.norm(x_gt)
diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/shallow/fast_speech2_decoder.py
index ec264f3ce..61dc04860 100644
--- a/modules/shallow/fast_speech2_decoder.py
+++ b/modules/shallow/fast_speech2_decoder.py
@@ -84,7 +84,7 @@ def build_loss(self):
 
         return fs2_loss()
 
-    def forward(self, x,infer,**kwargs):
+    def forward(self, x,infer,*args,**kwargs):
         x=x.transpose(1, 2)
         x=self.inconv(x)
         for i in self.conv:
diff --git a/modules/shallow/fs2_decoder.py b/modules/shallow/fs2_decoder.py
index 073819dd1..acb3408be 100644
--- a/modules/shallow/fs2_decoder.py
+++ b/modules/shallow/fs2_decoder.py
@@ -287,7 +287,7 @@ def build_loss(self):
 
         return fs2_loss()
 
-    def forward(self, x,infer,**kwargs):
+    def forward(self, x,infer,*args,**kwargs):
         x=x.transpose(1, 2)
         x=self.inconv(x)
 
diff --git a/modules/shallow/gglow.py b/modules/shallow/gglow.py
new file mode 100644
index 000000000..817005579
--- /dev/null
+++ b/modules/shallow/gglow.py
@@ -0,0 +1,1033 @@
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+import torch.nn.functional as F
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+class RelativeFFTBlock(nn.Module):
+    """ FFT Block with Relative Multi-Head Attention """
+
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
+                 window_size=None, block_length=None):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.block_length = block_length
+
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads,
+                                                          window_size=window_size, p_dropout=p_dropout,
+                                                          block_length=block_length))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(
+                hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask=None):
+
+        if x_mask is not None:
+            attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        else:
+            attn_mask = None
+
+        for i in range(self.n_layers):
+            if x_mask is not None:
+                x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+
+class RelativeSelfAttention(nn.Module):
+    """ Relative Multi-Head Attention """
+
+    def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0.,
+                 block_length=None, proximal_bias=False, proximal_init=False):
+        super(RelativeSelfAttention, self).__init__()
+        assert channels % n_heads == 0
+
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.p_dropout = p_dropout
+        self.attn = None
+
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels ** -0.5
+            self.emb_rel_k = nn.Parameter(torch.randn(
+                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(
+                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        if proximal_init:
+            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+        x = self.conv_o(x)
+        return x
+
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels,
+                           t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels,
+                           t_s).transpose(2, 3)
+
+        scores = torch.matmul(query, key.transpose(-2, -1)
+                              ) / math.sqrt(self.k_channels)
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query, key_relative_embeddings)
+            rel_logits = self._relative_position_to_absolute_position(
+                rel_logits)
+            scores_local = rel_logits / math.sqrt(self.k_channels)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + \
+                     self._attention_bias_proximal(t_s).to(
+                         device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                block_mask = torch.ones_like(
+                    scores).triu(-self.block_length).tril(self.block_length)
+                scores = scores * block_mask + -1e4 * (1 - block_mask)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(
+                p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s)
+            output = output + \
+                     self._matmul_with_relative_values(
+                         relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(
+            b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:,
+                                   slice_start_position:slice_end_position]
+        return used_relative_embeddings
+
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, convert_pad_shape(
+            [[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(x_flat, convert_pad_shape(
+            [[0, 0], [0, 0], [0, length - 1]]))
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view(
+            [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(x, convert_pad_shape(
+            [[0, 0], [0, 0], [0, 0], [0, length - 1]]))
+        x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, convert_pad_shape(
+            [[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+
+    def _attention_bias_proximal(self, length):
+        """
+        Bias for self-attention to encourage attention to close positions.
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-4):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+
+
+class FFN(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, padding=kernel_size // 2)
+        self.drop = nn.Dropout(p_dropout)
+
+    def forward(self, x, x_mask=None):
+        if x_mask is not None:
+            x = self.conv(x * x_mask)
+        else:
+            x = self.conv(x)
+
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+
+
+Conv1dModel = nn.Conv1d  # 有毒 删
+
+
+class Depthwise_Separable_Conv1D(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            bias=True,
+            padding_mode='zeros',  # TODO: refine this type
+            device=None,
+            dtype=None
+    ):
+        super().__init__()
+        self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+                                    groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias,
+                                    padding_mode=padding_mode, device=device, dtype=dtype)
+        self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias,
+                                    device=device, dtype=dtype)
+
+    def forward(self, input):
+        return self.point_conv(self.depth_conv(input))
+
+
+def set_Conv1dModel(use_depthwise_conv):
+    global Conv1dModel
+    Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+@torch.jit.script
+def add_and_GRU(input_a, input_b):
+    in_act = input_a + input_b
+    x1, x2 = in_act.chunk(2, dim=1)
+    t_act = torch.tanh(x2)
+    s_act = torch.sigmoid(x1)
+    acts = t_act * s_act
+    return acts
+
+
+class WN(torch.nn.Module):
+    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+        super(WN, self).__init__()
+        assert (kernel_size % 2 == 1)
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size,
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels  # condition用的
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        self.condition_layers = torch.nn.ModuleList()
+
+        # if gin_channels != 0:
+        #     cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+        #     # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+        #     self.cond_layer=cond_layer
+
+        for i in range(n_layers):
+
+            if gin_channels != 0:
+                cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1)
+                # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+                # self.cond_layer = cond_layer
+            else:
+                cond_layer = nn.Identity()
+            self.condition_layers.append(cond_layer)
+
+            dilation = dilation_rate ** i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size,
+                                   dilation=dilation, padding=padding)
+            # in_layer = weight_norm_modules(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight')
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, x, x_mask=None, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+        # if g is not None:
+        #     g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+
+            if g is not None:
+
+                condition = self.condition_layers[i](g)
+            else:
+                condition = torch.zeros_like(x_in)
+
+            # acts = fused_add_tanh_sigmoid_multiply(  # GRU 这不就是wavnet的那个 GRU
+            #     x_in,
+            #     condition,
+            #     n_channels_tensor)
+            acts = add_and_GRU(  # GRU 这不就是wavnet的那个 GRU
+                x_in,
+                condition,
+            )
+            acts = self.drop(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, :self.hidden_channels, :]
+                if x_mask is not None:
+                    x = (x + res_acts) * x_mask
+                else:
+                    x = x + res_acts
+                output = output + res_skip_acts[:, self.hidden_channels:, :]
+            else:
+                output = output + res_skip_acts
+
+        if x_mask is not None:
+            out = output * x_mask
+        else:
+            out = output
+        return out
+
+    # def remove_weight_norm(self):
+    #     if self.gin_channels != 0:
+    #         remove_weight_norm_modules(self.cond_layer)
+    #     for l in self.in_layers:
+    #         remove_weight_norm_modules(l)
+    #     for l in self.res_skip_layers:
+    #         remove_weight_norm_modules(l)
+
+
+class ResidualCouplingLayer(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 p_dropout=0,
+                 gin_channels=0,
+                 mean_only=False,
+                 wn_sharing_parameter=None  # 不明的共享权重
+                 ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
+                      gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+
+    def forward(self, x, x_mask=None, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        if x_mask is not None:
+            h = self.pre(x0) * x_mask
+        else:
+            h = self.pre(x0)
+        h = self.enc(h, x_mask, g=g)
+
+        if x_mask is not None:
+            stats = self.post(h) * x_mask
+        else:
+            stats = self.post(h)
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not reverse:
+            if x_mask is not None:
+                x1 = m + x1 * torch.exp(logs) * x_mask
+            else:
+                x1 = m + x1 * torch.exp(logs)
+            # x1 = m + x1 * torch.exp(logs) * x_mask  # 逆过程
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            if x_mask is not None:
+                x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            else:
+                x1 = (x1 - m) * torch.exp(-logs)
+            # x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 n_flows=4,
+                 gin_channels=0,
+                 share_parameter=False
+                 ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+
+        self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0,
+                     gin_channels=gin_channels) if share_parameter else None
+
+        for i in range(n_flows):
+            self.flows.append(
+                ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
+                                      gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
+            self.flows.append(Flip())
+
+    def forward(self, x, x_mask=None, g=None, reverse=False):
+        if not reverse:
+            logdet_tot = 0
+            for flow in self.flows:
+                x, logdet = flow(x, x_mask, g=g, reverse=reverse)
+                logdet_tot += logdet
+        else:
+            logdet_tot = None
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x, logdet_tot
+
+
+# class TextEncoder(nn.Module):
+#     def __init__(self,
+#                  out_channels,
+#                  hidden_channels,
+#                  kernel_size,
+#                  n_layers,
+#                  gin_channels=0,
+#                  filter_channels=None,
+#                  n_heads=None,
+#                  p_dropout=None):
+#         super().__init__()
+#         self.out_channels = out_channels
+#         self.hidden_channels = hidden_channels
+#         self.kernel_size = kernel_size
+#         self.n_layers = n_layers
+#         self.gin_channels = gin_channels
+#         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+#         self.f0_emb = nn.Embedding(256, hidden_channels)
+#
+#         self.enc_ = attentions.Encoder(
+#             hidden_channels,
+#             filter_channels,
+#             n_heads,
+#             n_layers,
+#             kernel_size,
+#             p_dropout)
+#
+#     def forward(self, x, x_mask, f0=None, noice_scale=1):
+#         x = x + self.f0_emb(f0).transpose(1, 2)
+#         x = self.enc_(x * x_mask, x_mask)
+#         stats = self.proj(x) * x_mask
+#         m, logs = torch.split(stats, self.out_channels, dim=1)
+#         z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
+#
+#         return z, m, logs, x_mask
+
+
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            intermediate_dim: int,
+            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0
+
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
+
+    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = self.dropout(x)
+
+        x = residual + self.drop_path(x)
+        return x
+
+
+class condition_latent_encoder_att(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
+
+        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
+                                    n_layers=n_layers,
+                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
+
+    def forward(self, x, noice_scale=1):
+        x = self.proj_in(x)
+
+        x = self.enc(x)
+        stats = self.proj_out(x)
+        m, logs = torch.chunk(stats, 2, 1)
+        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
+
+        return z, m, logs,
+
+
+class condition_latent_encoder_convnext(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
+
+        self.conv = nn.ModuleList(
+            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
+                           drop_out=dropout_rate) for _ in range(n_layers)])
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
+
+    def forward(self, x, noice_scale=1):
+        x = self.proj_in(x)
+
+        for i in self.conv:
+            x = i(x)
+        stats = self.proj_out(x)
+        m, logs = torch.chunk(stats, 2, 1)
+        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
+
+        return z, m, logs,
+
+
+class condition_encoder_att(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
+                                 padding=condition_encoder_kernel_size // 2)
+
+        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
+                                    n_layers=n_layers,
+                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
+                                  padding=condition_encoder_kernel_size // 2)
+
+    def forward(self, x, ):
+        x = self.proj_in(x)
+
+        x = self.enc(x)
+        stats = self.proj_out(x)
+
+        return stats
+
+
+class condition_encoder_convnext(nn.Module):
+    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
+                 filter_channels=None):
+        super().__init__()
+        if filter_channels is None:
+            filter_channels = n_chans * 4
+
+        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
+                                 padding=condition_encoder_kernel_size // 2)
+
+        self.conv = nn.ModuleList(
+            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
+                           drop_out=dropout_rate) for _ in range(n_layers)])
+
+        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
+                                  padding=condition_encoder_kernel_size // 2)
+
+    def forward(self, x, ):
+        x = self.proj_in(x)
+
+        for i in self.conv:
+            x = i(x)
+        stats = self.proj_out(x)
+
+        return stats
+
+
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+
+    def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads,
+                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
+
+                 condition_in_chans,
+
+                 condition_encoder_hidden_channels,
+                 condition_encoder_n_heads,
+                 condition_encoder_n_layers,
+                 condition_encoder_kernel_size,
+                 condition_encoder_dropout_rate,
+
+                 inter_channels,
+                 hidden_channels,
+
+                 condition_channels,flow_wavenet_lay=4,
+
+                 condition_encoder_filter_channels=None,
+
+                 latent_encoder_filter_channels=None,
+
+                 use_depthwise_conv=False,
+
+                 flow_share_parameter=False,
+                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True,
+                 ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention',
+
+                 **kwargs):
+
+        super().__init__()
+        self.inter_channels = inter_channels
+        self.ues_condition = ues_condition
+
+        self.use_latent = use_latent
+
+        if use_latent_encoder and use_latent:
+            if latent_encoder_type == 'attention':
+                self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans,
+                                                                   out_channels=inter_channels,
+                                                                   n_chans=latent_encoder_hidden_channels,
+                                                                   n_heads=latent_encoder_n_heads,
+                                                                   n_layers=latent_encoder_n_layers,
+                                                                   condition_encoder_kernel_size=latent_encoder_kernel_size,
+                                                                   dropout_rate=latent_encoder_dropout_rate,
+                                                                   filter_channels=latent_encoder_filter_channels)
+            elif latent_encoder_type == 'convnext':
+                self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans,
+                                                                        out_channels=inter_channels,
+                                                                        n_chans=latent_encoder_hidden_channels,
+                                                                        n_heads=None,
+                                                                        n_layers=latent_encoder_n_layers,
+                                                                        condition_encoder_kernel_size=None,
+                                                                        dropout_rate=latent_encoder_dropout_rate,
+                                                                        filter_channels=latent_encoder_filter_channels)
+            else:
+                raise RuntimeError("unsupport_latent_encoder")
+
+        elif ((not use_latent_encoder) and use_latent):
+            self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3)
+
+        if ues_condition_encoder and ues_condition:
+            if condition_encoder_type == 'attention':
+                self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans,
+                                                               out_channels=condition_channels,
+                                                               n_chans=condition_encoder_hidden_channels,
+                                                               n_heads=condition_encoder_n_heads,
+                                                               n_layers=condition_encoder_n_layers,
+                                                               condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                                               dropout_rate=condition_encoder_dropout_rate,
+                                                               filter_channels=condition_encoder_filter_channels)
+            elif condition_encoder_type == 'convnext':
+                self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans,
+                                                                    out_channels=condition_channels,
+                                                                    n_chans=condition_encoder_hidden_channels,
+                                                                    n_heads=None,
+                                                                    n_layers=condition_encoder_n_layers,
+                                                                    condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                                                    dropout_rate=condition_encoder_dropout_rate,
+                                                                    filter_channels=condition_encoder_filter_channels)
+            else:
+                raise RuntimeError("unsupport__encoder")
+        elif ((not ues_condition_encoder) and ues_condition):
+            self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3)
+
+        self.use_depthwise_conv = use_depthwise_conv
+
+        # self.enc_p = TextEncoder(
+        #     inter_channels,
+        #     hidden_channels,
+        #     filter_channels=filter_channels,
+        #     n_heads=n_heads,
+        #     n_layers=n_layers,
+        #     kernel_size=kernel_size,
+        #     p_dropout=p_dropout
+        # )
+
+        set_Conv1dModel(self.use_depthwise_conv)
+
+        if ues_condition:
+            condition_channelsw = condition_channels
+        else:
+            condition_channelsw = 0
+
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay,
+                                          gin_channels=condition_channelsw, share_parameter=flow_share_parameter)
+
+    def forward(self, c, mel, x_mask=None):
+
+        # vol proj
+
+        # f0 predict
+
+        # encoder
+        if self.use_latent:
+            z_ptemp, m_p, logs_p = self.latent_encoder(c)
+        else:
+            m_p, logs_p = None, None
+        # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
+
+        # flow
+        if self.ues_condition:
+            condition = self.condition_encoder(c)
+            z_p, logdet = self.flow(mel, x_mask, g=condition)
+        else:
+            z_p, logdet = self.flow(mel, x_mask, g=None)
+
+        return x_mask, (z_p, m_p, logs_p), logdet,
+
+    @torch.no_grad()
+    def infer(self, c, noice_scale=0.35, seed=None, ):
+        if seed is not None:
+
+            if c.device == torch.device("cuda"):
+                torch.cuda.manual_seed_all(seed)
+            else:
+                torch.manual_seed(seed)
+
+        if self.use_latent:
+            z_p, m_p, logs_p = self.latent_encoder(c)
+        else:
+            z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale
+
+            z_p=z_p.cuda()
+
+        # vol proj
+
+        # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
+        # o, _ = self.flow(z_p,  g=g, reverse=True)
+
+        if self.ues_condition:
+            condition = self.condition_encoder(c)
+            # z_p, logdet = self.flow(mel, x_mask, g=condition)
+            o, _ = self.flow(z_p, g=condition, reverse=True)
+        else:
+            o, _ = self.flow(z_p, g=None, reverse=True)
+
+        return o
+
+
+class glow_loss_L(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, pack_loss,target):
+
+        z, m, logs, logdet, mask = pack_loss
+        # z, m, logs, logdet, mask = None
+
+        l = 0.5 * torch.sum(
+            torch.exp(-2 * logdet) * ((z ) ** 2))  # neg normal likelihood w/o the constant term
+        l = l - torch.sum(logdet)  # log jacobian determinant
+        if mask is not None:
+            l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
+        else:
+            l = l / torch.sum(torch.ones_like(z))  # averaging across batch, channel and time axes
+        l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
+        return l
+
+
+
+
+
+class glow_decoder(nn.Module):
+    def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads,
+                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
+                 condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers,
+                 condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels,
+                 flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35,
+                 condition_encoder_filter_channels=None,
+
+                 latent_encoder_filter_channels=None,
+
+                 use_depthwise_conv=False,
+
+                 flow_share_parameter=False,
+                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True,
+                 use_latent=True,
+                 ues_condition_encoder=False, ues_condition=False,
+                 condition_encoder_type='attention'):
+        super().__init__()
+        self.use_latent=use_latent
+        self.flow_infer_seed=flow_infer_seed
+        self.flow_infer_scale=flow_infer_scale
+        self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels,
+                                           latent_encoder_n_heads=latent_encoder_n_heads,
+                                           latent_encoder_n_layers=latent_encoder_n_layers,
+                                           latent_encoder_kernel_size=latent_encoder_kernel_size,
+                                           latent_encoder_dropout_rate=latent_encoder_dropout_rate,
+
+                                           condition_in_chans=encoder_hidden,
+
+                                           condition_encoder_hidden_channels=condition_encoder_hidden_channels,
+                                           condition_encoder_n_heads=condition_encoder_n_heads,
+                                           condition_encoder_n_layers=condition_encoder_n_layers,
+                                           condition_encoder_kernel_size=condition_encoder_kernel_size,
+                                           condition_encoder_dropout_rate=condition_encoder_dropout_rate,
+
+                                           inter_channels=out_dims,
+                                           flow_wavenet_lay=flow_wavenet_lay,
+                                           hidden_channels=flow_hidden_channels,
+
+                                           condition_channels=flow_condition_channels,
+
+                                           condition_encoder_filter_channels=condition_encoder_filter_channels,
+
+                                           latent_encoder_filter_channels=latent_encoder_filter_channels,
+
+                                           use_depthwise_conv=use_depthwise_conv,
+
+                                           flow_share_parameter=flow_share_parameter,
+                                           n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type,
+                                           use_latent_encoder=use_latent_encoder,
+                                           use_latent=use_latent,
+                                           ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition,
+                                           condition_encoder_type=condition_encoder_type)
+
+        self.use_mask=use_mask
+        self.use_norm=use_norm
+
+    def norm(self,x):
+        x = (x - (-5)) / (0 - (-5)) * 2 - 1
+        return x
+
+    def denorm(self,x):
+        x=(x + 1) / 2 * (0 - (-5)) + (-5)
+        return x
+
+    def build_loss(self):
+
+
+        if self.use_latent:
+
+            return glow_loss_L()
+
+        return glow_loss_L()
+    def forward(self, x, infer, x_gt,mask):
+        if not self.use_mask or infer:
+            mask=None
+        else:
+            mask=mask.transpose(1, 2)
+
+
+
+
+        if infer:
+            out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
+            if self.use_norm:
+                out = self.denorm(out)
+            return out
+        else:
+            if self.use_norm:
+                x_gt = self.norm(x_gt)
+
+
+            x = x.transpose(1, 2)
+            x_gt=x_gt.transpose(1, 2)
+
+            x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask)
+
+
+            pack_loss = (z_p, m_p, logs_p, logdet, x_mask)
+            return pack_loss
+
+
+
+
+        pass
diff --git a/modules/shallow/shallow_adapter.py b/modules/shallow/shallow_adapter.py
index 403c32f38..1a35c8507 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/shallow/shallow_adapter.py
@@ -3,7 +3,7 @@
 
 cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode',
            'ns': 'modules.shallow.noise_decoder.noise_decoder', 'ld': 'modules.shallow.light_decoder.noise_decoder','att_fs2':'modules.shallow.fs2_decoder.attention_fs2_decoder'
-           ,'glow':'modules.shallow.glow.glow_decoder','glow_convnext':'modules.shallow.convnext_glow.glow_decoder_convnext'
+           ,'glow':'modules.shallow.glow.glow_decoder','glow_convnext':'modules.shallow.convnext_glow.glow_decoder_convnext','gglow':'modules.shallow.gglow.glow_decoder','fast_speech2_decoders':'modules.shallow.fast_speech2_decoders.fs2_decode'
            }
 encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'}
 

From 6bbdf6c549a33af62a1d7278001cc1661ebbd97e Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Mon, 21 Aug 2023 18:16:26 +0800
Subject: [PATCH 27/33] Support using gt mel as source during validation

---
 configs/acoustic.yaml |  1 +
 modules/toplevel.py   | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 82f2163a8..84ef7d721 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -82,6 +82,7 @@ diff_depth: 400
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
+  val_gt_start: false
   aux_share_encoder: true
   aux_encoder_strict_hparams: false
   aux_encoder_arch: fs2
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 211727d0e..2145489f2 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -49,11 +49,11 @@ def __init__(self, vocab_size, out_dims):
         )
 
         self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False)
+        self.shallow_args = hparams['shallow_diffusion_args']
         if self.use_shallow_diffusion:
-            shallow_args = hparams['shallow_diffusion_args']
-            self.train_aux_decoder = shallow_args['train_aux_decoder']
-            self.train_diffusion = shallow_args['train_diffusion']
-            self.aux_decoder_grad = shallow_args['aux_decoder_grad']
+            self.train_aux_decoder = self.shallow_args['train_aux_decoder']
+            self.train_diffusion = self.shallow_args['train_diffusion']
+            self.aux_decoder_grad = self.shallow_args['aux_decoder_grad']
             self.aux_decoder = shallow_adapt(hparams, out_dims,vocab_size)
 
         self.diffusion = GaussianDiffusion(
@@ -79,21 +79,22 @@ def forward(
             txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed,
             spk_embed_id=spk_embed_id, **kwargs
         )
-
         if infer:
             if self.use_shallow_diffusion:
                 aux_mel_pred = self.aux_decoder(condition, infer=True,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
             key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs)
-
                 aux_mel_pred *= ((mel2ph > 0).float()[:, :, None])
+                if gt_mel is not None and self.shallow_args['val_gt_start']:
+                    src_mel = gt_mel
+                else:
+                    src_mel = aux_mel_pred
             else:
-                aux_mel_pred = None
-            mel_pred = self.diffusion(condition, src_spec=aux_mel_pred, infer=True)
+                aux_mel_pred = src_mel = None
+            mel_pred = self.diffusion(condition, src_spec=src_mel, infer=True)
             mel_pred *= ((mel2ph > 0).float()[:, :, None])
             return ShallowDiffusionOutput(aux_out=aux_mel_pred, diff_out=mel_pred)
         else:
             if self.use_shallow_diffusion:
-                # TODO: replace the following placeholder with real calling code
                 if self.train_aux_decoder:
                     aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
                     aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,

From f1cc641bf3b0b5a481822269427839696565d782 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Thu, 21 Sep 2023 14:59:54 +0800
Subject: [PATCH 28/33] Clean files and configs

---
 configs/acoustic.yaml                         |   15 +-
 modules/{shallow => aux_decoder}/__init__.py  |    0
 .../fast_speech2_decoder.py                   |   48 +-
 .../{shallow => aux_decoder}/fs2_decoder.py   |    0
 .../shallow_adapter.py                        |    6 +-
 modules/shallow/convnext_glow.py              | 1134 -----------------
 modules/shallow/gglow.py                      | 1033 ---------------
 modules/shallow/glow.py                       | 1032 ---------------
 modules/shallow/light_decoder.py              |  109 --
 modules/shallow/noise_decoder.py              |  100 --
 modules/toplevel.py                           |    2 +-
 11 files changed, 35 insertions(+), 3444 deletions(-)
 rename modules/{shallow => aux_decoder}/__init__.py (100%)
 rename modules/{shallow => aux_decoder}/fast_speech2_decoder.py (62%)
 rename modules/{shallow => aux_decoder}/fs2_decoder.py (100%)
 rename modules/{shallow => aux_decoder}/shallow_adapter.py (85%)
 delete mode 100644 modules/shallow/convnext_glow.py
 delete mode 100644 modules/shallow/gglow.py
 delete mode 100644 modules/shallow/glow.py
 delete mode 100644 modules/shallow/light_decoder.py
 delete mode 100644 modules/shallow/noise_decoder.py

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 84ef7d721..e892018d8 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -79,18 +79,23 @@ schedule_type: 'linear'
 # shallow diffusion
 use_shallow_diffusion: false
 diff_depth: 400
+
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
+  shared_encoder: true
   val_gt_start: false
   aux_share_encoder: true
   aux_encoder_strict_hparams: false
-  aux_encoder_arch: fs2
-  aux_encoder_args: {}
-  aux_decoder_grad: 0.1
-  aux_decoder_arch: fs2
+  aux_decoder_arch: convnext
+  aux_decoder_args:
+    num_channels: 512
+    num_layers: 6
+    kernel_size: 7
+    dropout_rate: 0.1
   aux_decoder_strict_hparams: true
-  aux_decoder_args: {}
+  aux_decoder_grad: 0.1
+
 lambda_aux_mel_loss: 0.2
 
 # train and eval
diff --git a/modules/shallow/__init__.py b/modules/aux_decoder/__init__.py
similarity index 100%
rename from modules/shallow/__init__.py
rename to modules/aux_decoder/__init__.py
diff --git a/modules/shallow/fast_speech2_decoder.py b/modules/aux_decoder/fast_speech2_decoder.py
similarity index 62%
rename from modules/shallow/fast_speech2_decoder.py
rename to modules/aux_decoder/fast_speech2_decoder.py
index 61dc04860..1ccb76fcc 100644
--- a/modules/shallow/fast_speech2_decoder.py
+++ b/modules/aux_decoder/fast_speech2_decoder.py
@@ -4,8 +4,6 @@
 import torch.nn as nn
 
 
-
-
 class ConvNeXtBlock(nn.Module):
     """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
 
@@ -19,17 +17,15 @@ class ConvNeXtBlock(nn.Module):
     """
 
     def __init__(
-        self,
-        dim: int,
-        intermediate_dim: int,
-        layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0
+            self,
+            dim: int,
+            intermediate_dim: int,
+            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0
 
     ):
         super().__init__()
         self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
 
-
-
         self.norm = nn.LayerNorm(dim, eps=1e-6)
         self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
         self.act = nn.GELU()
@@ -41,14 +37,13 @@ def __init__(
         )
         # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
         self.drop_path = nn.Identity()
-        self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
+        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
 
     def forward(self, x: torch.Tensor, ) -> torch.Tensor:
         residual = x
         x = self.dwconv(x)
         x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
 
-
         x = self.norm(x)
         x = self.pwconv1(x)
         x = self.act(x)
@@ -56,9 +51,9 @@ def forward(self, x: torch.Tensor, ) -> torch.Tensor:
         if self.gamma is not None:
             x = self.gamma * x
         x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
-        x=self.dropout(x)
+        x = self.dropout(x)
 
-        x = residual + self.drop_path (x)
+        x = residual + self.drop_path(x)
         return x
 
 
@@ -66,31 +61,30 @@ class fs2_loss(nn.Module):
     def __init__(self):
         super().__init__()
 
-    def forward(self,y, x):
-        x=(x - (-5)) / (0 - (-5)) * 2 - 1
-        return nn.L1Loss()(y,x)
+    def forward(self, y, x):
+        x = (x - (-5)) / (0 - (-5)) * 2 - 1
+        return nn.L1Loss()(y, x)
 
 
 class fs2_decode(nn.Module):
-    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame):
+    def __init__(self, encoder_hidden, out_dims, n_chans, kernel_size, dropout_rate, n_layers, parame):
         super().__init__()
-        self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-        self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate)  for _ in range(n_layers)])
-        self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-
-
+        self.inconv = nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans, intermediate_dim=n_chans * 4, layer_scale_init_value=1e-6,
+                                                 drop_out=dropout_rate) for _ in range(n_layers)])
+        self.outconv = nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
 
     def build_loss(self):
 
         return fs2_loss()
 
-    def forward(self, x,infer,*args,**kwargs):
-        x=x.transpose(1, 2)
-        x=self.inconv(x)
+    def forward(self, x, infer, *args, **kwargs):
+        x = x.transpose(1, 2)
+        x = self.inconv(x)
         for i in self.conv:
-            x=i(x)
-        x=self.outconv(x).transpose(1, 2)
+            x = i(x)
+        x = self.outconv(x).transpose(1, 2)
         if infer:
-            x=(x + 1) / 2 * (0 - (-5)) + (-5)
+            x = (x + 1) / 2 * (0 - (-5)) + (-5)
         return x
         pass
diff --git a/modules/shallow/fs2_decoder.py b/modules/aux_decoder/fs2_decoder.py
similarity index 100%
rename from modules/shallow/fs2_decoder.py
rename to modules/aux_decoder/fs2_decoder.py
diff --git a/modules/shallow/shallow_adapter.py b/modules/aux_decoder/shallow_adapter.py
similarity index 85%
rename from modules/shallow/shallow_adapter.py
rename to modules/aux_decoder/shallow_adapter.py
index 1a35c8507..53d47c56a 100644
--- a/modules/shallow/shallow_adapter.py
+++ b/modules/aux_decoder/shallow_adapter.py
@@ -1,9 +1,9 @@
 import torch
 import torch.nn as nn
 
-cls_map = {'fs2': 'modules.shallow.fast_speech2_decoder.fs2_decode',
-           'ns': 'modules.shallow.noise_decoder.noise_decoder', 'ld': 'modules.shallow.light_decoder.noise_decoder','att_fs2':'modules.shallow.fs2_decoder.attention_fs2_decoder'
-           ,'glow':'modules.shallow.glow.glow_decoder','glow_convnext':'modules.shallow.convnext_glow.glow_decoder_convnext','gglow':'modules.shallow.gglow.glow_decoder','fast_speech2_decoders':'modules.shallow.fast_speech2_decoders.fs2_decode'
+cls_map = {'fs2': 'modules.aux_decoder.fast_speech2_decoder.fs2_decode',
+           'ns': 'modules.aux_decoder.noise_decoder.noise_decoder', 'ld': 'modules.aux_decoder.light_decoder.noise_decoder','att_fs2':'modules.aux_decoder.fs2_decoder.attention_fs2_decoder'
+           ,'glow':'modules.aux_decoder.glow.glow_decoder','glow_convnext':'modules.aux_decoder.convnext_glow.glow_decoder_convnext','gglow':'modules.aux_decoder.gglow.glow_decoder','fast_speech2_decoders':'modules.aux_decoder.fast_speech2_decoders.fs2_decode'
            }
 encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'}
 
diff --git a/modules/shallow/convnext_glow.py b/modules/shallow/convnext_glow.py
deleted file mode 100644
index f9e8dbd6b..000000000
--- a/modules/shallow/convnext_glow.py
+++ /dev/null
@@ -1,1134 +0,0 @@
-import math
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-import torch.nn.functional as F
-
-
-def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
-    return pad_shape
-
-
-class RelativeFFTBlock(nn.Module):
-    """ FFT Block with Relative Multi-Head Attention """
-
-    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
-                 window_size=None, block_length=None):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.block_length = block_length
-
-        self.drop = nn.Dropout(p_dropout)
-        self.attn_layers = nn.ModuleList()
-        self.norm_layers_1 = nn.ModuleList()
-        self.ffn_layers = nn.ModuleList()
-        self.norm_layers_2 = nn.ModuleList()
-        for i in range(self.n_layers):
-            self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads,
-                                                          window_size=window_size, p_dropout=p_dropout,
-                                                          block_length=block_length))
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(FFN(
-                hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout))
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-
-    def forward(self, x, x_mask=None):
-
-        if x_mask is not None:
-            attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        else:
-            attn_mask = None
-
-        for i in range(self.n_layers):
-            if x_mask is not None:
-                x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-
-
-class RelativeSelfAttention(nn.Module):
-    """ Relative Multi-Head Attention """
-
-    def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0.,
-                 block_length=None, proximal_bias=False, proximal_init=False):
-        super(RelativeSelfAttention, self).__init__()
-        assert channels % n_heads == 0
-
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.block_length = block_length
-        self.proximal_bias = proximal_bias
-        self.p_dropout = p_dropout
-        self.attn = None
-
-        self.k_channels = channels // n_heads
-        self.conv_q = nn.Conv1d(channels, channels, 1)
-        self.conv_k = nn.Conv1d(channels, channels, 1)
-        self.conv_v = nn.Conv1d(channels, channels, 1)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels ** -0.5
-            self.emb_rel_k = nn.Parameter(torch.randn(
-                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-            self.emb_rel_v = nn.Parameter(torch.randn(
-                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
-        self.drop = nn.Dropout(p_dropout)
-
-        nn.init.xavier_uniform_(self.conv_q.weight)
-        nn.init.xavier_uniform_(self.conv_k.weight)
-        if proximal_init:
-            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
-            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
-        nn.init.xavier_uniform_(self.conv_v.weight)
-
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-
-        x = self.conv_o(x)
-        return x
-
-    def attention(self, query, key, value, mask=None):
-        # reshape [b, d, t] -> [b, n_h, t, d_k]
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = query.view(b, self.n_heads, self.k_channels,
-                           t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels,
-                           t_s).transpose(2, 3)
-
-        scores = torch.matmul(query, key.transpose(-2, -1)
-                              ) / math.sqrt(self.k_channels)
-        if self.window_size is not None:
-            assert t_s == t_t, "Relative attention is only available for self-attention."
-            key_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(
-                query, key_relative_embeddings)
-            rel_logits = self._relative_position_to_absolute_position(
-                rel_logits)
-            scores_local = rel_logits / math.sqrt(self.k_channels)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + \
-                     self._attention_bias_proximal(t_s).to(
-                         device=scores.device, dtype=scores.dtype)
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-            if self.block_length is not None:
-                block_mask = torch.ones_like(
-                    scores).triu(-self.block_length).tril(self.block_length)
-                scores = scores * block_mask + -1e4 * (1 - block_mask)
-        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(
-                p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_v, t_s)
-            output = output + \
-                     self._matmul_with_relative_values(
-                         relative_weights, value_relative_embeddings)
-        output = output.transpose(2, 3).contiguous().view(
-            b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-        return output, p_attn
-
-    def _matmul_with_relative_values(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-
-    def _matmul_with_relative_keys(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        max_relative_position = 2 * self.window_size + 1
-        # Pad first before slice to avoid using cond ops.
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = F.pad(
-                relative_embeddings,
-                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[:,
-                                   slice_start_position:slice_end_position]
-        return used_relative_embeddings
-
-    def _relative_position_to_absolute_position(self, x):
-        batch, heads, length, _ = x.size()
-        # Concat columns of pad to shift from relative to absolute indexing.
-        x = F.pad(x, convert_pad_shape(
-            [[0, 0], [0, 0], [0, 0], [0, 1]]))
-
-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
-        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(x_flat, convert_pad_shape(
-            [[0, 0], [0, 0], [0, length - 1]]))
-
-        # Reshape and slice out the padded elements.
-        x_final = x_flat.view(
-            [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
-        return x_final
-
-    def _absolute_position_to_relative_position(self, x):
-        batch, heads, length, _ = x.size()
-        # padd along column
-        x = F.pad(x, convert_pad_shape(
-            [[0, 0], [0, 0], [0, 0], [0, length - 1]]))
-        x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
-        # add 0's in the beginning that will skew the elements after reshape
-        x_flat = F.pad(x_flat, convert_pad_shape(
-            [[0, 0], [0, 0], [length, 0]]))
-        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
-        return x_final
-
-    def _attention_bias_proximal(self, length):
-        """
-        Bias for self-attention to encourage attention to close positions.
-        """
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-
-
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-4):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-
-    def forward(self, x):
-        n_dims = len(x.shape)
-        mean = torch.mean(x, 1, keepdim=True)
-        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
-
-        x = (x - mean) * torch.rsqrt(variance + self.eps)
-
-        shape = [1, -1] + [1] * (n_dims - 2)
-        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
-        return x
-
-
-class FFN(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.activation = activation
-
-        self.conv = nn.Conv1d(
-            in_channels, out_channels, kernel_size, padding=kernel_size // 2)
-        self.drop = nn.Dropout(p_dropout)
-
-    def forward(self, x, x_mask=None):
-        if x_mask is not None:
-            x = self.conv(x * x_mask)
-        else:
-            x = self.conv(x)
-
-        if self.activation == "gelu":
-            x = x * torch.sigmoid(1.702 * x)
-        else:
-            x = torch.relu(x)
-        x = self.drop(x)
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-
-
-class Flip(nn.Module):
-    def forward(self, x, *args, reverse=False, **kwargs):
-        x = torch.flip(x, [1])
-        if not reverse:
-            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
-            return x, logdet
-        else:
-            return x
-
-
-Conv1dModel = nn.Conv1d  # 有毒 删
-
-
-class Depthwise_Separable_Conv1D(nn.Module):
-    def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            padding=0,
-            dilation=1,
-            bias=True,
-            padding_mode='zeros',  # TODO: refine this type
-            device=None,
-            dtype=None
-    ):
-        super().__init__()
-        self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
-                                    groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias,
-                                    padding_mode=padding_mode, device=device, dtype=dtype)
-        self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias,
-                                    device=device, dtype=dtype)
-
-    def forward(self, input):
-        return self.point_conv(self.depth_conv(input))
-
-
-def set_Conv1dModel(use_depthwise_conv):
-    global Conv1dModel
-    Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d
-
-
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
-@torch.jit.script
-def add_and_GRU(input_a, input_b):
-    in_act = input_a + input_b
-    x1, x2 = in_act.chunk(2, dim=1)
-    t_act = torch.tanh(x2)
-    s_act = torch.sigmoid(x1)
-    acts = t_act * s_act
-    return acts
-
-
-class WN(torch.nn.Module):
-    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-        super(WN, self).__init__()
-        assert (kernel_size % 2 == 1)
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size,
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels  # condition用的
-        self.p_dropout = p_dropout
-
-        self.in_layers = torch.nn.ModuleList()
-        self.res_skip_layers = torch.nn.ModuleList()
-        self.drop = nn.Dropout(p_dropout)
-        self.condition_layers = torch.nn.ModuleList()
-
-        # if gin_channels != 0:
-        #     cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
-        #     # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
-        #     self.cond_layer=cond_layer
-
-        for i in range(n_layers):
-
-            if gin_channels != 0:
-                cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1)
-                # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
-                # self.cond_layer = cond_layer
-            else:
-                cond_layer = nn.Identity()
-            self.condition_layers.append(cond_layer)
-
-            dilation = dilation_rate ** i
-            padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size,
-                                   dilation=dilation, padding=padding)
-            # in_layer = weight_norm_modules(in_layer, name='weight')
-            self.in_layers.append(in_layer)
-
-            # last one is not necessary
-            if i < n_layers - 1:
-                res_skip_channels = 2 * hidden_channels
-            else:
-                res_skip_channels = hidden_channels
-
-            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
-            # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight')
-            self.res_skip_layers.append(res_skip_layer)
-
-    def forward(self, x, x_mask=None, g=None, **kwargs):
-        output = torch.zeros_like(x)
-        n_channels_tensor = torch.IntTensor([self.hidden_channels])
-
-        # if g is not None:
-        #     g = self.cond_layer(g)
-
-        for i in range(self.n_layers):
-            x_in = self.in_layers[i](x)
-
-            if g is not None:
-
-                condition = self.condition_layers[i](g)
-            else:
-                condition = torch.zeros_like(x_in)
-
-            # acts = fused_add_tanh_sigmoid_multiply(  # GRU 这不就是wavnet的那个 GRU
-            #     x_in,
-            #     condition,
-            #     n_channels_tensor)
-            acts = add_and_GRU(  # GRU 这不就是wavnet的那个 GRU
-                x_in,
-                condition,
-            )
-            acts = self.drop(acts)
-
-            res_skip_acts = self.res_skip_layers[i](acts)
-            if i < self.n_layers - 1:
-                res_acts = res_skip_acts[:, :self.hidden_channels, :]
-                if x_mask is not None:
-                    x = (x + res_acts) * x_mask
-                else:
-                    x = x + res_acts
-                output = output + res_skip_acts[:, self.hidden_channels:, :]
-            else:
-                output = output + res_skip_acts
-
-        if x_mask is not None:
-            out = output * x_mask
-        else:
-            out = output
-        return out
-pass
-class ConvNeXtBlock_condition(nn.Module):
-    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-
-    Args:
-        dim (int): Number of input channels.
-        intermediate_dim (int): Dimensionality of the intermediate layer.
-        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
-            Defaults to None.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-            None means non-conditional LayerNorm. Defaults to None.
-    """
-
-    def __init__(
-            self,
-            dim: int,
-            intermediate_dim: int, dilation, padding,
-            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0,condione: int=0
-
-    ):
-        super().__init__()
-        if condione!=0:
-            self.cond_layer = torch.nn.Conv1d(condione, intermediate_dim, 1)
-        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=padding, groups=dim,dilation=dilation)  # depthwise conv
-
-        self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(intermediate_dim, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.drop_path = nn.Identity()
-        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
-
-    def forward(self, x: torch.Tensor,condition=None ) -> torch.Tensor:
-
-
-
-        residual = x
-        x = self.dwconv(x)
-        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
-
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        if condition is not None:
-
-            condition = self.cond_layer(condition)
-        else:
-            condition = torch.zeros_like(x.transpose(1, 2))
-
-        x=x+condition.transpose(1, 2)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
-        x = self.dropout(x)
-
-        x = residual + self.drop_path(x)
-        return x
-
-pass
-
-
-class CONVnext_flow(torch.nn.Module):
-    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0,innx=3):
-        super().__init__()
-        assert (kernel_size % 2 == 1)
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size,
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels  # condition用的
-        self.p_dropout = p_dropout
-
-        self.in_layers = torch.nn.ModuleList()
-        self.res_skip_layers = torch.nn.ModuleList()
-        self.drop = nn.Dropout(p_dropout)
-        self.condition_layers = torch.nn.ModuleList()
-
-        # if gin_channels != 0:
-        #     cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
-        #     # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
-        #     self.cond_layer=cond_layer
-
-        for i in range(n_layers):
-            kernel_size=7
-            dilation = dilation_rate ** i
-            padding = int((kernel_size * dilation - dilation) / 2)
-
-
-
-            in_layer = ConvNeXtBlock_condition(dim=hidden_channels, intermediate_dim=innx * hidden_channels, drop_out=p_dropout,
-                                   dilation=dilation, padding=padding,layer_scale_init_value=1e-6,condione=gin_channels)
-            # in_layer = weight_norm_modules(in_layer, name='weight')
-            self.in_layers.append(in_layer)
-
-            # last one is not necessary
-
-    def forward(self, x, x_mask=None, g=None, **kwargs):
-
-
-        # if g is not None:
-        #     g = self.cond_layer(g)
-
-        for i in range(self.n_layers):
-
-
-            x = self.in_layers[i](x,g)
-
-            if x_mask is not None:
-                x = x * x_mask
-            else:
-                x = x
-
-
-
-
-
-
-
-        return x
-
-
-
-
-class ResidualCouplingLayer(nn.Module):
-    def __init__(self,
-                 channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 p_dropout=0,
-                 gin_channels=0,
-                 mean_only=False,
-                 wn_sharing_parameter=None  # 不明的共享权重
-                 ):
-        assert channels % 2 == 0, "channels should be divisible by 2"
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.half_channels = channels // 2
-        self.mean_only = mean_only
-
-        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = CONVnext_flow(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
-                      gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
-        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-        self.post.weight.data.zero_()
-        self.post.bias.data.zero_()
-
-    def forward(self, x, x_mask=None, g=None, reverse=False):
-        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
-        if x_mask is not None:
-            h = self.pre(x0) * x_mask
-        else:
-            h = self.pre(x0)
-        h = self.enc(h, x_mask, g=g)
-
-        if x_mask is not None:
-            stats = self.post(h) * x_mask
-        else:
-            stats = self.post(h)
-        if not self.mean_only:
-            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
-        else:
-            m = stats
-            logs = torch.zeros_like(m)
-
-        if not reverse:
-            if x_mask is not None:
-                x1 = m + x1 * torch.exp(logs) * x_mask
-            else:
-                x1 = m + x1 * torch.exp(logs)
-            # x1 = m + x1 * torch.exp(logs) * x_mask  # 逆过程
-            x = torch.cat([x0, x1], 1)
-            logdet = torch.sum(logs, [1, 2])
-            return x, logdet
-        else:
-            if x_mask is not None:
-                x1 = (x1 - m) * torch.exp(-logs) * x_mask
-            else:
-                x1 = (x1 - m) * torch.exp(-logs)
-            # x1 = (x1 - m) * torch.exp(-logs) * x_mask
-            x = torch.cat([x0, x1], 1)
-            return x
-
-
-class ResidualCouplingBlock(nn.Module):
-    def __init__(self,
-                 channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 n_flows=4,
-                 gin_channels=0,
-                 share_parameter=False
-                 ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-
-        self.flows = nn.ModuleList()
-
-        self.wn = CONVnext_flow(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0,
-                     gin_channels=gin_channels) if share_parameter else None
-
-        for i in range(n_flows):
-            self.flows.append(
-                ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
-                                      gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
-            self.flows.append(Flip())
-
-    def forward(self, x, x_mask=None, g=None, reverse=False):
-        if not reverse:
-            logdet_tot = 0
-            for flow in self.flows:
-                x, logdet = flow(x, x_mask, g=g, reverse=reverse)
-                logdet_tot += logdet
-        else:
-            logdet_tot = None
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x, logdet_tot
-
-
-
-class ConvNeXtBlock(nn.Module):
-    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-
-    Args:
-        dim (int): Number of input channels.
-        intermediate_dim (int): Dimensionality of the intermediate layer.
-        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
-            Defaults to None.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-            None means non-conditional LayerNorm. Defaults to None.
-    """
-
-    def __init__(
-            self,
-            dim: int,
-            intermediate_dim: int,
-            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0
-
-    ):
-        super().__init__()
-        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
-
-        self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(intermediate_dim, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.drop_path = nn.Identity()
-        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
-
-    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
-        residual = x
-        x = self.dwconv(x)
-        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
-
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
-        x = self.dropout(x)
-
-        x = residual + self.drop_path(x)
-        return x
-
-
-class condition_latent_encoder_att(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
-
-        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
-                                    n_layers=n_layers,
-                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
-
-    def forward(self, x, noice_scale=1):
-        x = self.proj_in(x)
-
-        x = self.enc(x)
-        stats = self.proj_out(x)
-        m, logs = torch.chunk(stats, 2, 1)
-        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
-
-        return z, m, logs,
-
-
-class condition_latent_encoder_convnext(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
-
-        self.conv = nn.ModuleList(
-            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
-                           drop_out=dropout_rate) for _ in range(n_layers)])
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
-
-    def forward(self, x, noice_scale=1):
-        x = self.proj_in(x)
-
-        for i in self.conv:
-            x = i(x)
-        stats = self.proj_out(x)
-        m, logs = torch.chunk(stats, 2, 1)
-        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
-
-        return z, m, logs,
-
-
-class condition_encoder_att(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
-                                 padding=condition_encoder_kernel_size // 2)
-
-        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
-                                    n_layers=n_layers,
-                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
-                                  padding=condition_encoder_kernel_size // 2)
-
-    def forward(self, x, ):
-        x = self.proj_in(x)
-
-        x = self.enc(x)
-        stats = self.proj_out(x)
-
-        return stats
-
-
-class condition_encoder_convnext(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
-                                 padding=condition_encoder_kernel_size // 2)
-
-        self.conv = nn.ModuleList(
-            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
-                           drop_out=dropout_rate) for _ in range(n_layers)])
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
-                                  padding=condition_encoder_kernel_size // 2)
-
-    def forward(self, x, ):
-        x = self.proj_in(x)
-
-        for i in self.conv:
-            x = i(x)
-        stats = self.proj_out(x)
-
-        return stats
-
-
-class SynthesizerTrn(nn.Module):
-    """
-    Synthesizer for Training
-    """
-
-    def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads,
-                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
-
-                 condition_in_chans,
-
-                 condition_encoder_hidden_channels,
-                 condition_encoder_n_heads,
-                 condition_encoder_n_layers,
-                 condition_encoder_kernel_size,
-                 condition_encoder_dropout_rate,
-
-                 inter_channels,
-                 hidden_channels,
-
-                 condition_channels,flow_wavenet_lay=4,
-
-                 condition_encoder_filter_channels=None,
-
-                 latent_encoder_filter_channels=None,
-
-                 use_depthwise_conv=False,
-
-                 flow_share_parameter=False,
-                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True,
-                 ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention',
-
-                 **kwargs):
-
-        super().__init__()
-        self.inter_channels = inter_channels
-        self.ues_condition = ues_condition
-
-        self.use_latent = use_latent
-
-        if use_latent_encoder and use_latent:
-            if latent_encoder_type == 'attention':
-                self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans,
-                                                                   out_channels=inter_channels,
-                                                                   n_chans=latent_encoder_hidden_channels,
-                                                                   n_heads=latent_encoder_n_heads,
-                                                                   n_layers=latent_encoder_n_layers,
-                                                                   condition_encoder_kernel_size=latent_encoder_kernel_size,
-                                                                   dropout_rate=latent_encoder_dropout_rate,
-                                                                   filter_channels=latent_encoder_filter_channels)
-            elif latent_encoder_type == 'convnext':
-                self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans,
-                                                                        out_channels=inter_channels,
-                                                                        n_chans=latent_encoder_hidden_channels,
-                                                                        n_heads=None,
-                                                                        n_layers=latent_encoder_n_layers,
-                                                                        condition_encoder_kernel_size=None,
-                                                                        dropout_rate=latent_encoder_dropout_rate,
-                                                                        filter_channels=latent_encoder_filter_channels)
-            else:
-                raise RuntimeError("unsupport_latent_encoder")
-
-        elif ((not use_latent_encoder) and use_latent):
-            self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3)
-
-        if ues_condition_encoder and ues_condition:
-            if condition_encoder_type == 'attention':
-                self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans,
-                                                               out_channels=condition_channels,
-                                                               n_chans=condition_encoder_hidden_channels,
-                                                               n_heads=condition_encoder_n_heads,
-                                                               n_layers=condition_encoder_n_layers,
-                                                               condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                                               dropout_rate=condition_encoder_dropout_rate,
-                                                               filter_channels=condition_encoder_filter_channels)
-            elif condition_encoder_type == 'convnext':
-                self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans,
-                                                                    out_channels=condition_channels,
-                                                                    n_chans=condition_encoder_hidden_channels,
-                                                                    n_heads=None,
-                                                                    n_layers=condition_encoder_n_layers,
-                                                                    condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                                                    dropout_rate=condition_encoder_dropout_rate,
-                                                                    filter_channels=condition_encoder_filter_channels)
-            else:
-                raise RuntimeError("unsupport__encoder")
-        elif ((not ues_condition_encoder) and ues_condition):
-            self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3)
-
-        self.use_depthwise_conv = use_depthwise_conv
-
-        # self.enc_p = TextEncoder(
-        #     inter_channels,
-        #     hidden_channels,
-        #     filter_channels=filter_channels,
-        #     n_heads=n_heads,
-        #     n_layers=n_layers,
-        #     kernel_size=kernel_size,
-        #     p_dropout=p_dropout
-        # )
-
-        set_Conv1dModel(self.use_depthwise_conv)
-
-        if ues_condition:
-            condition_channelsw = condition_channels
-        else:
-            condition_channelsw = 0
-
-        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay,
-                                          gin_channels=condition_channelsw, share_parameter=flow_share_parameter)
-
-    def forward(self, c, mel, x_mask=None):
-
-        # vol proj
-
-        # f0 predict
-
-        # encoder
-        if self.use_latent:
-            z_ptemp, m_p, logs_p = self.latent_encoder(c)
-        else:
-            m_p, logs_p = None, None
-        # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
-
-        # flow
-        if self.ues_condition:
-            condition = self.condition_encoder(c)
-            z_p, logdet = self.flow(mel, x_mask, g=condition)
-        else:
-            z_p, logdet = self.flow(mel, x_mask, g=None)
-
-        return x_mask, (z_p, m_p, logs_p), logdet,
-
-
-    def infer(self, c, noice_scale=0.35, seed=None, ):
-        if seed is not None:
-
-            if c.device == torch.device("cuda"):
-                torch.cuda.manual_seed_all(seed)
-            else:
-                torch.manual_seed(seed)
-
-        if self.use_latent:
-            z_p, m_p, logs_p = self.latent_encoder(c)
-        else:
-            z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale
-
-        # vol proj
-
-        # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
-        # o, _ = self.flow(z_p,  g=g, reverse=True)
-
-        if self.ues_condition:
-            condition = self.condition_encoder(c)
-            # z_p, logdet = self.flow(mel, x_mask, g=condition)
-            o, _ = self.flow(z_p, g=condition, reverse=True)
-        else:
-            o, _ = self.flow(z_p, g=None, reverse=True)
-
-        return o
-
-class fs2_loss(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self,y, x):
-        x=(x - (-5)) / (0 - (-5)) * 2 - 1
-        return nn.L1Loss()(y,x)
-
-
-class glow_loss_L(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, pack_loss,target):
-
-        z, m, logs, logdet, mask = pack_loss
-        # z, m, logs, logdet, mask = None
-
-        l = torch.sum(logs) + 0.5 * torch.sum(
-            torch.exp(-2 * logs) * ((z - m) ** 2))  # neg normal likelihood w/o the constant term
-        l = l - torch.sum(logdet)  # log jacobian determinant
-        if mask is not None:
-            l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
-        else:
-            l = l / torch.sum(torch.ones_like(z))  # averaging across batch, channel and time axes
-        l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
-        return l
-
-
-
-
-
-
-
-class glow_decoder_convnext(nn.Module):
-    def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads,
-                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
-                 condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers,
-                 condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels,
-                 flow_condition_channels, parame,ft_flow=False,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35,
-                 condition_encoder_filter_channels=None,
-
-                 latent_encoder_filter_channels=None,
-
-                 use_depthwise_conv=False,
-
-                 flow_share_parameter=False,
-                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True,
-                 use_latent=True,
-                 ues_condition_encoder=False, ues_condition=False,
-                 condition_encoder_type='attention'):
-        super().__init__()
-        self.use_latent=use_latent
-        self.flow_infer_seed=flow_infer_seed
-        self.flow_infer_scale=flow_infer_scale
-        self.ft_flow=ft_flow
-        self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels,
-                                           latent_encoder_n_heads=latent_encoder_n_heads,
-                                           latent_encoder_n_layers=latent_encoder_n_layers,
-                                           latent_encoder_kernel_size=latent_encoder_kernel_size,
-                                           latent_encoder_dropout_rate=latent_encoder_dropout_rate,
-
-                                           condition_in_chans=encoder_hidden,
-
-                                           condition_encoder_hidden_channels=condition_encoder_hidden_channels,
-                                           condition_encoder_n_heads=condition_encoder_n_heads,
-                                           condition_encoder_n_layers=condition_encoder_n_layers,
-                                           condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                           condition_encoder_dropout_rate=condition_encoder_dropout_rate,
-
-                                           inter_channels=out_dims,
-                                           flow_wavenet_lay=flow_wavenet_lay,
-                                           hidden_channels=flow_hidden_channels,
-
-                                           condition_channels=flow_condition_channels,
-
-                                           condition_encoder_filter_channels=condition_encoder_filter_channels,
-
-                                           latent_encoder_filter_channels=latent_encoder_filter_channels,
-
-                                           use_depthwise_conv=use_depthwise_conv,
-
-                                           flow_share_parameter=flow_share_parameter,
-                                           n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type,
-                                           use_latent_encoder=use_latent_encoder,
-                                           use_latent=use_latent,
-                                           ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition,
-                                           condition_encoder_type=condition_encoder_type)
-
-        self.use_mask=use_mask
-        self.use_norm=use_norm
-
-    def norm(self,x):
-        x = (x - (-5)) / (0 - (-5)) * 2 - 1
-        return x
-
-    def denorm(self,x):
-        x=(x + 1) / 2 * (0 - (-5)) + (-5)
-        return x
-
-    def build_loss(self):
-        if self.ft_flow:
-            return fs2_loss()
-
-        if self.use_latent:
-
-            return glow_loss_L()
-
-
-
-    def forward(self, x, infer, x_gt,mask):
-        if not self.use_mask or infer:
-            mask=None
-        else:
-            mask=mask.transpose(1, 2)
-
-        if self.ft_flow and not infer:
-            out = self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale,
-                                          seed=self.flow_infer_seed).transpose(1, 2)
-            return out
-
-
-
-
-        if infer:
-            with torch.no_grad():
-                out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
-                if self.use_norm:
-                    out = self.denorm(out)
-                return out
-        else:
-            if self.use_norm:
-                x_gt = self.norm(x_gt)
-
-
-            x = x.transpose(1, 2)
-            x_gt=x_gt.transpose(1, 2)
-
-            x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask)
-
-
-            pack_loss = (z_p, m_p, logs_p, logdet, x_mask)
-            return pack_loss
-
-
-
-
-        pass
diff --git a/modules/shallow/gglow.py b/modules/shallow/gglow.py
deleted file mode 100644
index 817005579..000000000
--- a/modules/shallow/gglow.py
+++ /dev/null
@@ -1,1033 +0,0 @@
-import math
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-import torch.nn.functional as F
-
-
-def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
-    return pad_shape
-
-
-class RelativeFFTBlock(nn.Module):
-    """ FFT Block with Relative Multi-Head Attention """
-
-    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
-                 window_size=None, block_length=None):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.block_length = block_length
-
-        self.drop = nn.Dropout(p_dropout)
-        self.attn_layers = nn.ModuleList()
-        self.norm_layers_1 = nn.ModuleList()
-        self.ffn_layers = nn.ModuleList()
-        self.norm_layers_2 = nn.ModuleList()
-        for i in range(self.n_layers):
-            self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads,
-                                                          window_size=window_size, p_dropout=p_dropout,
-                                                          block_length=block_length))
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(FFN(
-                hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout))
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-
-    def forward(self, x, x_mask=None):
-
-        if x_mask is not None:
-            attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        else:
-            attn_mask = None
-
-        for i in range(self.n_layers):
-            if x_mask is not None:
-                x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-
-
-class RelativeSelfAttention(nn.Module):
-    """ Relative Multi-Head Attention """
-
-    def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0.,
-                 block_length=None, proximal_bias=False, proximal_init=False):
-        super(RelativeSelfAttention, self).__init__()
-        assert channels % n_heads == 0
-
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.block_length = block_length
-        self.proximal_bias = proximal_bias
-        self.p_dropout = p_dropout
-        self.attn = None
-
-        self.k_channels = channels // n_heads
-        self.conv_q = nn.Conv1d(channels, channels, 1)
-        self.conv_k = nn.Conv1d(channels, channels, 1)
-        self.conv_v = nn.Conv1d(channels, channels, 1)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels ** -0.5
-            self.emb_rel_k = nn.Parameter(torch.randn(
-                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-            self.emb_rel_v = nn.Parameter(torch.randn(
-                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
-        self.drop = nn.Dropout(p_dropout)
-
-        nn.init.xavier_uniform_(self.conv_q.weight)
-        nn.init.xavier_uniform_(self.conv_k.weight)
-        if proximal_init:
-            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
-            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
-        nn.init.xavier_uniform_(self.conv_v.weight)
-
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-
-        x = self.conv_o(x)
-        return x
-
-    def attention(self, query, key, value, mask=None):
-        # reshape [b, d, t] -> [b, n_h, t, d_k]
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = query.view(b, self.n_heads, self.k_channels,
-                           t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels,
-                           t_s).transpose(2, 3)
-
-        scores = torch.matmul(query, key.transpose(-2, -1)
-                              ) / math.sqrt(self.k_channels)
-        if self.window_size is not None:
-            assert t_s == t_t, "Relative attention is only available for self-attention."
-            key_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(
-                query, key_relative_embeddings)
-            rel_logits = self._relative_position_to_absolute_position(
-                rel_logits)
-            scores_local = rel_logits / math.sqrt(self.k_channels)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + \
-                     self._attention_bias_proximal(t_s).to(
-                         device=scores.device, dtype=scores.dtype)
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-            if self.block_length is not None:
-                block_mask = torch.ones_like(
-                    scores).triu(-self.block_length).tril(self.block_length)
-                scores = scores * block_mask + -1e4 * (1 - block_mask)
-        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(
-                p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_v, t_s)
-            output = output + \
-                     self._matmul_with_relative_values(
-                         relative_weights, value_relative_embeddings)
-        output = output.transpose(2, 3).contiguous().view(
-            b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-        return output, p_attn
-
-    def _matmul_with_relative_values(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-
-    def _matmul_with_relative_keys(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        max_relative_position = 2 * self.window_size + 1
-        # Pad first before slice to avoid using cond ops.
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = F.pad(
-                relative_embeddings,
-                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[:,
-                                   slice_start_position:slice_end_position]
-        return used_relative_embeddings
-
-    def _relative_position_to_absolute_position(self, x):
-        batch, heads, length, _ = x.size()
-        # Concat columns of pad to shift from relative to absolute indexing.
-        x = F.pad(x, convert_pad_shape(
-            [[0, 0], [0, 0], [0, 0], [0, 1]]))
-
-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
-        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(x_flat, convert_pad_shape(
-            [[0, 0], [0, 0], [0, length - 1]]))
-
-        # Reshape and slice out the padded elements.
-        x_final = x_flat.view(
-            [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
-        return x_final
-
-    def _absolute_position_to_relative_position(self, x):
-        batch, heads, length, _ = x.size()
-        # padd along column
-        x = F.pad(x, convert_pad_shape(
-            [[0, 0], [0, 0], [0, 0], [0, length - 1]]))
-        x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
-        # add 0's in the beginning that will skew the elements after reshape
-        x_flat = F.pad(x_flat, convert_pad_shape(
-            [[0, 0], [0, 0], [length, 0]]))
-        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
-        return x_final
-
-    def _attention_bias_proximal(self, length):
-        """
-        Bias for self-attention to encourage attention to close positions.
-        """
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-
-
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-4):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-
-    def forward(self, x):
-        n_dims = len(x.shape)
-        mean = torch.mean(x, 1, keepdim=True)
-        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
-
-        x = (x - mean) * torch.rsqrt(variance + self.eps)
-
-        shape = [1, -1] + [1] * (n_dims - 2)
-        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
-        return x
-
-
-class FFN(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.activation = activation
-
-        self.conv = nn.Conv1d(
-            in_channels, out_channels, kernel_size, padding=kernel_size // 2)
-        self.drop = nn.Dropout(p_dropout)
-
-    def forward(self, x, x_mask=None):
-        if x_mask is not None:
-            x = self.conv(x * x_mask)
-        else:
-            x = self.conv(x)
-
-        if self.activation == "gelu":
-            x = x * torch.sigmoid(1.702 * x)
-        else:
-            x = torch.relu(x)
-        x = self.drop(x)
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-
-
-class Flip(nn.Module):
-    def forward(self, x, *args, reverse=False, **kwargs):
-        x = torch.flip(x, [1])
-        if not reverse:
-            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
-            return x, logdet
-        else:
-            return x
-
-
-Conv1dModel = nn.Conv1d  # 有毒 删
-
-
-class Depthwise_Separable_Conv1D(nn.Module):
-    def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            padding=0,
-            dilation=1,
-            bias=True,
-            padding_mode='zeros',  # TODO: refine this type
-            device=None,
-            dtype=None
-    ):
-        super().__init__()
-        self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
-                                    groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias,
-                                    padding_mode=padding_mode, device=device, dtype=dtype)
-        self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias,
-                                    device=device, dtype=dtype)
-
-    def forward(self, input):
-        return self.point_conv(self.depth_conv(input))
-
-
-def set_Conv1dModel(use_depthwise_conv):
-    global Conv1dModel
-    Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d
-
-
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
-@torch.jit.script
-def add_and_GRU(input_a, input_b):
-    in_act = input_a + input_b
-    x1, x2 = in_act.chunk(2, dim=1)
-    t_act = torch.tanh(x2)
-    s_act = torch.sigmoid(x1)
-    acts = t_act * s_act
-    return acts
-
-
-class WN(torch.nn.Module):
-    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-        super(WN, self).__init__()
-        assert (kernel_size % 2 == 1)
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size,
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels  # condition用的
-        self.p_dropout = p_dropout
-
-        self.in_layers = torch.nn.ModuleList()
-        self.res_skip_layers = torch.nn.ModuleList()
-        self.drop = nn.Dropout(p_dropout)
-        self.condition_layers = torch.nn.ModuleList()
-
-        # if gin_channels != 0:
-        #     cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
-        #     # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
-        #     self.cond_layer=cond_layer
-
-        for i in range(n_layers):
-
-            if gin_channels != 0:
-                cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1)
-                # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
-                # self.cond_layer = cond_layer
-            else:
-                cond_layer = nn.Identity()
-            self.condition_layers.append(cond_layer)
-
-            dilation = dilation_rate ** i
-            padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size,
-                                   dilation=dilation, padding=padding)
-            # in_layer = weight_norm_modules(in_layer, name='weight')
-            self.in_layers.append(in_layer)
-
-            # last one is not necessary
-            if i < n_layers - 1:
-                res_skip_channels = 2 * hidden_channels
-            else:
-                res_skip_channels = hidden_channels
-
-            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
-            # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight')
-            self.res_skip_layers.append(res_skip_layer)
-
-    def forward(self, x, x_mask=None, g=None, **kwargs):
-        output = torch.zeros_like(x)
-        n_channels_tensor = torch.IntTensor([self.hidden_channels])
-
-        # if g is not None:
-        #     g = self.cond_layer(g)
-
-        for i in range(self.n_layers):
-            x_in = self.in_layers[i](x)
-
-            if g is not None:
-
-                condition = self.condition_layers[i](g)
-            else:
-                condition = torch.zeros_like(x_in)
-
-            # acts = fused_add_tanh_sigmoid_multiply(  # GRU 这不就是wavnet的那个 GRU
-            #     x_in,
-            #     condition,
-            #     n_channels_tensor)
-            acts = add_and_GRU(  # GRU 这不就是wavnet的那个 GRU
-                x_in,
-                condition,
-            )
-            acts = self.drop(acts)
-
-            res_skip_acts = self.res_skip_layers[i](acts)
-            if i < self.n_layers - 1:
-                res_acts = res_skip_acts[:, :self.hidden_channels, :]
-                if x_mask is not None:
-                    x = (x + res_acts) * x_mask
-                else:
-                    x = x + res_acts
-                output = output + res_skip_acts[:, self.hidden_channels:, :]
-            else:
-                output = output + res_skip_acts
-
-        if x_mask is not None:
-            out = output * x_mask
-        else:
-            out = output
-        return out
-
-    # def remove_weight_norm(self):
-    #     if self.gin_channels != 0:
-    #         remove_weight_norm_modules(self.cond_layer)
-    #     for l in self.in_layers:
-    #         remove_weight_norm_modules(l)
-    #     for l in self.res_skip_layers:
-    #         remove_weight_norm_modules(l)
-
-
-class ResidualCouplingLayer(nn.Module):
-    def __init__(self,
-                 channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 p_dropout=0,
-                 gin_channels=0,
-                 mean_only=False,
-                 wn_sharing_parameter=None  # 不明的共享权重
-                 ):
-        assert channels % 2 == 0, "channels should be divisible by 2"
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.half_channels = channels // 2
-        self.mean_only = mean_only
-
-        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
-                      gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
-        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-        self.post.weight.data.zero_()
-        self.post.bias.data.zero_()
-
-    def forward(self, x, x_mask=None, g=None, reverse=False):
-        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
-        if x_mask is not None:
-            h = self.pre(x0) * x_mask
-        else:
-            h = self.pre(x0)
-        h = self.enc(h, x_mask, g=g)
-
-        if x_mask is not None:
-            stats = self.post(h) * x_mask
-        else:
-            stats = self.post(h)
-        if not self.mean_only:
-            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
-        else:
-            m = stats
-            logs = torch.zeros_like(m)
-
-        if not reverse:
-            if x_mask is not None:
-                x1 = m + x1 * torch.exp(logs) * x_mask
-            else:
-                x1 = m + x1 * torch.exp(logs)
-            # x1 = m + x1 * torch.exp(logs) * x_mask  # 逆过程
-            x = torch.cat([x0, x1], 1)
-            logdet = torch.sum(logs, [1, 2])
-            return x, logdet
-        else:
-            if x_mask is not None:
-                x1 = (x1 - m) * torch.exp(-logs) * x_mask
-            else:
-                x1 = (x1 - m) * torch.exp(-logs)
-            # x1 = (x1 - m) * torch.exp(-logs) * x_mask
-            x = torch.cat([x0, x1], 1)
-            return x
-
-
-class ResidualCouplingBlock(nn.Module):
-    def __init__(self,
-                 channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 n_flows=4,
-                 gin_channels=0,
-                 share_parameter=False
-                 ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-
-        self.flows = nn.ModuleList()
-
-        self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0,
-                     gin_channels=gin_channels) if share_parameter else None
-
-        for i in range(n_flows):
-            self.flows.append(
-                ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
-                                      gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
-            self.flows.append(Flip())
-
-    def forward(self, x, x_mask=None, g=None, reverse=False):
-        if not reverse:
-            logdet_tot = 0
-            for flow in self.flows:
-                x, logdet = flow(x, x_mask, g=g, reverse=reverse)
-                logdet_tot += logdet
-        else:
-            logdet_tot = None
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x, logdet_tot
-
-
-# class TextEncoder(nn.Module):
-#     def __init__(self,
-#                  out_channels,
-#                  hidden_channels,
-#                  kernel_size,
-#                  n_layers,
-#                  gin_channels=0,
-#                  filter_channels=None,
-#                  n_heads=None,
-#                  p_dropout=None):
-#         super().__init__()
-#         self.out_channels = out_channels
-#         self.hidden_channels = hidden_channels
-#         self.kernel_size = kernel_size
-#         self.n_layers = n_layers
-#         self.gin_channels = gin_channels
-#         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-#         self.f0_emb = nn.Embedding(256, hidden_channels)
-#
-#         self.enc_ = attentions.Encoder(
-#             hidden_channels,
-#             filter_channels,
-#             n_heads,
-#             n_layers,
-#             kernel_size,
-#             p_dropout)
-#
-#     def forward(self, x, x_mask, f0=None, noice_scale=1):
-#         x = x + self.f0_emb(f0).transpose(1, 2)
-#         x = self.enc_(x * x_mask, x_mask)
-#         stats = self.proj(x) * x_mask
-#         m, logs = torch.split(stats, self.out_channels, dim=1)
-#         z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
-#
-#         return z, m, logs, x_mask
-
-
-class ConvNeXtBlock(nn.Module):
-    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-
-    Args:
-        dim (int): Number of input channels.
-        intermediate_dim (int): Dimensionality of the intermediate layer.
-        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
-            Defaults to None.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-            None means non-conditional LayerNorm. Defaults to None.
-    """
-
-    def __init__(
-            self,
-            dim: int,
-            intermediate_dim: int,
-            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0
-
-    ):
-        super().__init__()
-        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
-
-        self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(intermediate_dim, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.drop_path = nn.Identity()
-        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
-
-    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
-        residual = x
-        x = self.dwconv(x)
-        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
-
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
-        x = self.dropout(x)
-
-        x = residual + self.drop_path(x)
-        return x
-
-
-class condition_latent_encoder_att(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
-
-        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
-                                    n_layers=n_layers,
-                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
-
-    def forward(self, x, noice_scale=1):
-        x = self.proj_in(x)
-
-        x = self.enc(x)
-        stats = self.proj_out(x)
-        m, logs = torch.chunk(stats, 2, 1)
-        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
-
-        return z, m, logs,
-
-
-class condition_latent_encoder_convnext(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
-
-        self.conv = nn.ModuleList(
-            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
-                           drop_out=dropout_rate) for _ in range(n_layers)])
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
-
-    def forward(self, x, noice_scale=1):
-        x = self.proj_in(x)
-
-        for i in self.conv:
-            x = i(x)
-        stats = self.proj_out(x)
-        m, logs = torch.chunk(stats, 2, 1)
-        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
-
-        return z, m, logs,
-
-
-class condition_encoder_att(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
-                                 padding=condition_encoder_kernel_size // 2)
-
-        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
-                                    n_layers=n_layers,
-                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
-                                  padding=condition_encoder_kernel_size // 2)
-
-    def forward(self, x, ):
-        x = self.proj_in(x)
-
-        x = self.enc(x)
-        stats = self.proj_out(x)
-
-        return stats
-
-
-class condition_encoder_convnext(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
-                                 padding=condition_encoder_kernel_size // 2)
-
-        self.conv = nn.ModuleList(
-            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
-                           drop_out=dropout_rate) for _ in range(n_layers)])
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
-                                  padding=condition_encoder_kernel_size // 2)
-
-    def forward(self, x, ):
-        x = self.proj_in(x)
-
-        for i in self.conv:
-            x = i(x)
-        stats = self.proj_out(x)
-
-        return stats
-
-
-class SynthesizerTrn(nn.Module):
-    """
-    Synthesizer for Training
-    """
-
-    def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads,
-                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
-
-                 condition_in_chans,
-
-                 condition_encoder_hidden_channels,
-                 condition_encoder_n_heads,
-                 condition_encoder_n_layers,
-                 condition_encoder_kernel_size,
-                 condition_encoder_dropout_rate,
-
-                 inter_channels,
-                 hidden_channels,
-
-                 condition_channels,flow_wavenet_lay=4,
-
-                 condition_encoder_filter_channels=None,
-
-                 latent_encoder_filter_channels=None,
-
-                 use_depthwise_conv=False,
-
-                 flow_share_parameter=False,
-                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True,
-                 ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention',
-
-                 **kwargs):
-
-        super().__init__()
-        self.inter_channels = inter_channels
-        self.ues_condition = ues_condition
-
-        self.use_latent = use_latent
-
-        if use_latent_encoder and use_latent:
-            if latent_encoder_type == 'attention':
-                self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans,
-                                                                   out_channels=inter_channels,
-                                                                   n_chans=latent_encoder_hidden_channels,
-                                                                   n_heads=latent_encoder_n_heads,
-                                                                   n_layers=latent_encoder_n_layers,
-                                                                   condition_encoder_kernel_size=latent_encoder_kernel_size,
-                                                                   dropout_rate=latent_encoder_dropout_rate,
-                                                                   filter_channels=latent_encoder_filter_channels)
-            elif latent_encoder_type == 'convnext':
-                self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans,
-                                                                        out_channels=inter_channels,
-                                                                        n_chans=latent_encoder_hidden_channels,
-                                                                        n_heads=None,
-                                                                        n_layers=latent_encoder_n_layers,
-                                                                        condition_encoder_kernel_size=None,
-                                                                        dropout_rate=latent_encoder_dropout_rate,
-                                                                        filter_channels=latent_encoder_filter_channels)
-            else:
-                raise RuntimeError("unsupport_latent_encoder")
-
-        elif ((not use_latent_encoder) and use_latent):
-            self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3)
-
-        if ues_condition_encoder and ues_condition:
-            if condition_encoder_type == 'attention':
-                self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans,
-                                                               out_channels=condition_channels,
-                                                               n_chans=condition_encoder_hidden_channels,
-                                                               n_heads=condition_encoder_n_heads,
-                                                               n_layers=condition_encoder_n_layers,
-                                                               condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                                               dropout_rate=condition_encoder_dropout_rate,
-                                                               filter_channels=condition_encoder_filter_channels)
-            elif condition_encoder_type == 'convnext':
-                self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans,
-                                                                    out_channels=condition_channels,
-                                                                    n_chans=condition_encoder_hidden_channels,
-                                                                    n_heads=None,
-                                                                    n_layers=condition_encoder_n_layers,
-                                                                    condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                                                    dropout_rate=condition_encoder_dropout_rate,
-                                                                    filter_channels=condition_encoder_filter_channels)
-            else:
-                raise RuntimeError("unsupport__encoder")
-        elif ((not ues_condition_encoder) and ues_condition):
-            self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3)
-
-        self.use_depthwise_conv = use_depthwise_conv
-
-        # self.enc_p = TextEncoder(
-        #     inter_channels,
-        #     hidden_channels,
-        #     filter_channels=filter_channels,
-        #     n_heads=n_heads,
-        #     n_layers=n_layers,
-        #     kernel_size=kernel_size,
-        #     p_dropout=p_dropout
-        # )
-
-        set_Conv1dModel(self.use_depthwise_conv)
-
-        if ues_condition:
-            condition_channelsw = condition_channels
-        else:
-            condition_channelsw = 0
-
-        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay,
-                                          gin_channels=condition_channelsw, share_parameter=flow_share_parameter)
-
-    def forward(self, c, mel, x_mask=None):
-
-        # vol proj
-
-        # f0 predict
-
-        # encoder
-        if self.use_latent:
-            z_ptemp, m_p, logs_p = self.latent_encoder(c)
-        else:
-            m_p, logs_p = None, None
-        # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
-
-        # flow
-        if self.ues_condition:
-            condition = self.condition_encoder(c)
-            z_p, logdet = self.flow(mel, x_mask, g=condition)
-        else:
-            z_p, logdet = self.flow(mel, x_mask, g=None)
-
-        return x_mask, (z_p, m_p, logs_p), logdet,
-
-    @torch.no_grad()
-    def infer(self, c, noice_scale=0.35, seed=None, ):
-        if seed is not None:
-
-            if c.device == torch.device("cuda"):
-                torch.cuda.manual_seed_all(seed)
-            else:
-                torch.manual_seed(seed)
-
-        if self.use_latent:
-            z_p, m_p, logs_p = self.latent_encoder(c)
-        else:
-            z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale
-
-            z_p=z_p.cuda()
-
-        # vol proj
-
-        # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
-        # o, _ = self.flow(z_p,  g=g, reverse=True)
-
-        if self.ues_condition:
-            condition = self.condition_encoder(c)
-            # z_p, logdet = self.flow(mel, x_mask, g=condition)
-            o, _ = self.flow(z_p, g=condition, reverse=True)
-        else:
-            o, _ = self.flow(z_p, g=None, reverse=True)
-
-        return o
-
-
-class glow_loss_L(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, pack_loss,target):
-
-        z, m, logs, logdet, mask = pack_loss
-        # z, m, logs, logdet, mask = None
-
-        l = 0.5 * torch.sum(
-            torch.exp(-2 * logdet) * ((z ) ** 2))  # neg normal likelihood w/o the constant term
-        l = l - torch.sum(logdet)  # log jacobian determinant
-        if mask is not None:
-            l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
-        else:
-            l = l / torch.sum(torch.ones_like(z))  # averaging across batch, channel and time axes
-        l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
-        return l
-
-
-
-
-
-class glow_decoder(nn.Module):
-    def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads,
-                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
-                 condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers,
-                 condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels,
-                 flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35,
-                 condition_encoder_filter_channels=None,
-
-                 latent_encoder_filter_channels=None,
-
-                 use_depthwise_conv=False,
-
-                 flow_share_parameter=False,
-                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True,
-                 use_latent=True,
-                 ues_condition_encoder=False, ues_condition=False,
-                 condition_encoder_type='attention'):
-        super().__init__()
-        self.use_latent=use_latent
-        self.flow_infer_seed=flow_infer_seed
-        self.flow_infer_scale=flow_infer_scale
-        self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels,
-                                           latent_encoder_n_heads=latent_encoder_n_heads,
-                                           latent_encoder_n_layers=latent_encoder_n_layers,
-                                           latent_encoder_kernel_size=latent_encoder_kernel_size,
-                                           latent_encoder_dropout_rate=latent_encoder_dropout_rate,
-
-                                           condition_in_chans=encoder_hidden,
-
-                                           condition_encoder_hidden_channels=condition_encoder_hidden_channels,
-                                           condition_encoder_n_heads=condition_encoder_n_heads,
-                                           condition_encoder_n_layers=condition_encoder_n_layers,
-                                           condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                           condition_encoder_dropout_rate=condition_encoder_dropout_rate,
-
-                                           inter_channels=out_dims,
-                                           flow_wavenet_lay=flow_wavenet_lay,
-                                           hidden_channels=flow_hidden_channels,
-
-                                           condition_channels=flow_condition_channels,
-
-                                           condition_encoder_filter_channels=condition_encoder_filter_channels,
-
-                                           latent_encoder_filter_channels=latent_encoder_filter_channels,
-
-                                           use_depthwise_conv=use_depthwise_conv,
-
-                                           flow_share_parameter=flow_share_parameter,
-                                           n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type,
-                                           use_latent_encoder=use_latent_encoder,
-                                           use_latent=use_latent,
-                                           ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition,
-                                           condition_encoder_type=condition_encoder_type)
-
-        self.use_mask=use_mask
-        self.use_norm=use_norm
-
-    def norm(self,x):
-        x = (x - (-5)) / (0 - (-5)) * 2 - 1
-        return x
-
-    def denorm(self,x):
-        x=(x + 1) / 2 * (0 - (-5)) + (-5)
-        return x
-
-    def build_loss(self):
-
-
-        if self.use_latent:
-
-            return glow_loss_L()
-
-        return glow_loss_L()
-    def forward(self, x, infer, x_gt,mask):
-        if not self.use_mask or infer:
-            mask=None
-        else:
-            mask=mask.transpose(1, 2)
-
-
-
-
-        if infer:
-            out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
-            if self.use_norm:
-                out = self.denorm(out)
-            return out
-        else:
-            if self.use_norm:
-                x_gt = self.norm(x_gt)
-
-
-            x = x.transpose(1, 2)
-            x_gt=x_gt.transpose(1, 2)
-
-            x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask)
-
-
-            pack_loss = (z_p, m_p, logs_p, logdet, x_mask)
-            return pack_loss
-
-
-
-
-        pass
diff --git a/modules/shallow/glow.py b/modules/shallow/glow.py
deleted file mode 100644
index 65db536ad..000000000
--- a/modules/shallow/glow.py
+++ /dev/null
@@ -1,1032 +0,0 @@
-import math
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-import torch.nn.functional as F
-
-
-def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
-    return pad_shape
-
-
-class RelativeFFTBlock(nn.Module):
-    """ FFT Block with Relative Multi-Head Attention """
-
-    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
-                 window_size=None, block_length=None):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.block_length = block_length
-
-        self.drop = nn.Dropout(p_dropout)
-        self.attn_layers = nn.ModuleList()
-        self.norm_layers_1 = nn.ModuleList()
-        self.ffn_layers = nn.ModuleList()
-        self.norm_layers_2 = nn.ModuleList()
-        for i in range(self.n_layers):
-            self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads,
-                                                          window_size=window_size, p_dropout=p_dropout,
-                                                          block_length=block_length))
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(FFN(
-                hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout))
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-
-    def forward(self, x, x_mask=None):
-
-        if x_mask is not None:
-            attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        else:
-            attn_mask = None
-
-        for i in range(self.n_layers):
-            if x_mask is not None:
-                x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-
-
-class RelativeSelfAttention(nn.Module):
-    """ Relative Multi-Head Attention """
-
-    def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0.,
-                 block_length=None, proximal_bias=False, proximal_init=False):
-        super(RelativeSelfAttention, self).__init__()
-        assert channels % n_heads == 0
-
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.block_length = block_length
-        self.proximal_bias = proximal_bias
-        self.p_dropout = p_dropout
-        self.attn = None
-
-        self.k_channels = channels // n_heads
-        self.conv_q = nn.Conv1d(channels, channels, 1)
-        self.conv_k = nn.Conv1d(channels, channels, 1)
-        self.conv_v = nn.Conv1d(channels, channels, 1)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels ** -0.5
-            self.emb_rel_k = nn.Parameter(torch.randn(
-                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-            self.emb_rel_v = nn.Parameter(torch.randn(
-                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
-        self.drop = nn.Dropout(p_dropout)
-
-        nn.init.xavier_uniform_(self.conv_q.weight)
-        nn.init.xavier_uniform_(self.conv_k.weight)
-        if proximal_init:
-            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
-            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
-        nn.init.xavier_uniform_(self.conv_v.weight)
-
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-
-        x = self.conv_o(x)
-        return x
-
-    def attention(self, query, key, value, mask=None):
-        # reshape [b, d, t] -> [b, n_h, t, d_k]
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = query.view(b, self.n_heads, self.k_channels,
-                           t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels,
-                           t_s).transpose(2, 3)
-
-        scores = torch.matmul(query, key.transpose(-2, -1)
-                              ) / math.sqrt(self.k_channels)
-        if self.window_size is not None:
-            assert t_s == t_t, "Relative attention is only available for self-attention."
-            key_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(
-                query, key_relative_embeddings)
-            rel_logits = self._relative_position_to_absolute_position(
-                rel_logits)
-            scores_local = rel_logits / math.sqrt(self.k_channels)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + \
-                     self._attention_bias_proximal(t_s).to(
-                         device=scores.device, dtype=scores.dtype)
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-            if self.block_length is not None:
-                block_mask = torch.ones_like(
-                    scores).triu(-self.block_length).tril(self.block_length)
-                scores = scores * block_mask + -1e4 * (1 - block_mask)
-        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(
-                p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_v, t_s)
-            output = output + \
-                     self._matmul_with_relative_values(
-                         relative_weights, value_relative_embeddings)
-        output = output.transpose(2, 3).contiguous().view(
-            b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-        return output, p_attn
-
-    def _matmul_with_relative_values(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-
-    def _matmul_with_relative_keys(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        max_relative_position = 2 * self.window_size + 1
-        # Pad first before slice to avoid using cond ops.
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = F.pad(
-                relative_embeddings,
-                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[:,
-                                   slice_start_position:slice_end_position]
-        return used_relative_embeddings
-
-    def _relative_position_to_absolute_position(self, x):
-        batch, heads, length, _ = x.size()
-        # Concat columns of pad to shift from relative to absolute indexing.
-        x = F.pad(x, convert_pad_shape(
-            [[0, 0], [0, 0], [0, 0], [0, 1]]))
-
-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
-        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(x_flat, convert_pad_shape(
-            [[0, 0], [0, 0], [0, length - 1]]))
-
-        # Reshape and slice out the padded elements.
-        x_final = x_flat.view(
-            [batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
-        return x_final
-
-    def _absolute_position_to_relative_position(self, x):
-        batch, heads, length, _ = x.size()
-        # padd along column
-        x = F.pad(x, convert_pad_shape(
-            [[0, 0], [0, 0], [0, 0], [0, length - 1]]))
-        x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
-        # add 0's in the beginning that will skew the elements after reshape
-        x_flat = F.pad(x_flat, convert_pad_shape(
-            [[0, 0], [0, 0], [length, 0]]))
-        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
-        return x_final
-
-    def _attention_bias_proximal(self, length):
-        """
-        Bias for self-attention to encourage attention to close positions.
-        """
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-
-
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-4):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-
-    def forward(self, x):
-        n_dims = len(x.shape)
-        mean = torch.mean(x, 1, keepdim=True)
-        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
-
-        x = (x - mean) * torch.rsqrt(variance + self.eps)
-
-        shape = [1, -1] + [1] * (n_dims - 2)
-        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
-        return x
-
-
-class FFN(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.activation = activation
-
-        self.conv = nn.Conv1d(
-            in_channels, out_channels, kernel_size, padding=kernel_size // 2)
-        self.drop = nn.Dropout(p_dropout)
-
-    def forward(self, x, x_mask=None):
-        if x_mask is not None:
-            x = self.conv(x * x_mask)
-        else:
-            x = self.conv(x)
-
-        if self.activation == "gelu":
-            x = x * torch.sigmoid(1.702 * x)
-        else:
-            x = torch.relu(x)
-        x = self.drop(x)
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-
-
-class Flip(nn.Module):
-    def forward(self, x, *args, reverse=False, **kwargs):
-        x = torch.flip(x, [1])
-        if not reverse:
-            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
-            return x, logdet
-        else:
-            return x
-
-
-Conv1dModel = nn.Conv1d  # 有毒 删
-
-
-class Depthwise_Separable_Conv1D(nn.Module):
-    def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            padding=0,
-            dilation=1,
-            bias=True,
-            padding_mode='zeros',  # TODO: refine this type
-            device=None,
-            dtype=None
-    ):
-        super().__init__()
-        self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
-                                    groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias,
-                                    padding_mode=padding_mode, device=device, dtype=dtype)
-        self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias,
-                                    device=device, dtype=dtype)
-
-    def forward(self, input):
-        return self.point_conv(self.depth_conv(input))
-
-
-def set_Conv1dModel(use_depthwise_conv):
-    global Conv1dModel
-    Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d
-
-
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
-@torch.jit.script
-def add_and_GRU(input_a, input_b):
-    in_act = input_a + input_b
-    x1, x2 = in_act.chunk(2, dim=1)
-    t_act = torch.tanh(x2)
-    s_act = torch.sigmoid(x1)
-    acts = t_act * s_act
-    return acts
-
-
-class WN(torch.nn.Module):
-    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-        super(WN, self).__init__()
-        assert (kernel_size % 2 == 1)
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size,
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels  # condition用的
-        self.p_dropout = p_dropout
-
-        self.in_layers = torch.nn.ModuleList()
-        self.res_skip_layers = torch.nn.ModuleList()
-        self.drop = nn.Dropout(p_dropout)
-        self.condition_layers = torch.nn.ModuleList()
-
-        # if gin_channels != 0:
-        #     cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
-        #     # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
-        #     self.cond_layer=cond_layer
-
-        for i in range(n_layers):
-
-            if gin_channels != 0:
-                cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels, 1)
-                # self.cond_layer = weight_norm_modules(cond_layer, name='weight')
-                # self.cond_layer = cond_layer
-            else:
-                cond_layer = nn.Identity()
-            self.condition_layers.append(cond_layer)
-
-            dilation = dilation_rate ** i
-            padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = Conv1dModel(hidden_channels, 2 * hidden_channels, kernel_size,
-                                   dilation=dilation, padding=padding)
-            # in_layer = weight_norm_modules(in_layer, name='weight')
-            self.in_layers.append(in_layer)
-
-            # last one is not necessary
-            if i < n_layers - 1:
-                res_skip_channels = 2 * hidden_channels
-            else:
-                res_skip_channels = hidden_channels
-
-            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
-            # res_skip_layer = weight_norm_modules(res_skip_layer, name='weight')
-            self.res_skip_layers.append(res_skip_layer)
-
-    def forward(self, x, x_mask=None, g=None, **kwargs):
-        output = torch.zeros_like(x)
-        n_channels_tensor = torch.IntTensor([self.hidden_channels])
-
-        # if g is not None:
-        #     g = self.cond_layer(g)
-
-        for i in range(self.n_layers):
-            x_in = self.in_layers[i](x)
-
-            if g is not None:
-
-                condition = self.condition_layers[i](g)
-            else:
-                condition = torch.zeros_like(x_in)
-
-            # acts = fused_add_tanh_sigmoid_multiply(  # GRU 这不就是wavnet的那个 GRU
-            #     x_in,
-            #     condition,
-            #     n_channels_tensor)
-            acts = add_and_GRU(  # GRU 这不就是wavnet的那个 GRU
-                x_in,
-                condition,
-            )
-            acts = self.drop(acts)
-
-            res_skip_acts = self.res_skip_layers[i](acts)
-            if i < self.n_layers - 1:
-                res_acts = res_skip_acts[:, :self.hidden_channels, :]
-                if x_mask is not None:
-                    x = (x + res_acts) * x_mask
-                else:
-                    x = x + res_acts
-                output = output + res_skip_acts[:, self.hidden_channels:, :]
-            else:
-                output = output + res_skip_acts
-
-        if x_mask is not None:
-            out = output * x_mask
-        else:
-            out = output
-        return out
-
-    # def remove_weight_norm(self):
-    #     if self.gin_channels != 0:
-    #         remove_weight_norm_modules(self.cond_layer)
-    #     for l in self.in_layers:
-    #         remove_weight_norm_modules(l)
-    #     for l in self.res_skip_layers:
-    #         remove_weight_norm_modules(l)
-
-
-class ResidualCouplingLayer(nn.Module):
-    def __init__(self,
-                 channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 p_dropout=0,
-                 gin_channels=0,
-                 mean_only=False,
-                 wn_sharing_parameter=None  # 不明的共享权重
-                 ):
-        assert channels % 2 == 0, "channels should be divisible by 2"
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.half_channels = channels // 2
-        self.mean_only = mean_only
-
-        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
-                      gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
-        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-        self.post.weight.data.zero_()
-        self.post.bias.data.zero_()
-
-    def forward(self, x, x_mask=None, g=None, reverse=False):
-        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
-        if x_mask is not None:
-            h = self.pre(x0) * x_mask
-        else:
-            h = self.pre(x0)
-        h = self.enc(h, x_mask, g=g)
-
-        if x_mask is not None:
-            stats = self.post(h) * x_mask
-        else:
-            stats = self.post(h)
-        if not self.mean_only:
-            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
-        else:
-            m = stats
-            logs = torch.zeros_like(m)
-
-        if not reverse:
-            if x_mask is not None:
-                x1 = m + x1 * torch.exp(logs) * x_mask
-            else:
-                x1 = m + x1 * torch.exp(logs)
-            # x1 = m + x1 * torch.exp(logs) * x_mask  # 逆过程
-            x = torch.cat([x0, x1], 1)
-            logdet = torch.sum(logs, [1, 2])
-            return x, logdet
-        else:
-            if x_mask is not None:
-                x1 = (x1 - m) * torch.exp(-logs) * x_mask
-            else:
-                x1 = (x1 - m) * torch.exp(-logs)
-            # x1 = (x1 - m) * torch.exp(-logs) * x_mask
-            x = torch.cat([x0, x1], 1)
-            return x
-
-
-class ResidualCouplingBlock(nn.Module):
-    def __init__(self,
-                 channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 n_flows=4,
-                 gin_channels=0,
-                 share_parameter=False
-                 ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-
-        self.flows = nn.ModuleList()
-
-        self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0,
-                     gin_channels=gin_channels) if share_parameter else None
-
-        for i in range(n_flows):
-            self.flows.append(
-                ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
-                                      gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
-            self.flows.append(Flip())
-
-    def forward(self, x, x_mask=None, g=None, reverse=False):
-        if not reverse:
-            logdet_tot = 0
-            for flow in self.flows:
-                x, logdet = flow(x, x_mask, g=g, reverse=reverse)
-                logdet_tot += logdet
-        else:
-            logdet_tot = None
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x, logdet_tot
-
-
-# class TextEncoder(nn.Module):
-#     def __init__(self,
-#                  out_channels,
-#                  hidden_channels,
-#                  kernel_size,
-#                  n_layers,
-#                  gin_channels=0,
-#                  filter_channels=None,
-#                  n_heads=None,
-#                  p_dropout=None):
-#         super().__init__()
-#         self.out_channels = out_channels
-#         self.hidden_channels = hidden_channels
-#         self.kernel_size = kernel_size
-#         self.n_layers = n_layers
-#         self.gin_channels = gin_channels
-#         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-#         self.f0_emb = nn.Embedding(256, hidden_channels)
-#
-#         self.enc_ = attentions.Encoder(
-#             hidden_channels,
-#             filter_channels,
-#             n_heads,
-#             n_layers,
-#             kernel_size,
-#             p_dropout)
-#
-#     def forward(self, x, x_mask, f0=None, noice_scale=1):
-#         x = x + self.f0_emb(f0).transpose(1, 2)
-#         x = self.enc_(x * x_mask, x_mask)
-#         stats = self.proj(x) * x_mask
-#         m, logs = torch.split(stats, self.out_channels, dim=1)
-#         z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
-#
-#         return z, m, logs, x_mask
-
-
-class ConvNeXtBlock(nn.Module):
-    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-
-    Args:
-        dim (int): Number of input channels.
-        intermediate_dim (int): Dimensionality of the intermediate layer.
-        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
-            Defaults to None.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-            None means non-conditional LayerNorm. Defaults to None.
-    """
-
-    def __init__(
-            self,
-            dim: int,
-            intermediate_dim: int,
-            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0
-
-    ):
-        super().__init__()
-        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
-
-        self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(intermediate_dim, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.drop_path = nn.Identity()
-        self.dropout = nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
-
-    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
-        residual = x
-        x = self.dwconv(x)
-        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
-
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
-        x = self.dropout(x)
-
-        x = residual + self.drop_path(x)
-        return x
-
-
-class condition_latent_encoder_att(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
-
-        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
-                                    n_layers=n_layers,
-                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
-
-    def forward(self, x, noice_scale=1):
-        x = self.proj_in(x)
-
-        x = self.enc(x)
-        stats = self.proj_out(x)
-        m, logs = torch.chunk(stats, 2, 1)
-        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
-
-        return z, m, logs,
-
-
-class condition_latent_encoder_convnext(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, 1)
-
-        self.conv = nn.ModuleList(
-            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
-                           drop_out=dropout_rate) for _ in range(n_layers)])
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels * 2, 1)
-
-    def forward(self, x, noice_scale=1):
-        x = self.proj_in(x)
-
-        for i in self.conv:
-            x = i(x)
-        stats = self.proj_out(x)
-        m, logs = torch.chunk(stats, 2, 1)
-        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale)
-
-        return z, m, logs,
-
-
-class condition_encoder_att(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
-                                 padding=condition_encoder_kernel_size // 2)
-
-        self.enc = RelativeFFTBlock(hidden_channels=n_chans, filter_channels=filter_channels, n_heads=n_heads,
-                                    n_layers=n_layers,
-                                    kernel_size=condition_encoder_kernel_size, p_dropout=dropout_rate)
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
-                                  padding=condition_encoder_kernel_size // 2)
-
-    def forward(self, x, ):
-        x = self.proj_in(x)
-
-        x = self.enc(x)
-        stats = self.proj_out(x)
-
-        return stats
-
-
-class condition_encoder_convnext(nn.Module):
-    def __init__(self, in_chans, out_channels, n_chans, n_heads, n_layers, condition_encoder_kernel_size, dropout_rate,
-                 filter_channels=None):
-        super().__init__()
-        if filter_channels is None:
-            filter_channels = n_chans * 4
-
-        self.proj_in = nn.Conv1d(in_chans, n_chans, condition_encoder_kernel_size,
-                                 padding=condition_encoder_kernel_size // 2)
-
-        self.conv = nn.ModuleList(
-            [ConvNeXtBlock(dim=n_chans, intermediate_dim=filter_channels, layer_scale_init_value=1e-6,
-                           drop_out=dropout_rate) for _ in range(n_layers)])
-
-        self.proj_out = nn.Conv1d(n_chans, out_channels, kernel_size=condition_encoder_kernel_size,
-                                  padding=condition_encoder_kernel_size // 2)
-
-    def forward(self, x, ):
-        x = self.proj_in(x)
-
-        for i in self.conv:
-            x = i(x)
-        stats = self.proj_out(x)
-
-        return stats
-
-
-class SynthesizerTrn(nn.Module):
-    """
-    Synthesizer for Training
-    """
-
-    def __init__(self, latent_encoder_hidden_channels, latent_encoder_n_heads,
-                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
-
-                 condition_in_chans,
-
-                 condition_encoder_hidden_channels,
-                 condition_encoder_n_heads,
-                 condition_encoder_n_layers,
-                 condition_encoder_kernel_size,
-                 condition_encoder_dropout_rate,
-
-                 inter_channels,
-                 hidden_channels,
-
-                 condition_channels,flow_wavenet_lay=4,
-
-                 condition_encoder_filter_channels=None,
-
-                 latent_encoder_filter_channels=None,
-
-                 use_depthwise_conv=False,
-
-                 flow_share_parameter=False,
-                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True, use_latent=True,
-                 ues_condition_encoder=False, ues_condition=False, condition_encoder_type='attention',
-
-                 **kwargs):
-
-        super().__init__()
-        self.inter_channels = inter_channels
-        self.ues_condition = ues_condition
-
-        self.use_latent = use_latent
-
-        if use_latent_encoder and use_latent:
-            if latent_encoder_type == 'attention':
-                self.latent_encoder = condition_latent_encoder_att(in_chans=condition_in_chans,
-                                                                   out_channels=inter_channels,
-                                                                   n_chans=latent_encoder_hidden_channels,
-                                                                   n_heads=latent_encoder_n_heads,
-                                                                   n_layers=latent_encoder_n_layers,
-                                                                   condition_encoder_kernel_size=latent_encoder_kernel_size,
-                                                                   dropout_rate=latent_encoder_dropout_rate,
-                                                                   filter_channels=latent_encoder_filter_channels)
-            elif latent_encoder_type == 'convnext':
-                self.latent_encoder = condition_latent_encoder_convnext(in_chans=condition_in_chans,
-                                                                        out_channels=inter_channels,
-                                                                        n_chans=latent_encoder_hidden_channels,
-                                                                        n_heads=None,
-                                                                        n_layers=latent_encoder_n_layers,
-                                                                        condition_encoder_kernel_size=None,
-                                                                        dropout_rate=latent_encoder_dropout_rate,
-                                                                        filter_channels=latent_encoder_filter_channels)
-            else:
-                raise RuntimeError("unsupport_latent_encoder")
-
-        elif ((not use_latent_encoder) and use_latent):
-            self.condition_encoder = nn.Conv1d(condition_in_chans, inter_channels, kernel_size=7, padding=3)
-
-        if ues_condition_encoder and ues_condition:
-            if condition_encoder_type == 'attention':
-                self.condition_encoder = condition_encoder_att(in_chans=condition_in_chans,
-                                                               out_channels=condition_channels,
-                                                               n_chans=condition_encoder_hidden_channels,
-                                                               n_heads=condition_encoder_n_heads,
-                                                               n_layers=condition_encoder_n_layers,
-                                                               condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                                               dropout_rate=condition_encoder_dropout_rate,
-                                                               filter_channels=condition_encoder_filter_channels)
-            elif condition_encoder_type == 'convnext':
-                self.condition_encoder = condition_encoder_convnext(in_chans=condition_in_chans,
-                                                                    out_channels=condition_channels,
-                                                                    n_chans=condition_encoder_hidden_channels,
-                                                                    n_heads=None,
-                                                                    n_layers=condition_encoder_n_layers,
-                                                                    condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                                                    dropout_rate=condition_encoder_dropout_rate,
-                                                                    filter_channels=condition_encoder_filter_channels)
-            else:
-                raise RuntimeError("unsupport__encoder")
-        elif ((not ues_condition_encoder) and ues_condition):
-            self.condition_encoder = nn.Conv1d(condition_in_chans, condition_channels, kernel_size=7, padding=3)
-
-        self.use_depthwise_conv = use_depthwise_conv
-
-        # self.enc_p = TextEncoder(
-        #     inter_channels,
-        #     hidden_channels,
-        #     filter_channels=filter_channels,
-        #     n_heads=n_heads,
-        #     n_layers=n_layers,
-        #     kernel_size=kernel_size,
-        #     p_dropout=p_dropout
-        # )
-
-        set_Conv1dModel(self.use_depthwise_conv)
-
-        if ues_condition:
-            condition_channelsw = condition_channels
-        else:
-            condition_channelsw = 0
-
-        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,n_flows=flow_wavenet_lay,
-                                          gin_channels=condition_channelsw, share_parameter=flow_share_parameter)
-
-    def forward(self, c, mel, x_mask=None):
-
-        # vol proj
-
-        # f0 predict
-
-        # encoder
-        if self.use_latent:
-            z_ptemp, m_p, logs_p = self.latent_encoder(c)
-        else:
-            m_p, logs_p = None, None
-        # z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
-
-        # flow
-        if self.ues_condition:
-            condition = self.condition_encoder(c)
-            z_p, logdet = self.flow(mel, x_mask, g=condition)
-        else:
-            z_p, logdet = self.flow(mel, x_mask, g=None)
-
-        return x_mask, (z_p, m_p, logs_p), logdet,
-
-    @torch.no_grad()
-    def infer(self, c, noice_scale=0.35, seed=None, ):
-        if seed is not None:
-
-            if c.device == torch.device("cuda"):
-                torch.cuda.manual_seed_all(seed)
-            else:
-                torch.manual_seed(seed)
-
-        if self.use_latent:
-            z_p, m_p, logs_p = self.latent_encoder(c)
-        else:
-            z_p = torch.randn_like(torch.zeros(1, self.inter_channels, c.size()[2])) * noice_scale
-
-        # vol proj
-
-        # z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
-        # o, _ = self.flow(z_p,  g=g, reverse=True)
-
-        if self.ues_condition:
-            condition = self.condition_encoder(c)
-            # z_p, logdet = self.flow(mel, x_mask, g=condition)
-            o, _ = self.flow(z_p, g=condition, reverse=True)
-        else:
-            o, _ = self.flow(z_p, g=None, reverse=True)
-
-        return o
-
-
-class glow_loss_L(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, pack_loss,target):
-
-        z, m, logs, logdet, mask = pack_loss
-        # z, m, logs, logdet, mask = None
-
-        l = torch.sum(logs) + 0.5 * torch.sum(
-            torch.exp(-2 * logs) * ((z - m) ** 2))  # neg normal likelihood w/o the constant term
-        l = l - torch.sum(logdet)  # log jacobian determinant
-        if mask is not None:
-            l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
-        else:
-            l = l / torch.sum(torch.ones_like(z))  # averaging across batch, channel and time axes
-        l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
-        return l
-
-
-
-
-
-
-
-class glow_decoder(nn.Module):
-    def __init__(self, encoder_hidden, out_dims, latent_encoder_hidden_channels, latent_encoder_n_heads,
-                 latent_encoder_n_layers, latent_encoder_kernel_size, latent_encoder_dropout_rate,
-                 condition_encoder_hidden_channels, condition_encoder_n_heads, condition_encoder_n_layers,
-                 condition_encoder_kernel_size, condition_encoder_dropout_rate, flow_hidden_channels,
-                 flow_condition_channels, parame,use_mask=True,use_norm=True,flow_wavenet_lay=4,flow_infer_seed=None,flow_infer_scale=0.35,
-                 condition_encoder_filter_channels=None,
-
-                 latent_encoder_filter_channels=None,
-
-                 use_depthwise_conv=False,
-
-                 flow_share_parameter=False,
-                 n_flow_layer=4, latent_encoder_type='attention', use_latent_encoder=True,
-                 use_latent=True,
-                 ues_condition_encoder=False, ues_condition=False,
-                 condition_encoder_type='attention'):
-        super().__init__()
-        self.use_latent=use_latent
-        self.flow_infer_seed=flow_infer_seed
-        self.flow_infer_scale=flow_infer_scale
-        self.glow_decoder = SynthesizerTrn(latent_encoder_hidden_channels=latent_encoder_hidden_channels,
-                                           latent_encoder_n_heads=latent_encoder_n_heads,
-                                           latent_encoder_n_layers=latent_encoder_n_layers,
-                                           latent_encoder_kernel_size=latent_encoder_kernel_size,
-                                           latent_encoder_dropout_rate=latent_encoder_dropout_rate,
-
-                                           condition_in_chans=encoder_hidden,
-
-                                           condition_encoder_hidden_channels=condition_encoder_hidden_channels,
-                                           condition_encoder_n_heads=condition_encoder_n_heads,
-                                           condition_encoder_n_layers=condition_encoder_n_layers,
-                                           condition_encoder_kernel_size=condition_encoder_kernel_size,
-                                           condition_encoder_dropout_rate=condition_encoder_dropout_rate,
-
-                                           inter_channels=out_dims,
-                                           flow_wavenet_lay=flow_wavenet_lay,
-                                           hidden_channels=flow_hidden_channels,
-
-                                           condition_channels=flow_condition_channels,
-
-                                           condition_encoder_filter_channels=condition_encoder_filter_channels,
-
-                                           latent_encoder_filter_channels=latent_encoder_filter_channels,
-
-                                           use_depthwise_conv=use_depthwise_conv,
-
-                                           flow_share_parameter=flow_share_parameter,
-                                           n_flow_layer=n_flow_layer, latent_encoder_type=latent_encoder_type,
-                                           use_latent_encoder=use_latent_encoder,
-                                           use_latent=use_latent,
-                                           ues_condition_encoder=ues_condition_encoder, ues_condition=ues_condition,
-                                           condition_encoder_type=condition_encoder_type)
-
-        self.use_mask=use_mask
-        self.use_norm=use_norm
-
-    def norm(self,x):
-        x = (x - (-5)) / (0 - (-5)) * 2 - 1
-        return x
-
-    def denorm(self,x):
-        x=(x + 1) / 2 * (0 - (-5)) + (-5)
-        return x
-
-    def build_loss(self):
-
-
-        if self.use_latent:
-
-            return glow_loss_L()
-
-    def forward(self, x, infer, x_gt,mask):
-        if not self.use_mask or infer:
-            mask=None
-        else:
-            mask=mask.transpose(1, 2)
-
-
-
-
-        if infer:
-            out=self.glow_decoder.infer(x.transpose(1, 2), noice_scale=self.flow_infer_scale, seed=self.flow_infer_seed).transpose(1, 2)
-            if self.use_norm:
-                out = self.denorm(out)
-            return out
-        else:
-            if self.use_norm:
-                x_gt = self.norm(x_gt)
-
-
-            x = x.transpose(1, 2)
-            x_gt=x_gt.transpose(1, 2)
-
-            x_mask, (z_p, m_p, logs_p), logdet=self.glow_decoder(x,x_gt,x_mask=mask)
-
-
-            pack_loss = (z_p, m_p, logs_p, logdet, x_mask)
-            return pack_loss
-
-
-
-
-        pass
diff --git a/modules/shallow/light_decoder.py b/modules/shallow/light_decoder.py
deleted file mode 100644
index bb2624765..000000000
--- a/modules/shallow/light_decoder.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-class GLU(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, x):
-        out, gate = x.chunk(2, dim=self.dim)
-        return out * gate.sigmoid()
-
-
-class ConvNeXtBlock(nn.Module):
-    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-
-    Args:
-        dim (int): Number of input channels.
-        intermediate_dim (int): Dimensionality of the intermediate layer.
-        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
-            Defaults to None.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-            None means non-conditional LayerNorm. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        intermediate_dim: int,
-        layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0
-
-    ):
-        super().__init__()
-        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
-
-
-
-        self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.act2=GLU(2)
-        self.pwconv2 = nn.Linear(intermediate_dim//2, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.drop_path = nn.Identity()
-        self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
-
-
-
-    def forward(self, x: torch.Tensor, ) -> torch.Tensor:
-        residual = x
-        x=self.act(x)
-        x = self.dwconv(x)
-
-        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
-
-
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act2(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
-        x=self.dropout(x)
-
-        x = residual + self.drop_path (x)
-        return x
-
-
-class fs2_loss(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self,y, x):
-        x=(x - (-5)) / (0 - (-5)) * 2 - 1
-        return nn.L1Loss()(y,x)
-
-
-class noise_decoder(nn.Module):
-    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame):
-        super().__init__()
-        self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-        self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate)  for _ in range(n_layers)])
-        self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-
-
-
-    def build_loss(self):
-
-        return fs2_loss()
-
-    def forward(self, x,infer,**kwargs):
-        x=x.transpose(1, 2)
-        x=self.inconv(x)
-
-        for i in self.conv:
-            x=i(x)
-        x=self.outconv(x).transpose(1, 2)
-        if infer:
-            x=(x + 1) / 2 * (0 - (-5)) + (-5)
-        return x
-        pass
diff --git a/modules/shallow/noise_decoder.py b/modules/shallow/noise_decoder.py
deleted file mode 100644
index 862caf911..000000000
--- a/modules/shallow/noise_decoder.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-
-
-
-class ConvNeXtBlock(nn.Module):
-    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-
-    Args:
-        dim (int): Number of input channels.
-        intermediate_dim (int): Dimensionality of the intermediate layer.
-        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
-            Defaults to None.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-            None means non-conditional LayerNorm. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        intermediate_dim: int,
-        layer_scale_init_value: Optional[float] = None,drop_path: float=0.0,drop_out: float=0.0
-
-    ):
-        super().__init__()
-        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
-
-
-
-        self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(intermediate_dim, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.drop_path = nn.Identity()
-        self.dropout=nn.Dropout(drop_out) if drop_out > 0. else nn.Identity()
-        self.con = nn.Conv1d(dim, dim, kernel_size=1, )
-
-
-    def forward(self, x: torch.Tensor,y ) -> torch.Tensor:
-        residual = x
-        x = self.dwconv(x)
-        x=x+self.con(y)
-        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
-
-
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
-        x=self.dropout(x)
-
-        x = residual + self.drop_path (x)
-        return x
-
-
-class fs2_loss(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self,y, x):
-        x=(x - (-5)) / (0 - (-5)) * 2 - 1
-        return nn.L1Loss()(y,x)
-
-
-class noise_decoder(nn.Module):
-    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,parame):
-        super().__init__()
-        self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-        self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans,intermediate_dim=n_chans*4,layer_scale_init_value=1e-6,drop_out=dropout_rate)  for _ in range(n_layers)])
-        self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-
-
-
-    def build_loss(self):
-
-        return fs2_loss()
-
-    def forward(self, x,infer,**kwargs):
-        x=x.transpose(1, 2)
-        x=self.inconv(x)
-        y=torch.randn_like(x)
-        for i in self.conv:
-            y=i(y,x)
-        x=self.outconv(y).transpose(1, 2)
-        if infer:
-            x=(x + 1) / 2 * (0 - (-5)) + (-5)
-        return x
-        pass
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 09b85d6ba..7c00561be 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -17,7 +17,7 @@
 from modules.fastspeech.param_adaptor import ParameterAdaptorModule
 from modules.fastspeech.tts_modules import RhythmRegulator, LengthRegulator
 from modules.fastspeech.variance_encoder import FastSpeech2Variance
-from modules.shallow.shallow_adapter import shallow_adapt
+from modules.aux_decoder.shallow_adapter import shallow_adapt
 from utils.hparams import hparams
 
 

From 20d5bb5b63c24b60f2f4f9a1ee5fb158095d302a Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Thu, 21 Sep 2023 20:06:55 +0800
Subject: [PATCH 29/33] Clean and refactor aux decoder

---
 configs/acoustic.yaml                         |   4 -
 configs/templates/config_acoustic.yaml        |  17 +
 modules/aux_decoder/__init__.py               |  66 ++++
 .../{fast_speech2_decoder.py => convnext.py}  |  55 ++--
 modules/aux_decoder/fs2_decoder.py            | 300 ------------------
 modules/aux_decoder/shallow_adapter.py        |  76 -----
 modules/toplevel.py                           |  25 +-
 training/acoustic_task.py                     |  12 +-
 8 files changed, 125 insertions(+), 430 deletions(-)
 rename modules/aux_decoder/{fast_speech2_decoder.py => convnext.py} (61%)
 delete mode 100644 modules/aux_decoder/fs2_decoder.py
 delete mode 100644 modules/aux_decoder/shallow_adapter.py

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index e892018d8..174cd9943 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -83,17 +83,13 @@ diff_depth: 400
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
-  shared_encoder: true
   val_gt_start: false
-  aux_share_encoder: true
-  aux_encoder_strict_hparams: false
   aux_decoder_arch: convnext
   aux_decoder_args:
     num_channels: 512
     num_layers: 6
     kernel_size: 7
     dropout_rate: 0.1
-  aux_decoder_strict_hparams: true
   aux_decoder_grad: 0.1
 
 lambda_aux_mel_loss: 0.2
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 0291177a1..12d0b1dba 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -40,9 +40,26 @@ augmentation_args:
     domain: log  # or linear
     scale: 1.0
 
+K_step: 1000
 residual_channels: 512
 residual_layers: 20
 
+# shallow diffusion
+use_shallow_diffusion: false
+diff_depth: 400
+shallow_diffusion_args:
+  train_aux_decoder: true
+  train_diffusion: true
+  val_gt_start: false
+  aux_decoder_arch: convnext
+  aux_decoder_args:
+    num_channels: 512
+    num_layers: 6
+    kernel_size: 7
+    dropout_rate: 0.1
+  aux_decoder_grad: 0.1
+lambda_aux_mel_loss: 0.2
+
 optimizer_args:
   lr: 0.0004
 lr_scheduler_args:
diff --git a/modules/aux_decoder/__init__.py b/modules/aux_decoder/__init__.py
index e69de29bb..b408e4b7b 100644
--- a/modules/aux_decoder/__init__.py
+++ b/modules/aux_decoder/__init__.py
@@ -0,0 +1,66 @@
+import torch.nn
+from torch import nn
+
+from .convnext import ConvNeXtDecoder
+from utils import filter_kwargs
+
+AUX_DECODERS = {
+    'convnext': ConvNeXtDecoder
+}
+AUX_LOSSES = {
+    'convnext': nn.L1Loss
+}
+
+
+def build_aux_decoder(
+        in_dims: int, out_dims: int,
+        aux_decoder_arch: str, aux_decoder_args: dict
+) -> torch.nn.Module:
+    decoder_cls = AUX_DECODERS[aux_decoder_arch]
+    kwargs = filter_kwargs(aux_decoder_args, decoder_cls)
+    return AUX_DECODERS[aux_decoder_arch](in_dims, out_dims, **kwargs)
+
+
+def build_aux_loss(aux_decoder_arch):
+    return AUX_LOSSES[aux_decoder_arch]()
+
+
+class AuxDecoderAdaptor(nn.Module):
+    def __init__(self, in_dims: int, out_dims: int, num_feats: int,
+                 spec_min: list, spec_max: list,
+                 aux_decoder_arch: str, aux_decoder_args: dict):
+        super().__init__()
+        self.decoder = build_aux_decoder(
+            in_dims=in_dims, out_dims=out_dims * num_feats,
+            aux_decoder_arch=aux_decoder_arch,
+            aux_decoder_args=aux_decoder_args
+        )
+        self.out_dims = out_dims
+        self.n_feats = num_feats
+        if spec_min is not None and spec_max is not None:
+            # spec: [B, T, M] or [B, F, T, M]
+            # spec_min and spec_max: [1, 1, M] or [1, 1, F, M] => transpose(-3, -2) => [1, 1, M] or [1, F, 1, M]
+            spec_min = torch.FloatTensor(spec_min)[None, None, :].transpose(-3, -2)
+            spec_max = torch.FloatTensor(spec_max)[None, None, :].transpose(-3, -2)
+            self.register_buffer('spec_min', spec_min, persistent=False)
+            self.register_buffer('spec_max', spec_max, persistent=False)
+
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+
+    def denorm_spec(self, x):
+        return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
+
+    def forward(self, condition, infer=False):
+        x = self.decoder(condition, infer=infer)  # [B, T, F x C]
+
+        if self.n_feats > 1:
+            # This is the temporary solution since PyTorch 1.13
+            # does not support exporting aten::unflatten to ONNX
+            # x = x.unflatten(dim=2, sizes=(self.n_feats, self.in_dims))
+            x = x.reshape(-1, x.shape[1], self.n_feats, self.out_dims)  # [B, T, F, C]
+            x = x.transpose(1, 2)  # [B, F, T, C]
+        if infer:
+            x = self.denorm_spec(x)
+
+        return x  # [B, T, C] or [B, F, T, C]
diff --git a/modules/aux_decoder/fast_speech2_decoder.py b/modules/aux_decoder/convnext.py
similarity index 61%
rename from modules/aux_decoder/fast_speech2_decoder.py
rename to modules/aux_decoder/convnext.py
index 1ccb76fcc..a03959ddf 100644
--- a/modules/aux_decoder/fast_speech2_decoder.py
+++ b/modules/aux_decoder/convnext.py
@@ -12,15 +12,13 @@ class ConvNeXtBlock(nn.Module):
         intermediate_dim (int): Dimensionality of the intermediate layer.
         layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
             Defaults to None.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-            None means non-conditional LayerNorm. Defaults to None.
     """
 
     def __init__(
             self,
             dim: int,
             intermediate_dim: int,
-            layer_scale_init_value: Optional[float] = None, drop_path: float = 0.0, drop_out: float = 0.0
+            layer_scale_init_value: Optional[float] = None, drop_out: float = 0.0
 
     ):
         super().__init__()
@@ -57,34 +55,33 @@ def forward(self, x: torch.Tensor, ) -> torch.Tensor:
         return x
 
 
-class fs2_loss(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, y, x):
-        x = (x - (-5)) / (0 - (-5)) * 2 - 1
-        return nn.L1Loss()(y, x)
-
-
-class fs2_decode(nn.Module):
-    def __init__(self, encoder_hidden, out_dims, n_chans, kernel_size, dropout_rate, n_layers, parame):
+class ConvNeXtDecoder(nn.Module):
+    def __init__(
+            self, in_dims, out_dims, /, *,
+            num_channels=512, num_layers=6, kernel_size=7, dropout_rate=0.1
+    ):
         super().__init__()
-        self.inconv = nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-        self.conv = nn.ModuleList([ConvNeXtBlock(dim=n_chans, intermediate_dim=n_chans * 4, layer_scale_init_value=1e-6,
-                                                 drop_out=dropout_rate) for _ in range(n_layers)])
-        self.outconv = nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-
-    def build_loss(self):
-
-        return fs2_loss()
+        self.inconv = nn.Conv1d(
+            in_dims, num_channels, kernel_size,
+            stride=1, padding=(kernel_size - 1) // 2
+        )
+        self.conv = nn.ModuleList(
+            ConvNeXtBlock(
+                dim=num_channels, intermediate_dim=num_channels * 4,
+                layer_scale_init_value=1e-6, drop_out=dropout_rate
+            ) for _ in range(num_layers)
+        )
+        self.outconv = nn.Conv1d(
+            num_channels, out_dims, kernel_size,
+            stride=1, padding=(kernel_size - 1) // 2
+        )
 
-    def forward(self, x, infer, *args, **kwargs):
+    # noinspection PyUnusedLocal
+    def forward(self, x, infer=False):
         x = x.transpose(1, 2)
         x = self.inconv(x)
-        for i in self.conv:
-            x = i(x)
-        x = self.outconv(x).transpose(1, 2)
-        if infer:
-            x = (x + 1) / 2 * (0 - (-5)) + (-5)
+        for conv in self.conv:
+            x = conv(x)
+        x = self.outconv(x)
+        x = x.transpose(1, 2)
         return x
-        pass
diff --git a/modules/aux_decoder/fs2_decoder.py b/modules/aux_decoder/fs2_decoder.py
deleted file mode 100644
index acb3408be..000000000
--- a/modules/aux_decoder/fs2_decoder.py
+++ /dev/null
@@ -1,300 +0,0 @@
-import math
-
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-
-
-def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
-    return pad_shape
-class RelativeFFTBlock(nn.Module):
-    """ FFT Block with Relative Multi-Head Attention """
-
-    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=None, block_length=None):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.block_length = block_length
-
-        self.drop = nn.Dropout(p_dropout)
-        self.attn_layers = nn.ModuleList()
-        self.norm_layers_1 = nn.ModuleList()
-        self.ffn_layers = nn.ModuleList()
-        self.norm_layers_2 = nn.ModuleList()
-        for i in range(self.n_layers):
-            self.attn_layers.append(RelativeSelfAttention(hidden_channels, hidden_channels, n_heads,
-                                    window_size=window_size, p_dropout=p_dropout, block_length=block_length))
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(FFN(
-                hidden_channels, hidden_channels, kernel_size, p_dropout=p_dropout))
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-
-    def forward(self, x, x_mask=None):
-
-        if x_mask is  not None:
-            attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        else:
-            attn_mask = None
-
-        for i in range(self.n_layers):
-            if x_mask is not None:
-                x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-
-
-class RelativeSelfAttention(nn.Module):
-    """ Relative Multi-Head Attention """
-
-    def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., block_length=None, proximal_bias=False, proximal_init=False):
-        super(RelativeSelfAttention, self).__init__()
-        assert channels % n_heads == 0
-
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.block_length = block_length
-        self.proximal_bias = proximal_bias
-        self.p_dropout = p_dropout
-        self.attn = None
-
-        self.k_channels = channels // n_heads
-        self.conv_q = nn.Conv1d(channels, channels, 1)
-        self.conv_k = nn.Conv1d(channels, channels, 1)
-        self.conv_v = nn.Conv1d(channels, channels, 1)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels**-0.5
-            self.emb_rel_k = nn.Parameter(torch.randn(
-                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-            self.emb_rel_v = nn.Parameter(torch.randn(
-                n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
-        self.drop = nn.Dropout(p_dropout)
-
-        nn.init.xavier_uniform_(self.conv_q.weight)
-        nn.init.xavier_uniform_(self.conv_k.weight)
-        if proximal_init:
-            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
-            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
-        nn.init.xavier_uniform_(self.conv_v.weight)
-
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-
-        x = self.conv_o(x)
-        return x
-
-    def attention(self, query, key, value, mask=None):
-        # reshape [b, d, t] -> [b, n_h, t, d_k]
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = query.view(b, self.n_heads, self.k_channels,
-                           t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels,
-                           t_s).transpose(2, 3)
-
-        scores = torch.matmul(query, key.transpose(-2, -1)
-                              ) / math.sqrt(self.k_channels)
-        if self.window_size is not None:
-            assert t_s == t_t, "Relative attention is only available for self-attention."
-            key_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(
-                query, key_relative_embeddings)
-            rel_logits = self._relative_position_to_absolute_position(
-                rel_logits)
-            scores_local = rel_logits / math.sqrt(self.k_channels)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + \
-                self._attention_bias_proximal(t_s).to(
-                    device=scores.device, dtype=scores.dtype)
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-            if self.block_length is not None:
-                block_mask = torch.ones_like(
-                    scores).triu(-self.block_length).tril(self.block_length)
-                scores = scores * block_mask + -1e4*(1 - block_mask)
-        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(
-                p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_v, t_s)
-            output = output + \
-                self._matmul_with_relative_values(
-                    relative_weights, value_relative_embeddings)
-        output = output.transpose(2, 3).contiguous().view(
-            b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-        return output, p_attn
-
-    def _matmul_with_relative_values(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-
-    def _matmul_with_relative_keys(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        max_relative_position = 2 * self.window_size + 1
-        # Pad first before slice to avoid using cond ops.
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = F.pad(
-                relative_embeddings,
-                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[:,
-                                                              slice_start_position:slice_end_position]
-        return used_relative_embeddings
-
-    def _relative_position_to_absolute_position(self, x):
-        batch, heads, length, _ = x.size()
-        # Concat columns of pad to shift from relative to absolute indexing.
-        x = F.pad(x, convert_pad_shape(
-            [[0, 0], [0, 0], [0, 0], [0, 1]]))
-
-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
-        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(x_flat, convert_pad_shape(
-            [[0, 0], [0, 0], [0, length-1]]))
-
-        # Reshape and slice out the padded elements.
-        x_final = x_flat.view(
-            [batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
-        return x_final
-
-    def _absolute_position_to_relative_position(self, x):
-        batch, heads, length, _ = x.size()
-        # padd along column
-        x = F.pad(x, convert_pad_shape(
-            [[0, 0], [0, 0], [0, 0], [0, length-1]]))
-        x_flat = x.view([batch, heads, length**2 + length*(length - 1)])
-        # add 0's in the beginning that will skew the elements after reshape
-        x_flat = F.pad(x_flat, convert_pad_shape(
-            [[0, 0], [0, 0], [length, 0]]))
-        x_final = x_flat.view([batch, heads, length, 2*length])[:, :, :, 1:]
-        return x_final
-
-    def _attention_bias_proximal(self, length):
-        """
-        Bias for self-attention to encourage attention to close positions.
-        """
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-
-
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-4):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-
-    def forward(self, x):
-        n_dims = len(x.shape)
-        mean = torch.mean(x, 1, keepdim=True)
-        variance = torch.mean((x - mean)**2, 1, keepdim=True)
-
-        x = (x - mean) * torch.rsqrt(variance + self.eps)
-
-        shape = [1, -1] + [1] * (n_dims - 2)
-        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
-        return x
-
-
-class FFN(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, p_dropout=0., activation=None):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.activation = activation
-
-        self.conv = nn.Conv1d(
-            in_channels, out_channels, kernel_size, padding=kernel_size//2)
-        self.drop = nn.Dropout(p_dropout)
-
-    def forward(self, x, x_mask=None):
-        if x_mask is  not None:
-            x = self.conv(x * x_mask)
-        else:
-            x = self.conv(x )
-
-        if self.activation == "gelu":
-            x = x * torch.sigmoid(1.702 * x)
-        else:
-            x = torch.relu(x)
-        x = self.drop(x)
-        if x_mask is not None:
-            x=x * x_mask
-        return x
-
-
-class fs2_loss(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self,y, x):
-        x=(x - (-5)) / (0 - (-5)) * 2 - 1
-        return nn.L1Loss()(y,x)
-
-
-class attention_fs2_decoder(nn.Module):
-    def __init__(self,encoder_hidden,out_dims,n_chans,kernel_size,dropout_rate,n_layers,n_heads,attention_ffn_kernel_size,parame):
-        super().__init__()
-        self.inconv=nn.Conv1d(encoder_hidden, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-        self.conv = RelativeFFTBlock(hidden_channels=n_chans,filter_channels=n_chans*4, n_heads=n_heads, n_layers=n_layers, kernel_size=attention_ffn_kernel_size, p_dropout=dropout_rate)
-        self.outconv=nn.Conv1d(n_chans, out_dims, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
-
-
-
-    def build_loss(self):
-
-        return fs2_loss()
-
-    def forward(self, x,infer,*args,**kwargs):
-        x=x.transpose(1, 2)
-        x=self.inconv(x)
-
-
-        x=self.conv(x)
-        x=self.outconv(x).transpose(1, 2)
-        if infer:
-            x=(x + 1) / 2 * (0 - (-5)) + (-5)
-        return x
-        pass
diff --git a/modules/aux_decoder/shallow_adapter.py b/modules/aux_decoder/shallow_adapter.py
deleted file mode 100644
index 53d47c56a..000000000
--- a/modules/aux_decoder/shallow_adapter.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import torch
-import torch.nn as nn
-
-cls_map = {'fs2': 'modules.aux_decoder.fast_speech2_decoder.fs2_decode',
-           'ns': 'modules.aux_decoder.noise_decoder.noise_decoder', 'ld': 'modules.aux_decoder.light_decoder.noise_decoder','att_fs2':'modules.aux_decoder.fs2_decoder.attention_fs2_decoder'
-           ,'glow':'modules.aux_decoder.glow.glow_decoder','glow_convnext':'modules.aux_decoder.convnext_glow.glow_decoder_convnext','gglow':'modules.aux_decoder.gglow.glow_decoder','fast_speech2_decoders':'modules.aux_decoder.fast_speech2_decoders.fs2_decode'
-           }
-encoder_cls_map = {'fs2': 'modules.fastspeech.acoustic_encoder.FastSpeech2Acoustic'}
-
-
-def build_object_from_class_name(cls_str, parent_cls, strict, *args, **kwargs):
-    import importlib
-
-    pkg = ".".join(cls_str.split(".")[:-1])
-    cls_name = cls_str.split(".")[-1]
-    cls_type = getattr(importlib.import_module(pkg), cls_name)
-    if parent_cls is not None:
-        assert issubclass(cls_type, parent_cls), f'| {cls_type} is not subclass of {parent_cls}.'
-    if strict:
-        return cls_type(*args, **kwargs)
-    return cls_type(*args, **filter_kwargs(kwargs, cls_type))
-
-
-def filter_kwargs(dict_to_filter, kwarg_obj):
-    import inspect
-
-    sig = inspect.signature(kwarg_obj)
-    filter_keys = [param.name for param in sig.parameters.values() if param.kind == param.POSITIONAL_OR_KEYWORD]
-    filtered_dict = {filter_key: dict_to_filter[filter_key] for filter_key in filter_keys if
-                     filter_key in dict_to_filter}
-    return filtered_dict
-
-
-class shallow_adapt(nn.Module):
-    def __init__(self, parame, out_dims, vocab_size):
-        super().__init__()
-        self.parame = parame
-
-        decodeparame = parame['shallow_diffusion_args']['aux_decoder_args']
-        if decodeparame.get('encoder_hidden') is None:
-            decodeparame['encoder_hidden'] = parame['hidden_size']
-        decodeparame['out_dims'] = out_dims
-        decodeparame['parame'] = parame
-
-        encoderparame = parame['shallow_diffusion_args']['aux_encoder_args']
-        encoderparame['parame'] = parame
-        encoderparame['vocab_size'] = vocab_size
-        self.decoder = build_object_from_class_name(cls_map[parame['shallow_diffusion_args']['aux_decoder_arch']],
-                                                    nn.Module,
-                                                    parame['shallow_diffusion_args']['aux_decoder_strict_hparams'],
-                                                    **decodeparame)
-
-        if not parame['shallow_diffusion_args']['aux_share_encoder']:
-            # todo
-            self.use_encoder = True
-            self.encoder = build_object_from_class_name(
-                encoder_cls_map[parame['shallow_diffusion_args']['aux_encoder_arch']],
-                nn.Module,
-                parame['shallow_diffusion_args']['aux_encoder_strict_hparams'],
-                **encoderparame)
-        else:
-            self.use_encoder = False
-
-    def forward(self, condition, infer=False, txt_tokens=None, mel2ph=None, f0=None,
-                key_shift=None, speed=None,
-                spk_embed_id=None,gt_mel=None,mask=None, **kwargs):
-
-        if self.use_encoder:
-            condition = self.encoder(txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
-                                     key_shift=key_shift, speed=speed,
-                                     spk_embed_id=spk_embed_id, **kwargs)
-
-        return self.decoder(condition, infer,gt_mel,mask)
-
-    def get_loss(self):
-        return self.decoder.build_loss()
diff --git a/modules/toplevel.py b/modules/toplevel.py
index 7c00561be..1b917c585 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -6,6 +6,7 @@
 from torch import Tensor
 
 from basics.base_module import CategorizedModule
+from modules.aux_decoder import AuxDecoderAdaptor
 from modules.commons.common_layers import (
     XavierUniformInitLinear as Linear,
     NormalInitEmbedding as Embedding
@@ -17,7 +18,6 @@
 from modules.fastspeech.param_adaptor import ParameterAdaptorModule
 from modules.fastspeech.tts_modules import RhythmRegulator, LengthRegulator
 from modules.fastspeech.variance_encoder import FastSpeech2Variance
-from modules.aux_decoder.shallow_adapter import shallow_adapt
 from utils.hparams import hparams
 
 
@@ -27,16 +27,6 @@ def __init__(self, *, aux_out=None, diff_out=None):
         self.diff_out = diff_out
 
 
-# TODO: replace the following placeholder with real modules
-class ExampleAuxDecoder(nn.Module):
-    def __init__(self, out_dims):
-        super().__init__()
-        self.out_dims = out_dims
-
-    def forward(self, condition, infer=True):
-        return torch.randn(condition.shape[0], condition.shape[1], self.out_dims, device=condition.device)
-
-
 class DiffSingerAcoustic(ParameterAdaptorModule, CategorizedModule):
     @property
     def category(self):
@@ -54,7 +44,12 @@ def __init__(self, vocab_size, out_dims):
             self.train_aux_decoder = self.shallow_args['train_aux_decoder']
             self.train_diffusion = self.shallow_args['train_diffusion']
             self.aux_decoder_grad = self.shallow_args['aux_decoder_grad']
-            self.aux_decoder = shallow_adapt(hparams, out_dims,vocab_size)
+            self.aux_decoder = AuxDecoderAdaptor(
+                in_dims=hparams['hidden_size'], out_dims=out_dims, num_feats=1,
+                spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+                aux_decoder_arch=self.shallow_args['aux_decoder_arch'],
+                aux_decoder_args=self.shallow_args['aux_decoder_args']
+            )
 
         self.diffusion = GaussianDiffusion(
             out_dims=out_dims,
@@ -81,8 +76,7 @@ def forward(
         )
         if infer:
             if self.use_shallow_diffusion:
-                aux_mel_pred = self.aux_decoder(condition, infer=True,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
-            key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id, **kwargs)
+                aux_mel_pred = self.aux_decoder(condition, infer=True)
                 aux_mel_pred *= ((mel2ph > 0).float()[:, :, None])
                 if gt_mel is not None and self.shallow_args['val_gt_start']:
                     src_mel = gt_mel
@@ -97,8 +91,7 @@ def forward(
             if self.use_shallow_diffusion:
                 if self.train_aux_decoder:
                     aux_cond = condition * self.aux_decoder_grad + condition.detach() * (1 - self.aux_decoder_grad)
-                    aux_out = self.aux_decoder(aux_cond, infer=False,txt_tokens=txt_tokens, mel2ph=mel2ph, f0=f0,
-            key_shift=key_shift, speed=speed,spk_embed_id=spk_embed_id,gt_mel=gt_mel,mask=((mel2ph > 0).float()[:, :, None]), **kwargs)
+                    aux_out = self.aux_decoder(aux_cond, infer=False)
                 else:
                     aux_out = None
                 if self.train_diffusion:
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index 6969b5f5d..04dedb65c 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -9,6 +9,7 @@
 from basics.base_dataset import BaseDataset
 from basics.base_task import BaseTask
 from basics.base_vocoder import BaseVocoder
+from modules.aux_decoder import build_aux_loss
 from modules.losses.diff_loss import DiffusionNoiseLoss
 from modules.toplevel import DiffSingerAcoustic, ShallowDiffusionOutput
 from modules.vocoders.registry import get_vocoder_cls
@@ -62,9 +63,9 @@ def __init__(self):
         self.dataset_cls = AcousticDataset
         self.use_shallow_diffusion = hparams['use_shallow_diffusion']
         if self.use_shallow_diffusion:
-            shallow_args = hparams['shallow_diffusion_args']
-            self.train_aux_decoder = shallow_args['train_aux_decoder']
-            self.train_diffusion = shallow_args['train_diffusion']
+            self.shallow_args = hparams['shallow_diffusion_args']
+            self.train_aux_decoder = self.shallow_args['train_aux_decoder']
+            self.train_diffusion = self.shallow_args['train_diffusion']
 
         self.use_vocoder = hparams['infer'] or hparams['val_with_vocoder']
         if self.use_vocoder:
@@ -85,7 +86,7 @@ def build_model(self):
     # noinspection PyAttributeOutsideInit
     def build_losses_and_metrics(self):
         if self.use_shallow_diffusion:
-            self.aux_mel_loss = self.model.aux_decoder.get_loss()
+            self.aux_mel_loss = build_aux_loss(self.shallow_args['aux_decoder_arch'])
             self.lambda_aux_mel_loss = hparams['lambda_aux_mel_loss']
         self.mel_loss = DiffusionNoiseLoss(loss_type=hparams['diff_loss_type'])
 
@@ -118,7 +119,8 @@ def run_model(self, sample, infer=False):
 
             if output.aux_out is not None:
                 aux_out = output.aux_out
-                aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, target)
+                norm_gt = self.model.aux_decoder.norm_spec(target)
+                aux_mel_loss = self.lambda_aux_mel_loss * self.aux_mel_loss(aux_out, norm_gt)
                 losses['aux_mel_loss'] = aux_mel_loss
 
             if output.diff_out is not None:

From ef87664e441f1844ae9f1d000751ef57ab603f85 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Thu, 21 Sep 2023 23:54:10 +0800
Subject: [PATCH 30/33] Fix KeyError

---
 modules/toplevel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/toplevel.py b/modules/toplevel.py
index 1b917c585..38cdbe7c9 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -39,7 +39,7 @@ def __init__(self, vocab_size, out_dims):
         )
 
         self.use_shallow_diffusion = hparams.get('use_shallow_diffusion', False)
-        self.shallow_args = hparams['shallow_diffusion_args']
+        self.shallow_args = hparams.get('shallow_diffusion_args', {})
         if self.use_shallow_diffusion:
             self.train_aux_decoder = self.shallow_args['train_aux_decoder']
             self.train_diffusion = self.shallow_args['train_diffusion']

From 2986c888084f2e63074e6984fc0257dd047705fe Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Thu, 21 Sep 2023 23:54:47 +0800
Subject: [PATCH 31/33] Support exporting shallow diffusion to ONNX

---
 deployment/exporters/acoustic_exporter.py | 88 ++++++++++++++++-------
 deployment/modules/diffusion.py           | 37 ++++++++--
 deployment/modules/toplevel.py            | 49 +++++++------
 modules/aux_decoder/__init__.py           |  8 ++-
 utils/onnx_helper.py                      | 42 ++++++-----
 5 files changed, 148 insertions(+), 76 deletions(-)

diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
index 34cf2a016..ebfd75a10 100644
--- a/deployment/exporters/acoustic_exporter.py
+++ b/deployment/exporters/acoustic_exporter.py
@@ -33,12 +33,22 @@ def __init__(
         self.spk_map: dict = self.build_spk_map()
         self.vocab = TokenTextEncoder(vocab_list=build_phoneme_list())
         self.model = self.build_model()
-        self.fs2_cache_path = self.cache_dir / 'fs2.onnx'
+        self.fs2_aux_cache_path = self.cache_dir / (
+            'fs2_aux.onnx' if self.model.use_shallow_diffusion else 'fs2.onnx'
+        )
         self.diffusion_cache_path = self.cache_dir / 'diffusion.onnx'
 
         # Attributes for logging
         self.model_class_name = remove_suffix(self.model.__class__.__name__, 'ONNX')
-        self.fs2_class_name = remove_suffix(self.model.fs2.__class__.__name__, 'ONNX')
+        fs2_aux_cls_logging = [remove_suffix(self.model.fs2.__class__.__name__, 'ONNX')]
+        if self.model.use_shallow_diffusion:
+            fs2_aux_cls_logging.append(remove_suffix(
+                self.model.aux_decoder.decoder.__class__.__name__, 'ONNX'
+            ))
+        self.fs2_aux_class_name = ', '.join(fs2_aux_cls_logging)
+        self.aux_decoder_class_name = remove_suffix(
+            self.model.aux_decoder.decoder.__class__.__name__, 'ONNX'
+        ) if self.model.use_shallow_diffusion else None
         self.denoiser_class_name = remove_suffix(self.model.diffusion.denoise_fn.__class__.__name__, 'ONNX')
         self.diffusion_class_name = remove_suffix(self.model.diffusion.__class__.__name__, 'ONNX')
 
@@ -86,11 +96,11 @@ def export(self, path: Path):
 
     def export_model(self, path: Path):
         self._torch_export_model()
-        fs2_onnx = self._optimize_fs2_graph(onnx.load(self.fs2_cache_path))
+        fs2_aux_onnx = self._optimize_fs2_aux_graph(onnx.load(self.fs2_aux_cache_path))
         diffusion_onnx = self._optimize_diffusion_graph(onnx.load(self.diffusion_cache_path))
-        model_onnx = self._merge_fs2_diffusion_graphs(fs2_onnx, diffusion_onnx)
+        model_onnx = self._merge_fs2_aux_diffusion_graphs(fs2_aux_onnx, diffusion_onnx)
         onnx.save(model_onnx, path)
-        self.fs2_cache_path.unlink()
+        self.fs2_aux_cache_path.unlink()
         self.diffusion_cache_path.unlink()
         print(f'| export model => {path}')
 
@@ -105,7 +115,7 @@ def export_attachments(self, path: Path):
 
     @torch.no_grad()
     def _torch_export_model(self):
-        # Prepare inputs for FastSpeech2 tracing
+        # Prepare inputs for FastSpeech2 and aux decoder tracing
         n_frames = 10
         tokens = torch.LongTensor([[1]]).to(self.device)
         durations = torch.LongTensor([[n_frames]]).to(self.device)
@@ -161,22 +171,30 @@ def _torch_export_model(self):
             1: 'n_frames'
         }
 
-        # PyTorch ONNX export for FastSpeech2
-        print(f'Exporting {self.fs2_class_name}...')
+        # PyTorch ONNX export for FastSpeech2 and aux decoder
+        output_names = ['condition']
+        if self.model.use_shallow_diffusion:
+            output_names.append('aux_mel')
+            dynamix_axes['aux_mel'] = {
+                1: 'n_frames'
+            }
+        print(f'Exporting {self.fs2_aux_class_name}...')
         torch.onnx.export(
-            self.model.view_as_fs2(),
+            self.model.view_as_fs2_aux(),
             arguments,
-            self.fs2_cache_path,
+            self.fs2_aux_cache_path,
             input_names=input_names,
-            output_names=['condition'],
+            output_names=output_names,
             dynamic_axes=dynamix_axes,
             opset_version=15
         )
 
+        condition = torch.rand((1, n_frames, hparams['hidden_size']), device=self.device)
+
         # Prepare inputs for denoiser tracing and GaussianDiffusion scripting
         shape = (1, 1, hparams['audio_num_mel_bins'], n_frames)
         noise = torch.randn(shape, device=self.device)
-        condition = torch.rand((1, hparams['hidden_size'], n_frames), device=self.device)
+        x_start = torch.randn((1, n_frames, hparams['audio_num_mel_bins']),device=self.device)
         step = (torch.rand((1,), device=self.device) * hparams['K_step']).long()
 
         print(f'Tracing {self.denoiser_class_name} denoiser...')
@@ -186,20 +204,24 @@ def _torch_export_model(self):
             (
                 noise,
                 step,
-                condition
+                condition.transpose(1, 2)
             )
         )
 
         print(f'Scripting {self.diffusion_class_name}...')
+        diffusion_inputs = [
+            condition,
+            *([x_start, 100] if self.model.use_shallow_diffusion else [])
+        ]
         diffusion = torch.jit.script(
             diffusion,
             example_inputs=[
                 (
-                    condition.transpose(1, 2),
+                    *diffusion_inputs,
                     1  # p_sample branch
                 ),
                 (
-                    condition.transpose(1, 2),
+                    *diffusion_inputs,
                     200  # p_sample_plms branch
                 )
             ]
@@ -210,12 +232,14 @@ def _torch_export_model(self):
         torch.onnx.export(
             diffusion,
             (
-                condition.transpose(1, 2),
+                *diffusion_inputs,
                 200
             ),
             self.diffusion_cache_path,
             input_names=[
-                'condition', 'speedup'
+                'condition',
+                *(['x_start', 'depth'] if self.model.use_shallow_diffusion else []),
+                'speedup'
             ],
             output_names=[
                 'mel'
@@ -224,6 +248,7 @@ def _torch_export_model(self):
                 'condition': {
                     1: 'n_frames'
                 },
+                **({'x_start': {1: 'n_frames'}} if self.model.use_shallow_diffusion else {}),
                 'mel': {
                     1: 'n_frames'
                 }
@@ -252,11 +277,11 @@ def _perform_spk_mix(self, spk_mix: Dict[str, float]):
         )  # => [1, H]
         return spk_mix_embed
 
-    def _optimize_fs2_graph(self, fs2: onnx.ModelProto) -> onnx.ModelProto:
-        print(f'Running ONNX Simplifier on {self.fs2_class_name}...')
+    def _optimize_fs2_aux_graph(self, fs2: onnx.ModelProto) -> onnx.ModelProto:
+        print(f'Running ONNX Simplifier on {self.fs2_aux_class_name}...')
         fs2, check = onnxsim.simplify(fs2, include_subgraph=True)
         assert check, 'Simplified ONNX model could not be validated'
-        print(f'| optimize graph: {self.fs2_class_name}')
+        print(f'| optimize graph: {self.fs2_aux_class_name}')
         return fs2
 
     def _optimize_diffusion_graph(self, diffusion: onnx.ModelProto) -> onnx.ModelProto:
@@ -282,18 +307,33 @@ def _optimize_diffusion_graph(self, diffusion: onnx.ModelProto) -> onnx.ModelPro
         print(f'| optimize graph: {self.diffusion_class_name}')
         return diffusion
 
-    def _merge_fs2_diffusion_graphs(self, fs2: onnx.ModelProto, diffusion: onnx.ModelProto) -> onnx.ModelProto:
-        onnx_helper.model_add_prefixes(fs2, dim_prefix='fs2.', ignored_pattern=r'(n_tokens)|(n_frames)')
+    def _merge_fs2_aux_diffusion_graphs(self, fs2: onnx.ModelProto, diffusion: onnx.ModelProto) -> onnx.ModelProto:
+        onnx_helper.model_add_prefixes(
+            fs2, dim_prefix=('fs2aux.' if self.model.use_shallow_diffusion else 'fs2.'),
+            ignored_pattern=r'(n_tokens)|(n_frames)'
+        )
         onnx_helper.model_add_prefixes(diffusion, dim_prefix='diffusion.', ignored_pattern='n_frames')
-        print(f'Merging {self.fs2_class_name} and {self.diffusion_class_name} '
+        print(f'Merging {self.fs2_aux_class_name} and {self.diffusion_class_name} '
               f'back into {self.model_class_name}...')
         merged = onnx.compose.merge_models(
-            fs2, diffusion, io_map=[('condition', 'condition')],
+            fs2, diffusion, io_map=[
+                ('condition', 'condition'),
+                *([('aux_mel', 'x_start')] if self.model.use_shallow_diffusion else []),
+            ],
             prefix1='', prefix2='', doc_string='',
             producer_name=fs2.producer_name, producer_version=fs2.producer_version,
             domain=fs2.domain, model_version=fs2.model_version
         )
         merged.graph.name = fs2.graph.name
+
+        print(f'Running ONNX Simplifier on {self.model_class_name}...')
+        merged, check = onnxsim.simplify(
+            merged,
+            include_subgraph=True
+        )
+        assert check, 'Simplified ONNX model could not be validated'
+        print(f'| optimize graph: {self.model_class_name}')
+
         return merged
 
     # noinspection PyMethodMayBeStatic
diff --git a/deployment/modules/diffusion.py b/deployment/modules/diffusion.py
index 8905bebda..3c139f649 100644
--- a/deployment/modules/diffusion.py
+++ b/deployment/modules/diffusion.py
@@ -16,6 +16,12 @@ def extract(a, t):
 
 # noinspection PyMethodOverriding
 class GaussianDiffusionONNX(GaussianDiffusion):
+    def q_sample(self, x_start, t, noise):
+        return (
+                extract(self.sqrt_alphas_cumprod, t) * x_start +
+                extract(self.sqrt_one_minus_alphas_cumprod, t) * noise
+        )
+
     def p_sample(self, x, t, cond):
         x_pred = self.denoise_fn(x, t, cond)
         x_recon = (
@@ -74,18 +80,37 @@ def p_sample_plms(self, x_prev, t, interval: int, cond, noise_list: List[Tensor]
         x_prev = self.plms_get_x_pred(x_prev, noise_pred_prime, t, t_prev)
         return noise_pred, x_prev
 
+    def norm_spec(self, x):
+        k = (self.spec_max - self.spec_min) / 2.
+        b = (self.spec_max + self.spec_min) / 2.
+        return (x - b) / k
+
     def denorm_spec(self, x):
-        d = (self.spec_max - self.spec_min) / 2.
-        m = (self.spec_max + self.spec_min) / 2.
-        return x * d + m
+        k = (self.spec_max - self.spec_min) / 2.
+        b = (self.spec_max + self.spec_min) / 2.
+        return x * k + b
 
-    def forward(self, condition, speedup: int):
+    def forward(self, condition, x_start=None, depth: int = 1000, speedup: int = 1):
         condition = condition.transpose(1, 2)  # [1, T, H] => [1, H, T]
         device = condition.device
         n_frames = condition.shape[2]
 
-        step_range = torch.arange(0, self.k_step, speedup, dtype=torch.long, device=device).flip(0)[:, None]
-        x = torch.randn((1, self.num_feats, self.out_dims, n_frames), device=device)
+        noise = torch.randn((1, self.num_feats, self.out_dims, n_frames), device=device)
+        if x_start is None:
+            step_range = torch.arange(0, self.k_step, speedup, dtype=torch.long, device=device).flip(0)[:, None]
+            x = noise
+        else:
+            depth = min(depth, self.k_step)
+            step_range = torch.arange(0, depth, speedup, dtype=torch.long, device=device).flip(0)[:, None]
+            x_start = self.norm_spec(x_start).transpose(-2, -1)
+            if self.num_feats == 1:
+                x_start = x_start[:, None, :, :]
+            if depth > 0:
+                x = self.q_sample(
+                    x_start, torch.full((1,), depth - 1, device=device, dtype=torch.long), noise
+                )
+            else:
+                x = x_start
 
         if speedup > 1:
             for t in step_range:
diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py
index 2cbbda8fb..027997218 100644
--- a/deployment/modules/toplevel.py
+++ b/deployment/modules/toplevel.py
@@ -1,6 +1,6 @@
-import numpy as np
 import copy
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -37,7 +37,7 @@ def __init__(self, vocab_size, out_dims):
             spec_max=hparams['spec_max']
         )
 
-    def forward_fs2(
+    def forward_fs2_aux(
             self,
             tokens: Tensor,
             durations: Tensor,
@@ -46,41 +46,40 @@ def forward_fs2(
             gender: Tensor = None,
             velocity: Tensor = None,
             spk_embed: Tensor = None
-    ) -> Tensor:
-        return self.fs2(
+    ):
+        condition = self.fs2(
             tokens, durations, f0, variances=variances,
             gender=gender, velocity=velocity, spk_embed=spk_embed
         )
+        if self.use_shallow_diffusion:
+            aux_mel_pred = self.aux_decoder(condition, infer=True)
+            return condition, aux_mel_pred
+        else:
+            return condition
+
+    def forward_shallow_diffusion(
+            self, condition: Tensor, x_start: Tensor,
+            depth: int, speedup: int
+    ) -> Tensor:
+        return self.diffusion(condition, x_start=x_start, depth=depth, speedup=speedup)
 
-    def forward_diffusion(self, condition: Tensor, speedup: int) -> Tensor:
-        return self.diffusion(condition, speedup)
+    def forward_diffusion(self, condition: Tensor, speedup: int):
+        return self.diffusion(condition, speedup=speedup)
 
-    def view_as_fs2(self) -> nn.Module:
+    def view_as_fs2_aux(self) -> nn.Module:
         model = copy.deepcopy(self)
-        try:
-            del model.variance_embeds
-            del model.variance_adaptor
-        except AttributeError:
-            pass
         del model.diffusion
-        model.forward = model.forward_fs2
+        model.forward = model.forward_fs2_aux
         return model
 
-    def view_as_adaptor(self) -> nn.Module:
-        model = copy.deepcopy(self)
-        del model.fs2
-        del model.diffusion
-        raise NotImplementedError()
-
     def view_as_diffusion(self) -> nn.Module:
         model = copy.deepcopy(self)
         del model.fs2
-        try:
-            del model.variance_embeds
-            del model.variance_adaptor
-        except AttributeError:
-            pass
-        model.forward = model.forward_diffusion
+        if self.use_shallow_diffusion:
+            del model.aux_decoder
+            model.forward = model.forward_shallow_diffusion
+        else:
+            model.forward = model.forward_diffusion
         return model
 
 
diff --git a/modules/aux_decoder/__init__.py b/modules/aux_decoder/__init__.py
index b408e4b7b..54ceb2113 100644
--- a/modules/aux_decoder/__init__.py
+++ b/modules/aux_decoder/__init__.py
@@ -46,10 +46,14 @@ def __init__(self, in_dims: int, out_dims: int, num_feats: int,
             self.register_buffer('spec_max', spec_max, persistent=False)
 
     def norm_spec(self, x):
-        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+        k = (self.spec_max - self.spec_min) / 2.
+        b = (self.spec_max + self.spec_min) / 2.
+        return (x - b) / k
 
     def denorm_spec(self, x):
-        return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
+        k = (self.spec_max - self.spec_min) / 2.
+        b = (self.spec_max + self.spec_min) / 2.
+        return x * k + b
 
     def forward(self, condition, infer=False):
         x = self.decoder(condition, infer=infer)  # [B, T, F x C]
diff --git a/utils/onnx_helper.py b/utils/onnx_helper.py
index 9fc3f6fad..bebe97565 100644
--- a/utils/onnx_helper.py
+++ b/utils/onnx_helper.py
@@ -277,27 +277,31 @@ def _extract_conv_nodes_recursive(subgraph: GraphProto):
                 to_be_removed.append(sub_node)
         [subgraph.node.remove(_n) for _n in to_be_removed]
 
+    toplevel_if_idx = toplevel_if_node = None
+    # Find the **last** If node in toplevel graph
     for i, n in enumerate(graph.node):
         if n.op_type == 'If':
-            for a in n.attribute:
-                b = onnx.helper.get_attribute_value(a)
-                _extract_conv_nodes_recursive(b)
-            # Insert the extracted nodes before the first 'If' node which carries the main denoising loop.
-            for key in reversed(node_dict):
-                alias, node = node_dict[key]
-                # Rename output of the node.
-                out_name = node.output[0]
-                node.output.remove(node.output[0])
-                node.output.insert(0, alias)
-                # Insert node into the main graph.
-                graph.node.insert(i, node)
-                # Rename value info of the output.
-                for v in graph.value_info:
-                    if v.name == out_name:
-                        v.name = alias
-                        break
-                _verbose(f'| extract conditioner projection: \'{node.name}\'')
-            break
+            toplevel_if_idx = i
+            toplevel_if_node = n
+    if toplevel_if_node is not None:
+        for a in toplevel_if_node.attribute:
+            b = onnx.helper.get_attribute_value(a)
+            _extract_conv_nodes_recursive(b)
+        # Insert the extracted nodes before the first 'If' node which carries the main denoising loop.
+        for key in reversed(node_dict):
+            alias, node = node_dict[key]
+            # Rename output of the node.
+            out_name = node.output[0]
+            node.output.remove(node.output[0])
+            node.output.insert(0, alias)
+            # Insert node into the main graph.
+            graph.node.insert(toplevel_if_idx, node)
+            # Rename value info of the output.
+            for v in graph.value_info:
+                if v.name == out_name:
+                    v.name = alias
+                    break
+            _verbose(f'| extract conditioner projection: \'{node.name}\'')
 
 
 def graph_remove_unused_values(graph: GraphProto):

From 1a8fb72db3db0983c1a0e0dfa0be2ecebcdde9e5 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Fri, 22 Sep 2023 01:19:32 +0800
Subject: [PATCH 32/33] Add missing logic to ONNX

---
 deployment/modules/diffusion.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deployment/modules/diffusion.py b/deployment/modules/diffusion.py
index 3c139f649..c8a03fe5a 100644
--- a/deployment/modules/diffusion.py
+++ b/deployment/modules/diffusion.py
@@ -105,7 +105,9 @@ def forward(self, condition, x_start=None, depth: int = 1000, speedup: int = 1):
             x_start = self.norm_spec(x_start).transpose(-2, -1)
             if self.num_feats == 1:
                 x_start = x_start[:, None, :, :]
-            if depth > 0:
+            if depth >= self.timesteps:
+                x = noise
+            elif depth > 0:
                 x = self.q_sample(
                     x_start, torch.full((1,), depth - 1, device=device, dtype=torch.long), noise
                 )

From acf00e483fdc45e080d7b9d4c42f979964da1a4f Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Fri, 22 Sep 2023 21:43:03 +0800
Subject: [PATCH 33/33] Rename `diff_depth` to `K_step_infer`

---
 configs/acoustic.yaml                  | 2 +-
 configs/templates/config_acoustic.yaml | 2 +-
 modules/diffusion/ddpm.py              | 2 +-
 scripts/infer.py                       | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 174cd9943..92c7aa7f9 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -78,7 +78,7 @@ schedule_type: 'linear'
 
 # shallow diffusion
 use_shallow_diffusion: false
-diff_depth: 400
+K_step_infer: 400
 
 shallow_diffusion_args:
   train_aux_decoder: true
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 12d0b1dba..72e3c2dfd 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -46,7 +46,7 @@ residual_layers: 20
 
 # shallow diffusion
 use_shallow_diffusion: false
-diff_depth: 400
+K_step_infer: 400
 shallow_diffusion_args:
   train_aux_decoder: true
   train_diffusion: true
diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index 7c4215bf1..d8bdc4442 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -223,7 +223,7 @@ def p_losses(self, x_start, t, cond, noise=None):
         return x_recon, noise
 
     def inference(self, cond, b=1, x_start=None, device=None):
-        depth = hparams.get('diff_depth', self.k_step)
+        depth = hparams.get('K_step_infer', self.k_step)
         noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device)
         if self.use_shallow_diffusion:
             t_max = min(depth, self.k_step)
diff --git a/scripts/infer.py b/scripts/infer.py
index 9108ff353..8c6e6e835 100644
--- a/scripts/infer.py
+++ b/scripts/infer.py
@@ -111,9 +111,9 @@ def acoustic(
 
     if depth >= 0:
         assert depth <= hparams['K_step'], f'Diffusion depth should not be larger than K_step {hparams["K_step"]}.'
-        hparams['diff_depth'] = depth
+        hparams['K_step_infer'] = depth
     elif hparams.get('use_shallow_diffusion', False):
-        depth = hparams['diff_depth']
+        depth = hparams['K_step_infer']
     else:
         depth = hparams['K_step']  # gaussian start (full depth diffusion)