From d7427586b6fccbdf29f4741a6ac31fc7a86e571d Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 01:58:05 +0800
Subject: [PATCH 01/15] Drop support for discrete F0 embedding (reserved in
 ONNX exporter)

---
 configs/acoustic.yaml                  |  1 -
 deployment/modules/fastspeech2.py      | 17 ++++++++++++++---
 docs/ConfigurationSchemas.md           | 16 ----------------
 modules/fastspeech/acoustic_encoder.py | 18 +++---------------
 utils/pitch_utils.py                   | 19 -------------------
 5 files changed, 17 insertions(+), 54 deletions(-)
diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index f32b878e3..974373cc7 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -56,7 +56,6 @@ energy_smooth_width: 0.12
 breathiness_smooth_width: 0.12
 
 use_spk_id: false
-f0_embed_type: continuous
 use_energy_embed: false
 use_breathiness_embed: false
 use_key_shift_embed: false
diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py
index 52f6a4d28..b2ee086c4 100644
--- a/deployment/modules/fastspeech2.py
+++ b/deployment/modules/fastspeech2.py
@@ -1,17 +1,22 @@
 import copy
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from modules.commons.common_layers import NormalInitEmbedding as Embedding
 from modules.fastspeech.acoustic_encoder import FastSpeech2Acoustic
 from modules.fastspeech.variance_encoder import FastSpeech2Variance
 from utils.hparams import hparams
-from utils.pitch_utils import (
-    f0_bin, f0_mel_min, f0_mel_max
-)
 from utils.text_encoder import PAD_INDEX
 
+f0_bin = 256
+f0_max = 1100.0
+f0_min = 50.0
+f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
 
 def f0_to_coarse(f0):
     f0_mel = 1127 * (1 + f0 / 700).log()
@@ -38,6 +43,12 @@ def forward(self, dur):
 class FastSpeech2AcousticONNX(FastSpeech2Acoustic):
     def __init__(self, vocab_size):
         super().__init__(vocab_size=vocab_size)
+
+        # for temporary compatibility; will be completely removed in the future
+        self.f0_embed_type = hparams.get('f0_embed_type', 'discrete')
+        if self.f0_embed_type == 'discrete':
+            self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX)
+
         self.lr = LengthRegulator()
         if hparams.get('use_key_shift_embed', False):
             self.shift_min, self.shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index f9fdf60cb..8f2abe364 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -648,22 +648,6 @@ Length of sinusoidal smoothing convolution kernel (in seconds) on extracted ener
 <tr><td align="center"><b>default</b></td><td>0.12</td>
 </tbody></table>
 
-### f0_embed_type
-
-Map f0 to embedding using:
-
-- `torch.nn.Linear` if 'continuous'
-- `torch.nn.Embedding` if 'discrete'
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>continuous</td>
-<tr><td align="center"><b>constraints</b></td><td>Choose from 'continuous', 'discrete'.</td>
-</tbody></table>
-
 ### f0_max
 
 Maximum base frequency (F0) in Hz for pitch extraction.
diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 61b04a4fa..5d1ab1699 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -8,7 +8,6 @@
 )
 from modules.fastspeech.tts_modules import FastSpeech2Encoder, mel2ph_to_dur
 from utils.hparams import hparams
-from utils.pitch_utils import f0_to_coarse
 from utils.text_encoder import PAD_INDEX
 
 
@@ -25,14 +24,7 @@ def __init__(self, vocab_size):
             use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
         )
 
-        self.f0_embed_type = hparams.get('f0_embed_type', 'discrete')
-        if self.f0_embed_type == 'discrete':
-            self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX)
-        elif self.f0_embed_type == 'continuous':
-            self.pitch_embed = Linear(1, hparams['hidden_size'])
-        else:
-            raise ValueError('f0_embed_type must be \'discrete\' or \'continuous\'.')
-
+        self.pitch_embed = Linear(1, hparams['hidden_size'])
         self.variance_embed_list = []
         self.use_energy_embed = hparams.get('use_energy_embed', False)
         self.use_breathiness_embed = hparams.get('use_breathiness_embed', False)
@@ -100,12 +92,8 @@ def forward(
                 spk_embed = self.spk_embed(spk_embed_id)[:, None, :]
             condition += spk_embed
 
-        if self.f0_embed_type == 'discrete':
-            pitch = f0_to_coarse(f0)
-            pitch_embed = self.pitch_embed(pitch)
-        else:
-            f0_mel = (1 + f0 / 700).log()
-            pitch_embed = self.pitch_embed(f0_mel[:, :, None])
+        f0_mel = (1 + f0 / 700).log()
+        pitch_embed = self.pitch_embed(f0_mel[:, :, None])
         condition += pitch_embed
 
         condition = self.forward_variance_embedding(
diff --git a/utils/pitch_utils.py b/utils/pitch_utils.py
index fe706f94f..57ae943f7 100644
--- a/utils/pitch_utils.py
+++ b/utils/pitch_utils.py
@@ -1,23 +1,4 @@
 import numpy as np
-import torch
-
-f0_bin = 256
-f0_max = 1100.0
-f0_min = 50.0
-f0_mel_min = 1127 * np.log(1 + f0_min / 700)
-f0_mel_max = 1127 * np.log(1 + f0_max / 700)
-
-
-def f0_to_coarse(f0):
-    is_torch = isinstance(f0, torch.Tensor)
-    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
-    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
-
-    f0_mel[f0_mel <= 1] = 1
-    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
-    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
-    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
-    return f0_coarse
 
 
 def norm_f0(f0, uv=None):

From 9505b5496d70a3403d06fe8841b8f32a76d551cd Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 02:15:21 +0800
Subject: [PATCH 02/15] Drop support for `interp_uv` configuration key

---
 augmentation/spec_stretch.py        |  2 +-
 configs/acoustic.yaml               |  1 -
 docs/ConfigurationSchemas.md        | 12 ------------
 preprocessing/acoustic_binarizer.py |  2 +-
 4 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/augmentation/spec_stretch.py b/augmentation/spec_stretch.py
index 5d90518ec..8944f56a6 100644
--- a/augmentation/spec_stretch.py
+++ b/augmentation/spec_stretch.py
@@ -48,7 +48,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
             ).cpu().numpy()
 
             f0, _ = self.pe.get_pitch(
-                wav, aug_item['length'], hparams, speed=speed, interp_uv=hparams['interp_uv']
+                wav, aug_item['length'], hparams, speed=speed, interp_uv=True
             )
             aug_item['f0'] = f0.astype(np.float32)
 
diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 974373cc7..99beb58c4 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -51,7 +51,6 @@ spec_min: [-5]
 spec_max: [0]
 mel_vmin: -6. #-6.
 mel_vmax: 1.5
-interp_uv: true
 energy_smooth_width: 0.12
 breathiness_smooth_width: 0.12
 
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index 8f2abe364..fa7d56413 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -856,18 +856,6 @@ Hop size or step length (in number of waveform samples) of mel and feature extra
 <tr><td align="center"><b>default</b></td><td>512</td>
 </tbody></table>
 
-### interp_uv
-
-Whether to apply linear interpolation to unvoiced parts in f0.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>reserved</td>
-<tr><td align="center"><b>type</b></td><td>boolean</td>
-<tr><td align="center"><b>default</b></td><td>true</td>
-</tbody></table>
-
 ### lambda_aux_mel_loss
 
 Coefficient of aux mel loss when calculating total loss of acoustic model with shallow diffusion.
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index c6cf48b24..fdc87a22a 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -108,7 +108,7 @@ def process_item(self, item_name, meta_data, binarization_args):
         if pitch_extractor is None:
             pitch_extractor = initialize_pe()
         gt_f0, uv = pitch_extractor.get_pitch(
-            wav, length, hparams, interp_uv=hparams['interp_uv']
+            wav, length, hparams, interp_uv=True
         )
         if uv.all():  # All unvoiced
             print(f'Skipped \'{item_name}\': empty gt f0')

From c56f3f35f2591a75ffe9772121c5b37bd92feecc Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 17:40:39 +0800
Subject: [PATCH 03/15] Drop support for `train_set_name` and `valid_set_name`
 configuration keys

---
 basics/base_task.py          |  4 ++--
 configs/base.yaml            |  2 --
 docs/ConfigurationSchemas.md | 24 ------------------------
 3 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/basics/base_task.py b/basics/base_task.py
index 8a180e0b0..38e83386b 100644
--- a/basics/base_task.py
+++ b/basics/base_task.py
@@ -87,8 +87,8 @@ def _finish_init(self):
     # Training, validation and testing
     ###########
     def setup(self, stage):
-        self.train_dataset = self.dataset_cls(hparams['train_set_name'])
-        self.valid_dataset = self.dataset_cls(hparams['valid_set_name'])
+        self.train_dataset = self.dataset_cls('train')
+        self.valid_dataset = self.dataset_cls('valid')
         self.num_replicas = (self.trainer.distributed_sampler_kwargs or {}).get('num_replicas', 1)
 
     def get_need_freeze_state_dict_key(self, model_state_dict) -> list:
diff --git a/configs/base.yaml b/configs/base.yaml
index 57ec1c583..1171e85e2 100644
--- a/configs/base.yaml
+++ b/configs/base.yaml
@@ -67,8 +67,6 @@ max_batch_frames: 32000
 max_batch_size: 100000
 max_val_batch_frames: 60000
 max_val_batch_size: 1
-train_set_name: 'train'
-valid_set_name: 'valid'
 pe: 'parselmouth'
 pe_ckpt: ''
 f0_min: 65
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index fa7d56413..e47cf7a38 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -1731,18 +1731,6 @@ Total number of diffusion steps.
 <tr><td align="center"><b>default</b></td><td>1000</td>
 </tbody></table>
 
-### train_set_name
-
-Name of the training set used in binary filenames, TensorBoard keys, etc.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing, training</td>
-<tr><td align="center"><b>customizability</b></td><td>reserved</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>train</td>
-</tbody></table>
-
 ### use_breathiness_embed
 
 Whether to accept and embed breathiness values into the model.
@@ -1877,18 +1865,6 @@ Whether to load and use the vocoder to generate audio during validation. Validat
 <tr><td align="center"><b>default</b></td><td>true</td>
 </tbody></table>
 
-### valid_set_name
-
-Name of the validation set used in binary filenames, TensorBoard keys, etc.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing, training</td>
-<tr><td align="center"><b>customizability</b></td><td>reserved</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>valid</td>
-</tbody></table>
-
 ### variances_prediction_args
 
 Arguments for prediction of variance parameters other than pitch, like energy, breathiness, etc.

From 9f79b75d46edb7132335d78efc37517a71cb5c4f Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 17:47:36 +0800
Subject: [PATCH 04/15] Drop support for linear domain of random time
 stretching augmentation

---
 configs/acoustic.yaml                  |  1 -
 configs/templates/config_acoustic.yaml |  1 -
 docs/ConfigurationSchemas.md           | 16 ----------------
 preprocessing/acoustic_binarizer.py    | 11 ++---------
 4 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 99beb58c4..fd85e990c 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -39,7 +39,6 @@ augmentation_args:
   random_time_stretching:
     enabled: false
     range: [0.5, 2.]
-    domain: log  # or linear
     scale: 0.75
 
 raw_data_dir: 'data/opencpop/raw'
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index ef32ef5ac..84f6a7079 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -37,7 +37,6 @@ augmentation_args:
   random_time_stretching:
     enabled: true
     range: [0.5, 2.]
-    domain: log  # or linear
     scale: 0.75
 
 residual_channels: 512
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index e47cf7a38..723fe9129 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -165,22 +165,6 @@ Arguments for random time stretching augmentation.
 <tr><td align="center"><b>type</b></td><td>dict</td>
 </tbody></table>
 
-### augmentation_args.random_time_stretching.domain
-
-The domain where random time stretching factors are uniformly distributed in.
-
-- If 'linear', stretching ratio $x$ will be uniformly distributed in $[V_{min}, V_{max}]$.
-- If 'log', $\ln{x}$ will be uniformly distributed in $[\ln{V_{min}}, \ln{V_{max}}]$.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>log</td>
-<tr><td align="center"><b>constraint</b></td><td>Choose from 'log', 'linear'.</td>
-</tbody></table>
-
 ### augmentation_args.random_time_stretching.enabled
 
 Whether to apply random time stretching augmentation.
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index fdc87a22a..7ec1ae6f0 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -225,12 +225,10 @@ def arrange_data_augmentation(self, data_iterator):
             from augmentation.spec_stretch import SpectrogramStretchAugmentation
             aug_args = self.augmentation_args['random_time_stretching']
             speed_min, speed_max = aug_args['range']
-            domain = aug_args['domain']
             assert hparams.get('use_speed_embed', False), \
                 'Random time stretching augmentation requires use_speed_embed == True.'
             assert 0 < speed_min < 1 < speed_max, \
                 'Random time stretching augmentation must have a range where 0 < min < 1 < max.'
-            assert domain in ['log', 'linear'], 'domain must be \'log\' or \'linear\'.'
 
             aug_ins = SpectrogramStretchAugmentation(self.raw_data_dirs, aug_args, pe=aug_pe)
             scale = aug_args['scale']
@@ -241,13 +239,8 @@ def arrange_data_augmentation(self, data_iterator):
             aug_items = random.choices(all_item_names, k=k_from_raw) + random.choices(aug_list, k=k_from_aug + k_mutate)
 
             for aug_type, aug_item in zip(aug_types, aug_items):
-                if domain == 'log':
-                    # Uniform distribution in log domain
-                    speed = speed_min * (speed_max / speed_min) ** random.random()
-                else:
-                    # Uniform distribution in linear domain
-                    rand = random.uniform(-1, 1)
-                    speed = 1 + (speed_max - 1) * rand if rand >= 0 else 1 + (1 - speed_min) * rand
+                # Uniform distribution in log domain
+                speed = speed_min * (speed_max / speed_min) ** random.random()
                 if aug_type == 0:
                     aug_task = {
                         'name': aug_item,

From 996786721973fa956adc452aefadb408df66ccc8 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 18:04:33 +0800
Subject: [PATCH 05/15] Drop support for `num_pad_tokens` configuration key

---
 configs/acoustic.yaml        |  1 -
 configs/variance.yaml        |  1 -
 docs/BestPractices.md        |  6 +-----
 docs/ConfigurationSchemas.md | 14 --------------
 utils/text_encoder.py        | 23 ++++-------------------
 5 files changed, 5 insertions(+), 40 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index fd85e990c..f3f8b40d5 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -45,7 +45,6 @@ raw_data_dir: 'data/opencpop/raw'
 binary_data_dir: 'data/opencpop/binary'
 binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer
 dictionary: dictionaries/opencpop-extension.txt
-num_pad_tokens: 1
 spec_min: [-5]
 spec_max: [0]
 mel_vmin: -6. #-6.
diff --git a/configs/variance.yaml b/configs/variance.yaml
index 1951f685e..4a780bdd8 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -29,7 +29,6 @@ raw_data_dir: 'data/opencpop_variance/raw'
 binary_data_dir: 'data/opencpop_variance/binary'
 binarizer_cls: preprocessing.variance_binarizer.VarianceBinarizer
 dictionary: dictionaries/opencpop-extension.txt
-num_pad_tokens: 1
 
 use_spk_id: false
 
diff --git a/docs/BestPractices.md b/docs/BestPractices.md
index b142a3873..23829de79 100644
--- a/docs/BestPractices.md
+++ b/docs/BestPractices.md
@@ -132,11 +132,7 @@ Once the coverage checks passed, a phoneme distribution summary will be saved in
 
 ![phoneme-distribution](resources/phoneme-distribution.jpg)
 
-During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. By default, there are one padding index before all real phonemes IDs. You may edit the number of padding indices, but it is not recommended to do so:
-
-```yaml
-num_pad_tokens: 1
-```
+During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. There are one padding index (marked as `<PAD`) before all real phonemes IDs.
 
 The dictionary used to binarize the dataset will be copied to the binary data directory by the binarizer, and will be copied again to the experiment directory by the trainer. When exported to ONNX, the dictionary and the phoneme sequence ordered by IDs will be saved to the artifact directory. You do not need to carry the original dictionary file for training and inference.
 
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index 723fe9129..5ee45f1a4 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -1096,20 +1096,6 @@ The number of attention heads of `torch.nn.MultiheadAttention` in FastSpeech2 en
 <tr><td align="center"><b>default</b></td><td>2</td>
 </tbody></table>
 
-### num_pad_tokens
-
-Number of padding phoneme indexes before all real tokens.
-
-Due to some historical reasons, old checkpoints may have 3 padding tokens called \<PAD\>, \<EOS\> and \<UNK\>. After refactoring, all padding tokens are called \<PAD\>, and only the first one (token == 0) will be used.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
-<tr><td align="center"><b>scope</b></td><td>nn, preprocess</td>
-<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>1</td>
-</tbody></table>
-
 ### num_sanity_val_steps
 
 Number of sanity validation steps at the beginning.
diff --git a/utils/text_encoder.py b/utils/text_encoder.py
index 605b7e80e..4b7815c46 100644
--- a/utils/text_encoder.py
+++ b/utils/text_encoder.py
@@ -1,19 +1,9 @@
 import numpy as np
 
-from utils.hparams import hparams
-
 PAD = '<PAD>'
 PAD_INDEX = 0
 
 
-def strip_ids(ids, ids_to_strip):
-    """Strip ids_to_strip from the end ids."""
-    ids = list(ids)
-    while ids and ids[-1] in ids_to_strip:
-        ids.pop()
-    return ids
-
-
 class TokenTextEncoder:
     """Encoder based on a user-supplied vocabulary (file or list)."""
 
@@ -26,30 +16,25 @@ def __init__(self, vocab_list):
         Args:
             vocab_list: If not None, a list of elements of the vocabulary.
         """
-        self.num_reserved_ids = hparams.get('num_pad_tokens', 3)
-        assert self.num_reserved_ids > 0, 'num_pad_tokens must be positive'
         self.vocab_list = sorted(vocab_list)
 
     def encode(self, sentence):
         """Converts a space-separated string of phones to a list of ids."""
         phones = sentence.strip().split() if isinstance(sentence, str) else sentence
-        return [self.vocab_list.index(ph) + self.num_reserved_ids if ph != PAD else PAD_INDEX for ph in phones]
+        return [self.vocab_list.index(ph) + 1 if ph != PAD else PAD_INDEX for ph in phones]
 
     def decode(self, ids, strip_padding=False):
         if strip_padding:
             ids = np.trim_zeros(ids)
         ids = list(ids)
         return ' '.join([
-            self.vocab_list[_id - self.num_reserved_ids] if _id >= self.num_reserved_ids else PAD
+            self.vocab_list[_id - 1] if _id >= 1 else PAD
             for _id in ids
         ])
 
-    def pad(self):
-        pass
-
     @property
     def vocab_size(self):
-        return len(self.vocab_list) + self.num_reserved_ids
+        return len(self.vocab_list) + 1
 
     def __len__(self):
         return self.vocab_size
@@ -64,5 +49,5 @@ def store_to_file(self, filename):
         filename: Full path of the file to store the vocab to.
         """
         with open(filename, 'w', encoding='utf8') as f:
-            [print(PAD, file=f) for _ in range(self.num_reserved_ids)]
+            print(PAD, file=f)
             [print(tok, file=f) for tok in self.vocab_list]

From a09221e072e6c559333083771c4dd034f550e3e8 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 18:50:53 +0800
Subject: [PATCH 06/15] Drop support for code backup before training

---
 basics/base_task.py          | 10 ----------
 configs/base.yaml            |  5 -----
 docs/ConfigurationSchemas.md | 12 ------------
 3 files changed, 27 deletions(-)

diff --git a/basics/base_task.py b/basics/base_task.py
index 38e83386b..e176f2c9a 100644
--- a/basics/base_task.py
+++ b/basics/base_task.py
@@ -3,7 +3,6 @@
 import pathlib
 import shutil
 import sys
-from datetime import datetime
 from typing import Dict
 
 import matplotlib
@@ -16,7 +15,6 @@
 import torch.utils.data
 from torchmetrics import Metric, MeanMetric
 import lightning.pytorch as pl
-from lightning.pytorch.callbacks import LearningRateMonitor
 from lightning.pytorch.utilities.rank_zero import rank_zero_debug, rank_zero_info, rank_zero_only
 
 from basics.base_module import CategorizedModule
@@ -451,14 +449,6 @@ def start(cls):
         if not hparams['infer']:  # train
             @rank_zero_only
             def train_payload_copy():
-                # copy_code = input(f'{hparams["save_codes"]} code backup? y/n: ') == 'y'
-                copy_code = True  # backup code every time
-                if copy_code:
-                    code_dir = work_dir / 'codes' / datetime.now().strftime('%Y%m%d%H%M%S')
-                    code_dir.mkdir(exist_ok=True, parents=True)
-                    for c in hparams['save_codes']:
-                        shutil.copytree(c, code_dir / c, dirs_exist_ok=True)
-                    print(f'| Copied codes to {code_dir}.')
                 # Copy spk_map.json and dictionary.txt to work dir
                 binary_dir = pathlib.Path(hparams['binary_data_dir'])
                 spk_map = work_dir / 'spk_map.json'
diff --git a/configs/base.yaml b/configs/base.yaml
index 1171e85e2..34708c275 100644
--- a/configs/base.yaml
+++ b/configs/base.yaml
@@ -1,11 +1,6 @@
 # task
 task_cls: null
 seed: 1234
-save_codes:
-  - configs
-  - modules
-  - training
-  - utils
 
 #############
 # dataset
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index 5ee45f1a4..bd7ac7697 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -1487,18 +1487,6 @@ Training performance on some datasets may be very sensitive to this value. Chang
 <tr><td align="center"><b>default</b></td><td>6</td>
 </tbody></table>
 
-### save_codes
-
-Files in these folders will be backed up every time a training starts.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>training</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>list</td>
-<tr><td align="center"><b>default</b></td><td>[configs, modules, training, utils]</td>
-</tbody></table>
-
 ### schedule_type
 
 The diffusion schedule type.

From bae5779e5d5a18415ae659a602bccc16711f705d Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 20:18:53 +0800
Subject: [PATCH 07/15] Drop support for `ffn_padding` configuration key

---
 configs/base.yaml                      |  1 -
 docs/ConfigurationSchemas.md           | 14 +---
 modules/commons/common_layers.py       | 89 ++++++--------------------
 modules/fastspeech/acoustic_encoder.py |  3 +-
 modules/fastspeech/tts_modules.py      | 62 ++++++------------
 modules/fastspeech/variance_encoder.py |  7 +-
 6 files changed, 44 insertions(+), 132 deletions(-)

diff --git a/configs/base.yaml b/configs/base.yaml
index 34708c275..0fb73eaaa 100644
--- a/configs/base.yaml
+++ b/configs/base.yaml
@@ -31,7 +31,6 @@ enc_layers: 4
 num_heads: 2
 enc_ffn_kernel_size: 9
 ffn_act: gelu
-ffn_padding: 'SAME'
 use_spk_id: false
 
 ###########
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index bd7ac7697..6e3799f04 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -673,18 +673,6 @@ Activation function of TransformerFFNLayer in FastSpeech2 encoder:
 <tr><td align="center"><b>constraints</b></td><td>Choose from 'relu', 'gelu', 'swish'.</td>
 </tbody></table>
 
-### ffn_padding
-
-Padding mode of TransformerFFNLayer convolution in FastSpeech2 encoder.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>SAME</td>
-</tbody></table>
-
 ### fft_size
 
 Fast Fourier Transforms parameter for mel extraction.
@@ -1042,7 +1030,7 @@ Minimum mel spectrogram heatmap value for TensorBoard plotting.
 
 ### melody_encoder_args
 
-Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `enc_ffn_kernel_size`, `ffn_padding`, `ffn_act`, `dropout`, `num_heads`, `use_pos_embed`, `rel_pos`. If either of the parameter does not exist in this configuration key, it inherits from the linguistic encoder.
+Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `enc_ffn_kernel_size`, `ffn_act`, `dropout`, `num_heads`, `use_pos_embed`, `rel_pos`. If either of the parameter does not exist in this configuration key, it inherits from the linguistic encoder.
 
 <table><tbody>
 <tr><td align="center"><b>type</b></td><td>dict</td>
diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py
index 7e5fd08d4..9ea2c2638 100644
--- a/modules/commons/common_layers.py
+++ b/modules/commons/common_layers.py
@@ -104,35 +104,13 @@ def max_positions():
         return int(1e5)  # an arbitrary large number
 
 
-class ConvTBC(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
-        super(ConvTBC, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.padding = padding
-
-        self.weight = torch.nn.Parameter(torch.Tensor(
-            self.kernel_size, in_channels, out_channels))
-        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
-
-    def forward(self, x):
-        return torch.conv_tbc(x.contiguous(), self.weight, self.bias, self.padding)
-
-
 class TransformerFFNLayer(nn.Module):
-    def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'):
+    def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'):
         super().__init__()
         self.kernel_size = kernel_size
         self.dropout = dropout
         self.act = act
-        if padding == 'SAME':
-            self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
-        elif padding == 'LEFT':
-            self.ffn_1 = nn.Sequential(
-                nn.ConstantPad1d((kernel_size - 1, 0), 0.0),
-                nn.Conv1d(hidden_size, filter_size, kernel_size)
-            )
+        self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
         if self.act == 'relu':
             self.act_fn = ReLU()
         elif self.act == 'gelu':
@@ -152,44 +130,18 @@ def forward(self, x):
         return x
 
 
-class BatchNorm1dTBC(nn.Module):
-    def __init__(self, c):
-        super(BatchNorm1dTBC, self).__init__()
-        self.bn = nn.BatchNorm1d(c)
-
-    def forward(self, x):
-        """
-
-        :param x: [T, B, C]
-        :return: [T, B, C]
-        """
-        x = x.permute(1, 2, 0)  # [B, C, T]
-        x = self.bn(x)  # [B, C, T]
-        x = x.permute(2, 0, 1)  # [T, B, C]
-        return x
-
-
 class EncSALayer(nn.Module):
     def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
-                 relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'):
+                 relu_dropout=0.1, kernel_size=9, act='gelu'):
         super().__init__()
-        self.c = c
         self.dropout = dropout
-        self.num_heads = num_heads
-        if num_heads > 0:
-            if norm == 'ln':
-                self.layer_norm1 = LayerNorm(c)
-            elif norm == 'bn':
-                self.layer_norm1 = BatchNorm1dTBC(c)
-            self.self_attn = MultiheadAttention(
-                self.c, num_heads, dropout=attention_dropout, bias=False,
-            )
-        if norm == 'ln':
-            self.layer_norm2 = LayerNorm(c)
-        elif norm == 'bn':
-            self.layer_norm2 = BatchNorm1dTBC(c)
+        self.layer_norm1 = LayerNorm(c)
+        self.self_attn = MultiheadAttention(
+            c, num_heads, dropout=attention_dropout, bias=False,
+        )
+        self.layer_norm2 = LayerNorm(c)
         self.ffn = TransformerFFNLayer(
-            c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act
+            c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act
         )
 
     def forward(self, x, encoder_padding_mask=None, **kwargs):
@@ -197,18 +149,17 @@ def forward(self, x, encoder_padding_mask=None, **kwargs):
         if layer_norm_training is not None:
             self.layer_norm1.training = layer_norm_training
             self.layer_norm2.training = layer_norm_training
-        if self.num_heads > 0:
-            residual = x
-            x = self.layer_norm1(x)
-            x, _, = self.self_attn(
-                query=x,
-                key=x,
-                value=x,
-                key_padding_mask=encoder_padding_mask
-            )
-            x = F.dropout(x, self.dropout, training=self.training)
-            x = residual + x
-            x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        residual = x
+        x = self.layer_norm1(x)
+        x, _, = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=encoder_padding_mask
+        )
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
 
         residual = x
         x = self.layer_norm2(x)
diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 5d1ab1699..494354cb2 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -18,8 +18,7 @@ def __init__(self, vocab_size):
         self.dur_embed = Linear(1, hparams['hidden_size'])
         self.encoder = FastSpeech2Encoder(
             self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
-            ffn_kernel_size=hparams['enc_ffn_kernel_size'],
-            ffn_padding=hparams['ffn_padding'], ffn_act=hparams['ffn_act'],
+            ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'],
             dropout=hparams['dropout'], num_heads=hparams['num_heads'],
             use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
         )
diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py
index bc5eff265..5b31909b2 100644
--- a/modules/fastspeech/tts_modules.py
+++ b/modules/fastspeech/tts_modules.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 from torch.nn import functional as F
 
-from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer, BatchNorm1dTBC
+from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer
 from modules.commons.espnet_positional_embedding import RelPositionalEncoding
 
 DEFAULT_MAX_SOURCE_POSITIONS = 2000
@@ -12,14 +12,13 @@
 
 
 class TransformerEncoderLayer(nn.Module):
-    def __init__(self, hidden_size, dropout, kernel_size=None, padding='SAME', act='gelu', num_heads=2, norm='ln'):
+    def __init__(self, hidden_size, dropout, kernel_size=None, act='gelu', num_heads=2):
         super().__init__()
         self.op = EncSALayer(
             hidden_size, num_heads, dropout=dropout,
             attention_dropout=0.0, relu_dropout=dropout,
             kernel_size=kernel_size,
-            padding=padding,
-            norm=norm, act=act
+            act=act
         )
 
     def forward(self, x, **kwargs):
@@ -63,7 +62,7 @@ class DurationPredictor(torch.nn.Module):
     """
 
     def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3,
-                 dropout_rate=0.1, offset=1.0, padding='SAME', dur_loss_type='mse'):
+                 dropout_rate=0.1, offset=1.0, dur_loss_type='mse'):
         """Initialize duration predictor module.
         Args:
             in_dims (int): Input dimension.
@@ -77,18 +76,14 @@ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3,
         self.offset = offset
         self.conv = torch.nn.ModuleList()
         self.kernel_size = kernel_size
-        self.padding = padding
         for idx in range(n_layers):
             in_chans = in_dims if idx == 0 else n_chans
-            self.conv += [torch.nn.Sequential(
-                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
-                                       if padding == 'SAME'
-                                       else (kernel_size - 1, 0), 0),
-                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+            self.conv.append(torch.nn.Sequential(
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
                 torch.nn.ReLU(),
                 LayerNorm(n_chans, dim=1),
                 torch.nn.Dropout(dropout_rate)
-            )]
+            ))
 
         self.loss_type = dur_loss_type
         if self.loss_type in ['mse', 'huber']:
@@ -141,7 +136,7 @@ def forward(self, xs, x_masks=None, infer=True):
 class VariancePredictor(torch.nn.Module):
     def __init__(self, vmin, vmax, in_dims,
                  n_layers=5, n_chans=512, kernel_size=5,
-                 dropout_rate=0.1, padding='SAME'):
+                 dropout_rate=0.1):
         """Initialize variance predictor module.
         Args:
             in_dims (int): Input dimension.
@@ -156,18 +151,14 @@ def __init__(self, vmin, vmax, in_dims,
         self.vmax = vmax
         self.conv = torch.nn.ModuleList()
         self.kernel_size = kernel_size
-        self.padding = padding
         for idx in range(n_layers):
             in_chans = in_dims if idx == 0 else n_chans
-            self.conv += [torch.nn.Sequential(
-                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
-                                       if padding == 'SAME'
-                                       else (kernel_size - 1, 0), 0),
-                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+            self.conv.append(torch.nn.Sequential(
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
                 torch.nn.ReLU(),
                 LayerNorm(n_chans, dim=1),
                 torch.nn.Dropout(dropout_rate)
-            )]
+            ))
         self.linear = torch.nn.Linear(n_chans, 1)
         self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
         self.pos_embed_alpha = nn.Parameter(torch.Tensor([1]))
@@ -195,7 +186,7 @@ def forward(self, xs, infer=True):
 class PitchPredictor(torch.nn.Module):
     def __init__(self, vmin, vmax, num_bins, deviation,
                  in_dims, n_layers=5, n_chans=384, kernel_size=5,
-                 dropout_rate=0.1, padding='SAME'):
+                 dropout_rate=0.1):
         """Initialize pitch predictor module.
         Args:
             in_dims (int): Input dimension.
@@ -214,18 +205,14 @@ def __init__(self, vmin, vmax, num_bins, deviation,
         self.base_pitch_embed = torch.nn.Linear(1, in_dims)
         self.conv = torch.nn.ModuleList()
         self.kernel_size = kernel_size
-        self.padding = padding
         for idx in range(n_layers):
             in_chans = in_dims if idx == 0 else n_chans
-            self.conv += [torch.nn.Sequential(
-                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
-                                       if padding == 'SAME'
-                                       else (kernel_size - 1, 0), 0),
-                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+            self.conv.append(torch.nn.Sequential(
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
                 torch.nn.ReLU(),
                 LayerNorm(n_chans, dim=1),
                 torch.nn.Dropout(dropout_rate)
-            )]
+            ))
         self.linear = torch.nn.Linear(n_chans, num_bins)
         self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
         self.pos_embed_alpha = nn.Parameter(torch.Tensor([1]))
@@ -364,31 +351,23 @@ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
 
 class FastSpeech2Encoder(nn.Module):
     def __init__(self, embed_tokens, hidden_size, num_layers,
-                 ffn_kernel_size=9, ffn_padding='SAME', ffn_act='gelu',
-                 dropout=None, num_heads=2, use_last_norm=True, norm='ln',
-                 use_pos_embed=True, rel_pos=True):
+                 ffn_kernel_size=9, ffn_act='gelu',
+                 dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True):
         super().__init__()
         self.num_layers = num_layers
         embed_dim = self.hidden_size = hidden_size
         self.dropout = dropout
         self.use_pos_embed = use_pos_embed
-        self.use_last_norm = use_last_norm
 
         self.layers = nn.ModuleList([
             TransformerEncoderLayer(
                 self.hidden_size, self.dropout,
-                kernel_size=ffn_kernel_size, padding=ffn_padding, act=ffn_act,
+                kernel_size=ffn_kernel_size, act=ffn_act,
                 num_heads=num_heads
             )
             for _ in range(self.num_layers)
         ])
-        if self.use_last_norm:
-            if norm == 'ln':
-                self.layer_norm = nn.LayerNorm(embed_dim)
-            elif norm == 'bn':
-                self.layer_norm = BatchNorm1dTBC(embed_dim)
-        else:
-            self.layer_norm = None
+        self.layer_norm = nn.LayerNorm(embed_dim)
 
         self.embed_tokens = embed_tokens  # redundant, but have to persist for compatibility with old checkpoints
         self.embed_scale = math.sqrt(hidden_size)
@@ -438,8 +417,7 @@ def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_
         for layer in self.layers:
             x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
             hiddens.append(x)
-        if self.use_last_norm:
-            x = self.layer_norm(x) * nonpadding_mask_TB
+        x = self.layer_norm(x) * nonpadding_mask_TB
         if return_hiddens:
             x = torch.stack(hiddens, 0)  # [L, T, B, C]
             x = x.transpose(1, 2)  # [L, B, T, C]
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index 8e2117f6b..82702b3b6 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -27,8 +27,7 @@ def __init__(self, vocab_size):
 
         self.encoder = FastSpeech2Encoder(
             self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
-            ffn_kernel_size=hparams['enc_ffn_kernel_size'],
-            ffn_padding=hparams['ffn_padding'], ffn_act=hparams['ffn_act'],
+            ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'],
             dropout=hparams['dropout'], num_heads=hparams['num_heads'],
             use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
         )
@@ -41,7 +40,6 @@ def __init__(self, vocab_size):
                 n_chans=dur_hparams['hidden_size'],
                 n_layers=dur_hparams['num_layers'],
                 dropout_rate=dur_hparams['dropout'],
-                padding=hparams['ffn_padding'],
                 kernel_size=dur_hparams['kernel_size'],
                 offset=dur_hparams['log_offset'],
                 dur_loss_type=dur_hparams['loss_type']
@@ -109,8 +107,7 @@ def get_hparam(key):
 
         self.encoder = FastSpeech2Encoder(
             None, hidden_size, num_layers=get_hparam('enc_layers'),
-            ffn_kernel_size=get_hparam('enc_ffn_kernel_size'),
-            ffn_padding=get_hparam('ffn_padding'), ffn_act=get_hparam('ffn_act'),
+            ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), ffn_act=get_hparam('ffn_act'),
             dropout=get_hparam('dropout'), num_heads=get_hparam('num_heads'),
             use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos')
         )

From 204593946ed749368bc60535d01f2bcccaf43fab Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 20:37:08 +0800
Subject: [PATCH 08/15] Drop support for random seeding

---
 basics/base_binarizer.py     |  1 -
 basics/base_task.py          |  4 +---
 configs/base.yaml            |  1 -
 docs/ConfigurationSchemas.md | 12 ------------
 4 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index 38b191077..ddad6e02e 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -173,7 +173,6 @@ def process(self):
         self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)
 
         if self.binarization_args['shuffle']:
-            random.seed(hparams['seed'])
             random.shuffle(self.item_names)
 
         self.binary_data_dir.mkdir(parents=True, exist_ok=True)
diff --git a/basics/base_task.py b/basics/base_task.py
index e176f2c9a..c933e2a99 100644
--- a/basics/base_task.py
+++ b/basics/base_task.py
@@ -342,8 +342,7 @@ def train_dataloader(self):
             size_reversed=True,
             required_batch_count_multiple=hparams['accumulate_grad_batches'],
             shuffle_sample=True,
-            shuffle_batch=True,
-            seed=hparams['seed']
+            shuffle_batch=True
         )
         return torch.utils.data.DataLoader(
             self.train_dataset,
@@ -394,7 +393,6 @@ def on_test_end(self):
 
     @classmethod
     def start(cls):
-        pl.seed_everything(hparams['seed'], workers=True)
         task = cls()
 
         # if pre_train is not None:
diff --git a/configs/base.yaml b/configs/base.yaml
index 0fb73eaaa..4eed200e9 100644
--- a/configs/base.yaml
+++ b/configs/base.yaml
@@ -1,6 +1,5 @@
 # task
 task_cls: null
-seed: 1234
 
 #############
 # dataset
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index 6e3799f04..46f0f23f5 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -1488,18 +1488,6 @@ The diffusion schedule type.
 <tr><td align="center"><b>constraints</b></td><td>Choose from 'linear', 'cosine'.</td>
 </tbody></table>
 
-### seed
-
-The global random seed used to shuffle data, initializing model weights, etc.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing, training</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>1234</td>
-</tbody></table>
-
 ### shallow_diffusion_args
 
 Arguments for shallow_diffusion.

From 1d2a9e80527347564ad7e12126d698ac3f9ff4c9 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 21:19:56 +0800
Subject: [PATCH 09/15] Add placeholder to load old checkpoint

---
 modules/fastspeech/tts_modules.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py
index 5b31909b2..590c4da92 100644
--- a/modules/fastspeech/tts_modules.py
+++ b/modules/fastspeech/tts_modules.py
@@ -79,6 +79,7 @@ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3,
         for idx in range(n_layers):
             in_chans = in_dims if idx == 0 else n_chans
             self.conv.append(torch.nn.Sequential(
+                torch.nn.Identity(),  # this is a placeholder for ConstantPad1d which is now merged into Conv1d
                 torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
                 torch.nn.ReLU(),
                 LayerNorm(n_chans, dim=1),

From 26e8f07f6b471217b6f2c6eff61c8ac4754b2832 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 7 Feb 2024 21:21:00 +0800
Subject: [PATCH 10/15] Remove duplicate txt_embed layer (resuming may raise
 errors)

---
 modules/fastspeech/acoustic_encoder.py | 2 +-
 modules/fastspeech/tts_modules.py      | 3 +--
 modules/fastspeech/variance_encoder.py | 4 ++--
 utils/__init__.py                      | 6 +++++-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 494354cb2..2099c9008 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -17,7 +17,7 @@ def __init__(self, vocab_size):
         self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX)
         self.dur_embed = Linear(1, hparams['hidden_size'])
         self.encoder = FastSpeech2Encoder(
-            self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
+            hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
             ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'],
             dropout=hparams['dropout'], num_heads=hparams['num_heads'],
             use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py
index 590c4da92..1dd164d17 100644
--- a/modules/fastspeech/tts_modules.py
+++ b/modules/fastspeech/tts_modules.py
@@ -351,7 +351,7 @@ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
 
 
 class FastSpeech2Encoder(nn.Module):
-    def __init__(self, embed_tokens, hidden_size, num_layers,
+    def __init__(self, hidden_size, num_layers,
                  ffn_kernel_size=9, ffn_act='gelu',
                  dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True):
         super().__init__()
@@ -370,7 +370,6 @@ def __init__(self, embed_tokens, hidden_size, num_layers,
         ])
         self.layer_norm = nn.LayerNorm(embed_dim)
 
-        self.embed_tokens = embed_tokens  # redundant, but have to persist for compatibility with old checkpoints
         self.embed_scale = math.sqrt(hidden_size)
         self.padding_idx = 0
         self.rel_pos = rel_pos
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index 82702b3b6..82e0a88e8 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -26,7 +26,7 @@ def __init__(self, vocab_size):
             self.ph_dur_embed = Linear(1, hparams['hidden_size'])
 
         self.encoder = FastSpeech2Encoder(
-            self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
+            hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
             ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'],
             dropout=hparams['dropout'], num_heads=hparams['num_heads'],
             use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
@@ -106,7 +106,7 @@ def get_hparam(key):
             self.note_glide_embed = Embedding(len(hparams['glide_types']) + 1, hidden_size, padding_idx=0)
 
         self.encoder = FastSpeech2Encoder(
-            None, hidden_size, num_layers=get_hparam('enc_layers'),
+            hidden_size=hidden_size, num_layers=get_hparam('enc_layers'),
             ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), ffn_act=get_hparam('ffn_act'),
             dropout=get_hparam('dropout'), num_heads=get_hparam('num_heads'),
             use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos')
diff --git a/utils/__init__.py b/utils/__init__.py
index 356fef7a5..abb5df151 100644
--- a/utils/__init__.py
+++ b/utils/__init__.py
@@ -165,9 +165,12 @@ def filter_kwargs(dict_to_filter, kwarg_obj):
 
 def load_ckpt(
         cur_model, ckpt_base_dir, ckpt_steps=None,
-        prefix_in_ckpt='model', key_in_ckpt='state_dict',
+        prefix_in_ckpt='model', ignored_prefixes=None, key_in_ckpt='state_dict',
         strict=True, device='cpu'
 ):
+    if ignored_prefixes is None:
+        # NOTICE: this is for compatibility with old checkpoints which have duplicate txt_embed layer in them.
+        ignored_prefixes = ['model.fs2.encoder.embed_tokens']
     if not isinstance(ckpt_base_dir, pathlib.Path):
         ckpt_base_dir = pathlib.Path(ckpt_base_dir)
     if ckpt_base_dir.is_file():
@@ -197,6 +200,7 @@ def load_ckpt(
         state_dict = OrderedDict({
             k[len(prefix_in_ckpt) + 1:]: v
             for k, v in state_dict.items() if k.startswith(f'{prefix_in_ckpt}.')
+            if all(not k.startswith(p) for p in ignored_prefixes)
         })
     if not strict:
         cur_model_state_dict = cur_model.state_dict()

From 49c23602ce55a7c943211951633f8d9e9a8e7935 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Thu, 8 Feb 2024 15:03:44 +0800
Subject: [PATCH 11/15] Remove migration script and error message for
 transcriptions.txt

---
 preprocessing/acoustic_binarizer.py | 35 ++++++-------
 scripts/migrate.py                  | 78 -----------------------------
 2 files changed, 14 insertions(+), 99 deletions(-)
 delete mode 100644 scripts/migrate.py

diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index 7ec1ae6f0..bba3b7b6b 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -55,27 +55,20 @@ def __init__(self):
 
     def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
         meta_data_dict = {}
-        if (raw_data_dir / 'transcriptions.csv').exists():
-            with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f:
-                for utterance_label in csv.DictReader(f):
-                    item_name = utterance_label['name']
-                    temp_dict = {
-                        'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
-                        'ph_seq': utterance_label['ph_seq'].split(),
-                        'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()],
-                        'spk_id': spk_id,
-                        'spk_name': self.speakers[ds_id],
-                    }
-                    assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \
-                        f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.'
-                    meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict
-        else:
-            raise FileNotFoundError(
-                f'transcriptions.csv not found in {raw_data_dir}. '
-                'If this is a dataset with the old transcription format, please consider '
-                'migrating it to the new format via the following command:\n'
-                'python scripts/migrate.py txt <INPUT_TXT>'
-            )
+        with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f:
+            for utterance_label in csv.DictReader(f):
+                item_name = utterance_label['name']
+                temp_dict = {
+                    'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
+                    'ph_seq': utterance_label['ph_seq'].split(),
+                    'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()],
+                    'spk_id': spk_id,
+                    'spk_name': self.speakers[ds_id],
+                }
+                assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \
+                    f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.'
+                meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict
+
         self.items.update(meta_data_dict)
 
     @torch.no_grad()
diff --git a/scripts/migrate.py b/scripts/migrate.py
deleted file mode 100644
index f1125f3d5..000000000
--- a/scripts/migrate.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import pathlib
-from collections import OrderedDict
-
-import click
-
-
-@click.group()
-def main():
-    pass
-
-
-@main.command(help='Migrate checkpoint files of MIDI-less acoustic models from old format')
-@click.argument('input_ckpt', metavar='INPUT')
-@click.argument('output_ckpt', metavar='OUTPUT')
-@click.option('--overwrite', is_flag=True, show_default=True, help='Overwrite the existing file')
-def ckpt(
-        input_ckpt: str,
-        output_ckpt: str,
-        overwrite: bool = False
-):
-    input_ckpt = pathlib.Path(input_ckpt).resolve()
-    output_ckpt = pathlib.Path(output_ckpt).resolve()
-    assert input_ckpt.exists(), 'The input file does not exist.'
-    assert overwrite or not output_ckpt.exists(), \
-        'The output file already exists or is the same as the input file.\n' \
-        'This is not recommended because migration scripts may not be stable, ' \
-        'and you may be at risk of losing your model.\n' \
-        'If you are sure to OVERWRITE the existing file, please re-run this script with the \'--overwrite\' argument.'
-
-    import torch
-    ckpt_loaded = torch.load(input_ckpt, map_location='cpu')
-    if 'category' in ckpt_loaded:
-        print('This checkpoint file is already in the new format.')
-        exit(0)
-    state_dict: OrderedDict = ckpt_loaded['state_dict']
-    ckpt_loaded['optimizer_states'][0]['state'].clear()
-    new_state_dict = OrderedDict()
-    for key in state_dict:
-        if key.startswith('model.fs2'):
-            # keep model.fs2.xxx
-            new_state_dict[key] = state_dict[key]
-        else:
-            # model.xxx => model.diffusion.xxx
-            path = key.split('.', maxsplit=1)[1]
-            new_state_dict[f'model.diffusion.{path}'] = state_dict[key]
-    ckpt_loaded['category'] = 'acoustic'
-    ckpt_loaded['state_dict'] = new_state_dict
-    torch.save(ckpt_loaded, output_ckpt)
-
-
-@main.command(help='Migrate transcriptions.txt in old datasets to transcriptions.csv')
-@click.argument('input_txt', metavar='INPUT')
-def txt(
-        input_txt: str
-):
-    input_txt = pathlib.Path(input_txt).resolve()
-    assert input_txt.exists(), 'The input file does not exist.'
-    with open(input_txt, 'r', encoding='utf8') as f:
-        utterances = f.readlines()
-    utterances = [u.split('|') for u in utterances]
-    utterances = [
-        {
-            'name': u[0],
-            'ph_seq': u[2],
-            'ph_dur': u[5]
-        }
-        for u in utterances
-    ]
-
-    import csv
-    with open(input_txt.with_suffix('.csv'), 'w', encoding='utf8', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur'])
-        writer.writeheader()
-        writer.writerows(utterances)
-
-
-if __name__ == '__main__':
-    main()

From 1bc60dd2f01d5c51afe50d08a6a50b3a459acca8 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Thu, 8 Feb 2024 15:38:54 +0800
Subject: [PATCH 12/15] Remove seed from batch shuffling

---
 utils/training_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/training_utils.py b/utils/training_utils.py
index 98fb6da47..d406985cd 100644
--- a/utils/training_utils.py
+++ b/utils/training_utils.py
@@ -103,7 +103,7 @@ def __init__(self, dataset, max_batch_frames, max_batch_size, sub_indices=None,
     def __form_batches(self):
         if self.formed == self.epoch + self.seed:
             return
-        rng = np.random.default_rng(self.seed + self.epoch)
+        rng = np.random.default_rng()
         # Create indices
         if self.shuffle_sample:
             if self.sub_indices is not None:

From 969e313529fba3c88380fb0f06fdc76751954fd1 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 24 Feb 2024 01:32:14 +0800
Subject: [PATCH 13/15] Use direct access on some hparam keys

---
 augmentation/spec_stretch.py              |  4 ++--
 basics/base_task.py                       |  6 +++---
 deployment/exporters/acoustic_exporter.py |  6 +++---
 deployment/modules/fastspeech2.py         | 10 +++++-----
 inference/ds_acoustic.py                  |  4 ++--
 modules/diffusion/ddpm.py                 |  6 +++---
 modules/pe/__init__.py                    |  2 +-
 preprocessing/acoustic_binarizer.py       |  8 ++++----
 scripts/binarize.py                       |  2 +-
 scripts/infer.py                          |  2 +-
 utils/phoneme_utils.py                    |  2 +-
 utils/training_utils.py                   |  2 +-
 12 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/augmentation/spec_stretch.py b/augmentation/spec_stretch.py
index 9c22e363b..7eeeda2dc 100644
--- a/augmentation/spec_stretch.py
+++ b/augmentation/spec_stretch.py
@@ -38,7 +38,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
 
         aug_item['mel'] = mel
 
-        if speed != 1. or hparams.get('use_speed_embed', False):
+        if speed != 1. or hparams['use_speed_embed']:
             aug_item['length'] = mel.shape[0]
             aug_item['speed'] = int(np.round(hparams['hop_size'] * speed)) / hparams['hop_size']  # real speed
             aug_item['seconds'] /= aug_item['speed']
@@ -83,7 +83,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
                         align_length=aug_item['length']
                     )
 
-        if key_shift != 0. or hparams.get('use_key_shift_embed', False):
+        if key_shift != 0. or hparams['use_key_shift_embed']:
             if replace_spk_id is None:
                 aug_item['key_shift'] = key_shift
             else:
diff --git a/basics/base_task.py b/basics/base_task.py
index c933e2a99..eabfcae6e 100644
--- a/basics/base_task.py
+++ b/basics/base_task.py
@@ -128,9 +128,9 @@ def load_finetune_ckpt(
         self.load_state_dict(state_dict, strict=False)
 
     def load_pre_train_model(self):
-        pre_train_ckpt_path = hparams.get('finetune_ckpt_path')
-        blacklist = hparams.get('finetune_ignored_params')
-        # whitelist=hparams.get('pre_train_whitelist')
+        pre_train_ckpt_path = hparams['finetune_ckpt_path']
+        blacklist = hparams['finetune_ignored_params']
+        # whitelist=hparams['pre_train_whitelist']
         if blacklist is None:
             blacklist = []
         # if whitelist is  None:
diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
index ab4252469..d637f34f1 100644
--- a/deployment/exporters/acoustic_exporter.py
+++ b/deployment/exporters/acoustic_exporter.py
@@ -58,7 +58,7 @@ def __init__(
             if hparams['use_spk_id'] else None
         self.export_spk: List[Tuple[str, Dict[str, float]]] = export_spk \
             if hparams['use_spk_id'] and export_spk is not None else []
-        if hparams.get('use_key_shift_embed', False) and not self.expose_gender:
+        if hparams['use_key_shift_embed'] and not self.expose_gender:
             shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
             key_shift = freeze_gender * shift_max if freeze_gender >= 0. else freeze_gender * abs(shift_min)
             key_shift = max(min(key_shift, shift_max), shift_min)  # clip key shift
@@ -143,14 +143,14 @@ def _torch_export_model(self):
                 for v_name in self.model.fs2.variance_embed_list
             }
         }
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             if self.expose_gender:
                 kwargs['gender'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device)
                 input_names.append('gender')
                 dynamix_axes['gender'] = {
                     1: 'n_frames'
                 }
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             if self.expose_velocity:
                 kwargs['velocity'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device)
                 input_names.append('velocity')
diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py
index b2ee086c4..157e031d8 100644
--- a/deployment/modules/fastspeech2.py
+++ b/deployment/modules/fastspeech2.py
@@ -45,14 +45,14 @@ def __init__(self, vocab_size):
         super().__init__(vocab_size=vocab_size)
 
         # for temporary compatibility; will be completely removed in the future
-        self.f0_embed_type = hparams.get('f0_embed_type', 'discrete')
+        self.f0_embed_type = hparams['f0_embed_type']
         if self.f0_embed_type == 'discrete':
             self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX)
 
         self.lr = LengthRegulator()
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             self.shift_min, self.shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             self.speed_min, self.speed_max = hparams['augmentation_args']['random_time_stretching']['range']
 
     # noinspection PyMethodOverriding
@@ -82,7 +82,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
             ], dim=-1).sum(-1)
             condition += variance_embeds
 
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             if hasattr(self, 'frozen_key_shift'):
                 key_shift_embed = self.key_shift_embed(self.frozen_key_shift[:, None, None])
             else:
@@ -92,7 +92,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
                 key_shift_embed = self.key_shift_embed(key_shift[:, :, None])
             condition += key_shift_embed
 
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             if velocity is not None:
                 velocity = torch.clip(velocity, min=self.speed_min, max=self.speed_max)
                 speed_embed = self.speed_embed(velocity[:, :, None])
diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py
index 02f6b3a92..a67f5b166 100644
--- a/inference/ds_acoustic.py
+++ b/inference/ds_acoustic.py
@@ -111,7 +111,7 @@ def preprocess_input(self, param, idx=0):
                 )).to(self.device)[None]
                 summary[v_name] = 'manual'
 
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
             gender = param.get('gender')
             if gender is None:
@@ -135,7 +135,7 @@ def preprocess_input(self, param, idx=0):
                     min=shift_min, max=shift_max
                 )
 
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             if param.get('velocity') is None:
                 summary['velocity'] = 'default'
                 batch['speed'] = torch.FloatTensor([1.]).to(self.device)[:, None]  # => [B=1, T=1]
diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index cd8295a7c..89aea5eb4 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -36,7 +36,7 @@ def noise_like(shape, device, repeat=False):
     return repeat_noise() if repeat else noise()
 
 
-def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.01)):
+def linear_beta_schedule(timesteps, max_beta=0.01):
     """
     linear schedule
     """
@@ -239,8 +239,8 @@ def inference(self, cond, b=1, x_start=None, device=None):
             assert x_start is not None, 'Missing shallow diffusion source.'
             x = x_start
 
-        if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0:
-            algorithm = hparams.get('diff_accelerator', 'ddim')
+        if hparams['pndm_speedup'] > 1 and t_max > 0:
+            algorithm = hparams['diff_accelerator']
             if algorithm == 'dpm-solver':
                 from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
                 # 1. Define the noise schedule.
diff --git a/modules/pe/__init__.py b/modules/pe/__init__.py
index 99d3dae95..edf747a32 100644
--- a/modules/pe/__init__.py
+++ b/modules/pe/__init__.py
@@ -6,7 +6,7 @@
 
 
 def initialize_pe():
-    pe = hparams.get('pe', 'parselmouth')
+    pe = hparams['pe']
     pe_ckpt = hparams['pe_ckpt']
     if pe == 'parselmouth':
         return ParselmouthPE()
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index ec5d88841..36b609ffb 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -189,10 +189,10 @@ def process_item(self, item_name, meta_data, binarization_args):
 
             processed_input['tension'] = tension.cpu().numpy()
 
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             processed_input['key_shift'] = 0.
 
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             processed_input['speed'] = 1.
 
         return processed_input
@@ -207,7 +207,7 @@ def arrange_data_augmentation(self, data_iterator):
             from augmentation.spec_stretch import SpectrogramStretchAugmentation
             aug_args = self.augmentation_args['random_pitch_shifting']
             key_shift_min, key_shift_max = aug_args['range']
-            assert hparams.get('use_key_shift_embed', False), \
+            assert hparams['use_key_shift_embed'], \
                 'Random pitch shifting augmentation requires use_key_shift_embed == True.'
             assert key_shift_min < 0 < key_shift_max, \
                 'Random pitch shifting augmentation must have a range where min < 0 < max.'
@@ -273,7 +273,7 @@ def arrange_data_augmentation(self, data_iterator):
             from augmentation.spec_stretch import SpectrogramStretchAugmentation
             aug_args = self.augmentation_args['random_time_stretching']
             speed_min, speed_max = aug_args['range']
-            assert hparams.get('use_speed_embed', False), \
+            assert hparams['use_speed_embed'], \
                 'Random time stretching augmentation requires use_speed_embed == True.'
             assert 0 < speed_min < 1 < speed_max, \
                 'Random time stretching augmentation must have a range where 0 < min < 1 < max.'
diff --git a/scripts/binarize.py b/scripts/binarize.py
index 767e947a3..74abd2ba2 100644
--- a/scripts/binarize.py
+++ b/scripts/binarize.py
@@ -13,7 +13,7 @@
 
 
 def binarize():
-    binarizer_cls = hparams.get("binarizer_cls", 'basics.base_binarizer.BaseBinarizer')
+    binarizer_cls = hparams["binarizer_cls"]
     pkg = ".".join(binarizer_cls.split(".")[:-1])
     cls_name = binarizer_cls.split(".")[-1]
     binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
diff --git a/scripts/infer.py b/scripts/infer.py
index 8c6e6e835..e56a71186 100644
--- a/scripts/infer.py
+++ b/scripts/infer.py
@@ -123,7 +123,7 @@ def acoustic(
 
     spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None
     for param in params:
-        if gender is not None and hparams.get('use_key_shift_embed'):
+        if gender is not None and hparams['use_key_shift_embed']:
             param['gender'] = gender
 
         if spk_mix is not None:
diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 7c6f317f5..269122a6d 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -29,7 +29,7 @@ def locate_dictionary():
     """
     assert 'dictionary' in hparams or 'g2p_dictionary' in hparams, \
         'Please specify a dictionary file in your config.'
-    config_dict_path = pathlib.Path(hparams.get('dictionary', hparams.get('g2p_dictionary')))
+    config_dict_path = pathlib.Path(hparams['dictionary'])
     if config_dict_path.exists():
         return config_dict_path
     work_dir = pathlib.Path(hparams['work_dir'])
diff --git a/utils/training_utils.py b/utils/training_utils.py
index d406985cd..26d24eec5 100644
--- a/utils/training_utils.py
+++ b/utils/training_utils.py
@@ -113,7 +113,7 @@ def __form_batches(self):
                 indices = rng.permutation(len(self.dataset))
 
             if self.sort_by_similar_size:
-                grid = int(hparams.get('sampler_frame_count_grid', 6))
+                grid = int(hparams['sampler_frame_count_grid'])
                 assert grid > 0
                 sizes = (np.round(np.array(self.dataset.sizes)[indices] / grid) * grid).clip(grid, None)
                 sizes *= (-1 if self.size_reversed else 1)

From a95d49186f3b8794b032f0c7ffd6411ffad20ec8 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 25 Feb 2024 15:43:49 +0800
Subject: [PATCH 14/15] Fix duplicate keys in YAML

---
 configs/variance.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/configs/variance.yaml b/configs/variance.yaml
index 8ad481ecb..e1c0338f1 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -79,10 +79,6 @@ energy_smooth_width: 0.12
 breathiness_db_min: -96.0
 breathiness_db_max: -20.0
 breathiness_smooth_width: 0.12
-tension_logit_min: -10.0
-tension_logit_max: 10.0
-tension_smooth_width: 0.12
-
 voicing_db_min: -96.0
 voicing_db_max: -12.0
 voicing_smooth_width: 0.12

From 1e6f1540521a7bc69d00cdf3e30487769af65c60 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 25 Feb 2024 15:46:58 +0800
Subject: [PATCH 15/15] Rename `pndm_speedup` to `diff_speedup`

---
 configs/acoustic.yaml        |  2 +-
 configs/variance.yaml        |  2 +-
 docs/ConfigurationSchemas.md | 22 +++++++++++-----------
 modules/diffusion/ddpm.py    | 10 +++++-----
 scripts/infer.py             | 10 ++++++++--
 5 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index a5d2226ce..07a97be47 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -66,7 +66,7 @@ timesteps: 1000
 max_beta: 0.02
 rel_pos: true
 diff_accelerator: ddim
-pndm_speedup: 10
+diff_speedup: 10
 hidden_size: 256
 residual_layers: 20
 residual_channels: 512
diff --git a/configs/variance.yaml b/configs/variance.yaml
index e1c0338f1..4136dffa7 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -104,7 +104,7 @@ max_beta: 0.02
 diff_decoder_type: 'wavenet'
 diff_loss_type: l2
 diff_accelerator: ddim
-pndm_speedup: 10
+diff_speedup: 10
 
 # train and eval
 num_sanity_val_steps: 1
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index 46f0f23f5..171e13068 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -402,6 +402,17 @@ Loss type of the DDPM.
 <tr><td align="center"><b>constraints</b></td><td>Choose from 'l1', 'l2'.</td>
 </tbody></table>
 
+### diff_speedup
+
+Diffusion sampling speed-up ratio. 1 means no speeding up.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>type</b></td><td>int</td>
+<tr><td align="center"><b>default</b></td><td>10</td>
+<tr><td align="center"><b>constraints</b></td><td>Must be a factor of <a href="#K_step">K_step</a>.</td>
+</tbody></table>
+
 ### dilation_cycle_length
 
 Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks.
@@ -1351,17 +1362,6 @@ Strategy name for the Lightning trainer.
 <tr><td align="center"><b>default</b></td><td>auto</td>
 </tbody></table>
 
-### pndm_speedup
-
-Diffusion sampling speed-up ratio. 1 means no speeding up.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>10</td>
-<tr><td align="center"><b>constraints</b></td><td>Must be a factor of <a href="#K_step">K_step</a>.</td>
-</tbody></table>
-
 ### predict_breathiness
 
 Whether to enable breathiness prediction.
diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index 89aea5eb4..7b91122fd 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -239,7 +239,7 @@ def inference(self, cond, b=1, x_start=None, device=None):
             assert x_start is not None, 'Missing shallow diffusion source.'
             x = x_start
 
-        if hparams['pndm_speedup'] > 1 and t_max > 0:
+        if hparams['diff_speedup'] > 1 and t_max > 0:
             algorithm = hparams['diff_accelerator']
             if algorithm == 'dpm-solver':
                 from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
@@ -270,7 +270,7 @@ def wrapped(x, t, **kwargs):
                 # costs and the sample quality.
                 dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
 
-                steps = t_max // hparams["pndm_speedup"]
+                steps = t_max // hparams["diff_speedup"]
                 self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False)
                 x = dpm_solver.sample(
                     x,
@@ -308,7 +308,7 @@ def wrapped(x, t, **kwargs):
                 # costs and the sample quality.
                 uni_pc = UniPC(model_fn, noise_schedule, variant='bh2')
 
-                steps = t_max // hparams["pndm_speedup"]
+                steps = t_max // hparams["diff_speedup"]
                 self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False)
                 x = uni_pc.sample(
                     x,
@@ -320,7 +320,7 @@ def wrapped(x, t, **kwargs):
                 self.bar.close()
             elif algorithm == 'pndm':
                 self.noise_list = deque(maxlen=4)
-                iteration_interval = hparams['pndm_speedup']
+                iteration_interval = hparams['diff_speedup']
                 for i in tqdm(
                         reversed(range(0, t_max, iteration_interval)), desc='sample time step',
                         total=t_max // iteration_interval, disable=not hparams['infer'], leave=False
@@ -330,7 +330,7 @@ def wrapped(x, t, **kwargs):
                         iteration_interval, cond=cond
                     )
             elif algorithm == 'ddim':
-                iteration_interval = hparams['pndm_speedup']
+                iteration_interval = hparams['diff_speedup']
                 for i in tqdm(
                         reversed(range(0, t_max, iteration_interval)), desc='sample time step',
                         total=t_max // iteration_interval, disable=not hparams['infer'], leave=False
diff --git a/scripts/infer.py b/scripts/infer.py
index e56a71186..3618bcb6d 100644
--- a/scripts/infer.py
+++ b/scripts/infer.py
@@ -119,7 +119,10 @@ def acoustic(
 
     if speedup > 0:
         assert depth % speedup == 0, f'Acceleration ratio must be factor of diffusion depth {depth}.'
-        hparams['pndm_speedup'] = speedup
+        hparams['diff_speedup'] = speedup
+    elif 'diff_speedup' not in hparams:
+        # NOTICE: this is for compatibility
+        hparams['diff_speedup'] = hparams['pndm_speedup']
 
     spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None
     for param in params:
@@ -213,7 +216,10 @@ def variance(
 
     if speedup > 0:
         assert hparams['K_step'] % speedup == 0, f'Acceleration ratio must be factor of K_step {hparams["K_step"]}.'
-        hparams['pndm_speedup'] = speedup
+        hparams['diff_speedup'] = speedup
+    elif 'diff_speedup' not in hparams:
+        # NOTICE: this is for compatibility
+        hparams['diff_speedup'] = hparams['pndm_speedup']
 
     spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None
     for param in params:

visibility	acoustic
scope	nn
customizability	normal
type	str
default	continuous
constraints	Choose from 'continuous', 'discrete'.
visibility	all
scope	preprocessing, training
customizability	reserved
type	str
default	train
visibility	acoustic, variance
scope	nn, preprocess
customizability	not recommended
type	int
default	1
visibility	all
scope	training
customizability	normal
type	list
default	[configs, modules, training, utils]