diff --git a/augmentation/spec_stretch.py b/augmentation/spec_stretch.py
index 6f1a394ef..7eeeda2dc 100644
--- a/augmentation/spec_stretch.py
+++ b/augmentation/spec_stretch.py
@@ -38,7 +38,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
 
         aug_item['mel'] = mel
 
-        if speed != 1. or hparams.get('use_speed_embed', False):
+        if speed != 1. or hparams['use_speed_embed']:
             aug_item['length'] = mel.shape[0]
             aug_item['speed'] = int(np.round(hparams['hop_size'] * speed)) / hparams['hop_size']  # real speed
             aug_item['seconds'] /= aug_item['speed']
@@ -50,7 +50,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
             f0, _ = self.pe.get_pitch(
                 wav, samplerate=hparams['audio_sample_rate'], length=aug_item['length'],
                 hop_size=hparams['hop_size'], f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
-                speed=speed, interp_uv=hparams['interp_uv']
+                speed=speed, interp_uv=True
             )
             aug_item['f0'] = f0.astype(np.float32)
 
@@ -83,7 +83,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
                         align_length=aug_item['length']
                     )
 
-        if key_shift != 0. or hparams.get('use_key_shift_embed', False):
+        if key_shift != 0. or hparams['use_key_shift_embed']:
             if replace_spk_id is None:
                 aug_item['key_shift'] = key_shift
             else:
diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index 38b191077..ddad6e02e 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -173,7 +173,6 @@ def process(self):
         self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)
 
         if self.binarization_args['shuffle']:
-            random.seed(hparams['seed'])
             random.shuffle(self.item_names)
 
         self.binary_data_dir.mkdir(parents=True, exist_ok=True)
diff --git a/basics/base_task.py b/basics/base_task.py
index 8a180e0b0..eabfcae6e 100644
--- a/basics/base_task.py
+++ b/basics/base_task.py
@@ -3,7 +3,6 @@
 import pathlib
 import shutil
 import sys
-from datetime import datetime
 from typing import Dict
 
 import matplotlib
@@ -16,7 +15,6 @@
 import torch.utils.data
 from torchmetrics import Metric, MeanMetric
 import lightning.pytorch as pl
-from lightning.pytorch.callbacks import LearningRateMonitor
 from lightning.pytorch.utilities.rank_zero import rank_zero_debug, rank_zero_info, rank_zero_only
 
 from basics.base_module import CategorizedModule
@@ -87,8 +85,8 @@ def _finish_init(self):
     # Training, validation and testing
     ###########
     def setup(self, stage):
-        self.train_dataset = self.dataset_cls(hparams['train_set_name'])
-        self.valid_dataset = self.dataset_cls(hparams['valid_set_name'])
+        self.train_dataset = self.dataset_cls('train')
+        self.valid_dataset = self.dataset_cls('valid')
         self.num_replicas = (self.trainer.distributed_sampler_kwargs or {}).get('num_replicas', 1)
 
     def get_need_freeze_state_dict_key(self, model_state_dict) -> list:
@@ -130,9 +128,9 @@ def load_finetune_ckpt(
         self.load_state_dict(state_dict, strict=False)
 
     def load_pre_train_model(self):
-        pre_train_ckpt_path = hparams.get('finetune_ckpt_path')
-        blacklist = hparams.get('finetune_ignored_params')
-        # whitelist=hparams.get('pre_train_whitelist')
+        pre_train_ckpt_path = hparams['finetune_ckpt_path']
+        blacklist = hparams['finetune_ignored_params']
+        # whitelist=hparams['pre_train_whitelist']
         if blacklist is None:
             blacklist = []
         # if whitelist is  None:
@@ -344,8 +342,7 @@ def train_dataloader(self):
             size_reversed=True,
             required_batch_count_multiple=hparams['accumulate_grad_batches'],
             shuffle_sample=True,
-            shuffle_batch=True,
-            seed=hparams['seed']
+            shuffle_batch=True
         )
         return torch.utils.data.DataLoader(
             self.train_dataset,
@@ -396,7 +393,6 @@ def on_test_end(self):
 
     @classmethod
     def start(cls):
-        pl.seed_everything(hparams['seed'], workers=True)
         task = cls()
 
         # if pre_train is not None:
@@ -451,14 +447,6 @@ def start(cls):
         if not hparams['infer']:  # train
             @rank_zero_only
             def train_payload_copy():
-                # copy_code = input(f'{hparams["save_codes"]} code backup? y/n: ') == 'y'
-                copy_code = True  # backup code every time
-                if copy_code:
-                    code_dir = work_dir / 'codes' / datetime.now().strftime('%Y%m%d%H%M%S')
-                    code_dir.mkdir(exist_ok=True, parents=True)
-                    for c in hparams['save_codes']:
-                        shutil.copytree(c, code_dir / c, dirs_exist_ok=True)
-                    print(f'| Copied codes to {code_dir}.')
                 # Copy spk_map.json and dictionary.txt to work dir
                 binary_dir = pathlib.Path(hparams['binary_data_dir'])
                 spk_map = work_dir / 'spk_map.json'
diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index b7e7dcfda..07a97be47 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -39,26 +39,22 @@ augmentation_args:
   random_time_stretching:
     enabled: false
     range: [0.5, 2.]
-    domain: log  # or linear
     scale: 0.75
 
 raw_data_dir: 'data/opencpop/raw'
 binary_data_dir: 'data/opencpop/binary'
 binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer
 dictionary: dictionaries/opencpop-extension.txt
-num_pad_tokens: 1
 spec_min: [-5]
 spec_max: [0]
 mel_vmin: -6. #-6.
 mel_vmax: 1.5
-interp_uv: true
 energy_smooth_width: 0.12
 breathiness_smooth_width: 0.12
 voicing_smooth_width: 0.12
 tension_smooth_width: 0.12
 
 use_spk_id: false
-f0_embed_type: continuous
 use_energy_embed: false
 use_breathiness_embed: false
 use_voicing_embed: false
@@ -70,7 +66,7 @@ timesteps: 1000
 max_beta: 0.02
 rel_pos: true
 diff_accelerator: ddim
-pndm_speedup: 10
+diff_speedup: 10
 hidden_size: 256
 residual_layers: 20
 residual_channels: 512
diff --git a/configs/base.yaml b/configs/base.yaml
index 57ec1c583..4eed200e9 100644
--- a/configs/base.yaml
+++ b/configs/base.yaml
@@ -1,11 +1,5 @@
 # task
 task_cls: null
-seed: 1234
-save_codes:
-  - configs
-  - modules
-  - training
-  - utils
 
 #############
 # dataset
@@ -36,7 +30,6 @@ enc_layers: 4
 num_heads: 2
 enc_ffn_kernel_size: 9
 ffn_act: gelu
-ffn_padding: 'SAME'
 use_spk_id: false
 
 ###########
@@ -67,8 +60,6 @@ max_batch_frames: 32000
 max_batch_size: 100000
 max_val_batch_frames: 60000
 max_val_batch_size: 1
-train_set_name: 'train'
-valid_set_name: 'valid'
 pe: 'parselmouth'
 pe_ckpt: ''
 f0_min: 65
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 57c7c8653..cb0142502 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -43,7 +43,6 @@ augmentation_args:
   random_time_stretching:
     enabled: true
     range: [0.5, 2.]
-    domain: log  # or linear
     scale: 0.75
 
 residual_channels: 512
diff --git a/configs/variance.yaml b/configs/variance.yaml
index 276ce3084..4136dffa7 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -29,7 +29,6 @@ raw_data_dir: 'data/opencpop_variance/raw'
 binary_data_dir: 'data/opencpop_variance/binary'
 binarizer_cls: preprocessing.variance_binarizer.VarianceBinarizer
 dictionary: dictionaries/opencpop-extension.txt
-num_pad_tokens: 1
 
 use_spk_id: false
 
@@ -80,10 +79,6 @@ energy_smooth_width: 0.12
 breathiness_db_min: -96.0
 breathiness_db_max: -20.0
 breathiness_smooth_width: 0.12
-tension_logit_min: -10.0
-tension_logit_max: 10.0
-tension_smooth_width: 0.12
-
 voicing_db_min: -96.0
 voicing_db_max: -12.0
 voicing_smooth_width: 0.12
@@ -109,7 +104,7 @@ max_beta: 0.02
 diff_decoder_type: 'wavenet'
 diff_loss_type: l2
 diff_accelerator: ddim
-pndm_speedup: 10
+diff_speedup: 10
 
 # train and eval
 num_sanity_val_steps: 1
diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
index ab4252469..d637f34f1 100644
--- a/deployment/exporters/acoustic_exporter.py
+++ b/deployment/exporters/acoustic_exporter.py
@@ -58,7 +58,7 @@ def __init__(
             if hparams['use_spk_id'] else None
         self.export_spk: List[Tuple[str, Dict[str, float]]] = export_spk \
             if hparams['use_spk_id'] and export_spk is not None else []
-        if hparams.get('use_key_shift_embed', False) and not self.expose_gender:
+        if hparams['use_key_shift_embed'] and not self.expose_gender:
             shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
             key_shift = freeze_gender * shift_max if freeze_gender >= 0. else freeze_gender * abs(shift_min)
             key_shift = max(min(key_shift, shift_max), shift_min)  # clip key shift
@@ -143,14 +143,14 @@ def _torch_export_model(self):
                 for v_name in self.model.fs2.variance_embed_list
             }
         }
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             if self.expose_gender:
                 kwargs['gender'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device)
                 input_names.append('gender')
                 dynamix_axes['gender'] = {
                     1: 'n_frames'
                 }
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             if self.expose_velocity:
                 kwargs['velocity'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device)
                 input_names.append('velocity')
diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py
index 52f6a4d28..157e031d8 100644
--- a/deployment/modules/fastspeech2.py
+++ b/deployment/modules/fastspeech2.py
@@ -1,17 +1,22 @@
 import copy
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from modules.commons.common_layers import NormalInitEmbedding as Embedding
 from modules.fastspeech.acoustic_encoder import FastSpeech2Acoustic
 from modules.fastspeech.variance_encoder import FastSpeech2Variance
 from utils.hparams import hparams
-from utils.pitch_utils import (
-    f0_bin, f0_mel_min, f0_mel_max
-)
 from utils.text_encoder import PAD_INDEX
 
+f0_bin = 256
+f0_max = 1100.0
+f0_min = 50.0
+f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
 
 def f0_to_coarse(f0):
     f0_mel = 1127 * (1 + f0 / 700).log()
@@ -38,10 +43,16 @@ def forward(self, dur):
 class FastSpeech2AcousticONNX(FastSpeech2Acoustic):
     def __init__(self, vocab_size):
         super().__init__(vocab_size=vocab_size)
+
+        # for temporary compatibility; will be completely removed in the future
+        self.f0_embed_type = hparams['f0_embed_type']
+        if self.f0_embed_type == 'discrete':
+            self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX)
+
         self.lr = LengthRegulator()
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             self.shift_min, self.shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             self.speed_min, self.speed_max = hparams['augmentation_args']['random_time_stretching']['range']
 
     # noinspection PyMethodOverriding
@@ -71,7 +82,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
             ], dim=-1).sum(-1)
             condition += variance_embeds
 
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             if hasattr(self, 'frozen_key_shift'):
                 key_shift_embed = self.key_shift_embed(self.frozen_key_shift[:, None, None])
             else:
@@ -81,7 +92,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
                 key_shift_embed = self.key_shift_embed(key_shift[:, :, None])
             condition += key_shift_embed
 
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             if velocity is not None:
                 velocity = torch.clip(velocity, min=self.speed_min, max=self.speed_max)
                 speed_embed = self.speed_embed(velocity[:, :, None])
diff --git a/docs/BestPractices.md b/docs/BestPractices.md
index b142a3873..23829de79 100644
--- a/docs/BestPractices.md
+++ b/docs/BestPractices.md
@@ -132,11 +132,7 @@ Once the coverage checks passed, a phoneme distribution summary will be saved in
 
 ![phoneme-distribution](resources/phoneme-distribution.jpg)
 
-During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. By default, there are one padding index before all real phonemes IDs. You may edit the number of padding indices, but it is not recommended to do so:
-
-```yaml
-num_pad_tokens: 1
-```
+During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. There are one padding index (marked as `<PAD`) before all real phonemes IDs.
 
 The dictionary used to binarize the dataset will be copied to the binary data directory by the binarizer, and will be copied again to the experiment directory by the trainer. When exported to ONNX, the dictionary and the phoneme sequence ordered by IDs will be saved to the artifact directory. You do not need to carry the original dictionary file for training and inference.
 
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index f9fdf60cb..171e13068 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -165,22 +165,6 @@ Arguments for random time stretching augmentation.
 <tr><td align="center"><b>type</b></td><td>dict</td>
 </tbody></table>
 
-### augmentation_args.random_time_stretching.domain
-
-The domain where random time stretching factors are uniformly distributed in.
-
-- If 'linear', stretching ratio $x$ will be uniformly distributed in $[V_{min}, V_{max}]$.
-- If 'log', $\ln{x}$ will be uniformly distributed in $[\ln{V_{min}}, \ln{V_{max}}]$.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>log</td>
-<tr><td align="center"><b>constraint</b></td><td>Choose from 'log', 'linear'.</td>
-</tbody></table>
-
 ### augmentation_args.random_time_stretching.enabled
 
 Whether to apply random time stretching augmentation.
@@ -418,6 +402,17 @@ Loss type of the DDPM.
 <tr><td align="center"><b>constraints</b></td><td>Choose from 'l1', 'l2'.</td>
 </tbody></table>
 
+### diff_speedup
+
+Diffusion sampling speed-up ratio. 1 means no speeding up.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>type</b></td><td>int</td>
+<tr><td align="center"><b>default</b></td><td>10</td>
+<tr><td align="center"><b>constraints</b></td><td>Must be a factor of <a href="#K_step">K_step</a>.</td>
+</tbody></table>
+
 ### dilation_cycle_length
 
 Length k of the cycle $2^0, 2^1 ...., 2^k$ of convolution dilation factors through WaveNet residual blocks.
@@ -648,22 +643,6 @@ Length of sinusoidal smoothing convolution kernel (in seconds) on extracted ener
 <tr><td align="center"><b>default</b></td><td>0.12</td>
 </tbody></table>
 
-### f0_embed_type
-
-Map f0 to embedding using:
-
-- `torch.nn.Linear` if 'continuous'
-- `torch.nn.Embedding` if 'discrete'
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>continuous</td>
-<tr><td align="center"><b>constraints</b></td><td>Choose from 'continuous', 'discrete'.</td>
-</tbody></table>
-
 ### f0_max
 
 Maximum base frequency (F0) in Hz for pitch extraction.
@@ -705,18 +684,6 @@ Activation function of TransformerFFNLayer in FastSpeech2 encoder:
 <tr><td align="center"><b>constraints</b></td><td>Choose from 'relu', 'gelu', 'swish'.</td>
 </tbody></table>
 
-### ffn_padding
-
-Padding mode of TransformerFFNLayer convolution in FastSpeech2 encoder.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
-<tr><td align="center"><b>scope</b></td><td>nn</td>
-<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>SAME</td>
-</tbody></table>
-
 ### fft_size
 
 Fast Fourier Transforms parameter for mel extraction.
@@ -872,18 +839,6 @@ Hop size or step length (in number of waveform samples) of mel and feature extra
 <tr><td align="center"><b>default</b></td><td>512</td>
 </tbody></table>
 
-### interp_uv
-
-Whether to apply linear interpolation to unvoiced parts in f0.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>reserved</td>
-<tr><td align="center"><b>type</b></td><td>boolean</td>
-<tr><td align="center"><b>default</b></td><td>true</td>
-</tbody></table>
-
 ### lambda_aux_mel_loss
 
 Coefficient of aux mel loss when calculating total loss of acoustic model with shallow diffusion.
@@ -1086,7 +1041,7 @@ Minimum mel spectrogram heatmap value for TensorBoard plotting.
 
 ### melody_encoder_args
 
-Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `enc_ffn_kernel_size`, `ffn_padding`, `ffn_act`, `dropout`, `num_heads`, `use_pos_embed`, `rel_pos`. If either of the parameter does not exist in this configuration key, it inherits from the linguistic encoder.
+Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `enc_ffn_kernel_size`, `ffn_act`, `dropout`, `num_heads`, `use_pos_embed`, `rel_pos`. If either of the parameter does not exist in this configuration key, it inherits from the linguistic encoder.
 
 <table><tbody>
 <tr><td align="center"><b>type</b></td><td>dict</td>
@@ -1140,20 +1095,6 @@ The number of attention heads of `torch.nn.MultiheadAttention` in FastSpeech2 en
 <tr><td align="center"><b>default</b></td><td>2</td>
 </tbody></table>
 
-### num_pad_tokens
-
-Number of padding phoneme indexes before all real tokens.
-
-Due to some historical reasons, old checkpoints may have 3 padding tokens called \<PAD\>, \<EOS\> and \<UNK\>. After refactoring, all padding tokens are called \<PAD\>, and only the first one (token == 0) will be used.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
-<tr><td align="center"><b>scope</b></td><td>nn, preprocess</td>
-<tr><td align="center"><b>customizability</b></td><td>not recommended</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>1</td>
-</tbody></table>
-
 ### num_sanity_val_steps
 
 Number of sanity validation steps at the beginning.
@@ -1421,17 +1362,6 @@ Strategy name for the Lightning trainer.
 <tr><td align="center"><b>default</b></td><td>auto</td>
 </tbody></table>
 
-### pndm_speedup
-
-Diffusion sampling speed-up ratio. 1 means no speeding up.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>10</td>
-<tr><td align="center"><b>constraints</b></td><td>Must be a factor of <a href="#K_step">K_step</a>.</td>
-</tbody></table>
-
 ### predict_breathiness
 
 Whether to enable breathiness prediction.
@@ -1545,18 +1475,6 @@ Training performance on some datasets may be very sensitive to this value. Chang
 <tr><td align="center"><b>default</b></td><td>6</td>
 </tbody></table>
 
-### save_codes
-
-Files in these folders will be backed up every time a training starts.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>training</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>list</td>
-<tr><td align="center"><b>default</b></td><td>[configs, modules, training, utils]</td>
-</tbody></table>
-
 ### schedule_type
 
 The diffusion schedule type.
@@ -1570,18 +1488,6 @@ The diffusion schedule type.
 <tr><td align="center"><b>constraints</b></td><td>Choose from 'linear', 'cosine'.</td>
 </tbody></table>
 
-### seed
-
-The global random seed used to shuffle data, initializing model weights, etc.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing, training</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
-<tr><td align="center"><b>type</b></td><td>int</td>
-<tr><td align="center"><b>default</b></td><td>1234</td>
-</tbody></table>
-
 ### shallow_diffusion_args
 
 Arguments for shallow_diffusion.
@@ -1759,18 +1665,6 @@ Total number of diffusion steps.
 <tr><td align="center"><b>default</b></td><td>1000</td>
 </tbody></table>
 
-### train_set_name
-
-Name of the training set used in binary filenames, TensorBoard keys, etc.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing, training</td>
-<tr><td align="center"><b>customizability</b></td><td>reserved</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>train</td>
-</tbody></table>
-
 ### use_breathiness_embed
 
 Whether to accept and embed breathiness values into the model.
@@ -1905,18 +1799,6 @@ Whether to load and use the vocoder to generate audio during validation. Validat
 <tr><td align="center"><b>default</b></td><td>true</td>
 </tbody></table>
 
-### valid_set_name
-
-Name of the validation set used in binary filenames, TensorBoard keys, etc.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing, training</td>
-<tr><td align="center"><b>customizability</b></td><td>reserved</td>
-<tr><td align="center"><b>type</b></td><td>str</td>
-<tr><td align="center"><b>default</b></td><td>valid</td>
-</tbody></table>
-
 ### variances_prediction_args
 
 Arguments for prediction of variance parameters other than pitch, like energy, breathiness, etc.
diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py
index 02f6b3a92..a67f5b166 100644
--- a/inference/ds_acoustic.py
+++ b/inference/ds_acoustic.py
@@ -111,7 +111,7 @@ def preprocess_input(self, param, idx=0):
                 )).to(self.device)[None]
                 summary[v_name] = 'manual'
 
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
             gender = param.get('gender')
             if gender is None:
@@ -135,7 +135,7 @@ def preprocess_input(self, param, idx=0):
                     min=shift_min, max=shift_max
                 )
 
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             if param.get('velocity') is None:
                 summary['velocity'] = 'default'
                 batch['speed'] = torch.FloatTensor([1.]).to(self.device)[:, None]  # => [B=1, T=1]
diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py
index 7e5fd08d4..9ea2c2638 100644
--- a/modules/commons/common_layers.py
+++ b/modules/commons/common_layers.py
@@ -104,35 +104,13 @@ def max_positions():
         return int(1e5)  # an arbitrary large number
 
 
-class ConvTBC(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
-        super(ConvTBC, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.padding = padding
-
-        self.weight = torch.nn.Parameter(torch.Tensor(
-            self.kernel_size, in_channels, out_channels))
-        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
-
-    def forward(self, x):
-        return torch.conv_tbc(x.contiguous(), self.weight, self.bias, self.padding)
-
-
 class TransformerFFNLayer(nn.Module):
-    def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'):
+    def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'):
         super().__init__()
         self.kernel_size = kernel_size
         self.dropout = dropout
         self.act = act
-        if padding == 'SAME':
-            self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
-        elif padding == 'LEFT':
-            self.ffn_1 = nn.Sequential(
-                nn.ConstantPad1d((kernel_size - 1, 0), 0.0),
-                nn.Conv1d(hidden_size, filter_size, kernel_size)
-            )
+        self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
         if self.act == 'relu':
             self.act_fn = ReLU()
         elif self.act == 'gelu':
@@ -152,44 +130,18 @@ def forward(self, x):
         return x
 
 
-class BatchNorm1dTBC(nn.Module):
-    def __init__(self, c):
-        super(BatchNorm1dTBC, self).__init__()
-        self.bn = nn.BatchNorm1d(c)
-
-    def forward(self, x):
-        """
-
-        :param x: [T, B, C]
-        :return: [T, B, C]
-        """
-        x = x.permute(1, 2, 0)  # [B, C, T]
-        x = self.bn(x)  # [B, C, T]
-        x = x.permute(2, 0, 1)  # [T, B, C]
-        return x
-
-
 class EncSALayer(nn.Module):
     def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
-                 relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'):
+                 relu_dropout=0.1, kernel_size=9, act='gelu'):
         super().__init__()
-        self.c = c
         self.dropout = dropout
-        self.num_heads = num_heads
-        if num_heads > 0:
-            if norm == 'ln':
-                self.layer_norm1 = LayerNorm(c)
-            elif norm == 'bn':
-                self.layer_norm1 = BatchNorm1dTBC(c)
-            self.self_attn = MultiheadAttention(
-                self.c, num_heads, dropout=attention_dropout, bias=False,
-            )
-        if norm == 'ln':
-            self.layer_norm2 = LayerNorm(c)
-        elif norm == 'bn':
-            self.layer_norm2 = BatchNorm1dTBC(c)
+        self.layer_norm1 = LayerNorm(c)
+        self.self_attn = MultiheadAttention(
+            c, num_heads, dropout=attention_dropout, bias=False,
+        )
+        self.layer_norm2 = LayerNorm(c)
         self.ffn = TransformerFFNLayer(
-            c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act
+            c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act
         )
 
     def forward(self, x, encoder_padding_mask=None, **kwargs):
@@ -197,18 +149,17 @@ def forward(self, x, encoder_padding_mask=None, **kwargs):
         if layer_norm_training is not None:
             self.layer_norm1.training = layer_norm_training
             self.layer_norm2.training = layer_norm_training
-        if self.num_heads > 0:
-            residual = x
-            x = self.layer_norm1(x)
-            x, _, = self.self_attn(
-                query=x,
-                key=x,
-                value=x,
-                key_padding_mask=encoder_padding_mask
-            )
-            x = F.dropout(x, self.dropout, training=self.training)
-            x = residual + x
-            x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        residual = x
+        x = self.layer_norm1(x)
+        x, _, = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=encoder_padding_mask
+        )
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
 
         residual = x
         x = self.layer_norm2(x)
diff --git a/modules/diffusion/ddpm.py b/modules/diffusion/ddpm.py
index cd8295a7c..7b91122fd 100644
--- a/modules/diffusion/ddpm.py
+++ b/modules/diffusion/ddpm.py
@@ -36,7 +36,7 @@ def noise_like(shape, device, repeat=False):
     return repeat_noise() if repeat else noise()
 
 
-def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.01)):
+def linear_beta_schedule(timesteps, max_beta=0.01):
     """
     linear schedule
     """
@@ -239,8 +239,8 @@ def inference(self, cond, b=1, x_start=None, device=None):
             assert x_start is not None, 'Missing shallow diffusion source.'
             x = x_start
 
-        if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1 and t_max > 0:
-            algorithm = hparams.get('diff_accelerator', 'ddim')
+        if hparams['diff_speedup'] > 1 and t_max > 0:
+            algorithm = hparams['diff_accelerator']
             if algorithm == 'dpm-solver':
                 from inference.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
                 # 1. Define the noise schedule.
@@ -270,7 +270,7 @@ def wrapped(x, t, **kwargs):
                 # costs and the sample quality.
                 dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
 
-                steps = t_max // hparams["pndm_speedup"]
+                steps = t_max // hparams["diff_speedup"]
                 self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False)
                 x = dpm_solver.sample(
                     x,
@@ -308,7 +308,7 @@ def wrapped(x, t, **kwargs):
                 # costs and the sample quality.
                 uni_pc = UniPC(model_fn, noise_schedule, variant='bh2')
 
-                steps = t_max // hparams["pndm_speedup"]
+                steps = t_max // hparams["diff_speedup"]
                 self.bar = tqdm(desc="sample time step", total=steps, disable=not hparams['infer'], leave=False)
                 x = uni_pc.sample(
                     x,
@@ -320,7 +320,7 @@ def wrapped(x, t, **kwargs):
                 self.bar.close()
             elif algorithm == 'pndm':
                 self.noise_list = deque(maxlen=4)
-                iteration_interval = hparams['pndm_speedup']
+                iteration_interval = hparams['diff_speedup']
                 for i in tqdm(
                         reversed(range(0, t_max, iteration_interval)), desc='sample time step',
                         total=t_max // iteration_interval, disable=not hparams['infer'], leave=False
@@ -330,7 +330,7 @@ def wrapped(x, t, **kwargs):
                         iteration_interval, cond=cond
                     )
             elif algorithm == 'ddim':
-                iteration_interval = hparams['pndm_speedup']
+                iteration_interval = hparams['diff_speedup']
                 for i in tqdm(
                         reversed(range(0, t_max, iteration_interval)), desc='sample time step',
                         total=t_max // iteration_interval, disable=not hparams['infer'], leave=False
diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 666e7f659..9ab4ed633 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -8,7 +8,6 @@
 )
 from modules.fastspeech.tts_modules import FastSpeech2Encoder, mel2ph_to_dur
 from utils.hparams import hparams
-from utils.pitch_utils import f0_to_coarse
 from utils.text_encoder import PAD_INDEX
 
 
@@ -18,21 +17,13 @@ def __init__(self, vocab_size):
         self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX)
         self.dur_embed = Linear(1, hparams['hidden_size'])
         self.encoder = FastSpeech2Encoder(
-            self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
-            ffn_kernel_size=hparams['enc_ffn_kernel_size'],
-            ffn_padding=hparams['ffn_padding'], ffn_act=hparams['ffn_act'],
+            hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
+            ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'],
             dropout=hparams['dropout'], num_heads=hparams['num_heads'],
             use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
         )
 
-        self.f0_embed_type = hparams.get('f0_embed_type', 'discrete')
-        if self.f0_embed_type == 'discrete':
-            self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX)
-        elif self.f0_embed_type == 'continuous':
-            self.pitch_embed = Linear(1, hparams['hidden_size'])
-        else:
-            raise ValueError('f0_embed_type must be \'discrete\' or \'continuous\'.')
-
+        self.pitch_embed = Linear(1, hparams['hidden_size'])
         self.variance_embed_list = []
         self.use_energy_embed = hparams.get('use_energy_embed', False)
         self.use_breathiness_embed = hparams.get('use_breathiness_embed', False)
@@ -106,12 +97,8 @@ def forward(
                 spk_embed = self.spk_embed(spk_embed_id)[:, None, :]
             condition += spk_embed
 
-        if self.f0_embed_type == 'discrete':
-            pitch = f0_to_coarse(f0)
-            pitch_embed = self.pitch_embed(pitch)
-        else:
-            f0_mel = (1 + f0 / 700).log()
-            pitch_embed = self.pitch_embed(f0_mel[:, :, None])
+        f0_mel = (1 + f0 / 700).log()
+        pitch_embed = self.pitch_embed(f0_mel[:, :, None])
         condition += pitch_embed
 
         condition = self.forward_variance_embedding(
diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py
index bc5eff265..1dd164d17 100644
--- a/modules/fastspeech/tts_modules.py
+++ b/modules/fastspeech/tts_modules.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 from torch.nn import functional as F
 
-from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer, BatchNorm1dTBC
+from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer
 from modules.commons.espnet_positional_embedding import RelPositionalEncoding
 
 DEFAULT_MAX_SOURCE_POSITIONS = 2000
@@ -12,14 +12,13 @@
 
 
 class TransformerEncoderLayer(nn.Module):
-    def __init__(self, hidden_size, dropout, kernel_size=None, padding='SAME', act='gelu', num_heads=2, norm='ln'):
+    def __init__(self, hidden_size, dropout, kernel_size=None, act='gelu', num_heads=2):
         super().__init__()
         self.op = EncSALayer(
             hidden_size, num_heads, dropout=dropout,
             attention_dropout=0.0, relu_dropout=dropout,
             kernel_size=kernel_size,
-            padding=padding,
-            norm=norm, act=act
+            act=act
         )
 
     def forward(self, x, **kwargs):
@@ -63,7 +62,7 @@ class DurationPredictor(torch.nn.Module):
     """
 
     def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3,
-                 dropout_rate=0.1, offset=1.0, padding='SAME', dur_loss_type='mse'):
+                 dropout_rate=0.1, offset=1.0, dur_loss_type='mse'):
         """Initialize duration predictor module.
         Args:
             in_dims (int): Input dimension.
@@ -77,18 +76,15 @@ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3,
         self.offset = offset
         self.conv = torch.nn.ModuleList()
         self.kernel_size = kernel_size
-        self.padding = padding
         for idx in range(n_layers):
             in_chans = in_dims if idx == 0 else n_chans
-            self.conv += [torch.nn.Sequential(
-                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
-                                       if padding == 'SAME'
-                                       else (kernel_size - 1, 0), 0),
-                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+            self.conv.append(torch.nn.Sequential(
+                torch.nn.Identity(),  # this is a placeholder for ConstantPad1d which is now merged into Conv1d
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
                 torch.nn.ReLU(),
                 LayerNorm(n_chans, dim=1),
                 torch.nn.Dropout(dropout_rate)
-            )]
+            ))
 
         self.loss_type = dur_loss_type
         if self.loss_type in ['mse', 'huber']:
@@ -141,7 +137,7 @@ def forward(self, xs, x_masks=None, infer=True):
 class VariancePredictor(torch.nn.Module):
     def __init__(self, vmin, vmax, in_dims,
                  n_layers=5, n_chans=512, kernel_size=5,
-                 dropout_rate=0.1, padding='SAME'):
+                 dropout_rate=0.1):
         """Initialize variance predictor module.
         Args:
             in_dims (int): Input dimension.
@@ -156,18 +152,14 @@ def __init__(self, vmin, vmax, in_dims,
         self.vmax = vmax
         self.conv = torch.nn.ModuleList()
         self.kernel_size = kernel_size
-        self.padding = padding
         for idx in range(n_layers):
             in_chans = in_dims if idx == 0 else n_chans
-            self.conv += [torch.nn.Sequential(
-                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
-                                       if padding == 'SAME'
-                                       else (kernel_size - 1, 0), 0),
-                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+            self.conv.append(torch.nn.Sequential(
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
                 torch.nn.ReLU(),
                 LayerNorm(n_chans, dim=1),
                 torch.nn.Dropout(dropout_rate)
-            )]
+            ))
         self.linear = torch.nn.Linear(n_chans, 1)
         self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
         self.pos_embed_alpha = nn.Parameter(torch.Tensor([1]))
@@ -195,7 +187,7 @@ def forward(self, xs, infer=True):
 class PitchPredictor(torch.nn.Module):
     def __init__(self, vmin, vmax, num_bins, deviation,
                  in_dims, n_layers=5, n_chans=384, kernel_size=5,
-                 dropout_rate=0.1, padding='SAME'):
+                 dropout_rate=0.1):
         """Initialize pitch predictor module.
         Args:
             in_dims (int): Input dimension.
@@ -214,18 +206,14 @@ def __init__(self, vmin, vmax, num_bins, deviation,
         self.base_pitch_embed = torch.nn.Linear(1, in_dims)
         self.conv = torch.nn.ModuleList()
         self.kernel_size = kernel_size
-        self.padding = padding
         for idx in range(n_layers):
             in_chans = in_dims if idx == 0 else n_chans
-            self.conv += [torch.nn.Sequential(
-                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
-                                       if padding == 'SAME'
-                                       else (kernel_size - 1, 0), 0),
-                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+            self.conv.append(torch.nn.Sequential(
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
                 torch.nn.ReLU(),
                 LayerNorm(n_chans, dim=1),
                 torch.nn.Dropout(dropout_rate)
-            )]
+            ))
         self.linear = torch.nn.Linear(n_chans, num_bins)
         self.embed_positions = SinusoidalPositionalEmbedding(in_dims, 0, init_size=4096)
         self.pos_embed_alpha = nn.Parameter(torch.Tensor([1]))
@@ -363,34 +351,25 @@ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
 
 
 class FastSpeech2Encoder(nn.Module):
-    def __init__(self, embed_tokens, hidden_size, num_layers,
-                 ffn_kernel_size=9, ffn_padding='SAME', ffn_act='gelu',
-                 dropout=None, num_heads=2, use_last_norm=True, norm='ln',
-                 use_pos_embed=True, rel_pos=True):
+    def __init__(self, hidden_size, num_layers,
+                 ffn_kernel_size=9, ffn_act='gelu',
+                 dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True):
         super().__init__()
         self.num_layers = num_layers
         embed_dim = self.hidden_size = hidden_size
         self.dropout = dropout
         self.use_pos_embed = use_pos_embed
-        self.use_last_norm = use_last_norm
 
         self.layers = nn.ModuleList([
             TransformerEncoderLayer(
                 self.hidden_size, self.dropout,
-                kernel_size=ffn_kernel_size, padding=ffn_padding, act=ffn_act,
+                kernel_size=ffn_kernel_size, act=ffn_act,
                 num_heads=num_heads
             )
             for _ in range(self.num_layers)
         ])
-        if self.use_last_norm:
-            if norm == 'ln':
-                self.layer_norm = nn.LayerNorm(embed_dim)
-            elif norm == 'bn':
-                self.layer_norm = BatchNorm1dTBC(embed_dim)
-        else:
-            self.layer_norm = None
+        self.layer_norm = nn.LayerNorm(embed_dim)
 
-        self.embed_tokens = embed_tokens  # redundant, but have to persist for compatibility with old checkpoints
         self.embed_scale = math.sqrt(hidden_size)
         self.padding_idx = 0
         self.rel_pos = rel_pos
@@ -438,8 +417,7 @@ def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_
         for layer in self.layers:
             x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
             hiddens.append(x)
-        if self.use_last_norm:
-            x = self.layer_norm(x) * nonpadding_mask_TB
+        x = self.layer_norm(x) * nonpadding_mask_TB
         if return_hiddens:
             x = torch.stack(hiddens, 0)  # [L, T, B, C]
             x = x.transpose(1, 2)  # [L, B, T, C]
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index 8e2117f6b..82e0a88e8 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -26,9 +26,8 @@ def __init__(self, vocab_size):
             self.ph_dur_embed = Linear(1, hparams['hidden_size'])
 
         self.encoder = FastSpeech2Encoder(
-            self.txt_embed, hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
-            ffn_kernel_size=hparams['enc_ffn_kernel_size'],
-            ffn_padding=hparams['ffn_padding'], ffn_act=hparams['ffn_act'],
+            hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
+            ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'],
             dropout=hparams['dropout'], num_heads=hparams['num_heads'],
             use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
         )
@@ -41,7 +40,6 @@ def __init__(self, vocab_size):
                 n_chans=dur_hparams['hidden_size'],
                 n_layers=dur_hparams['num_layers'],
                 dropout_rate=dur_hparams['dropout'],
-                padding=hparams['ffn_padding'],
                 kernel_size=dur_hparams['kernel_size'],
                 offset=dur_hparams['log_offset'],
                 dur_loss_type=dur_hparams['loss_type']
@@ -108,9 +106,8 @@ def get_hparam(key):
             self.note_glide_embed = Embedding(len(hparams['glide_types']) + 1, hidden_size, padding_idx=0)
 
         self.encoder = FastSpeech2Encoder(
-            None, hidden_size, num_layers=get_hparam('enc_layers'),
-            ffn_kernel_size=get_hparam('enc_ffn_kernel_size'),
-            ffn_padding=get_hparam('ffn_padding'), ffn_act=get_hparam('ffn_act'),
+            hidden_size=hidden_size, num_layers=get_hparam('enc_layers'),
+            ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), ffn_act=get_hparam('ffn_act'),
             dropout=get_hparam('dropout'), num_heads=get_hparam('num_heads'),
             use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos')
         )
diff --git a/modules/pe/__init__.py b/modules/pe/__init__.py
index 99d3dae95..edf747a32 100644
--- a/modules/pe/__init__.py
+++ b/modules/pe/__init__.py
@@ -6,7 +6,7 @@
 
 
 def initialize_pe():
-    pe = hparams.get('pe', 'parselmouth')
+    pe = hparams['pe']
     pe_ckpt = hparams['pe_ckpt']
     if pe == 'parselmouth':
         return ParselmouthPE()
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index 6df8e7a55..36b609ffb 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -64,27 +64,20 @@ def __init__(self):
 
     def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
         meta_data_dict = {}
-        if (raw_data_dir / 'transcriptions.csv').exists():
-            with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f:
-                for utterance_label in csv.DictReader(f):
-                    item_name = utterance_label['name']
-                    temp_dict = {
-                        'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
-                        'ph_seq': utterance_label['ph_seq'].split(),
-                        'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()],
-                        'spk_id': spk_id,
-                        'spk_name': self.speakers[ds_id],
-                    }
-                    assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \
-                        f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.'
-                    meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict
-        else:
-            raise FileNotFoundError(
-                f'transcriptions.csv not found in {raw_data_dir}. '
-                'If this is a dataset with the old transcription format, please consider '
-                'migrating it to the new format via the following command:\n'
-                'python scripts/migrate.py txt <INPUT_TXT>'
-            )
+        with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f:
+            for utterance_label in csv.DictReader(f):
+                item_name = utterance_label['name']
+                temp_dict = {
+                    'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
+                    'ph_seq': utterance_label['ph_seq'].split(),
+                    'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()],
+                    'spk_id': spk_id,
+                    'spk_name': self.speakers[ds_id],
+                }
+                assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \
+                    f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.'
+                meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict
+
         self.items.update(meta_data_dict)
 
     @torch.no_grad()
@@ -119,7 +112,7 @@ def process_item(self, item_name, meta_data, binarization_args):
         gt_f0, uv = pitch_extractor.get_pitch(
             wav, samplerate=hparams['audio_sample_rate'], length=length,
             hop_size=hparams['hop_size'], f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
-            interp_uv=hparams['interp_uv']
+            interp_uv=True
         )
         if uv.all():  # All unvoiced
             print(f'Skipped \'{item_name}\': empty gt f0')
@@ -196,10 +189,10 @@ def process_item(self, item_name, meta_data, binarization_args):
 
             processed_input['tension'] = tension.cpu().numpy()
 
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             processed_input['key_shift'] = 0.
 
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             processed_input['speed'] = 1.
 
         return processed_input
@@ -214,7 +207,7 @@ def arrange_data_augmentation(self, data_iterator):
             from augmentation.spec_stretch import SpectrogramStretchAugmentation
             aug_args = self.augmentation_args['random_pitch_shifting']
             key_shift_min, key_shift_max = aug_args['range']
-            assert hparams.get('use_key_shift_embed', False), \
+            assert hparams['use_key_shift_embed'], \
                 'Random pitch shifting augmentation requires use_key_shift_embed == True.'
             assert key_shift_min < 0 < key_shift_max, \
                 'Random pitch shifting augmentation must have a range where min < 0 < max.'
@@ -280,12 +273,10 @@ def arrange_data_augmentation(self, data_iterator):
             from augmentation.spec_stretch import SpectrogramStretchAugmentation
             aug_args = self.augmentation_args['random_time_stretching']
             speed_min, speed_max = aug_args['range']
-            domain = aug_args['domain']
-            assert hparams.get('use_speed_embed', False), \
+            assert hparams['use_speed_embed'], \
                 'Random time stretching augmentation requires use_speed_embed == True.'
             assert 0 < speed_min < 1 < speed_max, \
                 'Random time stretching augmentation must have a range where 0 < min < 1 < max.'
-            assert domain in ['log', 'linear'], 'domain must be \'log\' or \'linear\'.'
 
             aug_ins = SpectrogramStretchAugmentation(self.raw_data_dirs, aug_args, pe=aug_pe)
             scale = aug_args['scale']
@@ -296,13 +287,8 @@ def arrange_data_augmentation(self, data_iterator):
             aug_items = random.choices(all_item_names, k=k_from_raw) + random.choices(aug_list, k=k_from_aug + k_mutate)
 
             for aug_type, aug_item in zip(aug_types, aug_items):
-                if domain == 'log':
-                    # Uniform distribution in log domain
-                    speed = speed_min * (speed_max / speed_min) ** random.random()
-                else:
-                    # Uniform distribution in linear domain
-                    rand = random.uniform(-1, 1)
-                    speed = 1 + (speed_max - 1) * rand if rand >= 0 else 1 + (1 - speed_min) * rand
+                # Uniform distribution in log domain
+                speed = speed_min * (speed_max / speed_min) ** random.random()
                 if aug_type == 0:
                     aug_task = {
                         'name': aug_item,
diff --git a/scripts/binarize.py b/scripts/binarize.py
index 767e947a3..74abd2ba2 100644
--- a/scripts/binarize.py
+++ b/scripts/binarize.py
@@ -13,7 +13,7 @@
 
 
 def binarize():
-    binarizer_cls = hparams.get("binarizer_cls", 'basics.base_binarizer.BaseBinarizer')
+    binarizer_cls = hparams["binarizer_cls"]
     pkg = ".".join(binarizer_cls.split(".")[:-1])
     cls_name = binarizer_cls.split(".")[-1]
     binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
diff --git a/scripts/infer.py b/scripts/infer.py
index 8c6e6e835..3618bcb6d 100644
--- a/scripts/infer.py
+++ b/scripts/infer.py
@@ -119,11 +119,14 @@ def acoustic(
 
     if speedup > 0:
         assert depth % speedup == 0, f'Acceleration ratio must be factor of diffusion depth {depth}.'
-        hparams['pndm_speedup'] = speedup
+        hparams['diff_speedup'] = speedup
+    elif 'diff_speedup' not in hparams:
+        # NOTICE: this is for compatibility
+        hparams['diff_speedup'] = hparams['pndm_speedup']
 
     spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None
     for param in params:
-        if gender is not None and hparams.get('use_key_shift_embed'):
+        if gender is not None and hparams['use_key_shift_embed']:
             param['gender'] = gender
 
         if spk_mix is not None:
@@ -213,7 +216,10 @@ def variance(
 
     if speedup > 0:
         assert hparams['K_step'] % speedup == 0, f'Acceleration ratio must be factor of K_step {hparams["K_step"]}.'
-        hparams['pndm_speedup'] = speedup
+        hparams['diff_speedup'] = speedup
+    elif 'diff_speedup' not in hparams:
+        # NOTICE: this is for compatibility
+        hparams['diff_speedup'] = hparams['pndm_speedup']
 
     spk_mix = parse_commandline_spk_mix(spk) if hparams['use_spk_id'] and spk is not None else None
     for param in params:
diff --git a/scripts/migrate.py b/scripts/migrate.py
deleted file mode 100644
index f1125f3d5..000000000
--- a/scripts/migrate.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import pathlib
-from collections import OrderedDict
-
-import click
-
-
-@click.group()
-def main():
-    pass
-
-
-@main.command(help='Migrate checkpoint files of MIDI-less acoustic models from old format')
-@click.argument('input_ckpt', metavar='INPUT')
-@click.argument('output_ckpt', metavar='OUTPUT')
-@click.option('--overwrite', is_flag=True, show_default=True, help='Overwrite the existing file')
-def ckpt(
-        input_ckpt: str,
-        output_ckpt: str,
-        overwrite: bool = False
-):
-    input_ckpt = pathlib.Path(input_ckpt).resolve()
-    output_ckpt = pathlib.Path(output_ckpt).resolve()
-    assert input_ckpt.exists(), 'The input file does not exist.'
-    assert overwrite or not output_ckpt.exists(), \
-        'The output file already exists or is the same as the input file.\n' \
-        'This is not recommended because migration scripts may not be stable, ' \
-        'and you may be at risk of losing your model.\n' \
-        'If you are sure to OVERWRITE the existing file, please re-run this script with the \'--overwrite\' argument.'
-
-    import torch
-    ckpt_loaded = torch.load(input_ckpt, map_location='cpu')
-    if 'category' in ckpt_loaded:
-        print('This checkpoint file is already in the new format.')
-        exit(0)
-    state_dict: OrderedDict = ckpt_loaded['state_dict']
-    ckpt_loaded['optimizer_states'][0]['state'].clear()
-    new_state_dict = OrderedDict()
-    for key in state_dict:
-        if key.startswith('model.fs2'):
-            # keep model.fs2.xxx
-            new_state_dict[key] = state_dict[key]
-        else:
-            # model.xxx => model.diffusion.xxx
-            path = key.split('.', maxsplit=1)[1]
-            new_state_dict[f'model.diffusion.{path}'] = state_dict[key]
-    ckpt_loaded['category'] = 'acoustic'
-    ckpt_loaded['state_dict'] = new_state_dict
-    torch.save(ckpt_loaded, output_ckpt)
-
-
-@main.command(help='Migrate transcriptions.txt in old datasets to transcriptions.csv')
-@click.argument('input_txt', metavar='INPUT')
-def txt(
-        input_txt: str
-):
-    input_txt = pathlib.Path(input_txt).resolve()
-    assert input_txt.exists(), 'The input file does not exist.'
-    with open(input_txt, 'r', encoding='utf8') as f:
-        utterances = f.readlines()
-    utterances = [u.split('|') for u in utterances]
-    utterances = [
-        {
-            'name': u[0],
-            'ph_seq': u[2],
-            'ph_dur': u[5]
-        }
-        for u in utterances
-    ]
-
-    import csv
-    with open(input_txt.with_suffix('.csv'), 'w', encoding='utf8', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur'])
-        writer.writeheader()
-        writer.writerows(utterances)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/utils/__init__.py b/utils/__init__.py
index 356fef7a5..abb5df151 100644
--- a/utils/__init__.py
+++ b/utils/__init__.py
@@ -165,9 +165,12 @@ def filter_kwargs(dict_to_filter, kwarg_obj):
 
 def load_ckpt(
         cur_model, ckpt_base_dir, ckpt_steps=None,
-        prefix_in_ckpt='model', key_in_ckpt='state_dict',
+        prefix_in_ckpt='model', ignored_prefixes=None, key_in_ckpt='state_dict',
         strict=True, device='cpu'
 ):
+    if ignored_prefixes is None:
+        # NOTICE: this is for compatibility with old checkpoints which have duplicate txt_embed layer in them.
+        ignored_prefixes = ['model.fs2.encoder.embed_tokens']
     if not isinstance(ckpt_base_dir, pathlib.Path):
         ckpt_base_dir = pathlib.Path(ckpt_base_dir)
     if ckpt_base_dir.is_file():
@@ -197,6 +200,7 @@ def load_ckpt(
         state_dict = OrderedDict({
             k[len(prefix_in_ckpt) + 1:]: v
             for k, v in state_dict.items() if k.startswith(f'{prefix_in_ckpt}.')
+            if all(not k.startswith(p) for p in ignored_prefixes)
         })
     if not strict:
         cur_model_state_dict = cur_model.state_dict()
diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 7c6f317f5..269122a6d 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -29,7 +29,7 @@ def locate_dictionary():
     """
     assert 'dictionary' in hparams or 'g2p_dictionary' in hparams, \
         'Please specify a dictionary file in your config.'
-    config_dict_path = pathlib.Path(hparams.get('dictionary', hparams.get('g2p_dictionary')))
+    config_dict_path = pathlib.Path(hparams['dictionary'])
     if config_dict_path.exists():
         return config_dict_path
     work_dir = pathlib.Path(hparams['work_dir'])
diff --git a/utils/pitch_utils.py b/utils/pitch_utils.py
index fe706f94f..57ae943f7 100644
--- a/utils/pitch_utils.py
+++ b/utils/pitch_utils.py
@@ -1,23 +1,4 @@
 import numpy as np
-import torch
-
-f0_bin = 256
-f0_max = 1100.0
-f0_min = 50.0
-f0_mel_min = 1127 * np.log(1 + f0_min / 700)
-f0_mel_max = 1127 * np.log(1 + f0_max / 700)
-
-
-def f0_to_coarse(f0):
-    is_torch = isinstance(f0, torch.Tensor)
-    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
-    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
-
-    f0_mel[f0_mel <= 1] = 1
-    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
-    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
-    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
-    return f0_coarse
 
 
 def norm_f0(f0, uv=None):
diff --git a/utils/text_encoder.py b/utils/text_encoder.py
index 605b7e80e..4b7815c46 100644
--- a/utils/text_encoder.py
+++ b/utils/text_encoder.py
@@ -1,19 +1,9 @@
 import numpy as np
 
-from utils.hparams import hparams
-
 PAD = '<PAD>'
 PAD_INDEX = 0
 
 
-def strip_ids(ids, ids_to_strip):
-    """Strip ids_to_strip from the end ids."""
-    ids = list(ids)
-    while ids and ids[-1] in ids_to_strip:
-        ids.pop()
-    return ids
-
-
 class TokenTextEncoder:
     """Encoder based on a user-supplied vocabulary (file or list)."""
 
@@ -26,30 +16,25 @@ def __init__(self, vocab_list):
         Args:
             vocab_list: If not None, a list of elements of the vocabulary.
         """
-        self.num_reserved_ids = hparams.get('num_pad_tokens', 3)
-        assert self.num_reserved_ids > 0, 'num_pad_tokens must be positive'
         self.vocab_list = sorted(vocab_list)
 
     def encode(self, sentence):
         """Converts a space-separated string of phones to a list of ids."""
         phones = sentence.strip().split() if isinstance(sentence, str) else sentence
-        return [self.vocab_list.index(ph) + self.num_reserved_ids if ph != PAD else PAD_INDEX for ph in phones]
+        return [self.vocab_list.index(ph) + 1 if ph != PAD else PAD_INDEX for ph in phones]
 
     def decode(self, ids, strip_padding=False):
         if strip_padding:
             ids = np.trim_zeros(ids)
         ids = list(ids)
         return ' '.join([
-            self.vocab_list[_id - self.num_reserved_ids] if _id >= self.num_reserved_ids else PAD
+            self.vocab_list[_id - 1] if _id >= 1 else PAD
             for _id in ids
         ])
 
-    def pad(self):
-        pass
-
     @property
     def vocab_size(self):
-        return len(self.vocab_list) + self.num_reserved_ids
+        return len(self.vocab_list) + 1
 
     def __len__(self):
         return self.vocab_size
@@ -64,5 +49,5 @@ def store_to_file(self, filename):
         filename: Full path of the file to store the vocab to.
         """
         with open(filename, 'w', encoding='utf8') as f:
-            [print(PAD, file=f) for _ in range(self.num_reserved_ids)]
+            print(PAD, file=f)
             [print(tok, file=f) for tok in self.vocab_list]
diff --git a/utils/training_utils.py b/utils/training_utils.py
index 98fb6da47..26d24eec5 100644
--- a/utils/training_utils.py
+++ b/utils/training_utils.py
@@ -103,7 +103,7 @@ def __init__(self, dataset, max_batch_frames, max_batch_size, sub_indices=None,
     def __form_batches(self):
         if self.formed == self.epoch + self.seed:
             return
-        rng = np.random.default_rng(self.seed + self.epoch)
+        rng = np.random.default_rng()
         # Create indices
         if self.shuffle_sample:
             if self.sub_indices is not None:
@@ -113,7 +113,7 @@ def __form_batches(self):
                 indices = rng.permutation(len(self.dataset))
 
             if self.sort_by_similar_size:
-                grid = int(hparams.get('sampler_frame_count_grid', 6))
+                grid = int(hparams['sampler_frame_count_grid'])
                 assert grid > 0
                 sizes = (np.round(np.array(self.dataset.sizes)[indices] / grid) * grid).clip(grid, None)
                 sizes *= (-1 if self.size_reversed else 1)