From da622be4a66387a6c27ea93cce7f958b2cf6621f Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 16 Jul 2024 00:35:10 +0800
Subject: [PATCH 01/44] Add multi-dictionary preprocessing and training

---
 basics/base_binarizer.py                  |  98 +++++---
 basics/base_exporter.py                   |  15 ++
 basics/base_task.py                       |  26 +-
 deployment/exporters/acoustic_exporter.py |  17 +-
 deployment/exporters/variance_exporter.py |  17 +-
 deployment/modules/fastspeech2.py         |   2 +-
 inference/ds_acoustic.py                  |  18 +-
 inference/ds_variance.py                  |  18 +-
 modules/fastspeech/acoustic_encoder.py    |   2 +-
 modules/fastspeech/variance_encoder.py    |   2 +-
 preprocessing/acoustic_binarizer.py       |  12 +-
 preprocessing/variance_binarizer.py       |  10 +-
 training/acoustic_task.py                 |   2 +-
 training/variance_task.py                 |   4 +-
 utils/phoneme_utils.py                    | 279 ++++++++++++++--------
 utils/text_encoder.py                     |  53 ----
 16 files changed, 318 insertions(+), 257 deletions(-)
 delete mode 100644 utils/text_encoder.py

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index ddad6e02e..896bfd3a4 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -13,9 +13,8 @@
 from utils.hparams import hparams
 from utils.indexed_datasets import IndexedDatasetBuilder
 from utils.multiprocess_utils import chunked_multiprocess_run
-from utils.phoneme_utils import build_phoneme_list, locate_dictionary
+from utils.phoneme_utils import load_phoneme_dictionary
 from utils.plot import distribution_to_figure
-from utils.text_encoder import TokenTextEncoder
 
 
 class BinarizationError(Exception):
@@ -58,17 +57,22 @@ def __init__(self, data_dir=None, data_attrs=None):
         self.augmentation_args = hparams.get('augmentation_args', {})
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
-        self.spk_map = None
+        self.spk_map = {}
         self.spk_ids = hparams['spk_ids']
         self.speakers = hparams['speakers']
         self.build_spk_map()
 
+        self.lang_map = {}
+        self.dictionaries = hparams['dictionaries']
+        self.languages = hparams['languages']
+        self.build_lang_map()
+
         self.items = {}
         self.item_names: list = None
         self._train_item_names: list = None
         self._valid_item_names: list = None
 
-        self.phone_encoder = TokenTextEncoder(vocab_list=build_phoneme_list())
+        self.phoneme_dictionary = load_phoneme_dictionary()
         self.timestep = hparams['hop_size'] / hparams['audio_sample_rate']
 
     def build_spk_map(self):
@@ -83,7 +87,6 @@ def build_spk_map(self):
         assert max(self.spk_ids) < hparams['num_spk'], \
             f'Index in spk_id sequence {self.spk_ids} is out of range. All values should be smaller than num_spk.'
 
-        self.spk_map = {}
         for spk_name, spk_id in zip(self.speakers, self.spk_ids):
             if spk_name in self.spk_map and self.spk_map[spk_name] != spk_id:
                 raise ValueError(f'Invalid speaker ID assignment. Name \'{spk_name}\' is assigned '
@@ -92,7 +95,19 @@ def build_spk_map(self):
 
         print("| spk_map: ", self.spk_map)
 
-    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
+    def build_lang_map(self):
+        assert isinstance(self.languages, list), 'Languages must be a list'
+        assert len(self.languages) == len(self.raw_data_dirs), \
+            'Number of raw data dirs must equal number of language names!'
+        for lang in self.languages:
+            assert lang in self.dictionaries, f'Unrecognized language name: {lang}'
+        assert len(self.dictionaries.keys()) <= hparams['num_lang'], \
+            'Number of languages must not be greater than num_lang!'
+
+        for lang_id, lang_name in enumerate(sorted(self.dictionaries.keys())):
+            self.lang_map[lang_name] = lang_id
+
+    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
         raise NotImplementedError()
 
     def split_train_valid_set(self, item_names):
@@ -167,8 +182,8 @@ def meta_data_iterator(self, prefix):
 
     def process(self):
         # load each dataset
-        for ds_id, spk_id, data_dir in zip(range(len(self.raw_data_dirs)), self.spk_ids, self.raw_data_dirs):
-            self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk_id=spk_id)
+        for ds_id, (data_dir, spk, lang) in enumerate(zip(self.raw_data_dirs, self.speakers, self.languages)):
+            self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk=spk, lang=lang)
         self.item_names = sorted(list(self.items.keys()))
         self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)
 
@@ -177,11 +192,15 @@ def process(self):
 
         self.binary_data_dir.mkdir(parents=True, exist_ok=True)
 
-        # Copy spk_map and dictionary to binary data dir
+        # Copy spk_map, lang_map and dictionary to binary data dir
         spk_map_fn = self.binary_data_dir / 'spk_map.json'
         with open(spk_map_fn, 'w', encoding='utf-8') as f:
-            json.dump(self.spk_map, f)
-        shutil.copy(locate_dictionary(), self.binary_data_dir / 'dictionary.txt')
+            json.dump(self.spk_map, f, ensure_ascii=False)
+        lang_map_fn = self.binary_data_dir / 'lang_map.json'
+        with open(lang_map_fn, 'w', encoding='utf-8') as f:
+            json.dump(self.spk_map, f, ensure_ascii=False)
+        for lang, dict_path in hparams['dictionaries'].items():
+            shutil.copy(dict_path, self.binary_data_dir / f'dictionary-{lang}.txt')
         self.check_coverage()
 
         # Process valid set and train set
@@ -197,40 +216,45 @@ def process(self):
 
     def check_coverage(self):
         # Group by phonemes in the dictionary.
-        ph_required = set(build_phoneme_list())
-        phoneme_map = {}
-        for ph in ph_required:
-            phoneme_map[ph] = 0
-        ph_occurred = []
+        ph_idx_required = set(range(1, len(self.phoneme_dictionary)))
+        ph_idx_occurred = set()
+        ph_idx_count_map = {
+            idx: 0
+            for idx in ph_idx_required
+        }
 
         # Load and count those phones that appear in the actual data
         for item_name in self.items:
-            ph_occurred += self.items[item_name]['ph_seq']
-            if len(ph_occurred) == 0:
-                raise BinarizationError(f'Empty tokens in {item_name}.')
-        for ph in ph_occurred:
-            if ph not in ph_required:
-                continue
-            phoneme_map[ph] += 1
-        ph_occurred = set(ph_occurred)
+            ph_idx_occurred.update(self.items[item_name]['ph_seq'])
+            for idx in self.items[item_name]['ph_seq']:
+                ph_idx_count_map[idx] += 1
+        ph_count_map = {
+            self.phoneme_dictionary.decode_one(idx, scalar=False): count
+            for idx, count in ph_idx_count_map.items()
+        }
 
         print('===== Phoneme Distribution Summary =====')
-        for i, key in enumerate(sorted(phoneme_map.keys())):
-            if i == len(ph_required) - 1:
+        keys = sorted(ph_count_map.keys(), key=lambda v: v[0] if isinstance(v, tuple) else v)
+        for i, key in enumerate(keys):
+            if i == len(ph_count_map) - 1:
                 end = '\n'
             elif i % 10 == 9:
                 end = ',\n'
             else:
                 end = ', '
-            print(f'\'{key}\': {phoneme_map[key]}', end=end)
+            if isinstance(key, tuple):
+                key_disp = '(' + ', '.join(key) + ')'
+            else:
+                key_disp = key
+            print(f'{key_disp}: {ph_count_map[key]}', end=end)
 
         # Draw graph.
-        x = sorted(phoneme_map.keys())
-        values = [phoneme_map[k] for k in x]
+        xs = [str(k) for k in keys]
+        ys = [ph_count_map[k] for k in keys]
         plt = distribution_to_figure(
             title='Phoneme Distribution Summary',
             x_label='Phoneme', y_label='Number of occurrences',
-            items=x, values=values
+            items=xs, values=ys
         )
         filename = self.binary_data_dir / 'phoneme_distribution.jpg'
         plt.savefig(fname=filename,
@@ -239,12 +263,14 @@ def check_coverage(self):
         print(f'| save summary to \'{filename}\'')
 
         # Check unrecognizable or missing phonemes
-        if ph_occurred != ph_required:
-            unrecognizable_phones = ph_occurred.difference(ph_required)
-            missing_phones = ph_required.difference(ph_occurred)
-            raise BinarizationError('transcriptions and dictionary mismatch.\n'
-                                    f' (+) {sorted(unrecognizable_phones)}\n'
-                                    f' (-) {sorted(missing_phones)}')
+        if ph_idx_occurred != ph_idx_required:
+            missing_phones = sorted({
+                self.phoneme_dictionary.decode_one(idx, scalar=False)
+                for idx in ph_idx_required.difference(ph_idx_occurred)
+            }, key=lambda v: v[0] if isinstance(v, tuple) else v)
+            raise BinarizationError(
+                f'The following phonemes are not covered in transcriptions: {sorted(missing_phones)}'
+            )
 
     def process_dataset(self, prefix, num_workers=0, apply_augmentation=False):
         args = []
diff --git a/basics/base_exporter.py b/basics/base_exporter.py
index cc016004a..e2e65f534 100644
--- a/basics/base_exporter.py
+++ b/basics/base_exporter.py
@@ -1,4 +1,6 @@
 import json
+import pathlib
+import shutil
 from pathlib import Path
 from typing import Union
 
@@ -44,6 +46,19 @@ def export_model(self, path: Path):
         """
         raise NotImplementedError()
 
+    # noinspection PyMethodMayBeStatic
+    def export_dictionaries(self, path: Path):
+        dicts = hparams.get('dictionaries')
+        if dicts is not None:
+            for lang in dicts.keys():
+                fn = f'dictionary-{lang}.txt'
+                shutil.copy(pathlib.Path(hparams['work_dir']) / fn, path)
+                print(f'| export dictionary => {path / fn}')
+        else:
+            fn = 'dictionary.txt'
+            shutil.copy(pathlib.Path(hparams['work_dir']) / fn, path)
+            print(f'| export dictionary => {path / fn}')
+
     def export_attachments(self, path: Path):
         """
         Exports related files and configs (e.g. the dictionary) to the target directory.
diff --git a/basics/base_task.py b/basics/base_task.py
index 768f8e311..767f1458d 100644
--- a/basics/base_task.py
+++ b/basics/base_task.py
@@ -8,7 +8,6 @@
 import matplotlib
 
 import utils
-from utils.text_encoder import TokenTextEncoder
 
 matplotlib.use('Agg')
 
@@ -24,7 +23,7 @@
     DsBatchSampler, DsTensorBoardLogger,
     get_latest_checkpoint_path, get_strategy
 )
-from utils.phoneme_utils import locate_dictionary, build_phoneme_list
+from utils.phoneme_utils import load_phoneme_dictionary
 
 torch.multiprocessing.set_sharing_strategy(os.getenv('TORCH_SHARE_STRATEGY', 'file_system'))
 
@@ -71,7 +70,7 @@ def __init__(self, *args, **kwargs):
         self.skip_immediate_validation = False
         self.skip_immediate_ckpt_save = False
 
-        self.phone_encoder = self.build_phone_encoder()
+        self.phoneme_dictionary = load_phoneme_dictionary()
         self.build_model()
 
         self.valid_losses: Dict[str, Metric] = {}
@@ -165,11 +164,6 @@ def load_pre_train_model(self):
         else:
             raise RuntimeError("")
 
-    @staticmethod
-    def build_phone_encoder():
-        phone_list = build_phoneme_list()
-        return TokenTextEncoder(vocab_list=phone_list)
-
     def _build_model(self):
         raise NotImplementedError()
 
@@ -448,21 +442,19 @@ def start(cls):
         if not hparams['infer']:  # train
             @rank_zero_only
             def train_payload_copy():
-                # Copy spk_map.json and dictionary.txt to work dir
+                # Copy files to work_dir
                 binary_dir = pathlib.Path(hparams['binary_data_dir'])
                 spk_map = work_dir / 'spk_map.json'
                 spk_map_src = binary_dir / 'spk_map.json'
                 if not spk_map.exists() and spk_map_src.exists():
                     shutil.copy(spk_map_src, spk_map)
                     print(f'| Copied spk map to {spk_map}.')
-                dictionary = work_dir / 'dictionary.txt'
-                dict_src = binary_dir / 'dictionary.txt'
-                if not dictionary.exists():
-                    if dict_src.exists():
-                        shutil.copy(dict_src, dictionary)
-                    else:
-                        shutil.copy(locate_dictionary(), dictionary)
-                    print(f'| Copied dictionary to {dictionary}.')
+                for lang in hparams['dictionaries'].keys():
+                    dict_dst = work_dir / f'dictionary-{lang}.txt'
+                    dict_src = binary_dir / f'dictionary-{lang}.txt'
+                    if not dict_dst.exists():
+                        shutil.copy(dict_src, dict_dst)
+                        print(f'| Copied dictionary for language \'{lang}\' to {dict_dst}.')
 
             train_payload_copy()
             trainer.fit(task, ckpt_path=get_latest_checkpoint_path(work_dir))
diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
index 4f0f533e2..1f56a9ce4 100644
--- a/deployment/exporters/acoustic_exporter.py
+++ b/deployment/exporters/acoustic_exporter.py
@@ -1,4 +1,3 @@
-import shutil
 from pathlib import Path
 from typing import List, Union, Tuple, Dict
 
@@ -12,8 +11,7 @@
 from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST
 from utils import load_ckpt, onnx_helper, remove_suffix
 from utils.hparams import hparams
-from utils.phoneme_utils import locate_dictionary, build_phoneme_list
-from utils.text_encoder import TokenTextEncoder
+from utils.phoneme_utils import load_phoneme_dictionary
 
 
 class DiffSingerAcousticExporter(BaseExporter):
@@ -32,7 +30,7 @@ def __init__(
         self.model_name: str = hparams['exp_name']
         self.ckpt_steps: int = ckpt_steps
         self.spk_map: dict = self.build_spk_map()
-        self.vocab = TokenTextEncoder(vocab_list=build_phoneme_list())
+        self.phoneme_dictionary = load_phoneme_dictionary()
         self.model = self.build_model()
         self.fs2_aux_cache_path = self.cache_dir / (
             'fs2_aux.onnx' if self.model.use_shallow_diffusion else 'fs2.onnx'
@@ -80,7 +78,7 @@ def __init__(
 
     def build_model(self) -> DiffSingerAcousticONNX:
         model = DiffSingerAcousticONNX(
-            vocab_size=len(self.vocab),
+            vocab_size=len(self.phoneme_dictionary),
             out_dims=hparams['audio_num_mel_bins']
         ).eval().to(self.device)
         load_ckpt(model, hparams['work_dir'], ckpt_steps=self.ckpt_steps,
@@ -111,7 +109,7 @@ def export_attachments(self, path: Path):
                 path / f'{self.model_name}.{spk[0]}.emb',
                 self._perform_spk_mix(spk[1])
             )
-        self._export_dictionary(path / 'dictionary.txt')
+        self.export_dictionaries(path)
         self._export_phonemes(path / f'{self.model_name}.phonemes.txt')
 
         model_name = self.model_name
@@ -395,11 +393,6 @@ def _export_spk_embed(self, path: Path, spk_embed: torch.Tensor):
             f.write(spk_embed.cpu().numpy().tobytes())
         print(f'| export spk embed => {path}')
 
-    # noinspection PyMethodMayBeStatic
-    def _export_dictionary(self, path: Path):
-        print(f'| export dictionary => {path}')
-        shutil.copy(locate_dictionary(), path)
-
     def _export_phonemes(self, path: Path):
-        self.vocab.store_to_file(path)
+        self.phoneme_dictionary.dump(path)
         print(f'| export phonemes => {path}')
diff --git a/deployment/exporters/variance_exporter.py b/deployment/exporters/variance_exporter.py
index 1af433ae4..4e594c407 100644
--- a/deployment/exporters/variance_exporter.py
+++ b/deployment/exporters/variance_exporter.py
@@ -1,4 +1,3 @@
-import shutil
 from pathlib import Path
 from typing import Union, List, Tuple, Dict
 
@@ -12,8 +11,7 @@
 from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST
 from utils import load_ckpt, onnx_helper, remove_suffix
 from utils.hparams import hparams
-from utils.phoneme_utils import locate_dictionary, build_phoneme_list
-from utils.text_encoder import TokenTextEncoder
+from utils.phoneme_utils import load_phoneme_dictionary
 
 
 class DiffSingerVarianceExporter(BaseExporter):
@@ -32,7 +30,7 @@ def __init__(
         self.model_name: str = hparams['exp_name']
         self.ckpt_steps: int = ckpt_steps
         self.spk_map: dict = self.build_spk_map()
-        self.vocab = TokenTextEncoder(vocab_list=build_phoneme_list())
+        self.phoneme_dictionary = load_phoneme_dictionary()
         self.model = self.build_model()
         self.linguistic_encoder_cache_path = self.cache_dir / 'linguistic.onnx'
         self.dur_predictor_cache_path = self.cache_dir / 'dur.onnx'
@@ -83,7 +81,7 @@ def __init__(
 
     def build_model(self) -> DiffSingerVarianceONNX:
         model = DiffSingerVarianceONNX(
-            vocab_size=len(self.vocab)
+            vocab_size=len(self.phoneme_dictionary)
         ).eval().to(self.device)
         load_ckpt(model, hparams['work_dir'], ckpt_steps=self.ckpt_steps,
                   prefix_in_ckpt='model', strict=True, device=self.device)
@@ -142,7 +140,7 @@ def export_attachments(self, path: Path):
                 path / f'{self.model_name}.{spk[0]}.emb',
                 self._perform_spk_mix(spk[1])
             )
-        self._export_dictionary(path / 'dictionary.txt')
+        self.export_dictionaries(path)
         self._export_phonemes((path / f'{self.model_name}.phonemes.txt'))
 
         model_name = self.model_name
@@ -771,11 +769,6 @@ def _export_spk_embed(self, path: Path, spk_embed: torch.Tensor):
             f.write(spk_embed.cpu().numpy().tobytes())
         print(f'| export spk embed => {path}')
 
-    # noinspection PyMethodMayBeStatic
-    def _export_dictionary(self, path: Path):
-        print(f'| export dictionary => {path}')
-        shutil.copy(locate_dictionary(), path)
-
     def _export_phonemes(self, path: Path):
-        self.vocab.store_to_file(path)
+        self.phoneme_dictionary.dump(path)
         print(f'| export phonemes => {path}')
diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py
index d0a3c7b5a..48a3afb40 100644
--- a/deployment/modules/fastspeech2.py
+++ b/deployment/modules/fastspeech2.py
@@ -9,7 +9,7 @@
 from modules.fastspeech.acoustic_encoder import FastSpeech2Acoustic
 from modules.fastspeech.variance_encoder import FastSpeech2Variance
 from utils.hparams import hparams
-from utils.text_encoder import PAD_INDEX
+from utils.phoneme_utils import PAD_INDEX
 
 f0_bin = 256
 f0_max = 1100.0
diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py
index a67f5b166..7b93d38cb 100644
--- a/inference/ds_acoustic.py
+++ b/inference/ds_acoustic.py
@@ -1,12 +1,11 @@
-from collections import OrderedDict
-
-import tqdm
 import json
 import pathlib
+from collections import OrderedDict
+from typing import Dict
 
 import numpy as np
 import torch
-from typing import Dict
+import tqdm
 
 from basics.base_svs_infer import BaseSVSInfer
 from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST
@@ -16,8 +15,7 @@
 from utils import load_ckpt
 from utils.hparams import hparams
 from utils.infer_utils import cross_fade, resample_align_curve, save_wav
-from utils.phoneme_utils import build_phoneme_list
-from utils.text_encoder import TokenTextEncoder
+from utils.phoneme_utils import load_phoneme_dictionary
 
 
 class DiffSingerAcousticInfer(BaseSVSInfer):
@@ -37,7 +35,7 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N
             if hparams.get('use_tension_embed', False):
                 self.variances_to_embed.add('tension')
 
-            self.ph_encoder = TokenTextEncoder(vocab_list=build_phoneme_list())
+            self.phoneme_dictionary = load_phoneme_dictionary()
             if hparams['use_spk_id']:
                 with open(pathlib.Path(hparams['work_dir']) / 'spk_map.json', 'r', encoding='utf8') as f:
                     self.spk_map = json.load(f)
@@ -50,7 +48,7 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N
 
     def build_model(self, ckpt_steps=None):
         model = DiffSingerAcoustic(
-            vocab_size=len(self.ph_encoder),
+            vocab_size=len(self.phoneme_dictionary),
             out_dims=hparams['audio_num_mel_bins']
         ).eval().to(self.device)
         load_ckpt(model, hparams['work_dir'], ckpt_steps=ckpt_steps,
@@ -73,7 +71,9 @@ def preprocess_input(self, param, idx=0):
         """
         batch = {}
         summary = OrderedDict()
-        txt_tokens = torch.LongTensor([self.ph_encoder.encode(param['ph_seq'])]).to(self.device)  # => [B, T_txt]
+        txt_tokens = torch.LongTensor([
+            self.phoneme_dictionary.encode(param['ph_seq'])
+        ]).to(self.device)  # => [B, T_txt]
         batch['tokens'] = txt_tokens
 
         ph_dur = torch.from_numpy(np.array(param['ph_dur'].split(), np.float32)).to(self.device)
diff --git a/inference/ds_variance.py b/inference/ds_variance.py
index c8a9b090a..86abd1abd 100644
--- a/inference/ds_variance.py
+++ b/inference/ds_variance.py
@@ -1,31 +1,29 @@
 import copy
 import json
-
-import tqdm
 import pathlib
 from collections import OrderedDict
+from typing import List, Tuple
 
 import librosa
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import tqdm
 from scipy import interpolate
-from typing import List, Tuple
 
 from basics.base_svs_infer import BaseSVSInfer
+from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST
 from modules.fastspeech.tts_modules import (
     LengthRegulator, RhythmRegulator,
     mel2ph_to_dur
 )
-from modules.fastspeech.param_adaptor import VARIANCE_CHECKLIST
 from modules.toplevel import DiffSingerVariance
 from utils import load_ckpt
 from utils.hparams import hparams
 from utils.infer_utils import resample_align_curve
-from utils.phoneme_utils import build_phoneme_list
+from utils.phoneme_utils import load_phoneme_dictionary
 from utils.pitch_utils import interp_f0
-from utils.text_encoder import TokenTextEncoder
 
 
 class DiffSingerVarianceInfer(BaseSVSInfer):
@@ -34,7 +32,7 @@ def __init__(
             predictions: set = None
     ):
         super().__init__(device=device)
-        self.ph_encoder = TokenTextEncoder(vocab_list=build_phoneme_list())
+        self.phoneme_dictionary = load_phoneme_dictionary()
         if hparams['use_spk_id']:
             with open(pathlib.Path(hparams['work_dir']) / 'spk_map.json', 'r', encoding='utf8') as f:
                 self.spk_map = json.load(f)
@@ -76,7 +74,7 @@ def __init__(
 
     def build_model(self, ckpt_steps=None):
         model = DiffSingerVariance(
-            vocab_size=len(self.ph_encoder)
+            vocab_size=len(self.phoneme_dictionary)
         ).eval().to(self.device)
         load_ckpt(model, hparams['work_dir'], ckpt_steps=ckpt_steps,
                   prefix_in_ckpt='model', strict=True, device=self.device)
@@ -97,7 +95,9 @@ def preprocess_input(
         """
         batch = {}
         summary = OrderedDict()
-        txt_tokens = torch.LongTensor([self.ph_encoder.encode(param['ph_seq'].split())]).to(self.device)  # [B=1, T_ph]
+        txt_tokens = torch.LongTensor([
+            self.phoneme_dictionary.encode(param['ph_seq'].split())
+        ]).to(self.device)  # [B=1, T_ph]
         T_ph = txt_tokens.shape[1]
         batch['tokens'] = txt_tokens
         ph_num = torch.from_numpy(np.array([param['ph_num'].split()], np.int64)).to(self.device)  # [B=1, T_w]
diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 9ab4ed633..b1507837e 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -8,7 +8,7 @@
 )
 from modules.fastspeech.tts_modules import FastSpeech2Encoder, mel2ph_to_dur
 from utils.hparams import hparams
-from utils.text_encoder import PAD_INDEX
+from utils.phoneme_utils import PAD_INDEX
 
 
 class FastSpeech2Acoustic(nn.Module):
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index 82e0a88e8..2031d89ce 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -8,7 +8,7 @@
 )
 from modules.fastspeech.tts_modules import FastSpeech2Encoder, DurationPredictor
 from utils.hparams import hparams
-from utils.text_encoder import PAD_INDEX
+from utils.phoneme_utils import PAD_INDEX
 
 
 class FastSpeech2Variance(nn.Module):
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index b61c88f88..bf799abe5 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -67,17 +67,19 @@ def __init__(self):
             "See https://github.com/openvpi/DiffSinger/releases/tag/v2.3.0 for more details."
         )
 
-    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
+    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
         meta_data_dict = {}
         with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f:
             for utterance_label in csv.DictReader(f):
                 item_name = utterance_label['name']
                 temp_dict = {
                     'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
-                    'ph_seq': utterance_label['ph_seq'].split(),
+                    'ph_seq': self.phoneme_dictionary.encode(utterance_label['ph_seq'], lang=lang),
                     'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()],
-                    'spk_id': spk_id,
-                    'spk_name': self.speakers[ds_id],
+                    'spk_id': self.spk_map[spk],
+                    'spk_name': spk,
+                    'language_id': self.lang_map[lang],
+                    'language_name': lang,
                 }
                 assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \
                     f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.'
@@ -106,7 +108,7 @@ def process_item(self, item_name, meta_data, binarization_args):
             'seconds': seconds,
             'length': length,
             'mel': mel,
-            'tokens': np.array(self.phone_encoder.encode(meta_data['ph_seq']), dtype=np.int64),
+            'tokens': np.array(meta_data['ph_seq'], dtype=np.int64),
             'ph_dur': np.array(meta_data['ph_dur']).astype(np.float32),
         }
 
diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py
index 2882cf769..99ed65c31 100644
--- a/preprocessing/variance_binarizer.py
+++ b/preprocessing/variance_binarizer.py
@@ -108,7 +108,7 @@ def load_attr_from_ds(self, ds_id, name, attr, idx=0):
             ds = ds[idx]
         return ds.get(attr)
 
-    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
+    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
         meta_data_dict = {}
 
         with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf8') as f:
@@ -130,10 +130,12 @@ def require(attr):
 
                 temp_dict = {
                     'ds_idx': item_idx,
-                    'spk_id': spk_id,
+                    'spk_id': self.spk_map[spk],
                     'spk_name': self.speakers[ds_id],
+                    'language_id': self.lang_map[lang],
+                    'language_name': lang,
                     'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
-                    'ph_seq': require('ph_seq').split(),
+                    'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang),
                     'ph_dur': [float(x) for x in require('ph_dur').split()]
                 }
 
@@ -249,7 +251,7 @@ def process_item(self, item_name, meta_data, binarization_args):
             'spk_name': meta_data['spk_name'],
             'seconds': seconds,
             'length': length,
-            'tokens': np.array(self.phone_encoder.encode(meta_data['ph_seq']), dtype=np.int64)
+            'tokens': np.array(meta_data['ph_seq'], dtype=np.int64)
         }
 
         ph_dur_sec = torch.FloatTensor(meta_data['ph_dur']).to(self.device)
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index de6a9adb5..79f05003f 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -92,7 +92,7 @@ def __init__(self):
 
     def _build_model(self):
         return DiffSingerAcoustic(
-            vocab_size=len(self.phone_encoder),
+            vocab_size=len(self.phoneme_dictionary),
             out_dims=hparams['audio_num_mel_bins']
         )
 
diff --git a/training/variance_task.py b/training/variance_task.py
index 88a844952..0a33301e6 100644
--- a/training/variance_task.py
+++ b/training/variance_task.py
@@ -113,7 +113,7 @@ def __init__(self):
 
     def _build_model(self):
         return DiffSingerVariance(
-            vocab_size=len(self.phone_encoder),
+            vocab_size=len(self.phoneme_dictionary),
         )
 
     # noinspection PyAttributeOutsideInit
@@ -295,7 +295,7 @@ def sample_get(key, idx, abs_idx):
     def plot_dur(self, data_idx, gt_dur, pred_dur, txt=None):
         gt_dur = gt_dur[0].cpu().numpy()
         pred_dur = pred_dur[0].cpu().numpy()
-        txt = self.phone_encoder.decode(txt[0].cpu().numpy()).split()
+        txt = self.phoneme_dictionary.decode(txt[0].cpu().numpy()).split()
         title_text = f"{self.valid_dataset.metadata['spk_names'][data_idx]} - {self.valid_dataset.metadata['names'][data_idx]}"
         self.logger.all_rank_experiment.add_figure(f'dur_{data_idx}', dur_to_figure(
             gt_dur, pred_dur, txt, title_text
diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 269122a6d..1547bc9eb 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -1,99 +1,190 @@
+import json
 import pathlib
-
-try:
-    from lightning.pytorch.utilities.rank_zero import rank_zero_info
-except ModuleNotFoundError:
-    rank_zero_info = print
+from typing import Dict, List, Union
 
 from utils.hparams import hparams
 
-_initialized = False
-_ALL_CONSONANTS_SET = set()
-_ALL_VOWELS_SET = set()
-_dictionary = {
-    'AP': ['AP'],
-    'SP': ['SP']
-}
-_phoneme_list: list
-
-
-def locate_dictionary():
-    """
-    Search and locate the dictionary file.
-    Order:
-    1. hparams['dictionary']
-    2. hparams['g2p_dictionary']
-    3. 'dictionary.txt' in hparams['work_dir']
-    4. file with same name as hparams['g2p_dictionary'] in hparams['work_dir']
-    :return: pathlib.Path of the dictionary file
-    """
-    assert 'dictionary' in hparams or 'g2p_dictionary' in hparams, \
-        'Please specify a dictionary file in your config.'
-    config_dict_path = pathlib.Path(hparams['dictionary'])
-    if config_dict_path.exists():
-        return config_dict_path
-    work_dir = pathlib.Path(hparams['work_dir'])
-    ckpt_dict_path = work_dir / config_dict_path.name
-    if ckpt_dict_path.exists():
-        return ckpt_dict_path
-    ckpt_dict_path = work_dir / 'dictionary.txt'
-    if ckpt_dict_path.exists():
-        return ckpt_dict_path
-    raise FileNotFoundError('Unable to locate the dictionary file. '
-                            'Please specify the right dictionary in your config.')
-
-
-def _build_dict_and_list():
-    global _dictionary, _phoneme_list
-
-    _set = set()
-    with open(locate_dictionary(), 'r', encoding='utf8') as _df:
-        _lines = _df.readlines()
-    for _line in _lines:
-        _pinyin, _ph_str = _line.strip().split('\t')
-        _dictionary[_pinyin] = _ph_str.split()
-    for _list in _dictionary.values():
-        [_set.add(ph) for ph in _list]
-    _phoneme_list = sorted(list(_set))
-    rank_zero_info('| load phoneme set: ' + str(_phoneme_list))
-
-
-def _initialize_consonants_and_vowels():
-    # Currently we only support two-part consonant-vowel phoneme systems.
-    for _ph_list in _dictionary.values():
-        _ph_count = len(_ph_list)
-        if _ph_count == 0 or _ph_list[0] in ['AP', 'SP']:
-            continue
-        elif len(_ph_list) == 1:
-            _ALL_VOWELS_SET.add(_ph_list[0])
+PAD_INDEX = 0
+
+
+class PhonemeDictionary:
+    def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[List[str]] = None):
+        all_phonemes = {'AP', 'SP'}
+        self._multi_langs = len(dictionaries) > 1
+        for lang, dict_path in dictionaries.items():
+            with open(dict_path, 'r', encoding='utf8') as dict_file:
+                for line in dict_file:
+                    _, phonemes = line.strip().split('\t')
+                    phonemes = phonemes.split()
+                    for phoneme in phonemes:
+                        if '/' in phoneme:
+                            raise ValueError(
+                                f"Invalid phoneme tag '{phoneme}' in dictionary '{dict_path}': "
+                                f"should not contain the reserved character '/'."
+                            )
+                        if self._multi_langs:
+                            all_phonemes.add(f'{lang}/{phoneme}')
+                        else:
+                            all_phonemes.add(phoneme)
+        if merged_groups is None:
+            merged_groups = []
         else:
-            _ALL_CONSONANTS_SET.add(_ph_list[0])
-            _ALL_VOWELS_SET.add(_ph_list[1])
-
-
-def _initialize():
-    global _initialized
-    if not _initialized:
-        _build_dict_and_list()
-        _initialize_consonants_and_vowels()
-        _initialized = True
-
-
-def get_all_consonants():
-    _initialize()
-    return sorted(_ALL_CONSONANTS_SET)
-
-
-def get_all_vowels():
-    _initialize()
-    return sorted(_ALL_VOWELS_SET)
-
-
-def build_dictionary() -> dict:
-    _initialize()
-    return _dictionary
-
-
-def build_phoneme_list() -> list:
-    _initialize()
-    return _phoneme_list
+            if self._multi_langs:
+                for group in merged_groups:
+                    for phoneme in group:
+                        if '/' not in phoneme:
+                            raise ValueError(
+                                f"Invalid phoneme tag '{phoneme}' in merged group: "
+                                "should specify language by '<lang>/' prefix."
+                            )
+                        lang, name = phoneme.split('/', maxsplit=1)
+                        if lang not in dictionaries:
+                            raise ValueError(
+                                f"Invalid phoneme tag '{phoneme}' in merged group: "
+                                f"unrecognized language name '{lang}'."
+                            )
+                merged_groups = [set(phones) for phones in merged_groups if len(phones) > 1]
+            else:
+                _merged_groups = []
+                for group in merged_groups:
+                    _group = []
+                    for phoneme in group:
+                        if '/' in phoneme:
+                            lang, name = phoneme.split('/', maxsplit=1)
+                            if lang not in dictionaries:
+                                raise ValueError(
+                                    f"Invalid phoneme tag '{phoneme}' in merged group: "
+                                    f"unrecognized language name '{lang}'."
+                                )
+                            _group.append(name)
+                        else:
+                            _group.append(phoneme)
+                    _merged_groups.append(_group)
+                merged_groups = [set(phones) for phones in _merged_groups if len(phones) > 1]
+        merged_phonemes_inverted_index = {}
+        for idx, group in enumerate(merged_groups):
+            other_idx = None
+            for phoneme in group:
+                if phoneme in merged_phonemes_inverted_index:
+                    other_idx = merged_phonemes_inverted_index[phoneme]
+                    break
+            target_idx = idx if other_idx is None else other_idx
+            for phoneme in group:
+                merged_phonemes_inverted_index[phoneme] = target_idx
+            if other_idx is not None:
+                merged_groups[other_idx] |= group
+        phone_to_id = {}
+        id_to_phone = []
+        idx = 1
+        for phoneme in sorted(all_phonemes):
+            if phoneme in merged_phonemes_inverted_index:
+                has_assigned = True
+                for alias in merged_groups[merged_phonemes_inverted_index[phoneme]]:
+                    if alias not in phone_to_id:
+                        has_assigned = False
+                        phone_to_id[alias] = idx
+                if not has_assigned:
+                    id_to_phone.append(tuple(sorted(merged_groups[merged_phonemes_inverted_index[phoneme]])))
+                    idx += 1
+            else:
+                phone_to_id[phoneme] = idx
+                id_to_phone.append(phoneme)
+                idx += 1
+        self._phone_to_id: Dict[str, int] = phone_to_id
+        self._id_to_phone: List[Union[str, tuple]] = id_to_phone
+
+    @property
+    def vocab_size(self):
+        return len(self._id_to_phone) + 1
+
+    def __len__(self):
+        return self.vocab_size
+
+    def encode_one(self, phone, lang=None):
+        if lang is None or not self._multi_langs or phone in self._phone_to_id:
+            return self._phone_to_id[phone]
+        if '/' not in phone:
+            phone = f'{lang}/{phone}'
+        return self._phone_to_id[phone]
+
+    def encode(self, sentence, lang=None):
+        phones = sentence.strip().split() if isinstance(sentence, str) else sentence
+        return [self.encode_one(phone, lang=lang) for phone in phones]
+
+    def decode_one(self, idx, lang=None, scalar=True):
+        if idx <= 0:
+            return None
+        phone = self._id_to_phone[idx - 1]
+        if not scalar or isinstance(phone, str):
+            return phone
+        if lang is None or not self._multi_langs:
+            return phone[0]
+        for alias in phone:
+            if alias.startswith(f'{lang}/'):
+                return alias
+        return phone[0]
+
+    def decode(self, ids, lang=None, scalar=True):
+        ids = list(ids)
+        return ' '.join([
+            self.decode_one(i, lang=lang, scalar=scalar)
+            for i in ids
+            if i >= 1
+        ])
+
+    def dump(self, filename):
+        with open(filename, 'w', encoding='utf8') as fp:
+            json.dump(self._phone_to_id, fp, ensure_ascii=False, indent=2)
+
+
+_dictionary = None
+
+
+def load_phoneme_dictionary() -> PhonemeDictionary:
+    if _dictionary is not None:
+        return _dictionary
+    config_dicts = hparams.get('dictionaries')
+    if config_dicts is not None:
+        dicts = {}
+        for lang, config_dict_path in config_dicts.items():
+            config_dict_path = pathlib.Path(config_dict_path)
+            if not config_dict_path.exists():
+                config_dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt'
+            if not config_dict_path.exists():
+                raise FileNotFoundError(
+                    f"Could not locate dictionary for language '{lang}'."
+                )
+            dicts[lang] = config_dict_path
+    else:
+        config_dict_path = pathlib.Path(hparams['dictionary'])
+        if not config_dict_path.exists():
+            config_dict_path = pathlib.Path(hparams['work_dir']) / 'dictionary.txt'
+        if not config_dict_path.exists():
+            raise FileNotFoundError(
+                f"Could not locate dictionary file."
+            )
+        dicts = {
+            'default': config_dict_path
+        }
+    return PhonemeDictionary(
+        dictionaries=dicts,
+        merged_groups=hparams.get('merged_phoneme_groups')
+    )
+
+
+if __name__ == '__main__':
+    d = PhonemeDictionary(
+        dictionaries={
+            'zh': 'dictionaries/opencpop-extension.txt',
+            # 'en': 'dictionaries/opencpop-extension.txt',
+        },
+        merged_groups=[
+            ['zh/a', 'zh/b', 'c'],
+            ['a', 'd', 'e'],
+            ['e', 'f']
+        ]
+    )
+    ph_ids = d.encode('sh ir zh e j v y i b a SP', lang='en')
+    ph_seq = d.decode(ph_ids)
+    print(ph_ids)
+    print(ph_seq)
diff --git a/utils/text_encoder.py b/utils/text_encoder.py
deleted file mode 100644
index 4b7815c46..000000000
--- a/utils/text_encoder.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import numpy as np
-
-PAD = '<PAD>'
-PAD_INDEX = 0
-
-
-class TokenTextEncoder:
-    """Encoder based on a user-supplied vocabulary (file or list)."""
-
-    def __init__(self, vocab_list):
-        """Initialize from a file or list, one token per line.
-
-        Handling of reserved tokens works as follows:
-        - When initializing from a list, we add reserved tokens to the vocab.
-
-        Args:
-            vocab_list: If not None, a list of elements of the vocabulary.
-        """
-        self.vocab_list = sorted(vocab_list)
-
-    def encode(self, sentence):
-        """Converts a space-separated string of phones to a list of ids."""
-        phones = sentence.strip().split() if isinstance(sentence, str) else sentence
-        return [self.vocab_list.index(ph) + 1 if ph != PAD else PAD_INDEX for ph in phones]
-
-    def decode(self, ids, strip_padding=False):
-        if strip_padding:
-            ids = np.trim_zeros(ids)
-        ids = list(ids)
-        return ' '.join([
-            self.vocab_list[_id - 1] if _id >= 1 else PAD
-            for _id in ids
-        ])
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab_list) + 1
-
-    def __len__(self):
-        return self.vocab_size
-
-    def store_to_file(self, filename):
-        """Write vocab file to disk.
-
-        Vocab files have one token per line. The file ends in a newline. Reserved
-        tokens are written to the vocab file as well.
-
-        Args:
-        filename: Full path of the file to store the vocab to.
-        """
-        with open(filename, 'w', encoding='utf8') as f:
-            print(PAD, file=f)
-            [print(tok, file=f) for tok in self.vocab_list]

From a151ecf5dc99d847fefe4518b905ff5e03b8c95e Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 16 Jul 2024 01:28:24 +0800
Subject: [PATCH 02/44] Fix lang_map.json copy

---
 basics/base_binarizer.py |  2 +-
 basics/base_task.py      | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index 896bfd3a4..8ec22c279 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -198,7 +198,7 @@ def process(self):
             json.dump(self.spk_map, f, ensure_ascii=False)
         lang_map_fn = self.binary_data_dir / 'lang_map.json'
         with open(lang_map_fn, 'w', encoding='utf-8') as f:
-            json.dump(self.spk_map, f, ensure_ascii=False)
+            json.dump(self.lang_map, f, ensure_ascii=False)
         for lang, dict_path in hparams['dictionaries'].items():
             shutil.copy(dict_path, self.binary_data_dir / f'dictionary-{lang}.txt')
         self.check_coverage()
diff --git a/basics/base_task.py b/basics/base_task.py
index 767f1458d..b53133d39 100644
--- a/basics/base_task.py
+++ b/basics/base_task.py
@@ -444,17 +444,18 @@ def start(cls):
             def train_payload_copy():
                 # Copy files to work_dir
                 binary_dir = pathlib.Path(hparams['binary_data_dir'])
-                spk_map = work_dir / 'spk_map.json'
+                spk_map_dst = work_dir / 'spk_map.json'
                 spk_map_src = binary_dir / 'spk_map.json'
-                if not spk_map.exists() and spk_map_src.exists():
-                    shutil.copy(spk_map_src, spk_map)
-                    print(f'| Copied spk map to {spk_map}.')
+                shutil.copy(spk_map_src, spk_map_dst)
+                lang_map_dst = work_dir / 'lang_map.json'
+                lang_map_src = binary_dir / 'lang_map.json'
+                shutil.copy(lang_map_src, lang_map_dst)
+                print(f'| Copied spk map to {spk_map_dst}.')
                 for lang in hparams['dictionaries'].keys():
                     dict_dst = work_dir / f'dictionary-{lang}.txt'
                     dict_src = binary_dir / f'dictionary-{lang}.txt'
-                    if not dict_dst.exists():
-                        shutil.copy(dict_src, dict_dst)
-                        print(f'| Copied dictionary for language \'{lang}\' to {dict_dst}.')
+                    shutil.copy(dict_src, dict_dst)
+                    print(f'| Copied dictionary for language \'{lang}\' to {dict_dst}.')
 
             train_payload_copy()
             trainer.fit(task, ckpt_path=get_latest_checkpoint_path(work_dir))

From b5a876b8c65140a08d9434e3d3fb703d5aa54108 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 16 Jul 2024 16:28:34 +0800
Subject: [PATCH 03/44] Add language embed (inject to txt_embed) for acoustic
 models

---
 basics/base_binarizer.py               |  4 +++-
 modules/fastspeech/acoustic_encoder.py |  9 ++++++++-
 preprocessing/acoustic_binarizer.py    |  8 ++++++--
 training/acoustic_task.py              | 11 ++++++++++-
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index 8ec22c279..b1bd6b97a 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -104,9 +104,11 @@ def build_lang_map(self):
         assert len(self.dictionaries.keys()) <= hparams['num_lang'], \
             'Number of languages must not be greater than num_lang!'
 
-        for lang_id, lang_name in enumerate(sorted(self.dictionaries.keys())):
+        for lang_id, lang_name in enumerate(sorted(self.dictionaries.keys()), start=1):
             self.lang_map[lang_name] = lang_id
 
+        print("| lang_map: ", self.lang_map)
+
     def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
         raise NotImplementedError()
 
diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index b1507837e..2b53ace20 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -56,6 +56,9 @@ def __init__(self, vocab_size):
         self.use_spk_id = hparams['use_spk_id']
         if self.use_spk_id:
             self.spk_embed = Embedding(hparams['num_spk'], hparams['hidden_size'])
+        self.use_lang_id = hparams.get('use_lang_id', False)
+        if self.use_lang_id:
+            self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
 
     def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances):
         if self.use_variance_embeds:
@@ -78,9 +81,13 @@ def forward_variance_embedding(self, condition, key_shift=None, speed=None, **va
     def forward(
             self, txt_tokens, mel2ph, f0,
             key_shift=None, speed=None,
-            spk_embed_id=None, **kwargs
+            spk_embed_id=None, languages=None,
+            **kwargs
     ):
         txt_embed = self.txt_embed(txt_tokens)
+        if self.use_lang_id:
+            lang_embed = self.lang_embed(languages)
+            txt_embed += lang_embed
         dur = mel2ph_to_dur(mel2ph, txt_tokens.shape[1]).float()
         dur_embed = self.dur_embed(dur[:, :, None])
         encoder_out = self.encoder(txt_embed, dur_embed, txt_tokens == 0)
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index bf799abe5..18a6cd478 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -36,6 +36,7 @@
 ACOUSTIC_ITEM_ATTRIBUTES = [
     'spk_id',
     'mel',
+    'languages',
     'tokens',
     'mel2ph',
     'f0',
@@ -74,12 +75,14 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
                 item_name = utterance_label['name']
                 temp_dict = {
                     'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
+                    'lang_seq': [
+                        self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                        for p in utterance_label['ph_seq'].split()
+                    ],
                     'ph_seq': self.phoneme_dictionary.encode(utterance_label['ph_seq'], lang=lang),
                     'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()],
                     'spk_id': self.spk_map[spk],
                     'spk_name': spk,
-                    'language_id': self.lang_map[lang],
-                    'language_name': lang,
                 }
                 assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \
                     f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.'
@@ -108,6 +111,7 @@ def process_item(self, item_name, meta_data, binarization_args):
             'seconds': seconds,
             'length': length,
             'mel': mel,
+            'languages': np.array(meta_data['lang_seq'], dtype=np.int64),
             'tokens': np.array(meta_data['ph_seq'], dtype=np.int64),
             'ph_dur': np.array(meta_data['ph_dur']).astype(np.float32),
         }
diff --git a/training/acoustic_task.py b/training/acoustic_task.py
index 79f05003f..ca6a71c65 100644
--- a/training/acoustic_task.py
+++ b/training/acoustic_task.py
@@ -35,6 +35,7 @@ def __init__(self, prefix, preload=False):
         self.need_key_shift = hparams['use_key_shift_embed']
         self.need_speed = hparams['use_speed_embed']
         self.need_spk_id = hparams['use_spk_id']
+        self.need_lang_id = hparams['use_lang_id']
 
     def collater(self, samples):
         batch = super().collater(samples)
@@ -60,6 +61,9 @@ def collater(self, samples):
         if self.need_spk_id:
             spk_ids = torch.LongTensor([s['spk_id'] for s in samples])
             batch['spk_ids'] = spk_ids
+        if self.need_lang_id:
+            languages = utils.collate_nd([s['languages'] for s in samples], 0)
+            batch['languages'] = languages
         return batch
 
 
@@ -128,9 +132,14 @@ def run_model(self, sample, infer=False):
             spk_embed_id = sample['spk_ids']
         else:
             spk_embed_id = None
+        if hparams['use_lang_id']:
+            languages = sample['languages']
+        else:
+            languages = None
         output: ShallowDiffusionOutput = self.model(
             txt_tokens, mel2ph=mel2ph, f0=f0, **variances,
-            key_shift=key_shift, speed=speed, spk_embed_id=spk_embed_id,
+            key_shift=key_shift, speed=speed,
+            spk_embed_id=spk_embed_id, languages=languages,
             gt_mel=target, infer=infer
         )
 

From d282e28edfb60d50bb4a1de044a67b3dab08fba6 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Thu, 18 Jul 2024 23:49:58 +0800
Subject: [PATCH 04/44] Save language sequence in variance preprocessing

---
 preprocessing/variance_binarizer.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py
index 99ed65c31..00027954e 100644
--- a/preprocessing/variance_binarizer.py
+++ b/preprocessing/variance_binarizer.py
@@ -30,6 +30,7 @@
 os.environ["OMP_NUM_THREADS"] = "1"
 VARIANCE_ITEM_ATTRIBUTES = [
     'spk_id',  # index number of dataset/speaker, int64
+    'languages',  # index numbers of phoneme languages, int64[T_ph,]
     'tokens',  # index numbers of phonemes, int64[T_ph,]
     'ph_dur',  # durations of phonemes, in number of frames, int64[T_ph,]
     'midi',  # phoneme-level mean MIDI pitch, int64[T_ph,]
@@ -135,7 +136,10 @@ def require(attr):
                     'language_id': self.lang_map[lang],
                     'language_name': lang,
                     'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
-                    'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang),
+                    'lang_seq': [
+                        self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                        for p in utterance_label['ph_seq'].split()
+                    ],'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang),
                     'ph_dur': [float(x) for x in require('ph_dur').split()]
                 }
 
@@ -251,6 +255,7 @@ def process_item(self, item_name, meta_data, binarization_args):
             'spk_name': meta_data['spk_name'],
             'seconds': seconds,
             'length': length,
+            'languages': np.array(meta_data['lang_seq'], dtype=np.int64),
             'tokens': np.array(meta_data['ph_seq'], dtype=np.int64)
         }
 

From 70676ccc5cdcfa2b76adda823a9fc667ba6bb482 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Fri, 19 Jul 2024 02:19:24 +0800
Subject: [PATCH 05/44] Display merged phoneme groups properly in distribution
 plots

---
 basics/base_binarizer.py | 14 ++++++++------
 utils/plot.py            |  4 +++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index b1bd6b97a..b0342d43b 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -235,6 +235,11 @@ def check_coverage(self):
             for idx, count in ph_idx_count_map.items()
         }
 
+        def display_phoneme(phoneme):
+            if isinstance(phoneme, tuple):
+                return f'({", ".join(phoneme)})'
+            return phoneme
+
         print('===== Phoneme Distribution Summary =====')
         keys = sorted(ph_count_map.keys(), key=lambda v: v[0] if isinstance(v, tuple) else v)
         for i, key in enumerate(keys):
@@ -244,19 +249,16 @@ def check_coverage(self):
                 end = ',\n'
             else:
                 end = ', '
-            if isinstance(key, tuple):
-                key_disp = '(' + ', '.join(key) + ')'
-            else:
-                key_disp = key
+            key_disp = display_phoneme(key)
             print(f'{key_disp}: {ph_count_map[key]}', end=end)
 
         # Draw graph.
-        xs = [str(k) for k in keys]
+        xs = [display_phoneme(k) for k in keys]
         ys = [ph_count_map[k] for k in keys]
         plt = distribution_to_figure(
             title='Phoneme Distribution Summary',
             x_label='Phoneme', y_label='Number of occurrences',
-            items=xs, values=ys
+            items=xs, values=ys, rotate=len(self.dictionaries) > 1
         )
         filename = self.binary_data_dir / 'phoneme_distribution.jpg'
         plt.savefig(fname=filename,
diff --git a/utils/plot.py b/utils/plot.py
index b76e0726c..48cb9c430 100644
--- a/utils/plot.py
+++ b/utils/plot.py
@@ -106,7 +106,7 @@ def curve_to_figure(curve_gt, curve_pred=None, curve_base=None, grid=None, title
     return fig
 
 
-def distribution_to_figure(title, x_label, y_label, items: list, values: list, zoom=0.8):
+def distribution_to_figure(title, x_label, y_label, items: list, values: list, zoom=0.8, rotate=False):
     fig = plt.figure(figsize=(int(len(items) * zoom), 10))
     plt.bar(x=items, height=values)
     plt.tick_params(labelsize=15)
@@ -117,4 +117,6 @@ def distribution_to_figure(title, x_label, y_label, items: list, values: list, z
     plt.title(title, fontsize=30)
     plt.xlabel(x_label, fontsize=20)
     plt.ylabel(y_label, fontsize=20)
+    if rotate:
+        fig.autofmt_xdate(rotation=45)
     return fig

From 62c093ed3ccc65cfd7e1422ddecbb178cf740ed4 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Fri, 19 Jul 2024 18:07:47 +0800
Subject: [PATCH 06/44] Add multi-dictionary inference

---
 basics/base_svs_infer.py |  7 ++++++-
 inference/ds_acoustic.py | 27 ++++++++++++++++++++++++---
 inference/ds_variance.py | 24 ++++++++++++++++++++++--
 scripts/infer.py         | 18 ++++++++++++++++--
 4 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/basics/base_svs_infer.py b/basics/base_svs_infer.py
index e040993a7..2b23d0112 100644
--- a/basics/base_svs_infer.py
+++ b/basics/base_svs_infer.py
@@ -29,6 +29,7 @@ def __init__(self, device=None):
         self.device = device
         self.timestep = hparams['hop_size'] / hparams['audio_sample_rate']
         self.spk_map = {}
+        self.lang_map = {}
         self.model: torch.nn.Module = None
 
     def build_model(self, ckpt_steps=None) -> torch.nn.Module:
@@ -50,7 +51,11 @@ def load_speaker_mix(self, param_src: dict, summary_dst: dict,
         spk_mix_map = param_src.get(param_key)  # { spk_name: value } or { spk_name: "value value value ..." }
         dynamic = False
         if spk_mix_map is None:
-            # Get the first speaker
+            assert len(self.spk_map) == 1, (
+                "This is a multi-speaker model. "
+                "Please specify a speaker or speaker mix by --spk option."
+            )
+            # Get the only speaker
             for name in self.spk_map.keys():
                 spk_mix_map = {name: 1.0}
                 break
diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py
index 7b93d38cb..d8dbcc13d 100644
--- a/inference/ds_acoustic.py
+++ b/inference/ds_acoustic.py
@@ -41,6 +41,10 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N
                     self.spk_map = json.load(f)
                 assert isinstance(self.spk_map, dict) and len(self.spk_map) > 0, 'Invalid or empty speaker map!'
                 assert len(self.spk_map) == len(set(self.spk_map.values())), 'Duplicate speaker id in speaker map!'
+            lang_map_fn = pathlib.Path(hparams['work_dir']) / 'lang_map.json'
+            if lang_map_fn.exists():
+                with open(lang_map_fn, 'r', encoding='utf8') as f:
+                    self.lang_map = json.load(f)
             self.model = self.build_model(ckpt_steps=ckpt_steps)
             self.lr = LengthRegulator().to(self.device)
         if load_vocoder:
@@ -71,8 +75,23 @@ def preprocess_input(self, param, idx=0):
         """
         batch = {}
         summary = OrderedDict()
+
+        lang = param.get('lang')
+        if lang is None:
+            assert len(self.lang_map) <= 1, (
+                "This is a multilingual model. "
+                "Please specify a language by --lang option."
+            )
+        else:
+            assert lang in self.lang_map, f'Unrecognized language name: \'{lang}\'.'
+        if hparams.get('use_lang_id', False):
+            languages = torch.LongTensor([
+                self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                for p in param['ph_seq'].split()
+            ]).to(self.device)  # => [B, T_txt]
+            batch['languages'] = languages
         txt_tokens = torch.LongTensor([
-            self.phoneme_dictionary.encode(param['ph_seq'])
+            self.phoneme_dictionary.encode(param['ph_seq'], lang=lang)
         ]).to(self.device)  # => [B, T_txt]
         batch['tokens'] = txt_tokens
 
@@ -175,9 +194,11 @@ def forward_model(self, sample):
         else:
             spk_mix_embed = None
         mel_pred: ShallowDiffusionOutput = self.model(
-            txt_tokens, mel2ph=sample['mel2ph'], f0=sample['f0'], **variances,
+            txt_tokens,  languages=sample.get('languages'),
+            mel2ph=sample['mel2ph'], f0=sample['f0'], **variances,
             key_shift=sample.get('key_shift'), speed=sample.get('speed'),
-            spk_mix_embed=spk_mix_embed, infer=True
+            spk_mix_embed=spk_mix_embed,
+            infer=True
         )
         return mel_pred.diff_out
 
diff --git a/inference/ds_variance.py b/inference/ds_variance.py
index 86abd1abd..f5a401c3e 100644
--- a/inference/ds_variance.py
+++ b/inference/ds_variance.py
@@ -38,6 +38,10 @@ def __init__(
                 self.spk_map = json.load(f)
             assert isinstance(self.spk_map, dict) and len(self.spk_map) > 0, 'Invalid or empty speaker map!'
             assert len(self.spk_map) == len(set(self.spk_map.values())), 'Duplicate speaker id in speaker map!'
+        lang_map_fn = pathlib.Path(hparams['work_dir']) / 'lang_map.json'
+        if lang_map_fn.exists():
+            with open(lang_map_fn, 'r', encoding='utf8') as f:
+                self.lang_map = json.load(f)
         self.model: DiffSingerVariance = self.build_model(ckpt_steps=ckpt_steps)
         self.lr = LengthRegulator()
         self.rr = RhythmRegulator()
@@ -95,8 +99,23 @@ def preprocess_input(
         """
         batch = {}
         summary = OrderedDict()
+
+        lang = param.get('lang')
+        if lang is None:
+            assert len(self.lang_map) <= 1, (
+                "This is a multilingual model. "
+                "Please specify a language by --lang option."
+            )
+        else:
+            assert lang in self.lang_map, f'Unrecognized language name: \'{lang}\'.'
+        if hparams.get('use_lang_id', False):
+            languages = torch.LongTensor([
+                self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                for p in param['ph_seq'].split()
+            ]).to(self.device)  # [B=1, T_ph]
+            batch['languages'] = languages
         txt_tokens = torch.LongTensor([
-            self.phoneme_dictionary.encode(param['ph_seq'].split())
+            self.phoneme_dictionary.encode(param['ph_seq'], lang=lang)
         ]).to(self.device)  # [B=1, T_ph]
         T_ph = txt_tokens.shape[1]
         batch['tokens'] = txt_tokens
@@ -305,7 +324,8 @@ def forward_model(self, sample):
             ph_spk_mix_embed = spk_mix_embed = None
 
         dur_pred, pitch_pred, variance_pred = self.model(
-            txt_tokens, midi=midi, ph2word=ph2word, word_dur=word_dur, ph_dur=ph_dur, mel2ph=mel2ph,
+            txt_tokens, languages=sample.get('languages'),
+            midi=midi, ph2word=ph2word, word_dur=word_dur, ph_dur=ph_dur, mel2ph=mel2ph,
             note_midi=note_midi, note_rest=note_rest, note_dur=note_dur, note_glide=note_glide, mel2note=mel2note,
             base_pitch=base_pitch, pitch=pitch, pitch_expr=expr,
             ph_spk_mix_embed=ph_spk_mix_embed, spk_mix_embed=spk_mix_embed,
diff --git a/scripts/infer.py b/scripts/infer.py
index 83a5cabb7..ae08f5d12 100644
--- a/scripts/infer.py
+++ b/scripts/infer.py
@@ -61,6 +61,11 @@ def main():
     required=False,
     help='Speaker name or mixture of speakers'
 )
+@click.option(
+    '--lang', type=click.STRING,
+    required=False,
+    help='Default language name'
+)
 @click.option(
     '--out', type=click.Path(
         file_okay=False, dir_okay=True, path_type=pathlib.Path
@@ -112,6 +117,7 @@ def acoustic(
         exp: str,
         ckpt: int,
         spk: str,
+        lang: str,
         out: pathlib.Path,
         title: str,
         num: int,
@@ -195,9 +201,10 @@ def acoustic(
     for param in params:
         if gender is not None and hparams['use_key_shift_embed']:
             param['gender'] = gender
-
         if spk_mix is not None:
             param['spk_mix'] = spk_mix
+        if lang is not None:
+            param['lang'] = lang
 
     from inference.ds_acoustic import DiffSingerAcousticInfer
     infer_ins = DiffSingerAcousticInfer(load_vocoder=not mel, ckpt_steps=ckpt)
@@ -241,6 +248,11 @@ def acoustic(
     required=False,
     help='Speaker name or mixture of speakers'
 )
+@click.option(
+    '--lang', type=click.STRING,
+    required=False,
+    help='Default language name'
+)
 @click.option(
     '--out', type=click.Path(
         file_okay=False, dir_okay=True, path_type=pathlib.Path
@@ -282,6 +294,7 @@ def variance(
         exp: str,
         ckpt: int,
         spk: str,
+        lang: str,
         predict: Tuple[str],
         out: pathlib.Path,
         title: str,
@@ -344,11 +357,12 @@ def variance(
     for param in params:
         if expr is not None:
             param['expr'] = expr
-
         if spk_mix is not None:
             param['ph_spk_mix_backup'] = param.get('ph_spk_mix')
             param['spk_mix_backup'] = param.get('spk_mix')
             param['ph_spk_mix'] = param['spk_mix'] = spk_mix
+        if lang is not None:
+            param['lang'] = lang
 
     from inference.ds_variance import DiffSingerVarianceInfer
     infer_ins = DiffSingerVarianceInfer(ckpt_steps=ckpt, predictions=set(predict))

From 96b9a602cdbb85bdd4756efd29bb1be9d062696a Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Fri, 19 Jul 2024 20:31:32 +0800
Subject: [PATCH 07/44] Save original phoneme texts for duration plots

---
 basics/base_binarizer.py            |  5 ++++-
 preprocessing/acoustic_binarizer.py |  6 ++++--
 preprocessing/variance_binarizer.py |  9 ++++++---
 training/variance_task.py           |  3 ++-
 utils/phoneme_utils.py              | 18 ------------------
 5 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index b0342d43b..92e583d61 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -281,7 +281,7 @@ def process_dataset(self, prefix, num_workers=0, apply_augmentation=False):
         builder = IndexedDatasetBuilder(self.binary_data_dir, prefix=prefix, allowed_attr=self.data_attrs)
         total_sec = {k: 0.0 for k in self.spk_map}
         total_raw_sec = {k: 0.0 for k in self.spk_map}
-        extra_info = {'names': {}, 'spk_ids': {}, 'spk_names': {}, 'lengths': {}}
+        extra_info = {'names': {}, 'ph_texts': {}, 'spk_ids': {}, 'spk_names': {}, 'lengths': {}}
         max_no = -1
 
         for item_name, meta_data in self.meta_data_iterator(prefix):
@@ -301,6 +301,7 @@ def postprocess(_item):
                         extra_info[k] = {}
                     extra_info[k][item_no] = v.shape[0]
             extra_info['names'][item_no] = _item['name'].split(':', 1)[-1]
+            extra_info['ph_texts'][item_no] = _item['ph_text']
             extra_info['spk_ids'][item_no] = _item['spk_id']
             extra_info['spk_names'][item_no] = _item['spk_name']
             extra_info['lengths'][item_no] = _item['length']
@@ -317,6 +318,7 @@ def postprocess(_item):
                             extra_info[k] = {}
                         extra_info[k][aug_item_no] = v.shape[0]
                 extra_info['names'][aug_item_no] = aug_item['name'].split(':', 1)[-1]
+                extra_info['ph_texts'][aug_item_no] = aug_item['ph_text']
                 extra_info['spk_ids'][aug_item_no] = aug_item['spk_id']
                 extra_info['spk_names'][aug_item_no] = aug_item['spk_name']
                 extra_info['lengths'][aug_item_no] = aug_item['length']
@@ -345,6 +347,7 @@ def postprocess(_item):
         builder.finalize()
         if prefix == "train":
             extra_info.pop("names")
+            extra_info.pop('ph_texts')
             extra_info.pop("spk_names")
         with open(self.binary_data_dir / f"{prefix}.meta", "wb") as f:
             # noinspection PyTypeChecker
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index 18a6cd478..efb97a1ec 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -75,14 +75,15 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
                 item_name = utterance_label['name']
                 temp_dict = {
                     'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
+                    'spk_id': self.spk_map[spk],
+                    'spk_name': spk,
                     'lang_seq': [
                         self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
                         for p in utterance_label['ph_seq'].split()
                     ],
                     'ph_seq': self.phoneme_dictionary.encode(utterance_label['ph_seq'], lang=lang),
                     'ph_dur': [float(x) for x in utterance_label['ph_dur'].split()],
-                    'spk_id': self.spk_map[spk],
-                    'spk_name': spk,
+                    'ph_text': utterance_label['ph_seq'],
                 }
                 assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \
                     f'Lengths of ph_seq and ph_dur mismatch in \'{item_name}\'.'
@@ -114,6 +115,7 @@ def process_item(self, item_name, meta_data, binarization_args):
             'languages': np.array(meta_data['lang_seq'], dtype=np.int64),
             'tokens': np.array(meta_data['ph_seq'], dtype=np.int64),
             'ph_dur': np.array(meta_data['ph_dur']).astype(np.float32),
+            'ph_text': meta_data['ph_text'],
         }
 
         # get ground truth dur
diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py
index 00027954e..3feda896d 100644
--- a/preprocessing/variance_binarizer.py
+++ b/preprocessing/variance_binarizer.py
@@ -139,8 +139,10 @@ def require(attr):
                     'lang_seq': [
                         self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
                         for p in utterance_label['ph_seq'].split()
-                    ],'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang),
-                    'ph_dur': [float(x) for x in require('ph_dur').split()]
+                    ],
+                    'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang),
+                    'ph_dur': [float(x) for x in require('ph_dur').split()],
+                    'ph_text': require('ph_seq'),
                 }
 
                 assert len(temp_dict['ph_seq']) == len(temp_dict['ph_dur']), \
@@ -256,7 +258,8 @@ def process_item(self, item_name, meta_data, binarization_args):
             'seconds': seconds,
             'length': length,
             'languages': np.array(meta_data['lang_seq'], dtype=np.int64),
-            'tokens': np.array(meta_data['ph_seq'], dtype=np.int64)
+            'tokens': np.array(meta_data['ph_seq'], dtype=np.int64),
+            'ph_text': meta_data['ph_text'],
         }
 
         ph_dur_sec = torch.FloatTensor(meta_data['ph_dur']).to(self.device)
diff --git a/training/variance_task.py b/training/variance_task.py
index 0a33301e6..e6e885944 100644
--- a/training/variance_task.py
+++ b/training/variance_task.py
@@ -295,7 +295,8 @@ def sample_get(key, idx, abs_idx):
     def plot_dur(self, data_idx, gt_dur, pred_dur, txt=None):
         gt_dur = gt_dur[0].cpu().numpy()
         pred_dur = pred_dur[0].cpu().numpy()
-        txt = self.phoneme_dictionary.decode(txt[0].cpu().numpy()).split()
+        if txt is None:
+            txt = self.valid_dataset.metadata['ph_texts'][data_idx].split()
         title_text = f"{self.valid_dataset.metadata['spk_names'][data_idx]} - {self.valid_dataset.metadata['names'][data_idx]}"
         self.logger.all_rank_experiment.add_figure(f'dur_{data_idx}', dur_to_figure(
             gt_dur, pred_dur, txt, title_text
diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 1547bc9eb..1e8275330 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -170,21 +170,3 @@ def load_phoneme_dictionary() -> PhonemeDictionary:
         dictionaries=dicts,
         merged_groups=hparams.get('merged_phoneme_groups')
     )
-
-
-if __name__ == '__main__':
-    d = PhonemeDictionary(
-        dictionaries={
-            'zh': 'dictionaries/opencpop-extension.txt',
-            # 'en': 'dictionaries/opencpop-extension.txt',
-        },
-        merged_groups=[
-            ['zh/a', 'zh/b', 'c'],
-            ['a', 'd', 'e'],
-            ['e', 'f']
-        ]
-    )
-    ph_ids = d.encode('sh ir zh e j v y i b a SP', lang='en')
-    ph_seq = d.decode(ph_ids)
-    print(ph_ids)
-    print(ph_seq)

From dbe3840c39c7285f47cb64a324c1d32b6d018153 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Fri, 19 Jul 2024 20:53:03 +0800
Subject: [PATCH 08/44] Fix duration plots displaying bug

---
 training/variance_task.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/training/variance_task.py b/training/variance_task.py
index e6e885944..6a27a3b6d 100644
--- a/training/variance_task.py
+++ b/training/variance_task.py
@@ -262,7 +262,10 @@ def sample_get(key, idx, abs_idx):
                         self.valid_metrics['ph_dur_acc'].update(
                             pdur_pred=pred_dur, pdur_target=gt_dur, ph2word=ph2word, mask=mask
                         )
-                        self.plot_dur(data_idx, gt_dur, pred_dur, tokens)
+                        self.plot_dur(
+                            data_idx, gt_dur, pred_dur,
+                            txt=self.valid_dataset.metadata['ph_texts'][data_idx].split()
+                        )
                     if pitch_preds is not None:
                         pitch_len = self.valid_dataset.metadata['pitch'][data_idx]
                         pred_pitch = sample_get('base_pitch', i, data_idx) + pitch_preds[i][:pitch_len].unsqueeze(0)
@@ -295,8 +298,6 @@ def sample_get(key, idx, abs_idx):
     def plot_dur(self, data_idx, gt_dur, pred_dur, txt=None):
         gt_dur = gt_dur[0].cpu().numpy()
         pred_dur = pred_dur[0].cpu().numpy()
-        if txt is None:
-            txt = self.valid_dataset.metadata['ph_texts'][data_idx].split()
         title_text = f"{self.valid_dataset.metadata['spk_names'][data_idx]} - {self.valid_dataset.metadata['names'][data_idx]}"
         self.logger.all_rank_experiment.add_figure(f'dur_{data_idx}', dur_to_figure(
             gt_dur, pred_dur, txt, title_text

From b5f20a587e5276064e8fef181bfdeb11866670de Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 20 Jul 2024 03:09:02 +0800
Subject: [PATCH 09/44] Explicit `languages` argument passing

---
 modules/toplevel.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/toplevel.py b/modules/toplevel.py
index 1976d09a9..777f42291 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -88,11 +88,12 @@ def __init__(self, vocab_size, out_dims):
 
     def forward(
             self, txt_tokens, mel2ph, f0, key_shift=None, speed=None,
-            spk_embed_id=None, gt_mel=None, infer=True, **kwargs
+            spk_embed_id=None, languages=None, gt_mel=None, infer=True, **kwargs
     ) -> ShallowDiffusionOutput:
         condition = self.fs2(
             txt_tokens, mel2ph, f0, key_shift=key_shift, speed=speed,
-            spk_embed_id=spk_embed_id, **kwargs
+            spk_embed_id=spk_embed_id, languages=languages,
+            **kwargs
         )
         if infer:
             if self.use_shallow_diffusion:

From a7dbb9340a0814680c5930f6f56f57742941590b Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 20 Jul 2024 20:57:37 +0800
Subject: [PATCH 10/44] Add language embed (inject to txt_embed) for variance
 models

---
 modules/fastspeech/variance_encoder.py | 14 +++++++++++++-
 training/variance_task.py              |  7 ++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index 2031d89ce..aca443352 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -16,8 +16,11 @@ def __init__(self, vocab_size):
         super().__init__()
         self.predict_dur = hparams['predict_dur']
         self.linguistic_mode = 'word' if hparams['predict_dur'] else 'phoneme'
+        self.use_lang_id = hparams['use_lang_id']
 
         self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX)
+        if self.use_lang_id:
+            self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
 
         if self.predict_dur:
             self.onset_embed = Embedding(2, hparams['hidden_size'])
@@ -45,7 +48,12 @@ def __init__(self, vocab_size):
                 dur_loss_type=dur_hparams['loss_type']
             )
 
-    def forward(self, txt_tokens, midi, ph2word, ph_dur=None, word_dur=None, spk_embed=None, infer=True):
+    def forward(
+            self, txt_tokens, midi, ph2word,
+            ph_dur=None, word_dur=None,
+            spk_embed=None, languages=None,
+            infer=True
+    ):
         """
         :param txt_tokens: (train, infer) [B, T_ph]
         :param midi: (train, infer) [B, T_ph]
@@ -53,10 +61,14 @@ def forward(self, txt_tokens, midi, ph2word, ph_dur=None, word_dur=None, spk_emb
         :param ph_dur: (train, [infer]) [B, T_ph]
         :param word_dur: (infer) [B, T_w]
         :param spk_embed: (train) [B, T_ph, H]
+        :param languages (train, infer) [B, T_ph]
         :param infer: whether inference
         :return: encoder_out, ph_dur_pred
         """
         txt_embed = self.txt_embed(txt_tokens)
+        if self.use_lang_id:
+            lang_embed = self.lang_embed(languages)
+            txt_embed += lang_embed
         if self.linguistic_mode == 'word':
             b = txt_tokens.shape[0]
             onset = torch.diff(ph2word, dim=1, prepend=ph2word.new_zeros(b, 1)) > 0
diff --git a/training/variance_task.py b/training/variance_task.py
index 6a27a3b6d..2fdc599f6 100644
--- a/training/variance_task.py
+++ b/training/variance_task.py
@@ -41,6 +41,8 @@ def collater(self, samples):
 
         if hparams['use_spk_id']:
             batch['spk_ids'] = torch.LongTensor([s['spk_id'] for s in samples])
+        if hparams['use_lang_id']:
+            batch['languages'] = utils.collate_nd([s['languages'] for s in samples], 0)
         if hparams['predict_dur']:
             batch['ph2word'] = utils.collate_nd([s['ph2word'] for s in samples], 0)
             batch['midi'] = utils.collate_nd([s['midi'] for s in samples], 0)
@@ -85,6 +87,7 @@ def __init__(self):
         self.diffusion_type = hparams['diffusion_type']
 
         self.use_spk_id = hparams['use_spk_id']
+        self.use_lang_id = hparams['use_lang_id']
 
         self.predict_dur = hparams['predict_dur']
         if self.predict_dur:
@@ -154,6 +157,7 @@ def build_losses_and_metrics(self):
 
     def run_model(self, sample, infer=False):
         spk_ids = sample['spk_ids'] if self.use_spk_id else None  # [B,]
+        languages = sample['languages'] if self.use_lang_id else None  # [B,]
         txt_tokens = sample['tokens']  # [B, T_ph]
         ph_dur = sample['ph_dur']  # [B, T_ph]
         ph2word = sample.get('ph2word')  # [B, T_ph]
@@ -188,7 +192,8 @@ def run_model(self, sample, infer=False):
                 }
 
         output = self.model(
-            txt_tokens, midi=midi, ph2word=ph2word,
+            txt_tokens, languages=languages,
+            midi=midi, ph2word=ph2word,
             ph_dur=ph_dur, mel2ph=mel2ph,
             note_midi=note_midi, note_rest=note_rest,
             note_dur=note_dur, note_glide=note_glide, mel2note=mel2note,

From 2a175615b0ae5c518939e646a1e65b46ec98b76e Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 20 Jul 2024 21:02:39 +0800
Subject: [PATCH 11/44] Fix argument passing

---
 modules/toplevel.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/toplevel.py b/modules/toplevel.py
index 777f42291..6db01eaff 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -212,7 +212,8 @@ def forward(
             note_midi=None, note_rest=None, note_dur=None, note_glide=None, mel2note=None,
             base_pitch=None, pitch=None, pitch_expr=None, pitch_retake=None,
             variance_retake: Dict[str, Tensor] = None,
-            spk_id=None, infer=True, **kwargs
+            spk_id=None, languages=None,
+            infer=True, **kwargs
     ):
         if self.use_spk_id:
             ph_spk_mix_embed = kwargs.get('ph_spk_mix_embed')
@@ -228,7 +229,8 @@ def forward(
         encoder_out, dur_pred_out = self.fs2(
             txt_tokens, midi=midi, ph2word=ph2word,
             ph_dur=ph_dur, word_dur=word_dur,
-            spk_embed=ph_spk_embed, infer=infer
+            spk_embed=ph_spk_embed, languages=languages,
+            infer=infer
         )
 
         if not self.predict_pitch and not self.predict_variances:

From 8b215dbe89a2b56fe8c2f36dfe779a29d73d67bd Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 21 Jul 2024 02:24:53 +0800
Subject: [PATCH 12/44] Add log for lang_map.json copy

---
 basics/base_task.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/basics/base_task.py b/basics/base_task.py
index b53133d39..065f8273a 100644
--- a/basics/base_task.py
+++ b/basics/base_task.py
@@ -447,10 +447,11 @@ def train_payload_copy():
                 spk_map_dst = work_dir / 'spk_map.json'
                 spk_map_src = binary_dir / 'spk_map.json'
                 shutil.copy(spk_map_src, spk_map_dst)
+                print(f'| Copied spk map to {spk_map_dst}.')
                 lang_map_dst = work_dir / 'lang_map.json'
                 lang_map_src = binary_dir / 'lang_map.json'
                 shutil.copy(lang_map_src, lang_map_dst)
-                print(f'| Copied spk map to {spk_map_dst}.')
+                print(f'| Copied lang map to {lang_map_dst}.')
                 for lang in hparams['dictionaries'].keys():
                     dict_dst = work_dir / f'dictionary-{lang}.txt'
                     dict_src = binary_dir / f'dictionary-{lang}.txt'

From 6f80697e34d803874fd36f51909ea344d8263587 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 21 Jul 2024 03:22:20 +0800
Subject: [PATCH 13/44] Add language embedding scale

---
 modules/fastspeech/acoustic_encoder.py | 13 +++++++++----
 modules/fastspeech/variance_encoder.py | 15 +++++++++------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 2b53ace20..3395f0c0e 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -1,3 +1,5 @@
+import math
+
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
@@ -58,6 +60,7 @@ def __init__(self, vocab_size):
             self.spk_embed = Embedding(hparams['num_spk'], hparams['hidden_size'])
         self.use_lang_id = hparams.get('use_lang_id', False)
         if self.use_lang_id:
+            self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size']))
             self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
 
     def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances):
@@ -85,12 +88,14 @@ def forward(
             **kwargs
     ):
         txt_embed = self.txt_embed(txt_tokens)
-        if self.use_lang_id:
-            lang_embed = self.lang_embed(languages)
-            txt_embed += lang_embed
         dur = mel2ph_to_dur(mel2ph, txt_tokens.shape[1]).float()
         dur_embed = self.dur_embed(dur[:, :, None])
-        encoder_out = self.encoder(txt_embed, dur_embed, txt_tokens == 0)
+        if self.use_lang_id:
+            lang_embed = self.lang_embed(languages)
+            extra_embed = dur_embed + lang_embed * self.lang_embed_scale
+        else:
+            extra_embed = dur_embed
+        encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0)
 
         encoder_out = F.pad(encoder_out, [0, 0, 1, 0])
         mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]])
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index aca443352..a5be5ec6f 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -1,3 +1,5 @@
+import math
+
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
@@ -20,6 +22,7 @@ def __init__(self, vocab_size):
 
         self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX)
         if self.use_lang_id:
+            self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size']))
             self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
 
         if self.predict_dur:
@@ -66,9 +69,6 @@ def forward(
         :return: encoder_out, ph_dur_pred
         """
         txt_embed = self.txt_embed(txt_tokens)
-        if self.use_lang_id:
-            lang_embed = self.lang_embed(languages)
-            txt_embed += lang_embed
         if self.linguistic_mode == 'word':
             b = txt_tokens.shape[0]
             onset = torch.diff(ph2word, dim=1, prepend=ph2word.new_zeros(b, 1)) > 0
@@ -80,11 +80,14 @@ def forward(
                 )[:, 1:]  # [B, T_ph] => [B, T_w]
             word_dur = torch.gather(F.pad(word_dur, [1, 0], value=0), 1, ph2word)  # [B, T_w] => [B, T_ph]
             word_dur_embed = self.word_dur_embed(word_dur.float()[:, :, None])
-
-            encoder_out = self.encoder(txt_embed, onset_embed + word_dur_embed, txt_tokens == 0)
+            extra_embed = onset_embed + word_dur_embed
         else:
             ph_dur_embed = self.ph_dur_embed(ph_dur.float()[:, :, None])
-            encoder_out = self.encoder(txt_embed, ph_dur_embed, txt_tokens == 0)
+            extra_embed = ph_dur_embed
+        if self.use_lang_id:
+            lang_embed = self.lang_embed(languages)
+            extra_embed += lang_embed * self.lang_embed_scale
+        encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0)
 
         if self.predict_dur:
             midi_embed = self.midi_embed(midi)  # => [B, T_ph, H]

From 655e9ba9611861793297956e79ccbf329313d7f2 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Mon, 22 Jul 2024 02:14:20 +0800
Subject: [PATCH 14/44] Add language embedding type

---
 modules/fastspeech/acoustic_encoder.py | 6 +++++-
 modules/fastspeech/variance_encoder.py | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 3395f0c0e..3b82cfbce 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -62,6 +62,7 @@ def __init__(self, vocab_size):
         if self.use_lang_id:
             self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size']))
             self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
+        self.lang_embed_type = hparams.get('lang_embed_type', 'before')
 
     def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances):
         if self.use_variance_embeds:
@@ -90,12 +91,15 @@ def forward(
         txt_embed = self.txt_embed(txt_tokens)
         dur = mel2ph_to_dur(mel2ph, txt_tokens.shape[1]).float()
         dur_embed = self.dur_embed(dur[:, :, None])
-        if self.use_lang_id:
+        if self.use_lang_id and self.lang_embed_type == 'before':
             lang_embed = self.lang_embed(languages)
             extra_embed = dur_embed + lang_embed * self.lang_embed_scale
         else:
             extra_embed = dur_embed
         encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0)
+        if self.use_lang_id and self.lang_embed_type == 'after':
+            lang_embed = self.lang_embed(languages)
+            encoder_out = encoder_out + lang_embed * self.lang_embed_scale
 
         encoder_out = F.pad(encoder_out, [0, 0, 1, 0])
         mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]])
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index a5be5ec6f..eccded276 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -24,6 +24,7 @@ def __init__(self, vocab_size):
         if self.use_lang_id:
             self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size']))
             self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
+        self.lang_embed_type = hparams.get('lang_embed_type', 'before')
 
         if self.predict_dur:
             self.onset_embed = Embedding(2, hparams['hidden_size'])
@@ -84,10 +85,13 @@ def forward(
         else:
             ph_dur_embed = self.ph_dur_embed(ph_dur.float()[:, :, None])
             extra_embed = ph_dur_embed
-        if self.use_lang_id:
+        if self.use_lang_id and self.lang_embed_type == 'before':
             lang_embed = self.lang_embed(languages)
             extra_embed += lang_embed * self.lang_embed_scale
         encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0)
+        if self.use_lang_id and self.lang_embed_type == 'after':
+            lang_embed = self.lang_embed(languages)
+            encoder_out = encoder_out + lang_embed * self.lang_embed_scale
 
         if self.predict_dur:
             midi_embed = self.midi_embed(midi)  # => [B, T_ph, H]

From c6b96cf877b920eda98bb455fd6059ecc4af68f7 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 23 Jul 2024 00:39:53 +0800
Subject: [PATCH 15/44] Preprocessing: only apply lang embed on cross-lingual
 phonemes

---
 preprocessing/acoustic_binarizer.py |  6 +++++-
 preprocessing/variance_binarizer.py |  6 +++++-
 utils/phoneme_utils.py              | 16 +++++++++++++++-
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index efb97a1ec..99d0aaf68 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -78,7 +78,11 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
                     'spk_id': self.spk_map[spk],
                     'spk_name': spk,
                     'lang_seq': [
-                        self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                        (
+                            self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                            if self.phoneme_dictionary.is_cross_lingual(p)
+                            else 0
+                        )
                         for p in utterance_label['ph_seq'].split()
                     ],
                     'ph_seq': self.phoneme_dictionary.encode(utterance_label['ph_seq'], lang=lang),
diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py
index 3feda896d..30c175b2c 100644
--- a/preprocessing/variance_binarizer.py
+++ b/preprocessing/variance_binarizer.py
@@ -137,7 +137,11 @@ def require(attr):
                     'language_name': lang,
                     'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
                     'lang_seq': [
-                        self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                        (
+                            self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                            if self.phoneme_dictionary.is_cross_lingual(p)
+                            else 0
+                        )
                         for p in utterance_label['ph_seq'].split()
                     ],
                     'ph_seq': self.phoneme_dictionary.encode(require('ph_seq'), lang=lang),
diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 1e8275330..59a0924bc 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -73,8 +73,10 @@ def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[Li
                 merged_phonemes_inverted_index[phoneme] = target_idx
             if other_idx is not None:
                 merged_groups[other_idx] |= group
+                group.clear()
         phone_to_id = {}
         id_to_phone = []
+        cross_lingual_phonemes = set()
         idx = 1
         for phoneme in sorted(all_phonemes):
             if phoneme in merged_phonemes_inverted_index:
@@ -84,14 +86,23 @@ def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[Li
                         has_assigned = False
                         phone_to_id[alias] = idx
                 if not has_assigned:
-                    id_to_phone.append(tuple(sorted(merged_groups[merged_phonemes_inverted_index[phoneme]])))
+                    merged_group = sorted(merged_groups[merged_phonemes_inverted_index[phoneme]])
+                    merged_from_langs = {
+                        alias.split('/', maxsplit=1)[0]
+                        for alias in merged_group
+                        if '/' in alias
+                    }
+                    id_to_phone.append(tuple(merged_group))
                     idx += 1
+                    if len(merged_from_langs) > 1:
+                        cross_lingual_phonemes.update(ph for ph in merged_group if '/' in ph)
             else:
                 phone_to_id[phoneme] = idx
                 id_to_phone.append(phoneme)
                 idx += 1
         self._phone_to_id: Dict[str, int] = phone_to_id
         self._id_to_phone: List[Union[str, tuple]] = id_to_phone
+        self._cross_lingual_phonemes = cross_lingual_phonemes
 
     @property
     def vocab_size(self):
@@ -100,6 +111,9 @@ def vocab_size(self):
     def __len__(self):
         return self.vocab_size
 
+    def is_cross_lingual(self, phone):
+        return phone in self._cross_lingual_phonemes
+
     def encode_one(self, phone, lang=None):
         if lang is None or not self._multi_langs or phone in self._phone_to_id:
             return self._phone_to_id[phone]

From 8377728bd413452c62f3ab79fb6cf2050f9bea39 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 24 Jul 2024 23:30:44 +0800
Subject: [PATCH 16/44] Inference: only apply lang embed on cross-lingual
 phonemes

---
 inference/ds_acoustic.py | 6 +++++-
 inference/ds_variance.py | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py
index d8dbcc13d..8b139f62f 100644
--- a/inference/ds_acoustic.py
+++ b/inference/ds_acoustic.py
@@ -86,7 +86,11 @@ def preprocess_input(self, param, idx=0):
             assert lang in self.lang_map, f'Unrecognized language name: \'{lang}\'.'
         if hparams.get('use_lang_id', False):
             languages = torch.LongTensor([
-                self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                (
+                    self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                    if self.phoneme_dictionary.is_cross_lingual(p)
+                    else 0
+                )
                 for p in param['ph_seq'].split()
             ]).to(self.device)  # => [B, T_txt]
             batch['languages'] = languages
diff --git a/inference/ds_variance.py b/inference/ds_variance.py
index f5a401c3e..aa74dcabd 100644
--- a/inference/ds_variance.py
+++ b/inference/ds_variance.py
@@ -110,7 +110,11 @@ def preprocess_input(
             assert lang in self.lang_map, f'Unrecognized language name: \'{lang}\'.'
         if hparams.get('use_lang_id', False):
             languages = torch.LongTensor([
-                self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                (
+                    self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
+                    if self.phoneme_dictionary.is_cross_lingual(p)
+                    else 0
+                )
                 for p in param['ph_seq'].split()
             ]).to(self.device)  # [B=1, T_ph]
             batch['languages'] = languages

From 3d0a9ba3eaa4e03efe371ceb349f8d3bd56e5c11 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Wed, 24 Jul 2024 23:32:06 +0800
Subject: [PATCH 17/44] Revert "Add language embedding type"

This reverts commit 655e9ba9611861793297956e79ccbf329313d7f2.
---
 modules/fastspeech/acoustic_encoder.py | 6 +-----
 modules/fastspeech/variance_encoder.py | 6 +-----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 3b82cfbce..3395f0c0e 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -62,7 +62,6 @@ def __init__(self, vocab_size):
         if self.use_lang_id:
             self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size']))
             self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
-        self.lang_embed_type = hparams.get('lang_embed_type', 'before')
 
     def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances):
         if self.use_variance_embeds:
@@ -91,15 +90,12 @@ def forward(
         txt_embed = self.txt_embed(txt_tokens)
         dur = mel2ph_to_dur(mel2ph, txt_tokens.shape[1]).float()
         dur_embed = self.dur_embed(dur[:, :, None])
-        if self.use_lang_id and self.lang_embed_type == 'before':
+        if self.use_lang_id:
             lang_embed = self.lang_embed(languages)
             extra_embed = dur_embed + lang_embed * self.lang_embed_scale
         else:
             extra_embed = dur_embed
         encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0)
-        if self.use_lang_id and self.lang_embed_type == 'after':
-            lang_embed = self.lang_embed(languages)
-            encoder_out = encoder_out + lang_embed * self.lang_embed_scale
 
         encoder_out = F.pad(encoder_out, [0, 0, 1, 0])
         mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]])
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index eccded276..a5be5ec6f 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -24,7 +24,6 @@ def __init__(self, vocab_size):
         if self.use_lang_id:
             self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size']))
             self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
-        self.lang_embed_type = hparams.get('lang_embed_type', 'before')
 
         if self.predict_dur:
             self.onset_embed = Embedding(2, hparams['hidden_size'])
@@ -85,13 +84,10 @@ def forward(
         else:
             ph_dur_embed = self.ph_dur_embed(ph_dur.float()[:, :, None])
             extra_embed = ph_dur_embed
-        if self.use_lang_id and self.lang_embed_type == 'before':
+        if self.use_lang_id:
             lang_embed = self.lang_embed(languages)
             extra_embed += lang_embed * self.lang_embed_scale
         encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0)
-        if self.use_lang_id and self.lang_embed_type == 'after':
-            lang_embed = self.lang_embed(languages)
-            encoder_out = encoder_out + lang_embed * self.lang_embed_scale
 
         if self.predict_dur:
             midi_embed = self.midi_embed(midi)  # => [B, T_ph, H]

From 932c4f425d35cc6f4ea4acbf916132b33cf3be25 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 27 Jul 2024 17:31:51 +0800
Subject: [PATCH 18/44] Revert lang_embed_scale

---
 modules/fastspeech/acoustic_encoder.py | 11 ++++-------
 modules/fastspeech/variance_encoder.py |  5 +----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 3395f0c0e..6c4e54f8b 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -1,5 +1,3 @@
-import math
-
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
@@ -17,6 +15,9 @@ class FastSpeech2Acoustic(nn.Module):
     def __init__(self, vocab_size):
         super().__init__()
         self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX)
+        self.use_lang_id = hparams.get('use_lang_id', False)
+        if self.use_lang_id:
+            self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
         self.dur_embed = Linear(1, hparams['hidden_size'])
         self.encoder = FastSpeech2Encoder(
             hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
@@ -58,10 +59,6 @@ def __init__(self, vocab_size):
         self.use_spk_id = hparams['use_spk_id']
         if self.use_spk_id:
             self.spk_embed = Embedding(hparams['num_spk'], hparams['hidden_size'])
-        self.use_lang_id = hparams.get('use_lang_id', False)
-        if self.use_lang_id:
-            self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size']))
-            self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
 
     def forward_variance_embedding(self, condition, key_shift=None, speed=None, **variances):
         if self.use_variance_embeds:
@@ -92,7 +89,7 @@ def forward(
         dur_embed = self.dur_embed(dur[:, :, None])
         if self.use_lang_id:
             lang_embed = self.lang_embed(languages)
-            extra_embed = dur_embed + lang_embed * self.lang_embed_scale
+            extra_embed = dur_embed + lang_embed
         else:
             extra_embed = dur_embed
         encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0)
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index a5be5ec6f..a02e6e010 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -1,5 +1,3 @@
-import math
-
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
@@ -22,7 +20,6 @@ def __init__(self, vocab_size):
 
         self.txt_embed = Embedding(vocab_size, hparams['hidden_size'], PAD_INDEX)
         if self.use_lang_id:
-            self.lang_embed_scale = hparams.get('lang_embed_scale', math.sqrt(hparams['hidden_size']))
             self.lang_embed = Embedding(hparams['num_lang'] + 1, hparams['hidden_size'], padding_idx=0)
 
         if self.predict_dur:
@@ -86,7 +83,7 @@ def forward(
             extra_embed = ph_dur_embed
         if self.use_lang_id:
             lang_embed = self.lang_embed(languages)
-            extra_embed += lang_embed * self.lang_embed_scale
+            extra_embed += lang_embed
         encoder_out = self.encoder(txt_embed, extra_embed, txt_tokens == 0)
 
         if self.predict_dur:

From a0ec7e3a2ee293cdec7b61fcfc7af81ef4eeeed7 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 27 Jul 2024 23:18:46 +0800
Subject: [PATCH 19/44] Adapt ONNX exporters for multi-language models

---
 basics/base_exporter.py                   | 12 +++++
 deployment/exporters/acoustic_exporter.py | 32 +++++++++++--
 deployment/exporters/variance_exporter.py | 47 +++++++++++++-----
 deployment/modules/fastspeech2.py         | 58 +++++++++++++++++++----
 deployment/modules/toplevel.py            | 24 ++++++----
 utils/onnx_helper.py                      | 46 ++++++++++++++++--
 utils/phoneme_utils.py                    |  6 ++-
 7 files changed, 185 insertions(+), 40 deletions(-)

diff --git a/basics/base_exporter.py b/basics/base_exporter.py
index e2e65f534..77e5805a8 100644
--- a/basics/base_exporter.py
+++ b/basics/base_exporter.py
@@ -33,6 +33,18 @@ def build_spk_map(self) -> dict:
         else:
             return {}
 
+    # noinspection PyMethodMayBeStatic
+    def build_lang_map(self) -> dict:
+        lang_map_fn = pathlib.Path(hparams['work_dir']) / 'lang_map.json'
+        if lang_map_fn.exists():
+            with open(lang_map_fn, 'r', encoding='utf8') as f:
+                lang_map = json.load(f)
+            assert isinstance(lang_map, dict) and len(lang_map) > 0, 'Invalid or empty language map!'
+            assert len(lang_map) == len(set(lang_map.values())), 'Duplicate language id in language map!'
+            return lang_map
+        else:
+            return {}
+
     def build_model(self) -> nn.Module:
         """
         Creates an instance of nn.Module and load its state dict on the target device.
diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
index 1f56a9ce4..9160af6e3 100644
--- a/deployment/exporters/acoustic_exporter.py
+++ b/deployment/exporters/acoustic_exporter.py
@@ -1,3 +1,4 @@
+import json
 from pathlib import Path
 from typing import List, Union, Tuple, Dict
 
@@ -30,6 +31,7 @@ def __init__(
         self.model_name: str = hparams['exp_name']
         self.ckpt_steps: int = ckpt_steps
         self.spk_map: dict = self.build_spk_map()
+        self.lang_map: dict = self.build_lang_map()
         self.phoneme_dictionary = load_phoneme_dictionary()
         self.model = self.build_model()
         self.fs2_aux_cache_path = self.cache_dir / (
@@ -79,7 +81,11 @@ def __init__(
     def build_model(self) -> DiffSingerAcousticONNX:
         model = DiffSingerAcousticONNX(
             vocab_size=len(self.phoneme_dictionary),
-            out_dims=hparams['audio_num_mel_bins']
+            out_dims=hparams['audio_num_mel_bins'],
+            cross_lingual_token_idx=sorted({
+                self.phoneme_dictionary.encode_one(p)
+                for p in self.phoneme_dictionary.cross_lingual_phonemes
+            })
         ).eval().to(self.device)
         load_ckpt(model, hparams['work_dir'], ckpt_steps=self.ckpt_steps,
                   prefix_in_ckpt='model', strict=True, device=self.device)
@@ -110,14 +116,15 @@ def export_attachments(self, path: Path):
                 self._perform_spk_mix(spk[1])
             )
         self.export_dictionaries(path)
-        self._export_phonemes(path / f'{self.model_name}.phonemes.txt')
+        self._export_phonemes(path)
 
         model_name = self.model_name
         if self.freeze_spk is not None:
             model_name += '.' + self.freeze_spk[0]
         dsconfig = {
             # basic configs
-            'phonemes': f'{self.model_name}.phonemes.txt',
+            'phonemes': f'{self.model_name}.phonemes.json',
+            'use_lang_id': hparams.get('use_lang_id', False),
             'acoustic': f'{model_name}.onnx',
             'hidden_size': hparams['hidden_size'],
             'vocoder': 'nsf_hifigan_44.1k_hop512_128bin_2024.02',
@@ -209,6 +216,12 @@ def _torch_export_model(self):
             dynamix_axes['spk_embed'] = {
                 1: 'n_frames'
             }
+        if hparams.get('use_lang_id'):
+            kwargs['languages'] = torch.zeros_like(tokens)
+            input_names.append('languages')
+            dynamix_axes['languages'] = {
+                1: 'n_tokens'
+            }
         dynamix_axes['condition'] = {
             1: 'n_frames'
         }
@@ -332,6 +345,10 @@ def _optimize_fs2_aux_graph(self, fs2: onnx.ModelProto) -> onnx.ModelProto:
         print(f'Running ONNX Simplifier on {self.fs2_aux_class_name}...')
         fs2, check = onnxsim.simplify(fs2, include_subgraph=True)
         assert check, 'Simplified ONNX model could not be validated'
+        onnx_helper.model_reorder_io_list(
+            fs2, 'input',
+            target_name='languages', insert_after_name='tokens'
+        )
         print(f'| optimize graph: {self.fs2_aux_class_name}')
         return fs2
 
@@ -394,5 +411,10 @@ def _export_spk_embed(self, path: Path, spk_embed: torch.Tensor):
         print(f'| export spk embed => {path}')
 
     def _export_phonemes(self, path: Path):
-        self.phoneme_dictionary.dump(path)
-        print(f'| export phonemes => {path}')
+        ph_path = path / f'{self.model_name}.phonemes.json'
+        self.phoneme_dictionary.dump(ph_path)
+        print(f'| export phonemes => {ph_path}')
+        lang_path = path / 'languages.json'
+        with open(lang_path, 'w', encoding='utf8') as f:
+            json.dump(self.lang_map, f, ensure_ascii=False, indent=2)
+        print(f'| export languages => {lang_path}')
diff --git a/deployment/exporters/variance_exporter.py b/deployment/exporters/variance_exporter.py
index 4e594c407..76d061834 100644
--- a/deployment/exporters/variance_exporter.py
+++ b/deployment/exporters/variance_exporter.py
@@ -1,3 +1,4 @@
+import json
 from pathlib import Path
 from typing import Union, List, Tuple, Dict
 
@@ -30,6 +31,7 @@ def __init__(
         self.model_name: str = hparams['exp_name']
         self.ckpt_steps: int = ckpt_steps
         self.spk_map: dict = self.build_spk_map()
+        self.lang_map: dict = self.build_lang_map()
         self.phoneme_dictionary = load_phoneme_dictionary()
         self.model = self.build_model()
         self.linguistic_encoder_cache_path = self.cache_dir / 'linguistic.onnx'
@@ -81,7 +83,11 @@ def __init__(
 
     def build_model(self) -> DiffSingerVarianceONNX:
         model = DiffSingerVarianceONNX(
-            vocab_size=len(self.phoneme_dictionary)
+            vocab_size=len(self.phoneme_dictionary),
+            cross_lingual_token_idx=sorted({
+                self.phoneme_dictionary.encode_one(p)
+                for p in self.phoneme_dictionary.cross_lingual_phonemes
+            })
         ).eval().to(self.device)
         load_ckpt(model, hparams['work_dir'], ckpt_steps=self.ckpt_steps,
                   prefix_in_ckpt='model', strict=True, device=self.device)
@@ -141,14 +147,16 @@ def export_attachments(self, path: Path):
                 self._perform_spk_mix(spk[1])
             )
         self.export_dictionaries(path)
-        self._export_phonemes((path / f'{self.model_name}.phonemes.txt'))
+        self._export_phonemes(path)
 
         model_name = self.model_name
         if self.freeze_spk is not None:
             model_name += '.' + self.freeze_spk[0]
         dsconfig = {
             # basic configs
-            'phonemes': f'{self.model_name}.phonemes.txt',
+            'phonemes': f'{self.model_name}.phonemes.json',
+            'languages': sorted(self.lang_map.keys()),
+            'use_lang_id': hparams.get('use_lang_id', False),
             'linguistic': f'{model_name}.linguistic.onnx',
             'hidden_size': self.model.hidden_size,
             'predict_dur': self.model.predict_dur,
@@ -184,6 +192,7 @@ def _torch_export_model(self):
         ph_dur = torch.LongTensor([[3, 5, 2, 1, 4]]).to(self.device)
         word_div = torch.LongTensor([[2, 2, 1]]).to(self.device)
         word_dur = torch.LongTensor([[8, 3, 4]]).to(self.device)
+        languages = torch.LongTensor([[0] * 5]).to(self.device)
         encoder_out = torch.rand(1, 5, hparams['hidden_size'], dtype=torch.float32, device=self.device)
         x_masks = tokens == 0
         ph_midi = torch.LongTensor([[60] * 5]).to(self.device)
@@ -196,6 +205,7 @@ def _torch_export_model(self):
                 1: 'n_tokens'
             }
         }
+        input_lang_id = hparams.get('use_lang_id', False)
         input_spk_embed = hparams['use_spk_id'] and not self.freeze_spk
 
         print(f'Exporting {self.fs2_class_name}...')
@@ -205,13 +215,15 @@ def _torch_export_model(self):
                 (
                     tokens,
                     word_div,
-                    word_dur
+                    word_dur,
+                    *([languages] if input_lang_id else [])
                 ),
                 self.linguistic_encoder_cache_path,
                 input_names=[
                     'tokens',
                     'word_div',
-                    'word_dur'
+                    'word_dur',
+                    *(['languages'] if input_lang_id else [])
                 ],
                 output_names=encoder_output_names,
                 dynamic_axes={
@@ -224,7 +236,8 @@ def _torch_export_model(self):
                     'word_dur': {
                         1: 'n_words'
                     },
-                    **encoder_common_axes
+                    **encoder_common_axes,
+                    **({'languages': {1: 'n_tokens'}} if input_lang_id else {})
                 },
                 opset_version=15
             )
@@ -268,12 +281,14 @@ def _torch_export_model(self):
                 self.model.view_as_linguistic_encoder(),
                 (
                     tokens,
-                    ph_dur
+                    ph_dur,
+                    *([languages] if input_lang_id else [])
                 ),
                 self.linguistic_encoder_cache_path,
                 input_names=[
                     'tokens',
-                    'ph_dur'
+                    'ph_dur',
+                    *(['languages'] if input_lang_id else [])
                 ],
                 output_names=encoder_output_names,
                 dynamic_axes={
@@ -283,7 +298,8 @@ def _torch_export_model(self):
                     'ph_dur': {
                         1: 'n_tokens'
                     },
-                    **encoder_common_axes
+                    **encoder_common_axes,
+                    **({'languages': {1: 'n_tokens'}} if input_lang_id else {})
                 },
                 opset_version=15
             )
@@ -635,6 +651,10 @@ def _optimize_linguistic_graph(self, linguistic: onnx.ModelProto) -> onnx.ModelP
         print(f'Running ONNX Simplifier on {self.fs2_class_name}...')
         linguistic, check = onnxsim.simplify(linguistic, include_subgraph=True)
         assert check, 'Simplified ONNX model could not be validated'
+        onnx_helper.model_reorder_io_list(
+            linguistic, 'input',
+            target_name='languages', insert_after_name='tokens'
+        )
         print(f'| optimize graph: {self.fs2_class_name}')
         return linguistic
 
@@ -770,5 +790,10 @@ def _export_spk_embed(self, path: Path, spk_embed: torch.Tensor):
         print(f'| export spk embed => {path}')
 
     def _export_phonemes(self, path: Path):
-        self.phoneme_dictionary.dump(path)
-        print(f'| export phonemes => {path}')
+        ph_path = path / f'{self.model_name}.phonemes.json'
+        self.phoneme_dictionary.dump(ph_path)
+        print(f'| export phonemes => {ph_path}')
+        lang_path = path / f'{self.model_name}.languages.json'
+        with open(lang_path, 'w', encoding='utf8') as fw:
+            json.dump(self.lang_map, fw, ensure_ascii=False, indent=2)
+        print(f'| export languages => {lang_path}')
diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py
index 48a3afb40..bae452aea 100644
--- a/deployment/modules/fastspeech2.py
+++ b/deployment/modules/fastspeech2.py
@@ -41,8 +41,13 @@ def forward(self, dur):
 
 
 class FastSpeech2AcousticONNX(FastSpeech2Acoustic):
-    def __init__(self, vocab_size):
+    def __init__(self, vocab_size, cross_lingual_token_idx=None):
         super().__init__(vocab_size=vocab_size)
+        self.register_buffer(
+            'cross_lingual_token_idx',
+            torch.LongTensor(cross_lingual_token_idx),
+            persistent=False
+        )  # [N,]
 
         # for temporary compatibility; will be completely removed in the future
         self.f0_embed_type = hparams.get('f0_embed_type', 'continuous')
@@ -56,14 +61,29 @@ def __init__(self, vocab_size):
             self.speed_min, self.speed_max = hparams['augmentation_args']['random_time_stretching']['range']
 
     # noinspection PyMethodOverriding
-    def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=None, spk_embed=None):
+    def forward(
+            self, tokens, durations,
+            f0, variances: dict,
+            gender=None, velocity=None,
+            spk_embed=None,
+            languages=None
+    ):
         txt_embed = self.txt_embed(tokens)
         durations = durations * (tokens > 0)
         mel2ph = self.lr(durations)
         f0 = f0 * (mel2ph > 0)
         mel2ph = mel2ph[..., None].repeat((1, 1, hparams['hidden_size']))
         dur_embed = self.dur_embed(durations.float()[:, :, None])
-        encoded = self.encoder(txt_embed, dur_embed, tokens == PAD_INDEX)
+        if self.use_lang_id:
+            lang_mask = torch.any(
+                tokens[..., None] == self.cross_lingual_token_idx[None, None],
+                dim=-1
+            )
+            lang_embed = self.lang_embed(languages * lang_mask)
+            extra_embed = dur_embed + lang_embed
+        else:
+            extra_embed = dur_embed
+        encoded = self.encoder(txt_embed, extra_embed, tokens == PAD_INDEX)
         encoded = F.pad(encoded, (0, 0, 1, 0))
         condition = torch.gather(encoded, 1, mel2ph)
 
@@ -109,25 +129,47 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
 
 
 class FastSpeech2VarianceONNX(FastSpeech2Variance):
-    def __init__(self, vocab_size):
+    def __init__(self, vocab_size, cross_lingual_token_idx=None):
         super().__init__(vocab_size=vocab_size)
+        self.register_buffer(
+            'cross_lingual_token_idx',
+            torch.LongTensor(cross_lingual_token_idx),
+            persistent=False
+        )
         self.lr = LengthRegulator()
 
-    def forward_encoder_word(self, tokens, word_div, word_dur):
+    def forward_encoder_word(self, tokens, word_div, word_dur, languages=None):
         txt_embed = self.txt_embed(tokens)
         ph2word = self.lr(word_div)
         onset = ph2word > F.pad(ph2word, [1, -1])
         onset_embed = self.onset_embed(onset.long())
         ph_word_dur = torch.gather(F.pad(word_dur, [1, 0]), 1, ph2word)
         word_dur_embed = self.word_dur_embed(ph_word_dur.float()[:, :, None])
+        extra_embed = onset_embed + word_dur_embed
+        if self.use_lang_id:
+            lang_mask = torch.any(
+                tokens[..., None] == self.cross_lingual_token_idx[None, None],
+                dim=-1
+            )
+            lang_embed = self.lang_embed(languages * lang_mask)
+            extra_embed += lang_embed
         x_masks = tokens == PAD_INDEX
-        return self.encoder(txt_embed, onset_embed + word_dur_embed, x_masks), x_masks
+        return self.encoder(txt_embed, extra_embed, x_masks), x_masks
 
-    def forward_encoder_phoneme(self, tokens, ph_dur):
+    def forward_encoder_phoneme(self, tokens, ph_dur, languages=None):
         txt_embed = self.txt_embed(tokens)
         ph_dur_embed = self.ph_dur_embed(ph_dur.float()[:, :, None])
+        if self.use_lang_id:
+            lang_mask = torch.any(
+                tokens[..., None] == self.cross_lingual_token_idx[None, None],
+                dim=-1
+            )
+            lang_embed = self.lang_embed(languages * lang_mask)
+            extra_embed = ph_dur_embed + lang_embed
+        else:
+            extra_embed = ph_dur_embed
         x_masks = tokens == PAD_INDEX
-        return self.encoder(txt_embed, ph_dur_embed, x_masks), x_masks
+        return self.encoder(txt_embed, extra_embed, x_masks), x_masks
 
     def forward_dur_predictor(self, encoder_out, x_masks, ph_midi, spk_embed=None):
         midi_embed = self.midi_embed(ph_midi)
diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py
index 1dd4fe129..bfb281958 100644
--- a/deployment/modules/toplevel.py
+++ b/deployment/modules/toplevel.py
@@ -18,12 +18,13 @@
 
 
 class DiffSingerAcousticONNX(DiffSingerAcoustic):
-    def __init__(self, vocab_size, out_dims):
+    def __init__(self, vocab_size, out_dims, cross_lingual_token_idx=None):
         super().__init__(vocab_size, out_dims)
         del self.fs2
         del self.diffusion
         self.fs2 = FastSpeech2AcousticONNX(
-            vocab_size=vocab_size
+            vocab_size=vocab_size,
+            cross_lingual_token_idx=cross_lingual_token_idx
         )
         if self.diffusion_type == 'ddpm':
             self.diffusion = GaussianDiffusionONNX(
@@ -73,11 +74,13 @@ def forward_fs2_aux(
             variances: dict,
             gender: Tensor = None,
             velocity: Tensor = None,
-            spk_embed: Tensor = None
+            spk_embed: Tensor = None,
+            languages: Tensor = None
     ):
         condition = self.fs2(
             tokens, durations, f0, variances=variances,
-            gender=gender, velocity=velocity, spk_embed=spk_embed
+            gender=gender, velocity=velocity, spk_embed=spk_embed,
+            languages=languages
         )
         if self.use_shallow_diffusion:
             aux_mel_pred = self.aux_decoder(condition, infer=True)
@@ -135,11 +138,12 @@ def view_as_reflow(self) -> nn.Module:
 
 
 class DiffSingerVarianceONNX(DiffSingerVariance):
-    def __init__(self, vocab_size):
+    def __init__(self, vocab_size, cross_lingual_token_idx=None):
         super().__init__(vocab_size=vocab_size)
         del self.fs2
         self.fs2 = FastSpeech2VarianceONNX(
-            vocab_size=vocab_size
+            vocab_size=vocab_size,
+            cross_lingual_token_idx=cross_lingual_token_idx
         )
         self.hidden_size = hparams['hidden_size']
         if self.predict_pitch:
@@ -210,13 +214,13 @@ def embed_frozen_spk(self, encoder_out):
             encoder_out += self.frozen_spk_embed
         return encoder_out
 
-    def forward_linguistic_encoder_word(self, tokens, word_div, word_dur):
-        encoder_out, x_masks = self.fs2.forward_encoder_word(tokens, word_div, word_dur)
+    def forward_linguistic_encoder_word(self, tokens, word_div, word_dur, languages=None):
+        encoder_out, x_masks = self.fs2.forward_encoder_word(tokens, word_div, word_dur, languages=languages)
         encoder_out = self.embed_frozen_spk(encoder_out)
         return encoder_out, x_masks
 
-    def forward_linguistic_encoder_phoneme(self, tokens, ph_dur):
-        encoder_out, x_masks = self.fs2.forward_encoder_phoneme(tokens, ph_dur)
+    def forward_linguistic_encoder_phoneme(self, tokens, ph_dur, languages=None):
+        encoder_out, x_masks = self.fs2.forward_encoder_phoneme(tokens, ph_dur, languages=languages)
         encoder_out = self.embed_frozen_spk(encoder_out)
         return encoder_out, x_masks
 
diff --git a/utils/onnx_helper.py b/utils/onnx_helper.py
index 176df56dc..1470e47d6 100644
--- a/utils/onnx_helper.py
+++ b/utils/onnx_helper.py
@@ -1,5 +1,5 @@
 import re
-from typing import Dict, Tuple, Union
+from typing import Dict, Tuple, Union, Literal
 
 import onnx
 from google.protobuf.internal.containers import RepeatedCompositeFieldContainer
@@ -51,6 +51,42 @@ def _override_shapes(
         _override_shapes(model.graph.output, output_shapes)
 
 
+def model_reorder_io_list(
+        model: ModelProto,
+        input_or_output: Literal['input', 'output'],
+        target_name: str,
+        insert_after_name: str,
+):
+    """
+    Reorder the input of the model graph by moving the target input after the specified input (in-place operation).
+    If the given names are not found, the operation will be ignored.
+    :param model: model to perform the operation on
+    :param input_or_output: 'input' or 'output' to specify the list to reorder
+    :param target_name: the name of the input to be reordered
+    :param insert_after_name: the name of the input to be inserted after (None for the first)
+    """
+    def _reorder_input(input_list: RepeatedCompositeFieldContainer[ValueInfoProto]):
+        nonlocal input_or_output
+        target_idx = -1
+        insert_after_idx = -1
+        for i, value_info in enumerate(input_list):
+            if value_info.name == target_name:
+                target_idx = i
+            if value_info.name == insert_after_name:
+                insert_after_idx = i
+        if target_idx != -1 and insert_after_idx != -1:
+            target = input_list.pop(target_idx)
+            input_list.insert(insert_after_idx + 1, target)
+            _verbose(f'| reorder {input_or_output}: \'{target_name}\' after \'{insert_after_name}\'')
+
+    if input_or_output == 'input':
+        _reorder_input(model.graph.input)
+    elif input_or_output == 'output':
+        _reorder_input(model.graph.output)
+    else:
+        raise ValueError('Argument \'input_or_output\' should be either \'input\' or \'output\'.')
+
+
 def model_add_prefixes(
         model: ModelProto,
         initializer_prefix=None,
@@ -97,7 +133,7 @@ def _add_prefixes_recursive(subgraph):
                 new_name = initializer_prefix + initializer.name
                 _verbose('| add prefix:', initializer.name, '->', new_name)
                 initializer.name = new_name
-        
+
         for value_info in subgraph.value_info:
             if dim_prefix is not None:
                 for dim in value_info.type.tensor_type.shape.dim:
@@ -114,7 +150,7 @@ def _add_prefixes_recursive(subgraph):
             new_name = value_info_prefix + value_info.name
             _verbose('| add prefix:', value_info.name, '->', new_name)
             value_info.name = new_name
-        
+
         if node_prefix is not None:
             for node in subgraph.node:
                 if ignored_pattern is not None and re.match(ignored_pattern, node.name):
@@ -122,7 +158,7 @@ def _add_prefixes_recursive(subgraph):
                 new_name = node_prefix + node.name
                 _verbose('| add prefix:', node.name, '->', new_name)
                 node.name = new_name
-        
+
         for node in subgraph.node:
             # For 'If' and 'Loop' nodes, add prefixes recursively
             if node.op_type == 'If':
@@ -134,7 +170,7 @@ def _add_prefixes_recursive(subgraph):
                     if attr.name == 'body':
                         body = onnx.helper.get_attribute_value(attr)
                         _add_prefixes_recursive(body)
-            
+
             # For each node, rename its inputs and outputs
             for io_list in [node.input, node.output]:
                 for i, io_value in enumerate(io_list):
diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 59a0924bc..df40fd4b8 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -102,7 +102,7 @@ def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[Li
                 idx += 1
         self._phone_to_id: Dict[str, int] = phone_to_id
         self._id_to_phone: List[Union[str, tuple]] = id_to_phone
-        self._cross_lingual_phonemes = cross_lingual_phonemes
+        self._cross_lingual_phonemes = frozenset(cross_lingual_phonemes)
 
     @property
     def vocab_size(self):
@@ -111,6 +111,10 @@ def vocab_size(self):
     def __len__(self):
         return self.vocab_size
 
+    @property
+    def cross_lingual_phonemes(self):
+        return self._cross_lingual_phonemes
+
     def is_cross_lingual(self, phone):
         return phone in self._cross_lingual_phonemes
 

From 4a4b2b0abbd51871eba0d77b4347ff46f69eb7bb Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 28 Jul 2024 00:06:01 +0800
Subject: [PATCH 20/44] Refactor configuration schemas for datasets

---
 basics/base_binarizer.py            | 81 +++++++++++++++--------------
 preprocessing/acoustic_binarizer.py |  2 +-
 preprocessing/variance_binarizer.py |  4 +-
 3 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index 92e583d61..fb71614f6 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -43,13 +43,11 @@ class BaseBinarizer:
             the phoneme set.
     """
 
-    def __init__(self, data_dir=None, data_attrs=None):
-        if data_dir is None:
-            data_dir = hparams['raw_data_dir']
-        if not isinstance(data_dir, list):
-            data_dir = [data_dir]
-
-        self.raw_data_dirs = [pathlib.Path(d) for d in data_dir]
+    def __init__(self, datasets=None, data_attrs=None):
+        if datasets is None:
+            datasets = hparams['datasets']
+        self.datasets = datasets
+        self.raw_data_dirs = [pathlib.Path(ds['raw_data_dir']) for ds in self.datasets]
         self.binary_data_dir = pathlib.Path(hparams['binary_data_dir'])
         self.data_attrs = [] if data_attrs is None else data_attrs
 
@@ -58,13 +56,11 @@ def __init__(self, data_dir=None, data_attrs=None):
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
         self.spk_map = {}
-        self.spk_ids = hparams['spk_ids']
-        self.speakers = hparams['speakers']
+        self.spk_ids = None
         self.build_spk_map()
 
         self.lang_map = {}
         self.dictionaries = hparams['dictionaries']
-        self.languages = hparams['languages']
         self.build_lang_map()
 
         self.items = {}
@@ -76,58 +72,58 @@ def __init__(self, data_dir=None, data_attrs=None):
         self.timestep = hparams['hop_size'] / hparams['audio_sample_rate']
 
     def build_spk_map(self):
-        assert isinstance(self.speakers, list), 'Speakers must be a list'
-        assert len(self.speakers) == len(self.raw_data_dirs), \
-            'Number of raw data dirs must equal number of speaker names!'
-        if len(self.spk_ids) == 0:
-            self.spk_ids = list(range(len(self.raw_data_dirs)))
-        else:
-            assert len(self.spk_ids) == len(self.raw_data_dirs), \
-                'Length of explicitly given spk_ids must equal the number of raw datasets.'
-        assert max(self.spk_ids) < hparams['num_spk'], \
-            f'Index in spk_id sequence {self.spk_ids} is out of range. All values should be smaller than num_spk.'
-
-        for spk_name, spk_id in zip(self.speakers, self.spk_ids):
+        spk_ids = [ds.get('spk_id') for ds in self.datasets]
+        assigned_spk_ids = {spk_id for spk_id in spk_ids if spk_id is not None}
+        for i in range(len(spk_ids)):
+            if spk_ids[i] is not None:
+                continue
+            idx = 0
+            while idx in assigned_spk_ids:
+                idx += 1
+            spk_ids[i] = idx
+        assert max(spk_ids) < hparams['num_spk'], \
+            f'Index in spk_id sequence {spk_ids} is out of range. All values should be smaller than num_spk.'
+
+        for spk_id, dataset in zip(spk_ids, self.datasets):
+            spk_name = dataset['speaker']
             if spk_name in self.spk_map and self.spk_map[spk_name] != spk_id:
                 raise ValueError(f'Invalid speaker ID assignment. Name \'{spk_name}\' is assigned '
                                  f'with different speaker IDs: {self.spk_map[spk_name]} and {spk_id}.')
             self.spk_map[spk_name] = spk_id
+        self.spk_ids = spk_ids
 
         print("| spk_map: ", self.spk_map)
 
     def build_lang_map(self):
-        assert isinstance(self.languages, list), 'Languages must be a list'
-        assert len(self.languages) == len(self.raw_data_dirs), \
-            'Number of raw data dirs must equal number of language names!'
-        for lang in self.languages:
-            assert lang in self.dictionaries, f'Unrecognized language name: {lang}'
         assert len(self.dictionaries.keys()) <= hparams['num_lang'], \
             'Number of languages must not be greater than num_lang!'
+        for dataset in self.datasets:
+            assert dataset['language'] in self.dictionaries, f'Unrecognized language name: {dataset["language"]}'
 
         for lang_id, lang_name in enumerate(sorted(self.dictionaries.keys()), start=1):
             self.lang_map[lang_name] = lang_id
 
         print("| lang_map: ", self.lang_map)
 
-    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
+    def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang) -> dict:
         raise NotImplementedError()
 
-    def split_train_valid_set(self, item_names):
+    def split_train_valid_set(self, prefixes: list):
         """
         Split the dataset into training set and validation set.
         :return: train_item_names, valid_item_names
         """
-        prefixes = {str(pr): 1 for pr in hparams['test_prefixes']}
+        prefixes = {str(pr): 1 for pr in prefixes}
         valid_item_names = {}
         # Add prefixes that specified speaker index and matches exactly item name to test set
         for prefix in deepcopy(prefixes):
-            if prefix in item_names:
+            if prefix in self.item_names:
                 valid_item_names[prefix] = 1
                 prefixes.pop(prefix)
         # Add prefixes that exactly matches item name without speaker id to test set
         for prefix in deepcopy(prefixes):
             matched = False
-            for name in item_names:
+            for name in self.item_names:
                 if name.split(':')[-1] == prefix:
                     valid_item_names[name] = 1
                     matched = True
@@ -136,7 +132,7 @@ def split_train_valid_set(self, item_names):
         # Add names with one of the remaining prefixes to test set
         for prefix in deepcopy(prefixes):
             matched = False
-            for name in item_names:
+            for name in self.item_names:
                 if name.startswith(prefix):
                     valid_item_names[name] = 1
                     matched = True
@@ -144,7 +140,7 @@ def split_train_valid_set(self, item_names):
                 prefixes.pop(prefix)
         for prefix in deepcopy(prefixes):
             matched = False
-            for name in item_names:
+            for name in self.item_names:
                 if name.split(':')[-1].startswith(prefix):
                     valid_item_names[name] = 1
                     matched = True
@@ -160,7 +156,7 @@ def split_train_valid_set(self, item_names):
 
         valid_item_names = list(valid_item_names.keys())
         assert len(valid_item_names) > 0, 'Validation set is empty!'
-        train_item_names = [x for x in item_names if x not in set(valid_item_names)]
+        train_item_names = [x for x in self.item_names if x not in set(valid_item_names)]
         assert len(train_item_names) > 0, 'Training set is empty!'
 
         return train_item_names, valid_item_names
@@ -184,10 +180,19 @@ def meta_data_iterator(self, prefix):
 
     def process(self):
         # load each dataset
-        for ds_id, (data_dir, spk, lang) in enumerate(zip(self.raw_data_dirs, self.speakers, self.languages)):
-            self.load_meta_data(pathlib.Path(data_dir), ds_id=ds_id, spk=spk, lang=lang)
+        test_prefixes = []
+        for ds_id, dataset in enumerate(self.datasets):
+            items = self.load_meta_data(
+                pathlib.Path(dataset['raw_data_dir']),
+                ds_id=ds_id, spk=dataset['speaker'], lang=dataset['language']
+            )
+            self.items.update(items)
+            test_prefixes.extend(
+                f'{ds_id}:{prefix}'
+                for prefix in dataset.get('test_prefixes', [])
+            )
         self.item_names = sorted(list(self.items.keys()))
-        self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)
+        self._train_item_names, self._valid_item_names = self.split_train_valid_set(test_prefixes)
 
         if self.binarization_args['shuffle']:
             random.shuffle(self.item_names)
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index 99d0aaf68..0455c4f94 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -95,7 +95,7 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
                     f'Negative ph_dur found in \'{item_name}\'.'
                 meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict
 
-        self.items.update(meta_data_dict)
+        return meta_data_dict
 
     @torch.no_grad()
     def process_item(self, item_name, meta_data, binarization_args):
diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py
index 30c175b2c..c88ae924c 100644
--- a/preprocessing/variance_binarizer.py
+++ b/preprocessing/variance_binarizer.py
@@ -132,7 +132,7 @@ def require(attr):
                 temp_dict = {
                     'ds_idx': item_idx,
                     'spk_id': self.spk_map[spk],
-                    'spk_name': self.speakers[ds_id],
+                    'spk_name': spk,
                     'language_id': self.lang_map[lang],
                     'language_name': lang,
                     'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
@@ -173,7 +173,7 @@ def require(attr):
 
                 meta_data_dict[f'{ds_id}:{item_name}'] = temp_dict
 
-        self.items.update(meta_data_dict)
+        return meta_data_dict
 
     def check_coverage(self):
         super().check_coverage()

From 678e3e6fb19aeeebe7b71263c119af960db840a7 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 28 Jul 2024 15:17:09 +0800
Subject: [PATCH 21/44] Add check of existence for merged phonemes

---
 utils/phoneme_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index df40fd4b8..a9ddb160b 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -43,6 +43,12 @@ def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[Li
                                 f"Invalid phoneme tag '{phoneme}' in merged group: "
                                 f"unrecognized language name '{lang}'."
                             )
+                        unique_name = phoneme if self._multi_langs else name
+                        if unique_name not in all_phonemes:
+                            raise ValueError(
+                                f"Invalid phoneme tag '{phoneme}' in merged group: "
+                                f"not found in phoneme set."
+                            )
                 merged_groups = [set(phones) for phones in merged_groups if len(phones) > 1]
             else:
                 _merged_groups = []

From d0d7b7319990e16e88003a7a07dc3ef5a5cecf24 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 28 Jul 2024 15:17:25 +0800
Subject: [PATCH 22/44] Fix spk_id assignment

---
 basics/base_binarizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index fb71614f6..d1f812015 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -74,13 +74,14 @@ def __init__(self, datasets=None, data_attrs=None):
     def build_spk_map(self):
         spk_ids = [ds.get('spk_id') for ds in self.datasets]
         assigned_spk_ids = {spk_id for spk_id in spk_ids if spk_id is not None}
+        idx = 0
         for i in range(len(spk_ids)):
             if spk_ids[i] is not None:
                 continue
-            idx = 0
             while idx in assigned_spk_ids:
                 idx += 1
             spk_ids[i] = idx
+            assigned_spk_ids.add(idx)
         assert max(spk_ids) < hparams['num_spk'], \
             f'Index in spk_id sequence {spk_ids} is out of range. All values should be smaller than num_spk.'
 

From f3a969c886eaf871fd99a39fd04d8b6750ff2f0b Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 28 Jul 2024 17:43:48 +0800
Subject: [PATCH 23/44] Fix languages.json filename

---
 deployment/exporters/acoustic_exporter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
index 9160af6e3..287e35099 100644
--- a/deployment/exporters/acoustic_exporter.py
+++ b/deployment/exporters/acoustic_exporter.py
@@ -414,7 +414,7 @@ def _export_phonemes(self, path: Path):
         ph_path = path / f'{self.model_name}.phonemes.json'
         self.phoneme_dictionary.dump(ph_path)
         print(f'| export phonemes => {ph_path}')
-        lang_path = path / 'languages.json'
+        lang_path = path / f'{self.model_name}.languages.json'
         with open(lang_path, 'w', encoding='utf8') as f:
             json.dump(self.lang_map, f, ensure_ascii=False, indent=2)
         print(f'| export languages => {lang_path}')

From bf44910f4e2f0f884f48f2dded1a01c51500be6d Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 28 Jul 2024 18:11:34 +0800
Subject: [PATCH 24/44] Fix `languages` key in dsconfig.yaml

---
 deployment/exporters/acoustic_exporter.py | 1 +
 deployment/exporters/variance_exporter.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
index 287e35099..0e31105e7 100644
--- a/deployment/exporters/acoustic_exporter.py
+++ b/deployment/exporters/acoustic_exporter.py
@@ -124,6 +124,7 @@ def export_attachments(self, path: Path):
         dsconfig = {
             # basic configs
             'phonemes': f'{self.model_name}.phonemes.json',
+            'languages': f'{self.model_name}.languages.json',
             'use_lang_id': hparams.get('use_lang_id', False),
             'acoustic': f'{model_name}.onnx',
             'hidden_size': hparams['hidden_size'],
diff --git a/deployment/exporters/variance_exporter.py b/deployment/exporters/variance_exporter.py
index 76d061834..27e8e4d0b 100644
--- a/deployment/exporters/variance_exporter.py
+++ b/deployment/exporters/variance_exporter.py
@@ -155,7 +155,7 @@ def export_attachments(self, path: Path):
         dsconfig = {
             # basic configs
             'phonemes': f'{self.model_name}.phonemes.json',
-            'languages': sorted(self.lang_map.keys()),
+            'languages': f'{self.model_name}.languages.json',
             'use_lang_id': hparams.get('use_lang_id', False),
             'linguistic': f'{model_name}.linguistic.onnx',
             'hidden_size': self.model.hidden_size,

From fb5f58922b62ddcf3533a0a3bfe5867b2ac8b175 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Thu, 1 Aug 2024 01:21:05 +0800
Subject: [PATCH 25/44] Set `use_lang_id` to false if there are no
 cross-lingual phonemes

---
 deployment/exporters/acoustic_exporter.py | 5 +++--
 deployment/exporters/variance_exporter.py | 5 +++--
 deployment/modules/fastspeech2.py         | 4 ++++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
index 0e31105e7..849dae5db 100644
--- a/deployment/exporters/acoustic_exporter.py
+++ b/deployment/exporters/acoustic_exporter.py
@@ -33,6 +33,7 @@ def __init__(
         self.spk_map: dict = self.build_spk_map()
         self.lang_map: dict = self.build_lang_map()
         self.phoneme_dictionary = load_phoneme_dictionary()
+        self.use_lang_id = hparams.get('use_lang_id', False) and len(self.phoneme_dictionary.cross_lingual_phonemes) > 0
         self.model = self.build_model()
         self.fs2_aux_cache_path = self.cache_dir / (
             'fs2_aux.onnx' if self.model.use_shallow_diffusion else 'fs2.onnx'
@@ -125,7 +126,7 @@ def export_attachments(self, path: Path):
             # basic configs
             'phonemes': f'{self.model_name}.phonemes.json',
             'languages': f'{self.model_name}.languages.json',
-            'use_lang_id': hparams.get('use_lang_id', False),
+            'use_lang_id': self.use_lang_id,
             'acoustic': f'{model_name}.onnx',
             'hidden_size': hparams['hidden_size'],
             'vocoder': 'nsf_hifigan_44.1k_hop512_128bin_2024.02',
@@ -217,7 +218,7 @@ def _torch_export_model(self):
             dynamix_axes['spk_embed'] = {
                 1: 'n_frames'
             }
-        if hparams.get('use_lang_id'):
+        if self.use_lang_id:
             kwargs['languages'] = torch.zeros_like(tokens)
             input_names.append('languages')
             dynamix_axes['languages'] = {
diff --git a/deployment/exporters/variance_exporter.py b/deployment/exporters/variance_exporter.py
index 27e8e4d0b..82808ec08 100644
--- a/deployment/exporters/variance_exporter.py
+++ b/deployment/exporters/variance_exporter.py
@@ -33,6 +33,7 @@ def __init__(
         self.spk_map: dict = self.build_spk_map()
         self.lang_map: dict = self.build_lang_map()
         self.phoneme_dictionary = load_phoneme_dictionary()
+        self.use_lang_id = hparams.get('use_lang_id', False) and len(self.phoneme_dictionary.cross_lingual_phonemes) > 0
         self.model = self.build_model()
         self.linguistic_encoder_cache_path = self.cache_dir / 'linguistic.onnx'
         self.dur_predictor_cache_path = self.cache_dir / 'dur.onnx'
@@ -156,7 +157,7 @@ def export_attachments(self, path: Path):
             # basic configs
             'phonemes': f'{self.model_name}.phonemes.json',
             'languages': f'{self.model_name}.languages.json',
-            'use_lang_id': hparams.get('use_lang_id', False),
+            'use_lang_id': self.use_lang_id,
             'linguistic': f'{model_name}.linguistic.onnx',
             'hidden_size': self.model.hidden_size,
             'predict_dur': self.model.predict_dur,
@@ -205,7 +206,7 @@ def _torch_export_model(self):
                 1: 'n_tokens'
             }
         }
-        input_lang_id = hparams.get('use_lang_id', False)
+        input_lang_id = self.use_lang_id
         input_spk_embed = hparams['use_spk_id'] and not self.freeze_spk
 
         print(f'Exporting {self.fs2_class_name}...')
diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py
index bae452aea..20dfdb0d7 100644
--- a/deployment/modules/fastspeech2.py
+++ b/deployment/modules/fastspeech2.py
@@ -48,6 +48,8 @@ def __init__(self, vocab_size, cross_lingual_token_idx=None):
             torch.LongTensor(cross_lingual_token_idx),
             persistent=False
         )  # [N,]
+        if len(cross_lingual_token_idx) == 0:
+            self.use_lang_id = False
 
         # for temporary compatibility; will be completely removed in the future
         self.f0_embed_type = hparams.get('f0_embed_type', 'continuous')
@@ -136,6 +138,8 @@ def __init__(self, vocab_size, cross_lingual_token_idx=None):
             torch.LongTensor(cross_lingual_token_idx),
             persistent=False
         )
+        if len(cross_lingual_token_idx) == 0:
+            self.use_lang_id = False
         self.lr = LengthRegulator()
 
     def forward_encoder_word(self, tokens, word_div, word_dur, languages=None):

From 333d9ef8e939cb7ae101372a266e65cf46178619 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 3 Aug 2024 00:07:48 +0800
Subject: [PATCH 26/44] Support defining extra phonemes

---
 utils/phoneme_utils.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index a9ddb160b..012a65f35 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -8,8 +8,23 @@
 
 
 class PhonemeDictionary:
-    def __init__(self, dictionaries: Dict[str, pathlib.Path], merged_groups: List[List[str]] = None):
+    def __init__(
+            self,
+            dictionaries: Dict[str, pathlib.Path],
+            extra_phonemes: List[str] = None,
+            merged_groups: List[List[str]] = None
+    ):
         all_phonemes = {'AP', 'SP'}
+        if extra_phonemes:
+            for ph in extra_phonemes:
+                if '/' in ph:
+                    lang, name = ph.split('/', maxsplit=1)
+                    if lang not in dictionaries:
+                        raise ValueError(
+                            f"Invalid phoneme tag '{ph}' in extra phonemes: "
+                            f"unrecognized language name '{lang}'."
+                        )
+                all_phonemes.add(ph)
         self._multi_langs = len(dictionaries) > 1
         for lang, dict_path in dictionaries.items():
             with open(dict_path, 'r', encoding='utf8') as dict_file:
@@ -192,5 +207,6 @@ def load_phoneme_dictionary() -> PhonemeDictionary:
         }
     return PhonemeDictionary(
         dictionaries=dicts,
+        extra_phonemes=hparams.get('extra_phonemes'),
         merged_groups=hparams.get('merged_phoneme_groups')
     )

From d3cd5cdb251798ee36fefc1dc86cfd3195405922 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 3 Aug 2024 00:45:53 +0800
Subject: [PATCH 27/44] Refactor configs

---
 configs/acoustic.yaml                  | 32 ++++++++++--------
 configs/base.yaml                      |  2 +-
 configs/templates/config_acoustic.yaml | 46 +++++++++++++++++---------
 configs/templates/config_variance.yaml | 44 +++++++++++++++---------
 configs/variance.yaml                  | 39 ++++++++++++----------
 5 files changed, 102 insertions(+), 61 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 0364b5c15..f3cf127f4 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -2,17 +2,22 @@ base_config:
   - configs/base.yaml
 
 task_cls: training.acoustic_task.AcousticTask
-num_spk: 1
-speakers:
-  - opencpop
-spk_ids: []
-test_prefixes: [
-    '2044',
-    '2086',
-    '2092',
-    '2093',
-    '2100',
-]
+
+dictionaries:
+  zh: dictionaries/opencpop-extension.txt
+extra_phonemes: []
+merged_phoneme_groups: []
+datasets:
+  - raw_data_dir: 'data/opencpop/raw'
+    speaker: opencpop
+    spk_id: 0
+    language: zh
+    test_prefixes:
+      - '2044'
+      - '2086'
+      - '2092'
+      - '2093'
+      - '2100'
 
 vocoder: NsfHifiGAN
 vocoder_ckpt: checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt
@@ -41,10 +46,8 @@ augmentation_args:
     range: [0.5, 2.]
     scale: 0.75
 
-raw_data_dir: 'data/opencpop/raw'
 binary_data_dir: 'data/opencpop/binary'
 binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer
-dictionary: dictionaries/opencpop-extension.txt
 spec_min: [-12]
 spec_max: [0]
 mel_vmin: -14.
@@ -55,7 +58,10 @@ breathiness_smooth_width: 0.12
 voicing_smooth_width: 0.12
 tension_smooth_width: 0.12
 
+use_lang_id: false
+num_lang: 1
 use_spk_id: false
+num_spk: 1
 use_energy_embed: false
 use_breathiness_embed: false
 use_voicing_embed: false
diff --git a/configs/base.yaml b/configs/base.yaml
index b2e610f95..ab33c5541 100644
--- a/configs/base.yaml
+++ b/configs/base.yaml
@@ -5,7 +5,7 @@ task_cls: null
 # dataset
 #############
 sort_by_len: true
-raw_data_dir: null
+datasets: []
 binary_data_dir: null
 binarizer_cls: null
 binarization_args:
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 198444bc7..21b8e5805 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -1,19 +1,33 @@
-base_config: configs/acoustic.yaml
+base_config:
+  - configs/acoustic.yaml
+
+dictionaries:
+  zh: dictionaries/opencpop-extension.txt
+extra_phonemes: []
+merged_phoneme_groups: []
+
+datasets:
+  - raw_data_dir: data/xxx1/raw
+    speaker: speaker1
+    spk_id: 0
+    language: zh
+    test_prefixes:
+      - wav1
+      - wav2
+      - wav3
+      - wav4
+      - wav5
+  - raw_data_dir: data/xxx2/raw
+    speaker: speaker2
+    spk_id: 1
+    language: zh
+    test_prefixes:
+      - wav1
+      - wav2
+      - wav3
+      - wav4
+      - wav5
 
-raw_data_dir:
-  - data/xxx1/raw
-  - data/xxx2/raw
-speakers:
-  - speaker1
-  - speaker2
-spk_ids: []
-test_prefixes:
-  - wav1
-  - wav2
-  - wav3
-  - wav4
-  - wav5
-dictionary: dictionaries/opencpop-extension.txt
 binary_data_dir: data/xxx/binary
 binarization_args:
   num_workers: 0
@@ -24,6 +38,8 @@ hnsep_ckpt: 'checkpoints/vr/model.pt'
 vocoder: NsfHifiGAN
 vocoder_ckpt: checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt
 
+use_lang_id: false
+num_lang: 1
 use_spk_id: false
 num_spk: 1
 
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
index d75667797..952e994ae 100644
--- a/configs/templates/config_variance.yaml
+++ b/configs/templates/config_variance.yaml
@@ -1,29 +1,43 @@
 base_config:
   - configs/variance.yaml
 
-raw_data_dir:
-  - data/xxx1/raw
-  - data/xxx2/raw
-speakers:
-  - speaker1
-  - speaker2
-spk_ids: []
-test_prefixes:
-  - wav1
-  - wav2
-  - wav3
-  - wav4
-  - wav5
-dictionary: dictionaries/opencpop-extension.txt
+dictionaries:
+  zh: dictionaries/opencpop-extension.txt
+extra_phonemes: []
+merged_phoneme_groups: []
+
+datasets:
+  - raw_data_dir: data/xxx1/raw
+    speaker: speaker1
+    spk_id: 0
+    language: zh
+    test_prefixes:
+      - wav1
+      - wav2
+      - wav3
+      - wav4
+      - wav5
+  - raw_data_dir: data/xxx2/raw
+    speaker: speaker2
+    spk_id: 1
+    language: zh
+    test_prefixes:
+      - wav1
+      - wav2
+      - wav3
+      - wav4
+      - wav5
+
 binary_data_dir: data/xxx/binary
 binarization_args:
   num_workers: 0
-
 pe: parselmouth
 pe_ckpt: 'checkpoints/rmvpe/model.pt'
 hnsep: vr
 hnsep_ckpt: 'checkpoints/vr/model.pt'
 
+use_lang_id: false
+num_lang: 1
 use_spk_id: false
 num_spk: 1
 # NOTICE: before enabling variance modules, please read the docs at
diff --git a/configs/variance.yaml b/configs/variance.yaml
index 2c6d002da..9d7b59b4b 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -2,17 +2,22 @@ base_config:
   - configs/base.yaml
 
 task_cls: training.variance_task.VarianceTask
-num_spk: 1
-speakers:
-  - opencpop
-spk_ids: []
-test_prefixes: [
-    '2044',
-    '2086',
-    '2092',
-    '2093',
-    '2100',
-]
+
+dictionaries:
+  zh: dictionaries/opencpop-extension.txt
+extra_phonemes: []
+merged_phoneme_groups: []
+datasets:
+  - raw_data_dir: 'data/opencpop/raw'
+    speaker: opencpop
+    spk_id: 0
+    language: zh
+    test_prefixes:
+      - '2044'
+      - '2086'
+      - '2092'
+      - '2093'
+      - '2100'
 
 audio_sample_rate: 44100
 hop_size: 512            # Hop size.
@@ -25,16 +30,13 @@ binarization_args:
   num_workers: 0
   prefer_ds: false
 
-raw_data_dir: 'data/opencpop_variance/raw'
 binary_data_dir: 'data/opencpop_variance/binary'
 binarizer_cls: preprocessing.variance_binarizer.VarianceBinarizer
-dictionary: dictionaries/opencpop-extension.txt
 
+use_lang_id: false
+num_lang: 1
 use_spk_id: false
-
-rel_pos: true
-hidden_size: 256
-
+num_spk: 1
 predict_dur: true
 predict_pitch: true
 predict_energy: false
@@ -42,6 +44,9 @@ predict_breathiness: false
 predict_voicing: false
 predict_tension: false
 
+rel_pos: true
+hidden_size: 256
+
 dur_prediction_args:
   arch: fs2
   hidden_size: 512

From f729db859bea3ceab6785c7164644e88dbb55f1d Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 3 Aug 2024 22:52:36 +0800
Subject: [PATCH 28/44] Prefer file copies in work_dir when loading
 dictionaries

---
 utils/phoneme_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 012a65f35..584c55d84 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -186,9 +186,9 @@ def load_phoneme_dictionary() -> PhonemeDictionary:
     if config_dicts is not None:
         dicts = {}
         for lang, config_dict_path in config_dicts.items():
-            config_dict_path = pathlib.Path(config_dict_path)
+            config_dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt'
             if not config_dict_path.exists():
-                config_dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt'
+                config_dict_path = pathlib.Path(config_dict_path)
             if not config_dict_path.exists():
                 raise FileNotFoundError(
                     f"Could not locate dictionary for language '{lang}'."

From 453cb0fae5d6e03024f01695b95a548cfb4e15ce Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 4 Aug 2024 22:08:00 +0800
Subject: [PATCH 29/44] Fix cannot locate dictionary

---
 utils/phoneme_utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 584c55d84..f1556068b 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -37,6 +37,8 @@ def __init__(
                                 f"Invalid phoneme tag '{phoneme}' in dictionary '{dict_path}': "
                                 f"should not contain the reserved character '/'."
                             )
+                        if phoneme in all_phonemes:
+                            continue
                         if self._multi_langs:
                             all_phonemes.add(f'{lang}/{phoneme}')
                         else:
@@ -186,10 +188,10 @@ def load_phoneme_dictionary() -> PhonemeDictionary:
     if config_dicts is not None:
         dicts = {}
         for lang, config_dict_path in config_dicts.items():
-            config_dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt'
-            if not config_dict_path.exists():
-                config_dict_path = pathlib.Path(config_dict_path)
-            if not config_dict_path.exists():
+            dict_path = pathlib.Path(hparams['work_dir']) / f'dictionary-{lang}.txt'
+            if not dict_path.exists():
+                dict_path = pathlib.Path(config_dict_path)
+            if not dict_path.exists():
                 raise FileNotFoundError(
                     f"Could not locate dictionary for language '{lang}'."
                 )

From 663db52b4cf4b19ba6d132af0568d3f9f7ba8692 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 17 Aug 2024 22:31:48 +0800
Subject: [PATCH 30/44] Fix unexpected loading error when dictionary changes

---
 utils/phoneme_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index f1556068b..50145979e 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -195,17 +195,17 @@ def load_phoneme_dictionary() -> PhonemeDictionary:
                 raise FileNotFoundError(
                     f"Could not locate dictionary for language '{lang}'."
                 )
-            dicts[lang] = config_dict_path
+            dicts[lang] = dict_path
     else:
-        config_dict_path = pathlib.Path(hparams['dictionary'])
-        if not config_dict_path.exists():
-            config_dict_path = pathlib.Path(hparams['work_dir']) / 'dictionary.txt'
-        if not config_dict_path.exists():
+        dict_path = pathlib.Path(hparams['work_dir']) / 'dictionary.txt'
+        if not dict_path.exists():
+            dict_path = pathlib.Path(hparams['dictionary'])
+        if not dict_path.exists():
             raise FileNotFoundError(
                 f"Could not locate dictionary file."
             )
         dicts = {
-            'default': config_dict_path
+            'default': dict_path
         }
     return PhonemeDictionary(
         dictionaries=dicts,

From 6c7bb0836919e509e2c8fd8b8c5ed0d1731e0499 Mon Sep 17 00:00:00 2001
From: Anjo <87346264+AnAndroNerd@users.noreply.github.com>
Date: Fri, 15 Nov 2024 22:28:09 -0700
Subject: [PATCH 31/44] Update toplevel.py (#219)

---
 modules/toplevel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/toplevel.py b/modules/toplevel.py
index 5aedfed76..aceff1f70 100644
--- a/modules/toplevel.py
+++ b/modules/toplevel.py
@@ -154,7 +154,7 @@ def __init__(self, vocab_size):
             self.pitch_retake_embed = Embedding(2, hparams['hidden_size'])
             pitch_hparams = hparams['pitch_prediction_args']
             self.pitch_backbone_type = compat.get_backbone_type(hparams, nested_config=pitch_hparams)
-            self.pitch_backbone_args = compat.get_backbone_args(hparams, backbone_type=self.pitch_backbone_type)
+            self.pitch_backbone_args = compat.get_backbone_args(pitch_hparams, backbone_type=self.pitch_backbone_type)
             if self.diffusion_type == 'ddpm':
                 self.pitch_predictor = PitchDiffusion(
                     vmin=pitch_hparams['pitd_norm_min'],

From da79ef21653430d23ff2d9d7925929f15b115c92 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 4 Jan 2025 16:15:23 +0800
Subject: [PATCH 32/44] Fix unexpected config passing

---
 configs/acoustic.yaml | 15 ++-------------
 configs/variance.yaml | 15 ++-------------
 2 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 294471638..aad05ea15 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -3,21 +3,10 @@ base_config:
 
 task_cls: training.acoustic_task.AcousticTask
 
-dictionaries:
-  zh: dictionaries/opencpop-extension.txt
+dictionaries: {}
 extra_phonemes: []
 merged_phoneme_groups: []
-datasets:
-  - raw_data_dir: 'data/opencpop/raw'
-    speaker: opencpop
-    spk_id: 0
-    language: zh
-    test_prefixes:
-      - '2044'
-      - '2086'
-      - '2092'
-      - '2093'
-      - '2100'
+datasets: []
 
 vocoder: NsfHifiGAN
 vocoder_ckpt: checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt
diff --git a/configs/variance.yaml b/configs/variance.yaml
index 49e18ab7f..3e02e430e 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -3,21 +3,10 @@ base_config:
 
 task_cls: training.variance_task.VarianceTask
 
-dictionaries:
-  zh: dictionaries/opencpop-extension.txt
+dictionaries: {}
 extra_phonemes: []
 merged_phoneme_groups: []
-datasets:
-  - raw_data_dir: 'data/opencpop/raw'
-    speaker: opencpop
-    spk_id: 0
-    language: zh
-    test_prefixes:
-      - '2044'
-      - '2086'
-      - '2092'
-      - '2093'
-      - '2100'
+datasets: []
 
 audio_sample_rate: 44100
 hop_size: 512            # Hop size.

From 5d5632914a9ccb3e83da0b6b41c76da9416eced4 Mon Sep 17 00:00:00 2001
From: yxlllc <33565655+yxlllc@users.noreply.github.com>
Date: Fri, 17 Jan 2025 01:10:43 +0800
Subject: [PATCH 33/44] Update lynxnet backbone (#228)

* Change the injection method of conditions on lynxnet (#225)

* update configurations for new-lynxnet

* update configurations for new-lynxnet

* update configurations for new-lynxnet

---------

Co-authored-by: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com>
---
 configs/acoustic.yaml                  |  1 +
 configs/templates/config_acoustic.yaml |  1 +
 configs/templates/config_variance.yaml |  4 +++
 modules/backbones/lynxnet.py           | 39 +++++++++++++++++---------
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index aad05ea15..99fd175bd 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -74,6 +74,7 @@ backbone_args:
   num_layers: 6
   kernel_size: 31
   dropout_rate: 0.0
+  strong_cond: true
 main_loss_type: l2
 main_loss_log_norm: false
 schedule_type: 'linear'
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 5bcfc2df4..263d936fa 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -80,6 +80,7 @@ backbone_args:
   num_layers: 6
   kernel_size: 31
   dropout_rate: 0.0
+  strong_cond: true
 #backbone_type: 'wavenet'
 #backbone_args:
 #  num_channels: 512
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
index a9e2a19d4..ad051c0ec 100644
--- a/configs/templates/config_variance.yaml
+++ b/configs/templates/config_variance.yaml
@@ -101,6 +101,8 @@ pitch_prediction_args:
 # backbone_args:
 #   num_layers: 6
 #   num_channels: 512
+#   dropout_rate: 0.0
+#   strong_cond: true
 
 variances_prediction_args:
   total_repeat_bins: 48
@@ -113,6 +115,8 @@ variances_prediction_args:
 # backbone_args:
 #   num_layers: 6
 #   num_channels: 384
+#   dropout_rate: 0.0
+#   strong_cond: true
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py
index 744967c6b..18e7bf497 100644
--- a/modules/backbones/lynxnet.py
+++ b/modules/backbones/lynxnet.py
@@ -10,6 +10,12 @@
 from utils.hparams import hparams
 
 
+class Conv1d(torch.nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        nn.init.kaiming_normal_(self.weight)
+
+
 class SwiGLU(nn.Module):
     # Swish-Applies the gated linear unit function.
     def __init__(self, dim=-1):
@@ -39,7 +45,7 @@ def calc_same_padding(kernel_size):
         pad = kernel_size // 2
         return pad, pad - (kernel_size + 1) % 2
 
-    def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
+    def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.0):
         super().__init__()
         inner_dim = dim * expansion_factor
         activation_classes = {
@@ -73,27 +79,30 @@ def forward(self, x):
 
 
 class LYNXNetResidualLayer(nn.Module):
-    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
+    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.0):
         super().__init__()
         self.diffusion_projection = nn.Conv1d(dim, dim, 1)
         self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1)
         self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size,
                                          activation=activation, dropout=dropout)
 
-    def forward(self, x, conditioner, diffusion_step):
-        res_x = x.transpose(1, 2)
-        x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
-        x = x.transpose(1, 2)
-        x = self.convmodule(x)  # (#batch, dim, length)
-        x = x + res_x
+    def forward(self, x, conditioner, diffusion_step, front_cond_inject=False):
+        if front_cond_inject:
+            x = x + self.conditioner_projection(conditioner)
+            res_x = x
+        else:
+            res_x = x
+            x = x + self.conditioner_projection(conditioner)
+        x = x + self.diffusion_projection(diffusion_step)
         x = x.transpose(1, 2)
-
+        x = self.convmodule(x)  # (#batch, dim, length) 
+        x = x.transpose(1, 2) + res_x
         return x  # (#batch, length, dim)
 
 
 class LYNXNet(nn.Module):
     def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31,
-                 activation='PReLU', dropout=0.):
+                 activation='PReLU', dropout=0.0, strong_cond=False):
         """
         LYNXNet(Linear Gated Depthwise Separable Convolution Network)
         TIPS:You can control the style of the generated results by modifying the 'activation', 
@@ -104,7 +113,7 @@ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansio
         super().__init__()
         self.in_dims = in_dims
         self.n_feats = n_feats
-        self.input_projection = nn.Conv1d(in_dims * n_feats, num_channels, 1)
+        self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
         self.diffusion_embedding = nn.Sequential(
             SinusoidalPosEmb(num_channels),
             nn.Linear(num_channels, num_channels * 4),
@@ -125,7 +134,8 @@ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansio
             ]
         )
         self.norm = nn.LayerNorm(num_channels)
-        self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
+        self.output_projection = Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
+        self.strong_cond = strong_cond
         nn.init.zeros_(self.output_projection.weight)
 
     def forward(self, spec, diffusion_step, cond):
@@ -142,12 +152,13 @@ def forward(self, spec, diffusion_step, cond):
             x = spec.flatten(start_dim=1, end_dim=2)  # [B, F x M, T]
 
         x = self.input_projection(x)  # x [B, residual_channel, T]
-        x = F.gelu(x)
+        if not self.strong_cond:
+            x = F.gelu(x)
 
         diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1)
 
         for layer in self.residual_layers:
-            x = layer(x, cond, diffusion_step)
+            x = layer(x, cond, diffusion_step, front_cond_inject=self.strong_cond)
 
         # post-norm
         x = self.norm(x.transpose(1, 2)).transpose(1, 2)

From 3f8bc85280d66302389c8c86b7cee4df9b3e950e Mon Sep 17 00:00:00 2001
From: yxlllc <33565655+yxlllc@users.noreply.github.com>
Date: Mon, 10 Feb 2025 14:39:30 +0800
Subject: [PATCH 34/44] Improve fastspeech2 encoder using Rotary Position
 Embedding (RoPE) in multi-head self-attention (#234)

* update multi-head self attention with RoPE

* RoPE onnx (#230)

* fix requirements.txt (#233)

* fix rope for melody encoder

* support swiglu activation for ffn

* update dependencies

---------

Co-authored-by: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com>
---
 configs/acoustic.yaml                     |   2 +
 configs/templates/config_acoustic.yaml    |   2 +
 configs/templates/config_variance.yaml    |   2 +
 configs/variance.yaml                     |   2 +
 modules/commons/common_layers.py          | 125 +++++++--
 modules/commons/rotary_embedding_torch.py | 320 ++++++++++++++++++++++
 modules/fastspeech/acoustic_encoder.py    |   3 +-
 modules/fastspeech/tts_modules.py         |  38 +--
 modules/fastspeech/variance_encoder.py    |   6 +-
 requirements-onnx.txt                     |   1 +
 requirements.txt                          |   1 +
 11 files changed, 465 insertions(+), 37 deletions(-)
 create mode 100644 modules/commons/rotary_embedding_torch.py

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 99fd175bd..9f27733f7 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -62,6 +62,8 @@ diffusion_type: reflow
 time_scale_factor: 1000
 timesteps: 1000
 max_beta: 0.02
+enc_ffn_kernel_size: 3
+use_rope: true
 rel_pos: true
 sampling_algorithm: euler
 sampling_steps: 20
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 263d936fa..59778df99 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -69,6 +69,8 @@ augmentation_args:
 
 # diffusion and shallow diffusion
 diffusion_type: reflow
+enc_ffn_kernel_size: 3
+use_rope: true
 use_shallow_diffusion: true
 T_start: 0.4
 T_start_infer: 0.4
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
index ad051c0ec..7d5b211aa 100644
--- a/configs/templates/config_variance.yaml
+++ b/configs/templates/config_variance.yaml
@@ -63,6 +63,8 @@ voicing_db_max: -12.0
 tension_logit_min: -10.0
 tension_logit_max: 10.0
 
+enc_ffn_kernel_size: 3
+use_rope: true
 hidden_size: 256
 dur_prediction_args:
   arch: fs2
diff --git a/configs/variance.yaml b/configs/variance.yaml
index 3e02e430e..4f69e34f7 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -33,6 +33,8 @@ predict_breathiness: false
 predict_voicing: false
 predict_tension: false
 
+enc_ffn_kernel_size: 3
+use_rope: true
 rel_pos: true
 hidden_size: 256
 
diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py
index b12cc7f96..3927cd272 100644
--- a/modules/commons/common_layers.py
+++ b/modules/commons/common_layers.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import math
-
+import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.onnx.operators
@@ -104,24 +104,43 @@ def max_positions():
         return int(1e5)  # an arbitrary large number
 
 
+class SwiGLU(nn.Module):
+    # Swish-Applies the gated linear unit function.
+    def __init__(self, dim=-1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        # out, gate = x.chunk(2, dim=self.dim)
+        # Using torch.split instead of chunk for ONNX export compatibility.
+        out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim)
+        return out * F.silu(gate)
+
+
 class TransformerFFNLayer(nn.Module):
     def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'):
         super().__init__()
         self.kernel_size = kernel_size
         self.dropout = dropout
         self.act = act
-        self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
+        filter_size_1 = filter_size
         if self.act == 'relu':
             self.act_fn = ReLU()
         elif self.act == 'gelu':
             self.act_fn = GELU()
         elif self.act == 'swish':
             self.act_fn = SiLU()
+        elif self.act == 'swiglu':
+            self.act_fn = SwiGLU()
+            filter_size_1 = filter_size * 2
+        else:
+            raise ValueError(f'{act} is not a valid activation')
+        self.ffn_1 = nn.Conv1d(hidden_size, filter_size_1, kernel_size, padding=kernel_size // 2)
         self.ffn_2 = XavierUniformInitLinear(filter_size, hidden_size)
 
     def forward(self, x):
-        # x: T x B x C
-        x = self.ffn_1(x.permute(1, 2, 0)).permute(2, 0, 1)
+        # x: B x T x C
+        x = self.ffn_1(x.transpose(1, 2)).transpose(1, 2)
         x = x * self.kernel_size ** -0.5
 
         x = self.act_fn(x)
@@ -130,15 +149,86 @@ def forward(self, x):
         return x
 
 
+class MultiheadSelfAttentionWithRoPE(nn.Module):
+    def __init__(self, embed_dim, num_heads, dropout=0.1, bias=False, rotary_embed=None):
+        super().__init__()
+        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
+        
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        
+        # Linear layers for Q, K, V projections
+        self.in_proj = nn.Linear(embed_dim, embed_dim * 3, bias=bias)
+        
+        # Final linear layer after concatenation
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        
+        # Dropout layer
+        self.dropout = nn.Dropout(dropout)
+
+        # Rotary Embeddings
+        self.rotary_embed = rotary_embed
+        
+    def forward(self, x, key_padding_mask=None):
+        # x: (B, L, C)
+        # key_padding_mask: (B, L)
+        batch_size, seq_len, embed_dim = x.size()
+        
+        # Project inputs to Q, K, V
+        Q, K, V = torch.split(self.in_proj(x), self.embed_dim, dim=-1)
+        
+        # Reshape Q, K, V for multi-head attention
+        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)  # (B, H, L, D)
+        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)  # (B, H, L, D)
+        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)  # (B, H, L, D)
+        
+        # Apply RoPE
+        if self.rotary_embed is not None:
+            Q = self.rotary_embed.rotate_queries_or_keys(Q)
+            K = self.rotary_embed.rotate_queries_or_keys(K)
+            
+        # Compute attention scores
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_dim)  # (B, H, L, L)
+
+        # Apply key padding mask if provided
+        if key_padding_mask is not None:
+            # Expand mask to match attention scores shape
+            mask = key_padding_mask.unsqueeze(1).unsqueeze(1)  # (B, 1, 1, L)
+            scores = scores.masked_fill(mask == 1, -np.inf)  # Masked positions are set to -inf
+
+        # Compute attention weights
+        attn_weights = F.softmax(scores, dim=-1)  # (B, H, L, L)
+        attn_weights = self.dropout(attn_weights)
+        
+        # Apply attention weights to V
+        attn_output = torch.matmul(attn_weights, V)  # (B, H, L, D)
+        
+        # Reshape and concatenate heads
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)  # (B, L, C)
+        
+        # Final linear projection
+        output = self.out_proj(attn_output)  # (B, L, C)
+        
+        return output
+        
+
 class EncSALayer(nn.Module):
     def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
-                 relu_dropout=0.1, kernel_size=9, act='gelu'):
+                 relu_dropout=0.1, kernel_size=9, act='gelu', rotary_embed=None):
         super().__init__()
         self.dropout = dropout
         self.layer_norm1 = LayerNorm(c)
-        self.self_attn = MultiheadAttention(
-            c, num_heads, dropout=attention_dropout, bias=False,
-        )
+        if rotary_embed is None:
+            self.self_attn = MultiheadAttention(
+                c, num_heads, dropout=attention_dropout, bias=False, batch_first=True
+            )
+            self.use_rope = False
+        else:
+            self.self_attn = MultiheadSelfAttentionWithRoPE(
+                c, num_heads, dropout=attention_dropout, bias=False, rotary_embed=rotary_embed
+            )
+            self.use_rope = True
         self.layer_norm2 = LayerNorm(c)
         self.ffn = TransformerFFNLayer(
             c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, act=act
@@ -151,22 +241,25 @@ def forward(self, x, encoder_padding_mask=None, **kwargs):
             self.layer_norm2.training = layer_norm_training
         residual = x
         x = self.layer_norm1(x)
-        x, _, = self.self_attn(
-            query=x,
-            key=x,
-            value=x,
-            key_padding_mask=encoder_padding_mask
-        )
+        if self.use_rope:
+            x = self.self_attn(x, key_padding_mask=encoder_padding_mask)
+        else:
+            x, _, = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=encoder_padding_mask
+            )
         x = F.dropout(x, self.dropout, training=self.training)
         x = residual + x
-        x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        x = x * (1 - encoder_padding_mask.float())[..., None]
 
         residual = x
         x = self.layer_norm2(x)
         x = self.ffn(x)
         x = F.dropout(x, self.dropout, training=self.training)
         x = residual + x
-        x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        x = x * (1 - encoder_padding_mask.float())[..., None]
         return x
 
 
diff --git a/modules/commons/rotary_embedding_torch.py b/modules/commons/rotary_embedding_torch.py
new file mode 100644
index 000000000..4efcb514f
--- /dev/null
+++ b/modules/commons/rotary_embedding_torch.py
@@ -0,0 +1,320 @@
+from __future__ import annotations
+from math import pi, log
+
+import torch
+from torch.amp import autocast
+from torch.nn import Module, ModuleList
+from torch import nn, einsum, broadcast_tensors, Tensor
+
+from einops import rearrange, repeat
+
+from typing import Literal
+
+# helper functions
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    return val if exists(val) else d
+
+# broadcat, as tortoise-tts was using it
+
+def broadcat(tensors, dim = -1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim = dim)
+
+def slice_at_dim(t, dim_slice: slice, *, dim):
+    dim += (t.ndim if dim < 0 else 0)
+    colons = [slice(None)] * t.ndim
+    colons[dim] = dim_slice
+    return t[tuple(colons)]
+
+# rotary embedding helper functions
+
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+
+@autocast('cuda', enabled = False)
+def apply_rotary_emb(
+    freqs,
+    t,
+    start_index = 0,
+    scale = 1.,
+    seq_dim = -2,
+    freqs_seq_dim = None
+):
+    dtype = t.dtype
+
+    if not exists(freqs_seq_dim):
+        if freqs.ndim == 2 or t.ndim == 3:
+            freqs_seq_dim = 0
+
+    if t.ndim == 3 or exists(freqs_seq_dim):
+        seq_len = t.shape[seq_dim]
+        freqs = slice_at_dim(freqs, slice(-seq_len, None), dim = freqs_seq_dim)
+
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+
+    assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+
+    # Split t into three parts: left, middle (to be transformed), and right
+    t_left = t[..., :start_index]
+    t_middle = t[..., start_index:end_index]
+    t_right = t[..., end_index:]
+
+    # Apply rotary embeddings without modifying t in place    
+    t_transformed = (t_middle * freqs.cos() * scale) + (rotate_half(t_middle) * freqs.sin() * scale)
+        
+    out = torch.cat((t_left, t_transformed, t_right), dim=-1)
+
+    return out.type(dtype)
+
+# learned rotation helpers
+
+def apply_learned_rotations(rotations, t, start_index = 0, freq_ranges = None):
+    if exists(freq_ranges):
+        rotations = einsum('..., f -> ... f', rotations, freq_ranges)
+        rotations = rearrange(rotations, '... r f -> ... (r f)')
+
+    rotations = repeat(rotations, '... n -> ... (n r)', r = 2)
+    return apply_rotary_emb(rotations, t, start_index = start_index)
+
+# classes
+
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Tensor | None = None,
+        freqs_for:  Literal['lang', 'pixel', 'constant'] = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+        learned_freq = False,
+        use_xpos = False,
+        xpos_scale_base = 512,
+        interpolate_factor = 1.,
+        theta_rescale_factor = 1.,
+        seq_before_head_dim = False,
+        cache_if_possible = True,
+        cache_max_seq_len = 8192
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+        self.freqs_for = freqs_for
+
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+
+        self.cache_if_possible = cache_if_possible
+        self.cache_max_seq_len = cache_max_seq_len
+
+        self.register_buffer('cached_freqs', torch.zeros(cache_max_seq_len, dim), persistent = False)
+        self.cached_freqs_seq_len = 0
+
+        self.freqs = nn.Parameter(freqs, requires_grad = learned_freq)
+
+        self.learned_freq = learned_freq
+
+        # dummy for device
+
+        self.register_buffer('dummy', torch.tensor(0), persistent = False)
+
+        # default sequence dimension
+
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+
+        # interpolation factors
+
+        assert interpolate_factor >= 1.
+        self.interpolate_factor = interpolate_factor
+
+        # xpos
+
+        self.use_xpos = use_xpos
+
+        if not use_xpos:
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+
+        self.register_buffer('scale', scale, persistent = False)
+        self.register_buffer('cached_scales', torch.zeros(cache_max_seq_len, dim), persistent = False)
+        self.cached_scales_seq_len = 0
+
+        # add apply_rotary_emb as static method
+
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+
+    @property
+    def device(self):
+        return self.dummy.device
+
+    def get_seq_pos(self, seq_len, device, dtype, offset = 0):
+        return (torch.arange(seq_len, device = device, dtype = dtype) + offset) / self.interpolate_factor
+
+    def rotate_queries_or_keys(self, t, seq_dim = None, offset = 0, scale = None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert not self.use_xpos or exists(scale), 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+
+        seq = self.get_seq_pos(seq_len, device = device, dtype = dtype, offset = offset)
+
+        freqs = self.forward(seq, seq_len = seq_len, offset = offset)
+
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+
+        return apply_rotary_emb(freqs, t, scale = default(scale, 1.), seq_dim = seq_dim)
+
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim = None, offset = 0):
+        dtype, device, seq_dim = q.dtype, q.device, default(seq_dim, self.default_seq_dim)
+
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+
+        q_scale = k_scale = 1.
+
+        if self.use_xpos:
+            seq = self.get_seq_pos(k_len, dtype = dtype, device = device)
+
+            q_scale = self.get_scale(seq[-q_len:]).type(dtype)
+            k_scale = self.get_scale(seq).type(dtype)
+
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim = seq_dim, scale = q_scale, offset = k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim = seq_dim, scale = k_scale ** -1)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def rotate_queries_and_keys(self, q, k, seq_dim = None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+
+        seq = self.get_seq_pos(seq_len, dtype = dtype, device = device)
+
+        freqs = self.forward(seq, seq_len = seq_len)
+        scale = self.get_scale(seq, seq_len = seq_len).to(dtype)
+
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+            scale = rearrange(scale, 'n d -> n 1 d')
+
+        rotated_q = apply_rotary_emb(freqs, q, scale = scale, seq_dim = seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale = scale ** -1, seq_dim = seq_dim)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def get_scale(
+        self,
+        t: Tensor,
+        seq_len: int | None = None,
+        offset = 0
+    ):
+        assert self.use_xpos
+
+        should_cache = (
+            self.cache_if_possible and
+            exists(seq_len) and
+            (offset + seq_len) <= self.cache_max_seq_len
+        )
+
+        if (
+            should_cache and \
+            exists(self.cached_scales) and \
+            (seq_len + offset) <= self.cached_scales_seq_len
+        ):
+            return self.cached_scales[offset:(offset + seq_len)]
+
+        scale = 1.
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, 'n -> n 1')
+            scale = repeat(scale, 'n d -> n (d r)', r = 2)
+
+        if should_cache and offset == 0:
+            self.cached_scales[:seq_len] = scale.detach()
+            self.cached_scales_seq_len = seq_len
+
+        return scale
+
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == 'pixel':
+                pos = torch.linspace(-1, 1, steps = dim, device = self.device)
+            else:
+                pos = torch.arange(dim, device = self.device)
+
+            freqs = self.forward(pos, seq_len = dim)
+
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim = -1)
+
+    @autocast('cuda', enabled = False)
+    def forward(
+        self,
+        t: Tensor,
+        seq_len: int | None = None,
+        offset = 0
+    ):
+        should_cache = (
+            self.cache_if_possible and
+            not self.learned_freq and
+            exists(seq_len) and
+            self.freqs_for != 'pixel' and
+            (offset + seq_len) <= self.cache_max_seq_len
+        )
+
+        if (
+            should_cache and \
+            exists(self.cached_freqs) and \
+            (offset + seq_len) <= self.cached_freqs_seq_len
+        ):
+            return self.cached_freqs[offset:(offset + seq_len)].detach()
+
+        freqs = self.freqs
+
+        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+
+        if should_cache and offset == 0:
+            self.cached_freqs[:seq_len] = freqs.detach()
+            self.cached_freqs_seq_len = seq_len
+
+        return freqs
diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py
index 6c4e54f8b..b6f986bb0 100644
--- a/modules/fastspeech/acoustic_encoder.py
+++ b/modules/fastspeech/acoustic_encoder.py
@@ -23,7 +23,8 @@ def __init__(self, vocab_size):
             hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
             ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'],
             dropout=hparams['dropout'], num_heads=hparams['num_heads'],
-            use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
+            use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams.get('rel_pos', False), 
+            use_rope=hparams.get('use_rope', False)
         )
 
         self.pitch_embed = Linear(1, hparams['hidden_size'])
diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py
index 1dd164d17..391de11ab 100644
--- a/modules/fastspeech/tts_modules.py
+++ b/modules/fastspeech/tts_modules.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-
+from modules.commons.rotary_embedding_torch import RotaryEmbedding
 from modules.commons.common_layers import SinusoidalPositionalEmbedding, EncSALayer
 from modules.commons.espnet_positional_embedding import RelPositionalEncoding
 
@@ -12,13 +12,13 @@
 
 
 class TransformerEncoderLayer(nn.Module):
-    def __init__(self, hidden_size, dropout, kernel_size=None, act='gelu', num_heads=2):
+    def __init__(self, hidden_size, dropout, kernel_size=None, act='gelu', num_heads=2, rotary_embed=None):
         super().__init__()
         self.op = EncSALayer(
             hidden_size, num_heads, dropout=dropout,
             attention_dropout=0.0, relu_dropout=dropout,
             kernel_size=kernel_size,
-            act=act
+            act=act, rotary_embed=rotary_embed
         )
 
     def forward(self, x, **kwargs):
@@ -353,18 +353,21 @@ def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
 class FastSpeech2Encoder(nn.Module):
     def __init__(self, hidden_size, num_layers,
                  ffn_kernel_size=9, ffn_act='gelu',
-                 dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True):
+                 dropout=None, num_heads=2, use_pos_embed=True, rel_pos=True, use_rope=False):
         super().__init__()
         self.num_layers = num_layers
         embed_dim = self.hidden_size = hidden_size
         self.dropout = dropout
         self.use_pos_embed = use_pos_embed
-
+        if use_pos_embed and use_rope:
+            rotary_embed = RotaryEmbedding(dim = embed_dim // num_heads)
+        else:
+            rotary_embed = None
         self.layers = nn.ModuleList([
             TransformerEncoderLayer(
                 self.hidden_size, self.dropout,
                 kernel_size=ffn_kernel_size, act=ffn_act,
-                num_heads=num_heads
+                num_heads=num_heads, rotary_embed=rotary_embed
             )
             for _ in range(self.num_layers)
         ])
@@ -373,7 +376,9 @@ def __init__(self, hidden_size, num_layers,
         self.embed_scale = math.sqrt(hidden_size)
         self.padding_idx = 0
         self.rel_pos = rel_pos
-        if self.rel_pos:
+        if use_rope:
+            self.embed_positions = None
+        elif self.rel_pos:
             self.embed_positions = RelPositionalEncoding(hidden_size, dropout_rate=0.0)
         else:
             self.embed_positions = SinusoidalPositionalEmbedding(
@@ -385,7 +390,7 @@ def forward_embedding(self, main_embed, extra_embed=None, padding_mask=None):
         x = self.embed_scale * main_embed
         if extra_embed is not None:
             x = x + extra_embed
-        if self.use_pos_embed:
+        if self.use_pos_embed and self.embed_positions is not None:
             if self.rel_pos:
                 x = self.embed_positions(x)
             else:
@@ -396,7 +401,7 @@ def forward_embedding(self, main_embed, extra_embed=None, padding_mask=None):
 
     def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_hiddens=False):
         x = self.forward_embedding(main_embed, extra_embed, padding_mask=padding_mask)  # [B, T, H]
-        nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None]  # [T, B, 1]
+        nonpadding_mask_BT = 1 - padding_mask.float()[:, :, None]  # [B, T, 1]
 
         # NOTICE:
         # The following codes are commented out because
@@ -411,16 +416,13 @@ def forward(self, main_embed, extra_embed, padding_mask, attn_mask=None, return_
         #     x = x + positions
         #     x = F.dropout(x, p=self.dropout, training=self.training)
 
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1) * nonpadding_mask_TB
+        x = x * nonpadding_mask_BT
         hiddens = []
         for layer in self.layers:
-            x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
-            hiddens.append(x)
-        x = self.layer_norm(x) * nonpadding_mask_TB
+            x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_BT
+            if return_hiddens:
+                hiddens.append(x)
+        x = self.layer_norm(x) * nonpadding_mask_BT
         if return_hiddens:
-            x = torch.stack(hiddens, 0)  # [L, T, B, C]
-            x = x.transpose(1, 2)  # [L, B, T, C]
-        else:
-            x = x.transpose(0, 1)  # [B, T, C]
+            x = torch.stack(hiddens, 0)  # [L, B, T, C]
         return x
diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py
index a02e6e010..deab9ee84 100644
--- a/modules/fastspeech/variance_encoder.py
+++ b/modules/fastspeech/variance_encoder.py
@@ -32,7 +32,8 @@ def __init__(self, vocab_size):
             hidden_size=hparams['hidden_size'], num_layers=hparams['enc_layers'],
             ffn_kernel_size=hparams['enc_ffn_kernel_size'], ffn_act=hparams['ffn_act'],
             dropout=hparams['dropout'], num_heads=hparams['num_heads'],
-            use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams['rel_pos']
+            use_pos_embed=hparams['use_pos_embed'], rel_pos=hparams.get('rel_pos', False), 
+            use_rope=hparams.get('use_rope', False)
         )
 
         dur_hparams = hparams['dur_prediction_args']
@@ -121,7 +122,8 @@ def get_hparam(key):
             hidden_size=hidden_size, num_layers=get_hparam('enc_layers'),
             ffn_kernel_size=get_hparam('enc_ffn_kernel_size'), ffn_act=get_hparam('ffn_act'),
             dropout=get_hparam('dropout'), num_heads=get_hparam('num_heads'),
-            use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos')
+            use_pos_embed=get_hparam('use_pos_embed'), rel_pos=get_hparam('rel_pos'),
+            use_rope=get_hparam('use_rope')
         )
         self.out_proj = Linear(hidden_size, hparams['hidden_size'])
 
diff --git a/requirements-onnx.txt b/requirements-onnx.txt
index 976591f70..dda531484 100644
--- a/requirements-onnx.txt
+++ b/requirements-onnx.txt
@@ -3,6 +3,7 @@
 # See instructions at https://pytorch.org/get-started/previous-versions/
 
 click
+einops>=0.7.0
 h5py
 librosa<0.10.0
 lightning~=2.1.0
diff --git a/requirements.txt b/requirements.txt
index 90f3f9c5e..8f79e2382 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@
 # See instructions at https://pytorch.org/get-started/locally/
 
 click
+einops>=0.7.0
 h5py
 librosa<0.10.0
 lightning~=2.3.0

From 575d0aba7229397a8a0d3f9c784b7002a02500f6 Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Sun, 18 Aug 2024 00:30:04 +0800
Subject: [PATCH 35/44] support mini-nsf-hifigan vocoder

---
 modules/nsf_hifigan/env.py    |  2 +
 modules/nsf_hifigan/models.py | 84 +++++++++++++++++++++++------------
 2 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/modules/nsf_hifigan/env.py b/modules/nsf_hifigan/env.py
index ebb9486d3..04abfd9dc 100644
--- a/modules/nsf_hifigan/env.py
+++ b/modules/nsf_hifigan/env.py
@@ -18,6 +18,8 @@ def __setitem__(self, key, value):
         return super(AttrDict, self).__setitem__(key, value)
 
     def __getitem__(self, name):
+        if name not in super(AttrDict, self).keys():
+            return None
         return super(AttrDict, self).__getitem__(name)
 
     def __delitem__(self, name):
diff --git a/modules/nsf_hifigan/models.py b/modules/nsf_hifigan/models.py
index cc21039f7..085907d94 100644
--- a/modules/nsf_hifigan/models.py
+++ b/modules/nsf_hifigan/models.py
@@ -209,46 +209,74 @@ def __init__(self, h):
         self.h = h
         self.num_kernels = len(h.resblock_kernel_sizes)
         self.num_upsamples = len(h.upsample_rates)
-        self.m_source = SourceModuleHnNSF(
-            sampling_rate=h.sampling_rate,
-            harmonic_num=8
-        )
-        self.noise_convs = nn.ModuleList()
-        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
-        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
-
+        self.mini_nsf = h.mini_nsf
+            
+        if h.mini_nsf:
+            self.source_sr = h.sampling_rate / int(np.prod(h.upsample_rates[2: ]))
+            self.upp = int(np.prod(h.upsample_rates[: 2]))
+        else:
+            self.source_sr = h.sampling_rate
+            self.upp = int(np.prod(h.upsample_rates))
+            self.m_source = SourceModuleHnNSF(
+                sampling_rate=h.sampling_rate,
+                harmonic_num=8
+            )
+            self.noise_convs = nn.ModuleList()
+        
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))   
+        
         self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            c_cur = h.upsample_initial_channel // (2 ** (i + 1))
-            self.ups.append(weight_norm(
-                ConvTranspose1d(h.upsample_initial_channel // (2 ** i), h.upsample_initial_channel // (2 ** (i + 1)),
-                                k, u, padding=(k - u) // 2)))
-            if i + 1 < len(h.upsample_rates):  #
-                stride_f0 = int(np.prod(h.upsample_rates[i + 1:]))
-                self.noise_convs.append(Conv1d(
-                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
-            else:
-                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
         self.resblocks = nn.ModuleList()
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
         ch = h.upsample_initial_channel
-        for i in range(len(self.ups)):
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
             ch //= 2
+            self.ups.append(weight_norm(ConvTranspose1d(2 * ch, ch, k, u, padding=(k - u) // 2)))
             for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
                 self.resblocks.append(resblock(h, ch, k, d))
+            if not h.mini_nsf:
+                if i + 1 < len(h.upsample_rates):  #
+                    stride_f0 = int(np.prod(h.upsample_rates[i + 1:]))
+                    self.noise_convs.append(Conv1d(
+                        1, ch, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
+                else:
+                    self.noise_convs.append(Conv1d(1, ch, kernel_size=1))
+            elif i == 1:
+                self.source_conv = Conv1d(1, ch, 1)
+                self.source_conv.apply(init_weights)
 
         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        
         self.ups.apply(init_weights)
         self.conv_post.apply(init_weights)
-        self.upp = int(np.prod(h.upsample_rates))
-
+        
+    def fastsinegen(self, f0):
+        n = torch.arange(1, self.upp + 1, device=f0.device)
+        s0 = f0.unsqueeze(-1) / self.source_sr
+        ds0 = F.pad(s0[:, 1:, :] - s0[:, :-1, :], (0, 0, 0, 1))
+        rad = s0 * n + 0.5 * ds0 * n * (n - 1) / self.upp
+        rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5
+        rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0)
+        rad += F.pad(rad_acc, (0, 0, 1, -1))
+        rad = rad.reshape(f0.shape[0], 1, -1)
+        sines = torch.sin(2 * np.pi * rad)
+        return sines
+        
     def forward(self, x, f0):
-        har_source = self.m_source(f0, self.upp).transpose(1, 2)
+        if self.mini_nsf:
+            har_source = self.fastsinegen(f0)
+        else:
+            har_source = self.m_source(f0, self.upp).transpose(1, 2)
         x = self.conv_pre(x)
         for i in range(self.num_upsamples):
             x = F.leaky_relu(x, LRELU_SLOPE)
             x = self.ups[i](x)
-            x_source = self.noise_convs[i](har_source)
-            x = x + x_source
+            if not self.mini_nsf:
+                x_source = self.noise_convs[i](har_source)
+                x = x + x_source
+            elif i == 1:
+                x_source = self.source_conv(har_source)
+                x = x + x_source
             xs = None
             for j in range(self.num_kernels):
                 if xs is None:
@@ -259,14 +287,14 @@ def forward(self, x, f0):
         x = F.leaky_relu(x)
         x = self.conv_post(x)
         x = torch.tanh(x)
-
         return x
 
     def remove_weight_norm(self):
-        rank_zero_info('Removing weight norm...')
+        # rank_zero_info('Removing weight norm...')
+        print('Removing weight norm...')
         for l in self.ups:
             remove_weight_norm(l)
         for l in self.resblocks:
             l.remove_weight_norm()
         remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
+        remove_weight_norm(self.conv_post)
\ No newline at end of file

From 51da9ec499b5ed8b02f74b9118ece035ab0b3cb1 Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Sun, 18 Aug 2024 17:15:22 +0800
Subject: [PATCH 36/44] discard negative pad

---
 modules/nsf_hifigan/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/nsf_hifigan/models.py b/modules/nsf_hifigan/models.py
index 085907d94..1bdfa21e1 100644
--- a/modules/nsf_hifigan/models.py
+++ b/modules/nsf_hifigan/models.py
@@ -137,7 +137,7 @@ def _f02sine(self, f0, upp):
         rad = f0 / self.sampling_rate * torch.arange(1, upp + 1, device=f0.device)
         rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5
         rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0)
-        rad += F.pad(rad_acc, (0, 0, 1, -1))
+        rad += F.pad(rad_acc[:, :-1, :], (0, 0, 1, 0))
         rad = rad.reshape(f0.shape[0], -1, 1)
         rad = torch.multiply(rad, torch.arange(1, self.dim + 1, device=f0.device).reshape(1, 1, -1))
         rand_ini = torch.rand(1, 1, self.dim, device=f0.device)
@@ -257,7 +257,7 @@ def fastsinegen(self, f0):
         rad = s0 * n + 0.5 * ds0 * n * (n - 1) / self.upp
         rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5
         rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0)
-        rad += F.pad(rad_acc, (0, 0, 1, -1))
+        rad += F.pad(rad_acc[:, :-1, :], (0, 0, 1, 0))
         rad = rad.reshape(f0.shape[0], 1, -1)
         sines = torch.sin(2 * np.pi * rad)
         return sines

From 960bf90a6bc64c5252028165987a845376559d22 Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Fri, 14 Feb 2025 11:55:41 +0800
Subject: [PATCH 37/44] fix MHA inference using low torch version

---
 modules/backbones/lynxnet.py     | 15 +--------------
 modules/commons/common_layers.py |  4 +++-
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py
index 18e7bf497..5dbd1d0a1 100644
--- a/modules/backbones/lynxnet.py
+++ b/modules/backbones/lynxnet.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from modules.commons.common_layers import SinusoidalPosEmb
+from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU
 from utils.hparams import hparams
 
 
@@ -16,19 +16,6 @@ def __init__(self, *args, **kwargs):
         nn.init.kaiming_normal_(self.weight)
 
 
-class SwiGLU(nn.Module):
-    # Swish-Applies the gated linear unit function.
-    def __init__(self, dim=-1):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, x):
-        # out, gate = x.chunk(2, dim=self.dim)
-        # Using torch.split instead of chunk for ONNX export compatibility.
-        out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim)
-        return out * F.silu(gate)
-
-
 class Transpose(nn.Module):
     def __init__(self, dims):
         super().__init__()
diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py
index 3927cd272..bf4a2822c 100644
--- a/modules/commons/common_layers.py
+++ b/modules/commons/common_layers.py
@@ -221,7 +221,7 @@ def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
         self.layer_norm1 = LayerNorm(c)
         if rotary_embed is None:
             self.self_attn = MultiheadAttention(
-                c, num_heads, dropout=attention_dropout, bias=False, batch_first=True
+                c, num_heads, dropout=attention_dropout, bias=False, batch_first=False
             )
             self.use_rope = False
         else:
@@ -244,12 +244,14 @@ def forward(self, x, encoder_padding_mask=None, **kwargs):
         if self.use_rope:
             x = self.self_attn(x, key_padding_mask=encoder_padding_mask)
         else:
+            x = x.transpose(0, 1)
             x, _, = self.self_attn(
                 query=x,
                 key=x,
                 value=x,
                 key_padding_mask=encoder_padding_mask
             )
+            x = x.transpose(0, 1)
         x = F.dropout(x, self.dropout, training=self.training)
         x = residual + x
         x = x * (1 - encoder_padding_mask.float())[..., None]

From 84b32ed2eb4b6e0477efc5f68dc6d0a287156967 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 16 Feb 2025 22:20:37 +0800
Subject: [PATCH 38/44] Fix missing phoneme list sorting

---
 basics/base_binarizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
index d1f812015..397bd8305 100644
--- a/basics/base_binarizer.py
+++ b/basics/base_binarizer.py
@@ -279,7 +279,7 @@ def display_phoneme(phoneme):
                 for idx in ph_idx_required.difference(ph_idx_occurred)
             }, key=lambda v: v[0] if isinstance(v, tuple) else v)
             raise BinarizationError(
-                f'The following phonemes are not covered in transcriptions: {sorted(missing_phones)}'
+                f'The following phonemes are not covered in transcriptions: {missing_phones}'
             )
 
     def process_dataset(self, prefix, num_workers=0, apply_augmentation=False):

From 7741b5555ccd7b3dfc97f4c0d17dd344128a5d83 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Mon, 17 Feb 2025 23:58:29 +0800
Subject: [PATCH 39/44] Fix single-language dictionary parsing language tag

---
 utils/phoneme_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 50145979e..7dc27afa6 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -142,6 +142,8 @@ def is_cross_lingual(self, phone):
         return phone in self._cross_lingual_phonemes
 
     def encode_one(self, phone, lang=None):
+        if '/' in phone:
+            lang, phone = phone.split('/', maxsplit=1)
         if lang is None or not self._multi_langs or phone in self._phone_to_id:
             return self._phone_to_id[phone]
         if '/' not in phone:

From 58edd2fefc239d1229f82d735716dc373dedc161 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sun, 23 Mar 2025 00:04:00 +0800
Subject: [PATCH 40/44] Add `pitch_controllable` flag to vocoder exporter

(cherry picked from commit a6deb6b5c3dcca554546e790328278d493bdf8e9)
---
 deployment/exporters/nsf_hifigan_exporter.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/deployment/exporters/nsf_hifigan_exporter.py b/deployment/exporters/nsf_hifigan_exporter.py
index cbc052ce6..2f0a6b402 100644
--- a/deployment/exporters/nsf_hifigan_exporter.py
+++ b/deployment/exporters/nsf_hifigan_exporter.py
@@ -25,6 +25,7 @@ def __init__(
         super().__init__(device=device, cache_dir=cache_dir)
         self.model_path = model_path
         self.model_name = model_name
+        self.vocoder_pitch_controllable = False
         self.model = self.build_model()
         self.model_class_name = remove_suffix(self.model.__class__.__name__, 'ONNX')
         self.model_cache_path = (self.cache_dir / self.model_name).with_suffix('.onnx')
@@ -38,6 +39,7 @@ def build_model(self) -> nn.Module:
             "See https://github.com/openvpi/DiffSinger/releases/tag/v2.3.0 for more details."
         )
         model = NSFHiFiGANONNX(config).eval().to(self.device)
+        self.vocoder_pitch_controllable = config.get("pc_aug", False)
         load_ckpt(model.generator, str(self.model_path),
                   prefix_in_ckpt=None, key_in_ckpt='generator',
                   strict=True, device=self.device)
@@ -73,6 +75,10 @@ def export_attachments(self, path: Path):
                 'mel_fmax': hparams['fmax'] if hparams['fmax'] is not None else hparams['audio_sample_rate'] / 2,
                 'mel_base': 'e',
                 'mel_scale': 'slaney',
+                'pitch_controllable': self.vocoder_pitch_controllable,
+                # Some old vocoder versions may have severe performance issues on CUDA;
+                # the issues were fixed in newer versions, and this flag is to distinguish them
+                'force_on_cpu': False,
             }, fw, sort_keys=False)
         print(f'| export configs => {config_path} **PLEASE EDIT BEFORE USE**')
 

From 38335bf02f3d45335aca33f644b791baf2694ddb Mon Sep 17 00:00:00 2001
From: yxlllc <llc1995@sina.com>
Date: Sun, 23 Mar 2025 02:05:21 +0800
Subject: [PATCH 41/44] support noise injection

---
 modules/nsf_hifigan/models.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/nsf_hifigan/models.py b/modules/nsf_hifigan/models.py
index 1bdfa21e1..084949886 100644
--- a/modules/nsf_hifigan/models.py
+++ b/modules/nsf_hifigan/models.py
@@ -210,7 +210,8 @@ def __init__(self, h):
         self.num_kernels = len(h.resblock_kernel_sizes)
         self.num_upsamples = len(h.upsample_rates)
         self.mini_nsf = h.mini_nsf
-            
+        self.noise_sigma = h.noise_sigma
+        
         if h.mini_nsf:
             self.source_sr = h.sampling_rate / int(np.prod(h.upsample_rates[2: ]))
             self.upp = int(np.prod(h.upsample_rates[: 2]))
@@ -268,6 +269,8 @@ def forward(self, x, f0):
         else:
             har_source = self.m_source(f0, self.upp).transpose(1, 2)
         x = self.conv_pre(x)
+        if self.noise_sigma is not None and self.noise_sigma > 0:
+            x += self.noise_sigma * torch.randn_like(x)
         for i in range(self.num_upsamples):
             x = F.leaky_relu(x, LRELU_SLOPE)
             x = self.ups[i](x)

From 4a56fce58d825ca8d1dba3158211ec5d0abdf50e Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 29 Mar 2025 00:20:30 +0800
Subject: [PATCH 42/44] Allow merging global phonemes and language-specific
 phonemes

---
 utils/phoneme_utils.py | 52 +++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 31 deletions(-)

diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index 7dc27afa6..c208be705 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -14,6 +14,7 @@ def __init__(
             extra_phonemes: List[str] = None,
             merged_groups: List[List[str]] = None
     ):
+        # Step 1: Collect all phonemes
         all_phonemes = {'AP', 'SP'}
         if extra_phonemes:
             for ph in extra_phonemes:
@@ -43,47 +44,36 @@ def __init__(
                             all_phonemes.add(f'{lang}/{phoneme}')
                         else:
                             all_phonemes.add(phoneme)
+        # Step 2: Parse merged phoneme groups
         if merged_groups is None:
             merged_groups = []
         else:
-            if self._multi_langs:
-                for group in merged_groups:
-                    for phoneme in group:
-                        if '/' not in phoneme:
-                            raise ValueError(
-                                f"Invalid phoneme tag '{phoneme}' in merged group: "
-                                "should specify language by '<lang>/' prefix."
-                            )
+            _merged_groups = []
+            for group in merged_groups:
+                _group = []
+                for phoneme in group:
+                    if '/' in phoneme:
                         lang, name = phoneme.split('/', maxsplit=1)
                         if lang not in dictionaries:
                             raise ValueError(
                                 f"Invalid phoneme tag '{phoneme}' in merged group: "
                                 f"unrecognized language name '{lang}'."
                             )
-                        unique_name = phoneme if self._multi_langs else name
-                        if unique_name not in all_phonemes:
-                            raise ValueError(
-                                f"Invalid phoneme tag '{phoneme}' in merged group: "
-                                f"not found in phoneme set."
-                            )
-                merged_groups = [set(phones) for phones in merged_groups if len(phones) > 1]
-            else:
-                _merged_groups = []
-                for group in merged_groups:
-                    _group = []
-                    for phoneme in group:
-                        if '/' in phoneme:
-                            lang, name = phoneme.split('/', maxsplit=1)
-                            if lang not in dictionaries:
-                                raise ValueError(
-                                    f"Invalid phoneme tag '{phoneme}' in merged group: "
-                                    f"unrecognized language name '{lang}'."
-                                )
-                            _group.append(name)
+                        if self._multi_langs:
+                            element = phoneme
                         else:
-                            _group.append(phoneme)
-                    _merged_groups.append(_group)
-                merged_groups = [set(phones) for phones in _merged_groups if len(phones) > 1]
+                            element = name
+                    else:
+                        element = phoneme
+                    if element not in all_phonemes:
+                        raise ValueError(
+                            f"Invalid phoneme tag '{phoneme}' in merged group: "
+                            f"not found in phoneme set."
+                        )
+                    _group.append(element)
+                _merged_groups.append(_group)
+            merged_groups = [set(phones) for phones in _merged_groups if len(phones) > 1]
+        # Step 3: Build phoneme index
         merged_phonemes_inverted_index = {}
         for idx, group in enumerate(merged_groups):
             other_idx = None

From 21a0f6bff93d50d911295214af9d132977165f3c Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 29 Mar 2025 02:25:33 +0800
Subject: [PATCH 43/44] Check for conflicts between short names and global tags

---
 utils/phoneme_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/utils/phoneme_utils.py b/utils/phoneme_utils.py
index c208be705..ca1af6203 100644
--- a/utils/phoneme_utils.py
+++ b/utils/phoneme_utils.py
@@ -25,6 +25,11 @@ def __init__(
                             f"Invalid phoneme tag '{ph}' in extra phonemes: "
                             f"unrecognized language name '{lang}'."
                         )
+                    if name in all_phonemes:
+                        raise ValueError(
+                            f"Invalid phoneme tag '{ph}' in extra phonemes: "
+                            f"short name conflicts with existing tag."
+                        )
                 all_phonemes.add(ph)
         self._multi_langs = len(dictionaries) > 1
         for lang, dict_path in dictionaries.items():

From 7b58b46fee3ba3c3cf250cc0b0c402ff7347b9c3 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Sat, 29 Mar 2025 02:27:34 +0800
Subject: [PATCH 44/44] Finish documentation for multi-dictionary

---
 docs/BestPractices.md        | 173 ++++++++++++++++++++--------------
 docs/ConfigurationSchemas.md | 174 ++++++++++++++++++++++++-----------
 docs/GettingStarted.md       |   4 +-
 3 files changed, 225 insertions(+), 126 deletions(-)

diff --git a/docs/BestPractices.md b/docs/BestPractices.md
index 04426b836..cc9c26dd9 100644
--- a/docs/BestPractices.md
+++ b/docs/BestPractices.md
@@ -1,42 +1,126 @@
 # Best Practices
 
-## Materials for training and using models
+## Fundamental concepts and materials
 
-### Datasets
+### Configuration files
 
-A dataset mainly includes recordings and transcriptions, which is called a _raw dataset_. Raw datasets should be organized as the following folder structure:
+A configuration file is a YAML file that defines enabled features, model hyperparameters and controls the behavior of the binarizer, trainer and inference. Almost all settings and controls in this repository, including the practices in this guidance, are achieved through configuration files.
 
-- my_raw_data/
-  - wavs/
-    - 001.wav
-    - 002.wav
-    - ... (more recording files)
-  - transcriptions.csv
+For more information of the configuration system and configurable attributes, see [Configuration Schemas](ConfigurationSchemas.md).
 
-In the example above, the _my_raw_data_ folder is the root directory of a raw dataset.
+### Languages
 
-The _transcriptions.csv_ file contains all labels of the recordings. The common column of the CSV file is `name`, which represents all recording items by their filenames **without extension**. Elements of sequence attributes should be split by `space`. Other required columns may vary according to the category of the model you are training, and will be introduced in the following sections.
+Each language you are dealing with should have a unique tag in the configuration file. **We highly recommend using ISO 639 language codes as language tags.** For example, `zh` and `zho` stands for Chinese (`cmn` specifically for Mandarin Chinese), `ja` and `jpn` for Japanese, `en` and `eng` for English, `yue` for Cantonese (Yue). You can download a complete language code table from https://iso639-3.sil.org/code_tables/download_tables.
+
+### Phonemes
+
+Phonemes are the fundamental part of dictionaries and labels. There are two types of phonemes: language-specific phonemes and global phonemes.
+
+**Language-specific phonemes:** If there are multiple languages, all language-specific phonemes will be prefixed with its language name. For example: `zh/a`, `ja/o`, `en/eh`. These are called the **full name** of the phonemes, while `a`, `o`, `eh` are called the **short name** which has definite meaning only in a specific language context. If there is only one language, the short names can be used to determine each phoneme.
+
+**Global phonemes:** Some phonemes do not belong to any language. There are two reserved global phoneme tags: `SP` for space, and `AP` for aspiration. There can also be other user-defined tags (`EP`, `GS`, `VF`, etc.). These tags will not be prefixed with language, and are prior when identifying phoneme names.
+
+Extra phonemes, including user-defined global phonemes and additional language-specific phonemes that are not present in the dictionaries, can be defined in a list in the configuration file (full names should be used):
+
+```yaml
+extra_phonemes: ['EP', 'ja/cl']
+```
+
+The phoneme set expands rapidly with the number of languages. There are actually many similar phonemes that can be merged. Define the merging groups in your configuration file (full names should be used):
+
+```yaml
+merged_phoneme_groups:
+  - [zh/i, ja/i, en/iy]
+  - [zh/s, ja/s, en/s]
+  - [ja/cl, SP]  # global phonemes can also be merged
+  # ... (other groups omitted for brevity)
+use_lang_id: true  # whether to use language embedding; only take effects if there are cross-lingual phonemes
+```
+
+Merging phonemes does not mean that they are exactly the same for the dictionary. For those cross-lingual merged phonemes, Setting `use_lang_id` to true will still distinguish them by language IDs.
+
+#### Phoneme naming principles
+
+- Short names of language-specific phonemes should not conflict with global phoneme names, including reserved ones.
+- `/` cannot be used because it is already used for splitting the language tag and the short name.
+- `-` and `+` cannot be used because they are defined as slur tags in most singing voice synthesis editors.
+- Other special characters, including but not limited to `@`, `#`, `&`, `|`, `<`, `>`, is not recommended because they may be used as special tags in the future format changes.
+- ASCII characters are preferred for the best encoding compatibility, but all UTF-8 characters are acceptable.
 
 ### Dictionaries
 
-A dictionary is a .txt file, in which each line represents a mapping rule from one syllable to its phoneme sequence. The syllable and the phonemes are split by `tab`, and the phonemes are split by `space`:
+Each language should have a corresponding dictionary. Define languages and dictionaries in your configuration file:
+
+```yaml
+dictionaries:
+  zh: dictionaries/opencpop-extension.txt
+  ja: dictionaries/japanese_dict_full.txt
+  en: dictionaries/ds_cmudict-07b.txt
+num_lang: 3  # number of languages; should be >= number of defined languages
+```
+
+Each dictionary is a *.txt* file, in which each line represents a mapping rule from one syllable to its phoneme sequence. The syllable and the phonemes are split by `tab`, and the phonemes are split by `space`:
 
 ```
 <syllable>	<phoneme1> <phoneme2> ...
 ```
 
-Syllable names and phoneme names can be customized, but with the following limitations/suggestions:
+#### Syllable naming principles
 
-- `SP` (rest), `AP` (breath) and `<PAD>` (padding) cannot be used because they are reserved.
+- Try to use a standard writing or pronouncing system. For example, pinyin for Mandarin Chinese, romaji for Japanese and English words for English.
+- `AP` and `SP` cannot be used because they are reserved tags when using DiffSinger in editors.
+- `/` cannot be used because it is already used for splitting the language tag and the short name.
 - `-` and `+` cannot be used because they are defined as slur tags in most singing voice synthesis editors.
-- Special characters including but not limited to `@`, `#`, `&`, `|`, `/`, `<`, `>`, etc. should be avoided because they may be used as special tags in the future format changes. Using them now is okay, and all modifications will be notified in advance.
+- Syllable names is not recommended to start with `.` because this may have special meanings in the future editors.
+- Other special characters, including but not limited to `@`, `#`, `&`, `|`, `<`, `>`, is not recommended because they may be used as special tags in the future format changes.
 - ASCII characters are preferred for the best encoding compatibility, but all UTF-8 characters are acceptable.
 
-There are some preset dictionaries in the [dictionaries/](../dictionaries) folder. For the guidance of using a custom dictionary, see [Using custom dictionaries](#using-custom-dictionaries).
+There are some example dictionaries in the [dictionaries/](../dictionaries) folder.
 
-### Configuration files
+### Datasets
 
-A configuration file is a YAML file that defines enabled features, model hyperparameters and controls the behavior of the binarizer, trainer and inference. For more information of the configuration system and configurable attributes, see [Configuration Schemas](ConfigurationSchemas.md).
+A dataset mainly includes recordings and transcriptions, which is called a _raw dataset_. Raw datasets should be organized as the following folder structure:
+
+- my_raw_data/
+  - wavs/
+    - 001.wav
+    - 002.wav
+    - ... (more recording files)
+  - transcriptions.csv
+
+In the example above, the _my_raw_data_ directory is the root directory of a raw dataset.
+
+The _transcriptions.csv_ file contains all labels of the recordings. The common column of the CSV file is `name`, which represents all recording items by their filenames **without extension**. Elements of sequence attributes should be split by `space`. Other required columns may vary according to the category of the model you are training, and will be introduced in the following sections.
+
+Each dataset should have a main language. If you have many recordings in multiple languages, it is recommended to separate them by language (you can merge their speaker IDs in the configuration). In each dataset, the main language is set as the language context, and phoneme labels in transcriptions.csv do not need a prefix (short name). It is also valid if there are phonemes from other languages, but all of them should be prefixed with their actual language (full name). Global phonemes should not be prefixed in any datasets.
+
+You can define your datasets in the configuration file like this:
+
+```yaml
+datasets:  # define all raw datasets
+  - raw_data_dir: data/spk1-zh/raw  # path to the root of a raw dataset
+    speaker: speaker1  # speaker name
+    spk_id: 0  # optional; use this to merge two datasets; otherwise automatically assigned
+    language: zh  # language tag (main language) of this dataset
+    test_prefixes:  # optional; validation samples from this dataset
+      - wav1
+      - wav2
+  - raw_data_dir: data/spk1-en/raw
+    speaker: speaker1
+    spk_id: 0  # specify the same speaker ID to merge into the previous one
+    language: en
+    test_prefixes:
+      - wav1
+      - wav2
+  - raw_data_dir: data/spk2/raw
+    speaker: speaker2
+    language: ja
+    test_prefixes:
+      - wav1
+      - wav2
+  # ... (other datasets omitted for brevity)
+num_spk: 2  # number of languages; should be > maximum speaker ID
+```
 
 ### DS files
 
@@ -54,7 +138,7 @@ The [DiffSinger Community Vocoders Project](https://openvpi.github.io/vocoders)
 
 The pre-trained vocoder can be fine-tuned on your target dataset. It is highly recommended to do so because fine-tuned vocoder can generate much better results on specific (seen) datasets while does not need much computing resources. See the [vocoder training and fine-tuning repository](https://github.com/openvpi/SingingVocoders) for detailed instructions. After you get the fine-tuned vocoder checkpoint, you can configure it by `vocoder_ckpt` key in your configuration file. The fine-tuned NSF-HiFiGAN vocoder checkpoints can be exported to ONNX format like other DiffSinger user models for further production purposes.
 
-Another unrecommended option: train a ultra-lightweight [DDSP vocoder](https://github.com/yxlllc/pc-ddsp) first by yourself, then configure it according to the relevant [instructions](https://github.com/yxlllc/pc-ddsp/blob/master/DiffSinger.md).
+Another unrecommended option: train an ultra-lightweight [DDSP vocoder](https://github.com/yxlllc/pc-ddsp) first by yourself, then configure it according to the relevant [instructions](https://github.com/yxlllc/pc-ddsp/blob/master/DiffSinger.md).
 
 #### Feature extractors or auxiliary models
 
@@ -108,57 +192,6 @@ Functionalities of variance models are defined by their outputs. There are three
 
 There may be some mutual influence between the modules above when they are enabled together. See [mutual influence between variance modules](#mutual-influence-between-variance-modules) for more details.
 
-## Using custom dictionaries
-
-This section is about using a custom grapheme-to-phoneme dictionary for any language(s).
-
-### Add a dictionary
-
-Assume that you have made a dictionary file named `my_dict.txt`. Edit your  configuration file:
-
-```yaml
-dictionary: my_dict.txt
-```
-
-Then you can binarize your data as normal. The phonemes in your dataset must cover, and must only cover the phonemes appeared in your dictionary. Otherwise, the binarizer will raise an error:
-
-```
-AssertionError: transcriptions and dictionary mismatch.
- (+) ['E', 'En', 'i0', 'ir']
- (-) ['AP', 'SP']
-```
-
-This means there are 4 unexpected symbols in the data labels (`ir`, `i0`, `E`, `En`) and 2 missing phonemes that are not covered by the data labels (`AP`, `SP`).
-
-Once the coverage checks passed, a phoneme distribution summary will be saved into your binary data directory. Below is an example.
-
-![phoneme-distribution](resources/phoneme-distribution.jpg)
-
-During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. There are one padding index (marked as `<PAD`) before all real phonemes IDs.
-
-The dictionary used to binarize the dataset will be copied to the binary data directory by the binarizer, and will be copied again to the experiment directory by the trainer. When exported to ONNX, the dictionary and the phoneme sequence ordered by IDs will be saved to the artifact directory. You do not need to carry the original dictionary file for training and inference.
-
-### Preset dictionaries
-
-There are currently some preset dictionaries for you to use directly:
-
-|     dictionary     |        filename        | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-|:------------------:|:----------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|      Opencpop      |      opencpop.txt      | The original dictionary used by the Opencpop mandarin singing dataset that is fully aligned with the pinyin writing system. We copied the dictionary from [here](http://wenet.org.cn/opencpop/resources/annotationformat/), removed 5 syllables that has no occurrence in the data labels (`hm`, `hng`, `m`, `n` and `ng`) and added some aliases for some syllables (e.g. `jv` for `ju`). Due to pronunciation issues, this dictionary is deprecated and remained only for backward compatibility. |
-| Opencpop extension | opencpop-extension.txt | The modified version of the opencpop dictionary, with stricter phoneme division rules for some pinyin syllables. For example, `ci` is mapped to `c i0` and `chi` is mapped to `ch ir` to distinguish with `bi` (`b i`). This dictionary is now used as the default dictionary for mandarin Chinese. There are also many new syllables for more phoneme combinations.                                                                                                                                |
-
-### Submit or propose a new dictionary
-
-You can submit or propose a new dictionary by raising a topic in [Discussions](https://github.com/openvpi/DiffSinger/discussions). Any dictionary to be formally supported in the main branch must match the following principles:
-
-- Only monolingual dictionaries are accepted for now. Support for multilingual dictionaries will be designed in the future.
-- All syllables and phonemes in the dictionary should have linguistic meanings. Style tags (vocal fry, falsetto, etc.) should not appear in the dictionary.
-- Its syllables should be standard spelling or phonetic transcriptions (like pinyin in mandarin Chinese and romaji in Japanese) for easy integration with G2P modules.
-- Its phonemes should cover all (or almost all) possible pronunciations in that language.
-- Every syllable and every phoneme should have one, and only one certain pronunciation, in all or almost all situations in that language. Some slight context-based pronunciation differences are allowed as the networks can learn.
-- Most native speakers/singers of that language should be able to easily cover all phonemes in the dictionary. This means the dictionary should not contain extremely rare or highly customized phonemes of some dialects or accents.
-- It should not bring too much difficulty and complexity to the data labeling workflow, and it should be easy to use for end users of voicebanks.
-
 ## Build variance datasets with DS files
 
 By default, the variance binarizer loads attributes from transcriptions.csv and searches for recording files (*.wav) to extract features and parameters. These attributes and parameters also exist in DS files, which are normally used for inference. This section introduces the required settings and important notes to build a variance dataset from DS files.
diff --git a/docs/ConfigurationSchemas.md b/docs/ConfigurationSchemas.md
index b0e8dee66..2b08824c9 100644
--- a/docs/ConfigurationSchemas.md
+++ b/docs/ConfigurationSchemas.md
@@ -379,17 +379,86 @@ The key that indexes the binarized metadata to be used as the `sizes` when batch
 <tr><td align="center"><b>default</b></td><td>lengths</td>
 </tbody></table>
 
-### dictionary
+### datasets
 
-Path to the word-phoneme mapping dictionary file. Training data must fully cover phonemes in the dictionary.
+List of dataset configs for preprocessing.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
 <tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>normal</td>
+<tr><td align="center"><b>type</b></td><td>List[dict]</td>
+</tbody></table>
+
+### datasets[].language
+
+Language context of this dataset. Must be a key of [dictionaries](#dictionaries).
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
+<tr><td align="center"><b>customizability</b></td><td>required</td>
+<tr><td align="center"><b>type</b></td><td>str</td>
+</tbody></table>
+
+### datasets[].raw_data_dir
+
+Path to this dataset including wave files, transcriptions, etc.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>all</td>
+<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
+<tr><td align="center"><b>customizability</b></td><td>required</td>
 <tr><td align="center"><b>type</b></td><td>str</td>
 </tbody></table>
 
+### datasets[].speaker
+
+The name of speaker of this dataset. Speaker names are mapped to speaker indexes and stored into spk_map.json when preprocessing.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
+<tr><td align="center"><b>customizability</b></td><td>required</td>
+<tr><td align="center"><b>type</b></td><td>str</td>
+</tbody></table>
+
+### datasets[].spk_id
+
+The speaker ID assigned to this dataset. Will be automatically assigned if not given. IDs can be duplicate or discontinuous to merge multiple datasets to one speaker.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
+<tr><td align="center"><b>customizability</b></td><td>normal</td>
+<tr><td align="center"><b>type</b></td><td>int</td>
+</tbody></table>
+
+### datasets[].test_prefixes
+
+List of data item names or name prefixes in this dataset for the validation set. For each string `s` in the list:
+
+- If `s` equals to an actual item name, add that item to validation set.
+- If `s` does not equal to any item names, add all items whose names start with `s` to validation set.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>all</td>
+<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
+<tr><td align="center"><b>customizability</b></td><td>required</td>
+<tr><td align="center"><b>type</b></td><td>list</td>
+</tbody></table>
+
+### dictionaries
+
+Map of language names and their corresponding dictionary file paths. The phonemes in these dictionaries will be combined as the final phoneme set and have their phoneme IDs. Training data must fully cover all phoneme IDs.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
+<tr><td align="center"><b>customizability</b></td><td>required</td>
+<tr><td align="center"><b>type</b></td><td>Dict[str, str]</td>
+<tr><td align="center"><b>default</b></td><td>{}</td>
+</tbody></table>
+
 ### diff_accelerator
 
 DDPM sampling acceleration method. The following methods are currently available:
@@ -655,6 +724,18 @@ Length of sinusoidal smoothing convolution kernel (in seconds) on extracted ener
 <tr><td align="center"><b>default</b></td><td>0.12</td>
 </tbody></table>
 
+### extra_phonemes
+
+Extra phonemes to be added to the phoneme set. This list can be used to define custom global phoneme tags besides `AP` and `SP`, or to contain phonemes that are not present in any of the dictionaries.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
+<tr><td align="center"><b>customizability</b></td><td>normal</td>
+<tr><td align="center"><b>type</b></td><td>list</td>
+<tr><td align="center"><b>default</b></td><td>[]</td>
+</tbody></table>
+
 ### f0_max
 
 Maximum base frequency (F0) in Hz for pitch extraction.
@@ -1122,6 +1203,18 @@ Arguments for melody encoder. Available sub-keys: `hidden_size`, `enc_layers`, `
 <tr><td align="center"><b>type</b></td><td>dict</td>
 </tbody></table>
 
+### merged_phoneme_groups
+
+Phoneme groups to merge. Each group is a phoneme name list. The merged phonemes share the same ID and thus the same phoneme embedding.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
+<tr><td align="center"><b>customizability</b></td><td>required</td>
+<tr><td align="center"><b>type</b></td><td>list</td>
+<tr><td align="center"><b>default</b></td><td>[]</td>
+</tbody></table>
+
 ### midi_smooth_width
 
 Length of sinusoidal smoothing convolution kernel (in seconds) on the step function representing MIDI sequence for base pitch calculation.
@@ -1170,6 +1263,17 @@ The number of attention heads of `torch.nn.MultiheadAttention` in FastSpeech2 en
 <tr><td align="center"><b>default</b></td><td>2</td>
 </tbody></table>
 
+### num_lang
+
+Number of languages. This value is used to allocate language embeddings in the linguistic encoder.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>nn</td>
+<tr><td align="center"><b>customizability</b></td><td>required</td>
+<tr><td align="center"><b>type</b></td><td>int</td>
+</tbody></table>
+
 ### num_sanity_val_steps
 
 Number of sanity validation steps at the beginning.
@@ -1499,17 +1603,6 @@ Whether to enable voicing prediction.
 <tr><td align="center"><b>default</b></td><td>true</td>
 </tbody></table>
 
-### raw_data_dir
-
-Path(s) to the raw dataset including wave files, transcriptions, etc.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>required</td>
-<tr><td align="center"><b>type</b></td><td>str, List[str]</td>
-</tbody></table>
-
 ### rel_pos
 
 Whether to use relative positional encoding in FastSpeech2 module.
@@ -1674,29 +1767,6 @@ Whether to apply the _sorting by similar length_ algorithm described in [sampler
 <tr><td align="center"><b>default</b></td><td>true</td>
 </tbody></table>
 
-### speakers
-
-The names of speakers in a multi-speaker model. Speaker names are mapped to speaker indexes and stored into spk_map.json when preprocessing.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>required</td>
-<tr><td align="center"><b>type</b></td><td>list</td>
-</tbody></table>
-
-### spk_ids
-
-The IDs of speakers in a multi-speaker model. If an empty list is given, speaker IDs will be automatically generated as $0,1,2,...,N_{spk}-1$. IDs can be duplicate or discontinuous.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>required</td>
-<tr><td align="center"><b>type</b></td><td>List[int]</td>
-<tr><td align="center"><b>default</b></td><td>[]</td>
-</tbody></table>
-
 ### spec_min
 
 Minimum mel spectrogram value used for normalization to [-1, 1]. Different mel bins can have different minimum values.
@@ -1801,22 +1871,6 @@ Length of sinusoidal smoothing convolution kernel (in seconds) on extracted tens
 <tr><td align="center"><b>default</b></td><td>0.12</td>
 </tbody></table>
 
-### test_prefixes
-
-List of data item names or name prefixes for the validation set. For each string `s` in the list:
-
-- If `s` equals to an actual item name, add that item to validation set.
-- If `s` does not equal to any item names, add all items whose names start with `s` to validation set.
-
-For multi-speaker combined datasets, "ds_id:name_prefix" can be used to apply the rules above within data from a specific sub-dataset, where ds_id represents the dataset index.
-
-<table><tbody>
-<tr><td align="center"><b>visibility</b></td><td>all</td>
-<tr><td align="center"><b>scope</b></td><td>preprocessing</td>
-<tr><td align="center"><b>customizability</b></td><td>required</td>
-<tr><td align="center"><b>type</b></td><td>list</td>
-</tbody></table>
-
 ### time_scale_factor
 
 The scale factor that will be multiplied on the time $t$ of Rectified Flow before embedding into the model.
@@ -1891,6 +1945,18 @@ Whether to embed key shifting values introduced by random pitch shifting augment
 <tr><td align="center"><b>constraints</b></td><td>Must be true if random pitch shifting is enabled.</td>
 </tbody></table>
 
+### use_lang_id
+
+Whether to embed the language ID from a multilingual dataset. This option only takes effect for those cross-lingual phonemes in the merged groups.
+
+<table><tbody>
+<tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
+<tr><td align="center"><b>scope</b></td><td>nn, preprocessing, inference</td>
+<tr><td align="center"><b>customizability</b></td><td>recommended</td>
+<tr><td align="center"><b>type</b></td><td>bool</td>
+<tr><td align="center"><b>default</b></td><td>false</td>
+</tbody></table>
+
 ### use_melody_encoder
 
 Whether to enable melody encoder for the pitch predictor.
@@ -1941,7 +2007,7 @@ Whether to embed speed values introduced by random time stretching augmentation.
 
 ### use_spk_id
 
-Whether embed the speaker id from a multi-speaker dataset.
+Whether to embed the speaker ID from a multi-speaker dataset.
 
 <table><tbody>
 <tr><td align="center"><b>visibility</b></td><td>acoustic, variance</td>
diff --git a/docs/GettingStarted.md b/docs/GettingStarted.md
index a3422c3f0..92ddb395f 100644
--- a/docs/GettingStarted.md
+++ b/docs/GettingStarted.md
@@ -14,9 +14,9 @@ DiffSinger requires Python 3.8 or later. We strongly recommend you create a virt
    pip install -r requirements.txt
    ```
 
-### Materials and assets
+### Concepts and materials
 
-Some essential materials and assets are needed before continuing with this repository. See [materials for training and using models](BestPractices.md#materials-for-training-and-using-models) for detailed instructions.
+Before you proceed, it is necessary to understand some fundamental concepts in this repository and prepare some materials and assets. See [fundamental concepts and materials](BestPractices.md#fundamental-concepts-and-materials) for detailed information.
 
 ## Configuration