diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index b9f4e1a75..cbd7dcb67 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -54,12 +54,14 @@ mel_vmax: 1.5 interp_uv: true energy_smooth_width: 0.12 breathiness_smooth_width: 0.12 +voicing_smooth_width: 0.12 tension_smooth_width: 0.12 use_spk_id: false f0_embed_type: continuous use_energy_embed: false use_breathiness_embed: false +use_voicing_embed: false use_tension_embed: false use_key_shift_embed: false use_speed_embed: false diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index ef32ef5ac..4bd2ba314 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -22,6 +22,8 @@ use_spk_id: false num_spk: 1 use_energy_embed: false use_breathiness_embed: false +use_voicing_embed: false +use_tension_embed: false use_key_shift_embed: true use_speed_embed: true diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index a933603e3..f5deb1119 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -19,17 +19,26 @@ binary_data_dir: data/xxx/binary binarization_args: num_workers: 0 -energy_db_min: -96.0 -energy_db_max: -12.0 -breathiness_db_min: -96.0 -breathiness_db_max: -20.0 - use_spk_id: false num_spk: 1 predict_dur: true predict_pitch: true predict_energy: false predict_breathiness: false +predict_voicing: false +predict_tension: false + +energy_db_min: -96.0 +energy_db_max: -12.0 + +breathiness_db_min: -96.0 +breathiness_db_max: -20.0 + +voicing_db_min: -96.0 +voicing_db_max: -12.0 + +tension_logit_min: -10.0 +tension_logit_max: 10.0 hidden_size: 256 dur_prediction_args: diff --git a/configs/variance.yaml b/configs/variance.yaml index efbb6c79a..276ce3084 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -40,6 +40,7 @@ predict_dur: true predict_pitch: true predict_energy: false predict_breathiness: false +predict_voicing: false predict_tension: false dur_prediction_args: @@ -75,6 +76,7 @@ pitch_prediction_args: energy_db_min: -96.0 energy_db_max: -12.0 energy_smooth_width: 0.12 + breathiness_db_min: -96.0 breathiness_db_max: -20.0 breathiness_smooth_width: 0.12 @@ -82,6 +84,14 @@ tension_logit_min: -10.0 tension_logit_max: 10.0 tension_smooth_width: 0.12 +voicing_db_min: -96.0 +voicing_db_max: -12.0 +voicing_smooth_width: 0.12 + +tension_logit_min: -10.0 +tension_logit_max: 10.0 +tension_smooth_width: 0.12 + variances_prediction_args: total_repeat_bins: 48 residual_layers: 10 diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index 1f828c9e7..02f6b3a92 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -32,6 +32,8 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N self.variances_to_embed.add('energy') if hparams.get('use_breathiness_embed', False): self.variances_to_embed.add('breathiness') + if hparams.get('use_voicing_embed', False): + self.variances_to_embed.add('voicing') if hparams.get('use_tension_embed', False): self.variances_to_embed.add('tension') diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index a9d4d849f..666e7f659 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -36,11 +36,14 @@ def __init__(self, vocab_size): self.variance_embed_list = [] self.use_energy_embed = hparams.get('use_energy_embed', False) self.use_breathiness_embed = hparams.get('use_breathiness_embed', False) + self.use_voicing_embed = hparams.get('use_voicing_embed', False) self.use_tension_embed = hparams.get('use_tension_embed', False) if self.use_energy_embed: self.variance_embed_list.append('energy') if self.use_breathiness_embed: self.variance_embed_list.append('breathiness') + if self.use_voicing_embed: + self.variance_embed_list.append('voicing') if self.use_tension_embed: self.variance_embed_list.append('tension') diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py index e257186ab..fe526a87f 100644 --- a/modules/fastspeech/param_adaptor.py +++ b/modules/fastspeech/param_adaptor.py @@ -5,7 +5,7 @@ from modules.diffusion.ddpm import MultiVarianceDiffusion from utils.hparams import hparams -VARIANCE_CHECKLIST = ['energy', 'breathiness', 'tension'] +VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension'] class ParameterAdaptorModule(torch.nn.Module): @@ -14,11 +14,14 @@ def __init__(self): self.variance_prediction_list = [] self.predict_energy = hparams.get('predict_energy', False) self.predict_breathiness = hparams.get('predict_breathiness', False) + self.predict_voicing = hparams.get('predict_voicing', False) self.predict_tension = hparams.get('predict_tension', False) if self.predict_energy: self.variance_prediction_list.append('energy') if self.predict_breathiness: self.variance_prediction_list.append('breathiness') + if self.predict_voicing: + self.variance_prediction_list.append('voicing') if self.predict_tension: self.variance_prediction_list.append('tension') self.predict_variances = len(self.variance_prediction_list) > 0 @@ -41,6 +44,13 @@ def build_adaptor(self, cls=MultiVarianceDiffusion): )) clamps.append((hparams['breathiness_db_min'], 0.)) + if self.predict_voicing: + ranges.append(( + hparams['voicing_db_min'], + hparams['voicing_db_max'] + )) + clamps.append((hparams['voicing_db_min'], 0.)) + if self.predict_tension: ranges.append(( hparams['tension_logit_min'], diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 8c81abd52..6df8e7a55 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -26,6 +26,7 @@ get_mel2ph_torch, get_energy_librosa, get_breathiness_pyworld, + get_voicing_pyworld, get_tension_base_harmonic, ) from utils.hparams import hparams @@ -39,6 +40,7 @@ 'f0', 'energy', 'breathiness', + 'voicing', 'tension', 'key_shift', 'speed', @@ -47,6 +49,7 @@ pitch_extractor: BasePE = None energy_smooth: SinusoidalSmoothingConv1d = None breathiness_smooth: SinusoidalSmoothingConv1d = None +voicing_smooth: SinusoidalSmoothingConv1d = None tension_smooth: SinusoidalSmoothingConv1d = None @@ -56,6 +59,7 @@ def __init__(self): self.lr = LengthRegulator() self.need_energy = hparams['use_energy_embed'] self.need_breathiness = hparams['use_breathiness_embed'] + self.need_voicing = hparams['use_voicing_embed'] self.need_tension = hparams['use_tension_embed'] def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id): @@ -158,6 +162,21 @@ def process_item(self, item_name, meta_data, binarization_args): processed_input['breathiness'] = breathiness.cpu().numpy() + if self.need_voicing: + # get ground truth voicing + voicing = get_voicing_pyworld( + dec_waveform, None, None, length=length + ) + + global voicing_smooth + if voicing_smooth is None: + voicing_smooth = SinusoidalSmoothingConv1d( + round(hparams['voicing_smooth_width'] / self.timestep) + ).eval().to(self.device) + voicing = voicing_smooth(torch.from_numpy(voicing).to(self.device)[None])[0] + + processed_input['voicing'] = voicing.cpu().numpy() + if self.need_tension: # get ground truth tension tension = get_tension_base_harmonic( diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 3478b508f..c3cba9476 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -19,6 +19,7 @@ get_mel2ph_torch, get_energy_librosa, get_breathiness_pyworld, + get_voicing_pyworld, get_tension_base_harmonic, ) from utils.hparams import hparams @@ -44,6 +45,7 @@ 'uv', # unvoiced masks (only for objective evaluation metrics), bool[T_s,] 'energy', # frame-level RMS (dB), float32[T_s,] 'breathiness', # frame-level RMS of aperiodic parts (dB), float32[T_s,] + 'voicing', # frame-level RMS of harmonic parts (dB), float32[T_s,] 'tension', # frame-level tension (logit), float32[T_s,] ] DS_INDEX_SEP = '#' @@ -54,6 +56,7 @@ midi_smooth: SinusoidalSmoothingConv1d = None energy_smooth: SinusoidalSmoothingConv1d = None breathiness_smooth: SinusoidalSmoothingConv1d = None +voicing_smooth: SinusoidalSmoothingConv1d = None tension_smooth: SinusoidalSmoothingConv1d = None @@ -74,8 +77,9 @@ def __init__(self): predict_energy = hparams['predict_energy'] predict_breathiness = hparams['predict_breathiness'] + predict_voicing = hparams['predict_voicing'] predict_tension = hparams['predict_tension'] - self.predict_variances = predict_energy or predict_breathiness or predict_tension + self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension self.lr = LengthRegulator().to(self.device) self.prefer_ds = self.binarization_args['prefer_ds'] self.cached_ds = {} @@ -417,6 +421,37 @@ def process_item(self, item_name, meta_data, binarization_args): processed_input['breathiness'] = breathiness + # Below: extract voicing + if hparams['predict_voicing']: + voicing = None + voicing_from_wav = False + if self.prefer_ds: + voicing_seq = self.load_attr_from_ds(ds_id, name, 'voicing', idx=ds_seg_idx) + if voicing_seq is not None: + voicing = resample_align_curve( + np.array(voicing_seq.split(), np.float32), + original_timestep=float(self.load_attr_from_ds( + ds_id, name, 'voicing_timestep', idx=ds_seg_idx + )), + target_timestep=self.timestep, + align_length=length + ) + if voicing is None: + voicing = get_voicing_pyworld( + dec_waveform, None, None, length=length + ) + voicing_from_wav = True + + if voicing_from_wav: + global voicing_smooth + if voicing_smooth is None: + voicing_smooth = SinusoidalSmoothingConv1d( + round(hparams['voicing_smooth_width'] / self.timestep) + ).eval().to(self.device) + voicing = voicing_smooth(torch.from_numpy(voicing).to(self.device)[None])[0].cpu().numpy() + + processed_input['voicing'] = voicing + # Below: extract tension if hparams['predict_tension']: tension = None diff --git a/training/acoustic_task.py b/training/acoustic_task.py index c903c9623..a8d3e5a05 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -27,6 +27,8 @@ def __init__(self, prefix, preload=False): self.required_variances['energy'] = 0.0 if hparams['use_breathiness_embed']: self.required_variances['breathiness'] = 0.0 + if hparams['use_voicing_embed']: + self.required_variances['voicing'] = 0.0 if hparams['use_tension_embed']: self.required_variances['tension'] = 0.0 @@ -80,6 +82,8 @@ def __init__(self): self.required_variances.append('energy') if hparams['use_breathiness_embed']: self.required_variances.append('breathiness') + if hparams['use_voicing_embed']: + self.required_variances.append('voicing') if hparams['use_tension_embed']: self.required_variances.append('tension') super()._finish_init() diff --git a/training/variance_task.py b/training/variance_task.py index 69fc21207..7bfdb5095 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -24,8 +24,9 @@ def __init__(self, prefix, preload=False): super(VarianceDataset, self).__init__(prefix, hparams['dataset_size_key'], preload) need_energy = hparams['predict_energy'] need_breathiness = hparams['predict_breathiness'] + need_voicing = hparams['predict_voicing'] need_tension = hparams['predict_tension'] - self.predict_variances = need_energy or need_breathiness or need_tension + self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension def collater(self, samples): batch = super().collater(samples) @@ -60,6 +61,8 @@ def collater(self, samples): batch['energy'] = utils.collate_nd([s['energy'] for s in samples], 0) if hparams['predict_breathiness']: batch['breathiness'] = utils.collate_nd([s['breathiness'] for s in samples], 0) + if hparams['predict_voicing']: + batch['voicing'] = utils.collate_nd([s['voicing'] for s in samples], 0) if hparams['predict_tension']: batch['tension'] = utils.collate_nd([s['tension'] for s in samples], 0) @@ -92,12 +95,15 @@ def __init__(self): predict_energy = hparams['predict_energy'] predict_breathiness = hparams['predict_breathiness'] + predict_voicing = hparams['predict_voicing'] predict_tension = hparams['predict_tension'] self.variance_prediction_list = [] if predict_energy: self.variance_prediction_list.append('energy') if predict_breathiness: self.variance_prediction_list.append('breathiness') + if predict_voicing: + self.variance_prediction_list.append('voicing') if predict_tension: self.variance_prediction_list.append('tension') self.predict_variances = len(self.variance_prediction_list) > 0 @@ -153,6 +159,7 @@ def run_model(self, sample, infer=False): pitch = sample.get('pitch') # [B, T_s] energy = sample.get('energy') # [B, T_s] breathiness = sample.get('breathiness') # [B, T_s] + voicing = sample.get('voicing') # [B, T_s] tension = sample.get('tension') # [B, T_s] pitch_retake = variance_retake = None @@ -175,7 +182,7 @@ def run_model(self, sample, infer=False): note_midi=note_midi, note_rest=note_rest, note_dur=note_dur, note_glide=note_glide, mel2note=mel2note, base_pitch=base_pitch, pitch=pitch, - energy=energy, breathiness=breathiness, tension=tension, + energy=energy, breathiness=breathiness, voicing=voicing, tension=tension, pitch_retake=pitch_retake, variance_retake=variance_retake, spk_id=spk_ids, infer=infer ) diff --git a/utils/binarizer_utils.py b/utils/binarizer_utils.py index a89ef433a..7cba11090 100644 --- a/utils/binarizer_utils.py +++ b/utils/binarizer_utils.py @@ -279,6 +279,35 @@ def get_breathiness_pyworld( return breathiness +def get_voicing_pyworld( + waveform: Union[np.ndarray, DecomposedWaveform], + samplerate, f0, length, + *, hop_size=None, fft_size=None, win_size=None +): + """ + Definition of voicing: RMS of the harmonic part, in dB representation + :param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given + :param samplerate: sampling rate + :param f0: reference f0 + :param length: Expected number of frames + :param hop_size: Frame width, in number of samples + :param fft_size: Number of fft bins + :param win_size: Window size, in number of samples + :return: voicing + """ + if not isinstance(waveform, DecomposedWaveform): + waveform = DecomposedWaveform( + waveform=waveform, samplerate=samplerate, f0=f0, + hop_size=hop_size, fft_size=fft_size, win_size=win_size + ) + waveform_sp = waveform.harmonic() + voicing = get_energy_librosa( + waveform_sp, length=length, + hop_size=waveform.hop_size, win_size=waveform.win_size + ) + return voicing + + def get_tension_base_harmonic( waveform: Union[np.ndarray, DecomposedWaveform], samplerate, f0, length,