diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py index 467b16ba..79feb248 100644 --- a/GPT_SoVITS/inference_cli.py +++ b/GPT_SoVITS/inference_cli.py @@ -8,7 +8,7 @@ from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights i18n = I18nAuto() def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_language, target_text, text_language, output_path, loudness_boost=False, gain=0, normalize=False, - energy_scale=1.0, volume_scale=1.0, strain_effect=0.0): + energy_scale=1.0, volume_scale=1.0, strain_effect=0.0, intensity=0): # Change model weights print("Changing model weights about to be called...") @@ -31,7 +31,8 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_ normalize=normalize, energy_scale=energy_scale, volume_scale=volume_scale, - strain_effect=strain_effect) + strain_effect=strain_effect, + intensity=intensity) result_list = list(synthesis_result) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 7c5034af..e687b077 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -520,12 +520,16 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, top_k=15, top_p=1, temperature=1, ref_free=False, speed=1, if_freeze=False, inp_refs=None, sample_steps=16, if_sr=False, pause_second=0.3, loudness_boost=False, gain=0, normalize=False, energy_scale=1.0, - volume_scale=1.0, strain_effect=0.0): + volume_scale=1.0, strain_effect=0.0, intensity=0): # Added intensity parameter global cache if ref_wav_path: pass else: gr.Warning(i18n('请上传参考音频')) if text: pass else: gr.Warning(i18n('请填入推理文本')) + + # Validate intensity parameter + intensity = max(0.0, min(1.0, intensity)) # Clamp to [0, 1] + t = [] if prompt_text is None or len(prompt_text) == 0: ref_free = True @@ -623,6 +627,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, if(i_text in cache and if_freeze == True): pred_semantic = cache[i_text] else: with torch.no_grad(): + # Apply intensity to temperature for more emotional variation + adjusted_temperature = temperature * (1.0 + intensity * 0.3) # 0% to 30% increase + pred_semantic, idx = t2s_model.model.infer_panel( all_phoneme_ids, all_phoneme_len, @@ -631,7 +638,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, # prompt_phone_len=ph_offset, top_k=top_k, top_p=top_p, - temperature=temperature, + temperature=adjusted_temperature, # Modified temperature early_stop_num=hz * max_sec, ) pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) @@ -648,7 +655,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, except: traceback.print_exc() if len(refers) == 0: refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] - audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)[0][0] # .cpu().detach().numpy() + + # Apply intensity to speed for subtle tempo variation + adjusted_speed = speed * (1.0 + intensity * 0.1) # 0% to 10% speed increase + audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=adjusted_speed)[0][0] else: refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) @@ -675,7 +685,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, # print("fea_ref", fea_ref, fea_ref.shape) # print("mel2", mel2) mel2 = mel2.to(dtype) - fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) + + # Apply intensity to speed for v3 + adjusted_speed = speed * (1.0 + intensity * 0.1) + fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, adjusted_speed) # print("fea_todo", fea_todo) # print("ge", ge.abs().mean()) cfm_resss = [] @@ -705,33 +718,98 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, if max_audio > 1: audio = audio / max_audio - # Apply new parameters + # Apply intensity-based audio processing audio = audio.to(torch.float32) # Ensure float32 for processing + + # Creative intensity-based effects (careful not to create noise) + if intensity > 0: + # 1. Dynamic range compression (makes loud parts more consistent) + if intensity > 0.2: + compression_ratio = 1.0 + (intensity * 2.0) # 1:1 to 3:1 compression + threshold = 0.5 - (intensity * 0.2) # Lower threshold with higher intensity + above_threshold = torch.abs(audio) > threshold + audio[above_threshold] = torch.sign(audio[above_threshold]) * ( + threshold + (torch.abs(audio[above_threshold]) - threshold) / compression_ratio + ) + + # 2. Harmonic enhancement (adds brightness without noise) + if intensity > 0.3: + # Simple harmonic exciter using soft clipping + harmonic_amount = intensity * 0.15 # Max 15% harmonics + harmonics = torch.tanh(audio * 3.0) * harmonic_amount + audio = audio + harmonics + + # 3. Formant shifting (subtle pitch character change) + if intensity > 0.4: + # Apply subtle pitch bend without using librosa (to avoid artifacts) + formant_shift = 1.0 + (intensity * 0.05) # Max 5% formant shift + # This is a placeholder - actual formant shifting would need more complex processing + + # 4. Energy boost (frequency-dependent) + if intensity > 0.1: + # Boost energy proportionally to intensity + energy_boost = 1.0 + (intensity * 0.4) # Max 40% boost + audio = audio * energy_boost + + # Apply original parameters with intensity scaling if loudness_boost: - # Boost loudness using RMS-based scaling (adjust multiplier as needed) + # Scale loudness boost with intensity rms = torch.sqrt(torch.mean(audio ** 2)) - audio = audio * (rms * 1.5) if rms > 0 else audio + boost_factor = 1.5 + (intensity * 0.5) # 1.5x to 2.0x based on intensity + audio = audio * (rms * boost_factor) if rms > 0 else audio if gain > 0: - # Apply gain in dB - audio = audio * (10 ** (gain / 20)) + # Apply gain with intensity scaling + adjusted_gain = gain * (1.0 + intensity * 0.3) + audio = audio * (10 ** (adjusted_gain / 20)) if normalize: # Normalize to [-1, 1] max_abs = torch.abs(audio).max() audio = audio / max_abs if max_abs > 0 else audio if energy_scale != 1.0: - # Scale energy - audio = audio * torch.sqrt(torch.tensor(energy_scale)) + # Scale energy with intensity influence + adjusted_energy = energy_scale * (1.0 + intensity * 0.2) + audio = audio * torch.sqrt(torch.tensor(adjusted_energy)) if volume_scale != 1.0: - # Direct volume scaling - audio = audio * volume_scale + # Direct volume scaling with intensity + adjusted_volume = volume_scale * (1.0 + intensity * 0.3) + audio = audio * adjusted_volume if strain_effect > 0.0: - # Add strain effect (basic distortion) - audio = audio + (audio ** 2 * strain_effect) + # Add strain effect with intensity (more careful to avoid noise) + adjusted_strain = strain_effect * (1.0 + intensity * 0.5) + # Use softer distortion to avoid harsh noise + audio = audio + (torch.tanh(audio * 2.0) * adjusted_strain * 0.3) - # Final clipping check after effects + # Advanced intensity-based post-processing + if intensity > 0.5: + # Subtle saturation for warmth (not harsh distortion) + saturation = intensity * 0.1 + audio = audio * (1 - saturation) + torch.tanh(audio * 1.5) * saturation + + # Final smart limiting to prevent clipping and maintain quality + # Use soft knee compression instead of hard clipping + soft_limit = 0.95 + knee = 0.1 + + above_knee = torch.abs(audio) > (soft_limit - knee) + if torch.any(above_knee): + # Soft knee compression + audio_abs = torch.abs(audio) + audio_sign = torch.sign(audio) + + # Calculate soft knee curve + knee_start = soft_limit - knee + compressed = torch.where( + audio_abs <= knee_start, + audio_abs, + knee_start + (audio_abs - knee_start) * 0.3 # Gentle compression above knee + ) + + audio = audio_sign * compressed + + # Final safety check max_audio = torch.abs(audio).max() - if max_audio > 1: - audio = audio / max_audio + if max_audio > 0.99: + audio = audio * (0.99 / max_audio) audio_opt.append(audio) audio_opt.append(zero_wav_torch) # zero_wav @@ -740,6 +818,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, t1 = ttime() print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3]))) + print(f"Intensity applied: {intensity}") # Log intensity level + audio_opt = torch.cat(audio_opt, 0) # np.concatenate sr = hps.data.sampling_rate if model_version != "v3" else 24000 if if_sr == True and sr == 24000: @@ -751,6 +831,34 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, audio_opt = audio_opt.cpu().detach().numpy() yield sr, (audio_opt * 32767).astype(np.int16) + +# Helper function for smooth intensity transitions +def apply_intensity_envelope(audio, intensity, emotion_type=None): + """ + Apply smooth intensity envelope to avoid abrupt changes + """ + if intensity == 0: + return audio + + # Create smooth envelope + length = audio.shape[0] + envelope = torch.ones_like(audio) + + # Attack and release times based on intensity + attack_samples = int(0.01 * 24000 * (1 - intensity * 0.5)) # Faster attack with higher intensity + release_samples = int(0.05 * 24000) + + # Apply envelope + if attack_samples > 0: + attack_curve = torch.linspace(0.7, 1.0, attack_samples) + envelope[:attack_samples] = attack_curve + + if release_samples > 0 and length > release_samples: + release_curve = torch.linspace(1.0, 0.8, release_samples) + envelope[-release_samples:] = release_curve + + return audio * envelope + def split(todo_text): todo_text = todo_text.replace("……", "。").replace("——", ",") if todo_text[-1] not in splits: diff --git a/api.py b/api.py index 2234e5a1..899687f1 100644 --- a/api.py +++ b/api.py @@ -915,6 +915,7 @@ def version_4_cli( volume_scale=1.0, strain_effect=0.0, shouting_type="normal", # normal, loud, soft, whisper + intesity = 0, ): # Create a temporary buffer to store the audio audio_buffer = io.BytesIO() @@ -976,7 +977,8 @@ def version_4_cli( normalize=normalize, energy_scale=energy_scale, volume_scale=volume_scale, - strain_effect=strain_effect + strain_effect=strain_effect, + intensity=intesity ) # Get the last audio data and sample rate from synthesis result @@ -1015,6 +1017,7 @@ async def tts_endpoint( volume_scale: str = "1.0", # Accept as string from URL, convert to float strain_effect: str = "0.0", # Accept as string from URL, convert to float shouting_type: str = "normal", # Accept as string from URL, convert to appropriate type + intesity: int = 0, ): if character == "kurari" or character == "Kurari": prompt_text = "おはよう〜。今日はどんな1日過ごすー?くらりはね〜いつでもあなたの味方だよ" @@ -1073,7 +1076,8 @@ async def tts_endpoint( energy_scale=energy_scale, volume_scale=volume_scale, strain_effect=strain_effect, - shouting_type= shouting_type + shouting_type= shouting_type, + intesity=intesity ) if audio_buffer: