add intesity param

2025-08-10 10:09:51 +08:00 · 2025-07-23 23:09:35 +03:00 · 2025-07-23 23:09:35 +03:00 · 598afaaad9
commit 598afaaad9
parent 8a0346e795
3 changed files with 135 additions and 22 deletions
--- a/GPT_SoVITS/inference_cli.py
+++ b/GPT_SoVITS/inference_cli.py
@ -8,7 +8,7 @@ from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights
 i18n = I18nAuto()
 def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_language, target_text, text_language, output_path, loudness_boost=False, gain=0, normalize=False, 
-               energy_scale=1.0, volume_scale=1.0, strain_effect=0.0):
+               energy_scale=1.0, volume_scale=1.0, strain_effect=0.0, intensity=0):
    # Change model weights
    print("Changing model weights about to be called...")
@ -31,7 +31,8 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_
                                   normalize=normalize, 
                                   energy_scale=energy_scale, 
                                   volume_scale=volume_scale, 
-                                   strain_effect=strain_effect)
+                                   strain_effect=strain_effect,
                                   intensity=intensity)
    result_list = list(synthesis_result)
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -520,12 +520,16 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
                top_k=15, top_p=1, temperature=1, ref_free=False, speed=1, if_freeze=False, 
                inp_refs=None, sample_steps=16, if_sr=False, pause_second=0.3, 
                loudness_boost=False, gain=0, normalize=False, energy_scale=1.0, 
-                volume_scale=1.0, strain_effect=0.0):
+                volume_scale=1.0, strain_effect=0.0, intensity=0):  # Added intensity parameter
    global cache
    if ref_wav_path: pass
    else: gr.Warning(i18n('请上传参考音频'))
    if text: pass
    else: gr.Warning(i18n('请填入推理文本'))
    # Validate intensity parameter
    intensity = max(0.0, min(1.0, intensity))  # Clamp to [0, 1]
    t = []
    if prompt_text is None or len(prompt_text) == 0:
        ref_free = True
@ -623,6 +627,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
        if(i_text in cache and if_freeze == True): pred_semantic = cache[i_text]
        else:
            with torch.no_grad():
                # Apply intensity to temperature for more emotional variation
                adjusted_temperature = temperature * (1.0 + intensity * 0.3)  # 0% to 30% increase
                pred_semantic, idx = t2s_model.model.infer_panel(
                    all_phoneme_ids,
                    all_phoneme_len,
@ -631,7 +638,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
                    # prompt_phone_len=ph_offset,
                    top_k=top_k,
                    top_p=top_p,
-                    temperature=temperature,
+                    temperature=adjusted_temperature,  # Modified temperature
                    early_stop_num=hz * max_sec,
                )
                pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
@ -648,7 +655,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
                    except:
                        traceback.print_exc()
            if len(refers) == 0: refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
-            audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)[0][0]  # .cpu().detach().numpy()
+            
            # Apply intensity to speed for subtle tempo variation
            adjusted_speed = speed * (1.0 + intensity * 0.1)  # 0% to 10% speed increase
            audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=adjusted_speed)[0][0]
        else:
            refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
            phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
@ -675,7 +685,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
            # print("fea_ref", fea_ref, fea_ref.shape)
            # print("mel2", mel2)
            mel2 = mel2.to(dtype)
-            fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed)
+            
            # Apply intensity to speed for v3
            adjusted_speed = speed * (1.0 + intensity * 0.1)
            fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, adjusted_speed)
            # print("fea_todo", fea_todo)
            # print("ge", ge.abs().mean())
            cfm_resss = []
@ -705,33 +718,98 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
        if max_audio > 1:
            audio = audio / max_audio
-        # Apply new parameters
+        # Apply intensity-based audio processing
        audio = audio.to(torch.float32)  # Ensure float32 for processing
        # Creative intensity-based effects (careful not to create noise)
        if intensity > 0:
            # 1. Dynamic range compression (makes loud parts more consistent)
            if intensity > 0.2:
                compression_ratio = 1.0 + (intensity * 2.0)  # 1:1 to 3:1 compression
                threshold = 0.5 - (intensity * 0.2)  # Lower threshold with higher intensity
                above_threshold = torch.abs(audio) > threshold
                audio[above_threshold] = torch.sign(audio[above_threshold]) * (
                    threshold + (torch.abs(audio[above_threshold]) - threshold) / compression_ratio
                )
            # 2. Harmonic enhancement (adds brightness without noise)
            if intensity > 0.3:
                # Simple harmonic exciter using soft clipping
                harmonic_amount = intensity * 0.15  # Max 15% harmonics
                harmonics = torch.tanh(audio * 3.0) * harmonic_amount
                audio = audio + harmonics
            # 3. Formant shifting (subtle pitch character change)
            if intensity > 0.4:
                # Apply subtle pitch bend without using librosa (to avoid artifacts)
                formant_shift = 1.0 + (intensity * 0.05)  # Max 5% formant shift
                # This is a placeholder - actual formant shifting would need more complex processing
            # 4. Energy boost (frequency-dependent)
            if intensity > 0.1:
                # Boost energy proportionally to intensity
                energy_boost = 1.0 + (intensity * 0.4)  # Max 40% boost
                audio = audio * energy_boost
        # Apply original parameters with intensity scaling
        if loudness_boost:
-            # Boost loudness using RMS-based scaling (adjust multiplier as needed)
+            # Scale loudness boost with intensity
            rms = torch.sqrt(torch.mean(audio ** 2))
-            audio = audio * (rms * 1.5) if rms > 0 else audio
+            boost_factor = 1.5 + (intensity * 0.5)  # 1.5x to 2.0x based on intensity
            audio = audio * (rms * boost_factor) if rms > 0 else audio
        if gain > 0:
-            # Apply gain in dB
+            # Apply gain with intensity scaling
-            audio = audio * (10 ** (gain / 20))
+            adjusted_gain = gain * (1.0 + intensity * 0.3)
            audio = audio * (10 ** (adjusted_gain / 20))
        if normalize:
            # Normalize to [-1, 1]
            max_abs = torch.abs(audio).max()
            audio = audio / max_abs if max_abs > 0 else audio
        if energy_scale != 1.0:
-            # Scale energy
+            # Scale energy with intensity influence
-            audio = audio * torch.sqrt(torch.tensor(energy_scale))
+            adjusted_energy = energy_scale * (1.0 + intensity * 0.2)
            audio = audio * torch.sqrt(torch.tensor(adjusted_energy))
        if volume_scale != 1.0:
-            # Direct volume scaling
+            # Direct volume scaling with intensity
-            audio = audio * volume_scale
+            adjusted_volume = volume_scale * (1.0 + intensity * 0.3)
            audio = audio * adjusted_volume
        if strain_effect > 0.0:
-            # Add strain effect (basic distortion)
+            # Add strain effect with intensity (more careful to avoid noise)
-            audio = audio + (audio ** 2 * strain_effect)
+            adjusted_strain = strain_effect * (1.0 + intensity * 0.5)
            # Use softer distortion to avoid harsh noise
            audio = audio + (torch.tanh(audio * 2.0) * adjusted_strain * 0.3)
-        # Final clipping check after effects
+        # Advanced intensity-based post-processing
        if intensity > 0.5:
            # Subtle saturation for warmth (not harsh distortion)
            saturation = intensity * 0.1
            audio = audio * (1 - saturation) + torch.tanh(audio * 1.5) * saturation
        # Final smart limiting to prevent clipping and maintain quality
        # Use soft knee compression instead of hard clipping
        soft_limit = 0.95
        knee = 0.1
        above_knee = torch.abs(audio) > (soft_limit - knee)
        if torch.any(above_knee):
            # Soft knee compression
            audio_abs = torch.abs(audio)
            audio_sign = torch.sign(audio)
            # Calculate soft knee curve
            knee_start = soft_limit - knee
            compressed = torch.where(
                audio_abs <= knee_start,
                audio_abs,
                knee_start + (audio_abs - knee_start) * 0.3  # Gentle compression above knee
            )
            audio = audio_sign * compressed
        # Final safety check
        max_audio = torch.abs(audio).max()
-        if max_audio > 1:
+        if max_audio > 0.99:
-            audio = audio / max_audio
+            audio = audio * (0.99 / max_audio)
        audio_opt.append(audio)
        audio_opt.append(zero_wav_torch)  # zero_wav
@ -740,6 +818,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
        t1 = ttime()
    print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
    print(f"Intensity applied: {intensity}")  # Log intensity level
    audio_opt = torch.cat(audio_opt, 0)  # np.concatenate
    sr = hps.data.sampling_rate if model_version != "v3" else 24000
    if if_sr == True and sr == 24000:
@ -751,6 +831,34 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
        audio_opt = audio_opt.cpu().detach().numpy()
    yield sr, (audio_opt * 32767).astype(np.int16)
 # Helper function for smooth intensity transitions
 def apply_intensity_envelope(audio, intensity, emotion_type=None):
    """
    Apply smooth intensity envelope to avoid abrupt changes
    """
    if intensity == 0:
        return audio
    # Create smooth envelope
    length = audio.shape[0]
    envelope = torch.ones_like(audio)
    # Attack and release times based on intensity
    attack_samples = int(0.01 * 24000 * (1 - intensity * 0.5))  # Faster attack with higher intensity
    release_samples = int(0.05 * 24000)
    # Apply envelope
    if attack_samples > 0:
        attack_curve = torch.linspace(0.7, 1.0, attack_samples)
        envelope[:attack_samples] = attack_curve
    if release_samples > 0 and length > release_samples:
        release_curve = torch.linspace(1.0, 0.8, release_samples)
        envelope[-release_samples:] = release_curve
    return audio * envelope
 def split(todo_text):
    todo_text = todo_text.replace("……", "。").replace("——", "，")
    if todo_text[-1] not in splits:
--- a/api.py
+++ b/api.py
@ -915,6 +915,7 @@ def version_4_cli(
    volume_scale=1.0,
    strain_effect=0.0,
    shouting_type="normal",  # normal, loud, soft, whisper
    intesity = 0,
 ):
    # Create a temporary buffer to store the audio
    audio_buffer = io.BytesIO()
@ -976,7 +977,8 @@ def version_4_cli(
        normalize=normalize,
        energy_scale=energy_scale,
        volume_scale=volume_scale,
-        strain_effect=strain_effect
+        strain_effect=strain_effect,
        intensity=intesity
    )
    # Get the last audio data and sample rate from synthesis result
@ -1015,6 +1017,7 @@ async def tts_endpoint(
    volume_scale: str = "1.0",  # Accept as string from URL, convert to float
    strain_effect: str = "0.0",  # Accept as string from URL, convert to float
    shouting_type: str = "normal",  # Accept as string from URL, convert to appropriate type
    intesity: int = 0,
 ):
    if character == "kurari" or character == "Kurari":
        prompt_text = "おはよう〜。今日はどんな1日過ごすー？くらりはね〜いつでもあなたの味方だよ"
@ -1073,7 +1076,8 @@ async def tts_endpoint(
            energy_scale=energy_scale,
            volume_scale=volume_scale,
            strain_effect=strain_effect,
-            shouting_type= shouting_type
+            shouting_type= shouting_type,
            intesity=intesity
        )
        if audio_buffer: