add intesity param

This commit is contained in:
samiabat 2025-07-23 23:09:35 +03:00
parent 8a0346e795
commit 598afaaad9
3 changed files with 135 additions and 22 deletions

View File

@ -8,7 +8,7 @@ from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights
i18n = I18nAuto()
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_language, target_text, text_language, output_path, loudness_boost=False, gain=0, normalize=False,
energy_scale=1.0, volume_scale=1.0, strain_effect=0.0):
energy_scale=1.0, volume_scale=1.0, strain_effect=0.0, intensity=0):
# Change model weights
print("Changing model weights about to be called...")
@ -31,7 +31,8 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_
normalize=normalize,
energy_scale=energy_scale,
volume_scale=volume_scale,
strain_effect=strain_effect)
strain_effect=strain_effect,
intensity=intensity)
result_list = list(synthesis_result)

View File

@ -520,12 +520,16 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
top_k=15, top_p=1, temperature=1, ref_free=False, speed=1, if_freeze=False,
inp_refs=None, sample_steps=16, if_sr=False, pause_second=0.3,
loudness_boost=False, gain=0, normalize=False, energy_scale=1.0,
volume_scale=1.0, strain_effect=0.0):
volume_scale=1.0, strain_effect=0.0, intensity=0): # Added intensity parameter
global cache
if ref_wav_path: pass
else: gr.Warning(i18n('请上传参考音频'))
if text: pass
else: gr.Warning(i18n('请填入推理文本'))
# Validate intensity parameter
intensity = max(0.0, min(1.0, intensity)) # Clamp to [0, 1]
t = []
if prompt_text is None or len(prompt_text) == 0:
ref_free = True
@ -623,6 +627,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
if(i_text in cache and if_freeze == True): pred_semantic = cache[i_text]
else:
with torch.no_grad():
# Apply intensity to temperature for more emotional variation
adjusted_temperature = temperature * (1.0 + intensity * 0.3) # 0% to 30% increase
pred_semantic, idx = t2s_model.model.infer_panel(
all_phoneme_ids,
all_phoneme_len,
@ -631,7 +638,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
# prompt_phone_len=ph_offset,
top_k=top_k,
top_p=top_p,
temperature=temperature,
temperature=adjusted_temperature, # Modified temperature
early_stop_num=hz * max_sec,
)
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
@ -648,7 +655,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
except:
traceback.print_exc()
if len(refers) == 0: refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)[0][0] # .cpu().detach().numpy()
# Apply intensity to speed for subtle tempo variation
adjusted_speed = speed * (1.0 + intensity * 0.1) # 0% to 10% speed increase
audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=adjusted_speed)[0][0]
else:
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
@ -675,7 +685,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
# print("fea_ref", fea_ref, fea_ref.shape)
# print("mel2", mel2)
mel2 = mel2.to(dtype)
fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed)
# Apply intensity to speed for v3
adjusted_speed = speed * (1.0 + intensity * 0.1)
fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, adjusted_speed)
# print("fea_todo", fea_todo)
# print("ge", ge.abs().mean())
cfm_resss = []
@ -705,33 +718,98 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
if max_audio > 1:
audio = audio / max_audio
# Apply new parameters
# Apply intensity-based audio processing
audio = audio.to(torch.float32) # Ensure float32 for processing
# Creative intensity-based effects (careful not to create noise)
if intensity > 0:
# 1. Dynamic range compression (makes loud parts more consistent)
if intensity > 0.2:
compression_ratio = 1.0 + (intensity * 2.0) # 1:1 to 3:1 compression
threshold = 0.5 - (intensity * 0.2) # Lower threshold with higher intensity
above_threshold = torch.abs(audio) > threshold
audio[above_threshold] = torch.sign(audio[above_threshold]) * (
threshold + (torch.abs(audio[above_threshold]) - threshold) / compression_ratio
)
# 2. Harmonic enhancement (adds brightness without noise)
if intensity > 0.3:
# Simple harmonic exciter using soft clipping
harmonic_amount = intensity * 0.15 # Max 15% harmonics
harmonics = torch.tanh(audio * 3.0) * harmonic_amount
audio = audio + harmonics
# 3. Formant shifting (subtle pitch character change)
if intensity > 0.4:
# Apply subtle pitch bend without using librosa (to avoid artifacts)
formant_shift = 1.0 + (intensity * 0.05) # Max 5% formant shift
# This is a placeholder - actual formant shifting would need more complex processing
# 4. Energy boost (frequency-dependent)
if intensity > 0.1:
# Boost energy proportionally to intensity
energy_boost = 1.0 + (intensity * 0.4) # Max 40% boost
audio = audio * energy_boost
# Apply original parameters with intensity scaling
if loudness_boost:
# Boost loudness using RMS-based scaling (adjust multiplier as needed)
# Scale loudness boost with intensity
rms = torch.sqrt(torch.mean(audio ** 2))
audio = audio * (rms * 1.5) if rms > 0 else audio
boost_factor = 1.5 + (intensity * 0.5) # 1.5x to 2.0x based on intensity
audio = audio * (rms * boost_factor) if rms > 0 else audio
if gain > 0:
# Apply gain in dB
audio = audio * (10 ** (gain / 20))
# Apply gain with intensity scaling
adjusted_gain = gain * (1.0 + intensity * 0.3)
audio = audio * (10 ** (adjusted_gain / 20))
if normalize:
# Normalize to [-1, 1]
max_abs = torch.abs(audio).max()
audio = audio / max_abs if max_abs > 0 else audio
if energy_scale != 1.0:
# Scale energy
audio = audio * torch.sqrt(torch.tensor(energy_scale))
# Scale energy with intensity influence
adjusted_energy = energy_scale * (1.0 + intensity * 0.2)
audio = audio * torch.sqrt(torch.tensor(adjusted_energy))
if volume_scale != 1.0:
# Direct volume scaling
audio = audio * volume_scale
# Direct volume scaling with intensity
adjusted_volume = volume_scale * (1.0 + intensity * 0.3)
audio = audio * adjusted_volume
if strain_effect > 0.0:
# Add strain effect (basic distortion)
audio = audio + (audio ** 2 * strain_effect)
# Add strain effect with intensity (more careful to avoid noise)
adjusted_strain = strain_effect * (1.0 + intensity * 0.5)
# Use softer distortion to avoid harsh noise
audio = audio + (torch.tanh(audio * 2.0) * adjusted_strain * 0.3)
# Final clipping check after effects
# Advanced intensity-based post-processing
if intensity > 0.5:
# Subtle saturation for warmth (not harsh distortion)
saturation = intensity * 0.1
audio = audio * (1 - saturation) + torch.tanh(audio * 1.5) * saturation
# Final smart limiting to prevent clipping and maintain quality
# Use soft knee compression instead of hard clipping
soft_limit = 0.95
knee = 0.1
above_knee = torch.abs(audio) > (soft_limit - knee)
if torch.any(above_knee):
# Soft knee compression
audio_abs = torch.abs(audio)
audio_sign = torch.sign(audio)
# Calculate soft knee curve
knee_start = soft_limit - knee
compressed = torch.where(
audio_abs <= knee_start,
audio_abs,
knee_start + (audio_abs - knee_start) * 0.3 # Gentle compression above knee
)
audio = audio_sign * compressed
# Final safety check
max_audio = torch.abs(audio).max()
if max_audio > 1:
audio = audio / max_audio
if max_audio > 0.99:
audio = audio * (0.99 / max_audio)
audio_opt.append(audio)
audio_opt.append(zero_wav_torch) # zero_wav
@ -740,6 +818,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
t1 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
print(f"Intensity applied: {intensity}") # Log intensity level
audio_opt = torch.cat(audio_opt, 0) # np.concatenate
sr = hps.data.sampling_rate if model_version != "v3" else 24000
if if_sr == True and sr == 24000:
@ -751,6 +831,34 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
audio_opt = audio_opt.cpu().detach().numpy()
yield sr, (audio_opt * 32767).astype(np.int16)
# Helper function for smooth intensity transitions
def apply_intensity_envelope(audio, intensity, emotion_type=None):
"""
Apply smooth intensity envelope to avoid abrupt changes
"""
if intensity == 0:
return audio
# Create smooth envelope
length = audio.shape[0]
envelope = torch.ones_like(audio)
# Attack and release times based on intensity
attack_samples = int(0.01 * 24000 * (1 - intensity * 0.5)) # Faster attack with higher intensity
release_samples = int(0.05 * 24000)
# Apply envelope
if attack_samples > 0:
attack_curve = torch.linspace(0.7, 1.0, attack_samples)
envelope[:attack_samples] = attack_curve
if release_samples > 0 and length > release_samples:
release_curve = torch.linspace(1.0, 0.8, release_samples)
envelope[-release_samples:] = release_curve
return audio * envelope
def split(todo_text):
todo_text = todo_text.replace("……", "").replace("——", "")
if todo_text[-1] not in splits:

8
api.py
View File

@ -915,6 +915,7 @@ def version_4_cli(
volume_scale=1.0,
strain_effect=0.0,
shouting_type="normal", # normal, loud, soft, whisper
intesity = 0,
):
# Create a temporary buffer to store the audio
audio_buffer = io.BytesIO()
@ -976,7 +977,8 @@ def version_4_cli(
normalize=normalize,
energy_scale=energy_scale,
volume_scale=volume_scale,
strain_effect=strain_effect
strain_effect=strain_effect,
intensity=intesity
)
# Get the last audio data and sample rate from synthesis result
@ -1015,6 +1017,7 @@ async def tts_endpoint(
volume_scale: str = "1.0", # Accept as string from URL, convert to float
strain_effect: str = "0.0", # Accept as string from URL, convert to float
shouting_type: str = "normal", # Accept as string from URL, convert to appropriate type
intesity: int = 0,
):
if character == "kurari" or character == "Kurari":
prompt_text = "おはよう〜。今日はどんな1日過ごすーくらりはね〜いつでもあなたの味方だよ"
@ -1073,7 +1076,8 @@ async def tts_endpoint(
energy_scale=energy_scale,
volume_scale=volume_scale,
strain_effect=strain_effect,
shouting_type= shouting_type
shouting_type= shouting_type,
intesity=intesity
)
if audio_buffer: