mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-08-10 10:09:51 +08:00
add intesity param
This commit is contained in:
parent
8a0346e795
commit
598afaaad9
@ -8,7 +8,7 @@ from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights
|
|||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
|
|
||||||
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_language, target_text, text_language, output_path, loudness_boost=False, gain=0, normalize=False,
|
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_language, target_text, text_language, output_path, loudness_boost=False, gain=0, normalize=False,
|
||||||
energy_scale=1.0, volume_scale=1.0, strain_effect=0.0):
|
energy_scale=1.0, volume_scale=1.0, strain_effect=0.0, intensity=0):
|
||||||
|
|
||||||
# Change model weights
|
# Change model weights
|
||||||
print("Changing model weights about to be called...")
|
print("Changing model weights about to be called...")
|
||||||
@ -31,7 +31,8 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_
|
|||||||
normalize=normalize,
|
normalize=normalize,
|
||||||
energy_scale=energy_scale,
|
energy_scale=energy_scale,
|
||||||
volume_scale=volume_scale,
|
volume_scale=volume_scale,
|
||||||
strain_effect=strain_effect)
|
strain_effect=strain_effect,
|
||||||
|
intensity=intensity)
|
||||||
|
|
||||||
result_list = list(synthesis_result)
|
result_list = list(synthesis_result)
|
||||||
|
|
||||||
|
@ -520,12 +520,16 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
top_k=15, top_p=1, temperature=1, ref_free=False, speed=1, if_freeze=False,
|
top_k=15, top_p=1, temperature=1, ref_free=False, speed=1, if_freeze=False,
|
||||||
inp_refs=None, sample_steps=16, if_sr=False, pause_second=0.3,
|
inp_refs=None, sample_steps=16, if_sr=False, pause_second=0.3,
|
||||||
loudness_boost=False, gain=0, normalize=False, energy_scale=1.0,
|
loudness_boost=False, gain=0, normalize=False, energy_scale=1.0,
|
||||||
volume_scale=1.0, strain_effect=0.0):
|
volume_scale=1.0, strain_effect=0.0, intensity=0): # Added intensity parameter
|
||||||
global cache
|
global cache
|
||||||
if ref_wav_path: pass
|
if ref_wav_path: pass
|
||||||
else: gr.Warning(i18n('请上传参考音频'))
|
else: gr.Warning(i18n('请上传参考音频'))
|
||||||
if text: pass
|
if text: pass
|
||||||
else: gr.Warning(i18n('请填入推理文本'))
|
else: gr.Warning(i18n('请填入推理文本'))
|
||||||
|
|
||||||
|
# Validate intensity parameter
|
||||||
|
intensity = max(0.0, min(1.0, intensity)) # Clamp to [0, 1]
|
||||||
|
|
||||||
t = []
|
t = []
|
||||||
if prompt_text is None or len(prompt_text) == 0:
|
if prompt_text is None or len(prompt_text) == 0:
|
||||||
ref_free = True
|
ref_free = True
|
||||||
@ -623,6 +627,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
if(i_text in cache and if_freeze == True): pred_semantic = cache[i_text]
|
if(i_text in cache and if_freeze == True): pred_semantic = cache[i_text]
|
||||||
else:
|
else:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
# Apply intensity to temperature for more emotional variation
|
||||||
|
adjusted_temperature = temperature * (1.0 + intensity * 0.3) # 0% to 30% increase
|
||||||
|
|
||||||
pred_semantic, idx = t2s_model.model.infer_panel(
|
pred_semantic, idx = t2s_model.model.infer_panel(
|
||||||
all_phoneme_ids,
|
all_phoneme_ids,
|
||||||
all_phoneme_len,
|
all_phoneme_len,
|
||||||
@ -631,7 +638,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
# prompt_phone_len=ph_offset,
|
# prompt_phone_len=ph_offset,
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
temperature=temperature,
|
temperature=adjusted_temperature, # Modified temperature
|
||||||
early_stop_num=hz * max_sec,
|
early_stop_num=hz * max_sec,
|
||||||
)
|
)
|
||||||
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
|
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
|
||||||
@ -648,7 +655,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if len(refers) == 0: refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
|
if len(refers) == 0: refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
|
||||||
audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)[0][0] # .cpu().detach().numpy()
|
|
||||||
|
# Apply intensity to speed for subtle tempo variation
|
||||||
|
adjusted_speed = speed * (1.0 + intensity * 0.1) # 0% to 10% speed increase
|
||||||
|
audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=adjusted_speed)[0][0]
|
||||||
else:
|
else:
|
||||||
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
|
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
|
||||||
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
|
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
|
||||||
@ -675,7 +685,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
# print("fea_ref", fea_ref, fea_ref.shape)
|
# print("fea_ref", fea_ref, fea_ref.shape)
|
||||||
# print("mel2", mel2)
|
# print("mel2", mel2)
|
||||||
mel2 = mel2.to(dtype)
|
mel2 = mel2.to(dtype)
|
||||||
fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed)
|
|
||||||
|
# Apply intensity to speed for v3
|
||||||
|
adjusted_speed = speed * (1.0 + intensity * 0.1)
|
||||||
|
fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, adjusted_speed)
|
||||||
# print("fea_todo", fea_todo)
|
# print("fea_todo", fea_todo)
|
||||||
# print("ge", ge.abs().mean())
|
# print("ge", ge.abs().mean())
|
||||||
cfm_resss = []
|
cfm_resss = []
|
||||||
@ -705,33 +718,98 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
if max_audio > 1:
|
if max_audio > 1:
|
||||||
audio = audio / max_audio
|
audio = audio / max_audio
|
||||||
|
|
||||||
# Apply new parameters
|
# Apply intensity-based audio processing
|
||||||
audio = audio.to(torch.float32) # Ensure float32 for processing
|
audio = audio.to(torch.float32) # Ensure float32 for processing
|
||||||
|
|
||||||
|
# Creative intensity-based effects (careful not to create noise)
|
||||||
|
if intensity > 0:
|
||||||
|
# 1. Dynamic range compression (makes loud parts more consistent)
|
||||||
|
if intensity > 0.2:
|
||||||
|
compression_ratio = 1.0 + (intensity * 2.0) # 1:1 to 3:1 compression
|
||||||
|
threshold = 0.5 - (intensity * 0.2) # Lower threshold with higher intensity
|
||||||
|
above_threshold = torch.abs(audio) > threshold
|
||||||
|
audio[above_threshold] = torch.sign(audio[above_threshold]) * (
|
||||||
|
threshold + (torch.abs(audio[above_threshold]) - threshold) / compression_ratio
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Harmonic enhancement (adds brightness without noise)
|
||||||
|
if intensity > 0.3:
|
||||||
|
# Simple harmonic exciter using soft clipping
|
||||||
|
harmonic_amount = intensity * 0.15 # Max 15% harmonics
|
||||||
|
harmonics = torch.tanh(audio * 3.0) * harmonic_amount
|
||||||
|
audio = audio + harmonics
|
||||||
|
|
||||||
|
# 3. Formant shifting (subtle pitch character change)
|
||||||
|
if intensity > 0.4:
|
||||||
|
# Apply subtle pitch bend without using librosa (to avoid artifacts)
|
||||||
|
formant_shift = 1.0 + (intensity * 0.05) # Max 5% formant shift
|
||||||
|
# This is a placeholder - actual formant shifting would need more complex processing
|
||||||
|
|
||||||
|
# 4. Energy boost (frequency-dependent)
|
||||||
|
if intensity > 0.1:
|
||||||
|
# Boost energy proportionally to intensity
|
||||||
|
energy_boost = 1.0 + (intensity * 0.4) # Max 40% boost
|
||||||
|
audio = audio * energy_boost
|
||||||
|
|
||||||
|
# Apply original parameters with intensity scaling
|
||||||
if loudness_boost:
|
if loudness_boost:
|
||||||
# Boost loudness using RMS-based scaling (adjust multiplier as needed)
|
# Scale loudness boost with intensity
|
||||||
rms = torch.sqrt(torch.mean(audio ** 2))
|
rms = torch.sqrt(torch.mean(audio ** 2))
|
||||||
audio = audio * (rms * 1.5) if rms > 0 else audio
|
boost_factor = 1.5 + (intensity * 0.5) # 1.5x to 2.0x based on intensity
|
||||||
|
audio = audio * (rms * boost_factor) if rms > 0 else audio
|
||||||
if gain > 0:
|
if gain > 0:
|
||||||
# Apply gain in dB
|
# Apply gain with intensity scaling
|
||||||
audio = audio * (10 ** (gain / 20))
|
adjusted_gain = gain * (1.0 + intensity * 0.3)
|
||||||
|
audio = audio * (10 ** (adjusted_gain / 20))
|
||||||
if normalize:
|
if normalize:
|
||||||
# Normalize to [-1, 1]
|
# Normalize to [-1, 1]
|
||||||
max_abs = torch.abs(audio).max()
|
max_abs = torch.abs(audio).max()
|
||||||
audio = audio / max_abs if max_abs > 0 else audio
|
audio = audio / max_abs if max_abs > 0 else audio
|
||||||
if energy_scale != 1.0:
|
if energy_scale != 1.0:
|
||||||
# Scale energy
|
# Scale energy with intensity influence
|
||||||
audio = audio * torch.sqrt(torch.tensor(energy_scale))
|
adjusted_energy = energy_scale * (1.0 + intensity * 0.2)
|
||||||
|
audio = audio * torch.sqrt(torch.tensor(adjusted_energy))
|
||||||
if volume_scale != 1.0:
|
if volume_scale != 1.0:
|
||||||
# Direct volume scaling
|
# Direct volume scaling with intensity
|
||||||
audio = audio * volume_scale
|
adjusted_volume = volume_scale * (1.0 + intensity * 0.3)
|
||||||
|
audio = audio * adjusted_volume
|
||||||
if strain_effect > 0.0:
|
if strain_effect > 0.0:
|
||||||
# Add strain effect (basic distortion)
|
# Add strain effect with intensity (more careful to avoid noise)
|
||||||
audio = audio + (audio ** 2 * strain_effect)
|
adjusted_strain = strain_effect * (1.0 + intensity * 0.5)
|
||||||
|
# Use softer distortion to avoid harsh noise
|
||||||
|
audio = audio + (torch.tanh(audio * 2.0) * adjusted_strain * 0.3)
|
||||||
|
|
||||||
# Final clipping check after effects
|
# Advanced intensity-based post-processing
|
||||||
|
if intensity > 0.5:
|
||||||
|
# Subtle saturation for warmth (not harsh distortion)
|
||||||
|
saturation = intensity * 0.1
|
||||||
|
audio = audio * (1 - saturation) + torch.tanh(audio * 1.5) * saturation
|
||||||
|
|
||||||
|
# Final smart limiting to prevent clipping and maintain quality
|
||||||
|
# Use soft knee compression instead of hard clipping
|
||||||
|
soft_limit = 0.95
|
||||||
|
knee = 0.1
|
||||||
|
|
||||||
|
above_knee = torch.abs(audio) > (soft_limit - knee)
|
||||||
|
if torch.any(above_knee):
|
||||||
|
# Soft knee compression
|
||||||
|
audio_abs = torch.abs(audio)
|
||||||
|
audio_sign = torch.sign(audio)
|
||||||
|
|
||||||
|
# Calculate soft knee curve
|
||||||
|
knee_start = soft_limit - knee
|
||||||
|
compressed = torch.where(
|
||||||
|
audio_abs <= knee_start,
|
||||||
|
audio_abs,
|
||||||
|
knee_start + (audio_abs - knee_start) * 0.3 # Gentle compression above knee
|
||||||
|
)
|
||||||
|
|
||||||
|
audio = audio_sign * compressed
|
||||||
|
|
||||||
|
# Final safety check
|
||||||
max_audio = torch.abs(audio).max()
|
max_audio = torch.abs(audio).max()
|
||||||
if max_audio > 1:
|
if max_audio > 0.99:
|
||||||
audio = audio / max_audio
|
audio = audio * (0.99 / max_audio)
|
||||||
|
|
||||||
audio_opt.append(audio)
|
audio_opt.append(audio)
|
||||||
audio_opt.append(zero_wav_torch) # zero_wav
|
audio_opt.append(zero_wav_torch) # zero_wav
|
||||||
@ -740,6 +818,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
t1 = ttime()
|
t1 = ttime()
|
||||||
|
|
||||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
|
print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
|
||||||
|
print(f"Intensity applied: {intensity}") # Log intensity level
|
||||||
|
|
||||||
audio_opt = torch.cat(audio_opt, 0) # np.concatenate
|
audio_opt = torch.cat(audio_opt, 0) # np.concatenate
|
||||||
sr = hps.data.sampling_rate if model_version != "v3" else 24000
|
sr = hps.data.sampling_rate if model_version != "v3" else 24000
|
||||||
if if_sr == True and sr == 24000:
|
if if_sr == True and sr == 24000:
|
||||||
@ -751,6 +831,34 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
audio_opt = audio_opt.cpu().detach().numpy()
|
audio_opt = audio_opt.cpu().detach().numpy()
|
||||||
yield sr, (audio_opt * 32767).astype(np.int16)
|
yield sr, (audio_opt * 32767).astype(np.int16)
|
||||||
|
|
||||||
|
|
||||||
|
# Helper function for smooth intensity transitions
|
||||||
|
def apply_intensity_envelope(audio, intensity, emotion_type=None):
|
||||||
|
"""
|
||||||
|
Apply smooth intensity envelope to avoid abrupt changes
|
||||||
|
"""
|
||||||
|
if intensity == 0:
|
||||||
|
return audio
|
||||||
|
|
||||||
|
# Create smooth envelope
|
||||||
|
length = audio.shape[0]
|
||||||
|
envelope = torch.ones_like(audio)
|
||||||
|
|
||||||
|
# Attack and release times based on intensity
|
||||||
|
attack_samples = int(0.01 * 24000 * (1 - intensity * 0.5)) # Faster attack with higher intensity
|
||||||
|
release_samples = int(0.05 * 24000)
|
||||||
|
|
||||||
|
# Apply envelope
|
||||||
|
if attack_samples > 0:
|
||||||
|
attack_curve = torch.linspace(0.7, 1.0, attack_samples)
|
||||||
|
envelope[:attack_samples] = attack_curve
|
||||||
|
|
||||||
|
if release_samples > 0 and length > release_samples:
|
||||||
|
release_curve = torch.linspace(1.0, 0.8, release_samples)
|
||||||
|
envelope[-release_samples:] = release_curve
|
||||||
|
|
||||||
|
return audio * envelope
|
||||||
|
|
||||||
def split(todo_text):
|
def split(todo_text):
|
||||||
todo_text = todo_text.replace("……", "。").replace("——", ",")
|
todo_text = todo_text.replace("……", "。").replace("——", ",")
|
||||||
if todo_text[-1] not in splits:
|
if todo_text[-1] not in splits:
|
||||||
|
8
api.py
8
api.py
@ -915,6 +915,7 @@ def version_4_cli(
|
|||||||
volume_scale=1.0,
|
volume_scale=1.0,
|
||||||
strain_effect=0.0,
|
strain_effect=0.0,
|
||||||
shouting_type="normal", # normal, loud, soft, whisper
|
shouting_type="normal", # normal, loud, soft, whisper
|
||||||
|
intesity = 0,
|
||||||
):
|
):
|
||||||
# Create a temporary buffer to store the audio
|
# Create a temporary buffer to store the audio
|
||||||
audio_buffer = io.BytesIO()
|
audio_buffer = io.BytesIO()
|
||||||
@ -976,7 +977,8 @@ def version_4_cli(
|
|||||||
normalize=normalize,
|
normalize=normalize,
|
||||||
energy_scale=energy_scale,
|
energy_scale=energy_scale,
|
||||||
volume_scale=volume_scale,
|
volume_scale=volume_scale,
|
||||||
strain_effect=strain_effect
|
strain_effect=strain_effect,
|
||||||
|
intensity=intesity
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get the last audio data and sample rate from synthesis result
|
# Get the last audio data and sample rate from synthesis result
|
||||||
@ -1015,6 +1017,7 @@ async def tts_endpoint(
|
|||||||
volume_scale: str = "1.0", # Accept as string from URL, convert to float
|
volume_scale: str = "1.0", # Accept as string from URL, convert to float
|
||||||
strain_effect: str = "0.0", # Accept as string from URL, convert to float
|
strain_effect: str = "0.0", # Accept as string from URL, convert to float
|
||||||
shouting_type: str = "normal", # Accept as string from URL, convert to appropriate type
|
shouting_type: str = "normal", # Accept as string from URL, convert to appropriate type
|
||||||
|
intesity: int = 0,
|
||||||
):
|
):
|
||||||
if character == "kurari" or character == "Kurari":
|
if character == "kurari" or character == "Kurari":
|
||||||
prompt_text = "おはよう〜。今日はどんな1日過ごすー?くらりはね〜いつでもあなたの味方だよ"
|
prompt_text = "おはよう〜。今日はどんな1日過ごすー?くらりはね〜いつでもあなたの味方だよ"
|
||||||
@ -1073,7 +1076,8 @@ async def tts_endpoint(
|
|||||||
energy_scale=energy_scale,
|
energy_scale=energy_scale,
|
||||||
volume_scale=volume_scale,
|
volume_scale=volume_scale,
|
||||||
strain_effect=strain_effect,
|
strain_effect=strain_effect,
|
||||||
shouting_type= shouting_type
|
shouting_type= shouting_type,
|
||||||
|
intesity=intesity
|
||||||
)
|
)
|
||||||
|
|
||||||
if audio_buffer:
|
if audio_buffer:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user