mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-08-26 23:40:00 +08:00
Merge pull request #3 from JarodMica/upstream-sync
update streaming and add reference code
This commit is contained in:
commit
df6da098f5
1
.gitignore
vendored
1
.gitignore
vendored
@ -21,6 +21,7 @@ ffprobe*
|
|||||||
cfg.json
|
cfg.json
|
||||||
speakers.json
|
speakers.json
|
||||||
ref_audios
|
ref_audios
|
||||||
|
local_files/
|
||||||
tools/AP_BWE_main/24kto48k/*
|
tools/AP_BWE_main/24kto48k/*
|
||||||
!tools/AP_BWE_main/24kto48k/readme.txt
|
!tools/AP_BWE_main/24kto48k/readme.txt
|
||||||
|
|
||||||
|
@ -941,6 +941,8 @@ class Text2SemanticDecoder(nn.Module):
|
|||||||
prompts: torch.LongTensor,
|
prompts: torch.LongTensor,
|
||||||
bert_feature: torch.LongTensor,
|
bert_feature: torch.LongTensor,
|
||||||
cumulation_amount: int,
|
cumulation_amount: int,
|
||||||
|
dynamic_cumulatation: bool,
|
||||||
|
dynamic_cumulatation_amount: int,
|
||||||
top_k: int = -100,
|
top_k: int = -100,
|
||||||
top_p: int = 100,
|
top_p: int = 100,
|
||||||
early_stop_num: int = -1,
|
early_stop_num: int = -1,
|
||||||
@ -1038,6 +1040,8 @@ class Text2SemanticDecoder(nn.Module):
|
|||||||
|
|
||||||
if tokens_since_last_yield >= cumulation_amount:
|
if tokens_since_last_yield >= cumulation_amount:
|
||||||
generated_tokens = y[:, last_yield_idx:]
|
generated_tokens = y[:, last_yield_idx:]
|
||||||
|
if dynamic_cumulatation:
|
||||||
|
cumulation_amount += dynamic_cumulatation_amount
|
||||||
yield generated_tokens
|
yield generated_tokens
|
||||||
last_yield_idx = y.shape[1]
|
last_yield_idx = y.shape[1]
|
||||||
tokens_since_last_yield = 0
|
tokens_since_last_yield = 0
|
||||||
|
@ -1604,6 +1604,8 @@ class TTS:
|
|||||||
search_length = inputs.get("search_length", 32000 * 5)
|
search_length = inputs.get("search_length", 32000 * 5)
|
||||||
num_zeroes = inputs.get("num_zeroes", 5)
|
num_zeroes = inputs.get("num_zeroes", 5)
|
||||||
cumulation_amount = inputs.get("cumulation_amount", 50)
|
cumulation_amount = inputs.get("cumulation_amount", 50)
|
||||||
|
dynamic_cumulatation = inputs.get("dynamic_cumulatation", False)
|
||||||
|
dynamic_cumulatation_amount = inputs.get("dynamic_cumulatation_amount", 10)
|
||||||
# Prepare reference audio
|
# Prepare reference audio
|
||||||
if ref_audio_path and ref_audio_path != self.prompt_cache["ref_audio_path"]:
|
if ref_audio_path and ref_audio_path != self.prompt_cache["ref_audio_path"]:
|
||||||
if not os.path.exists(ref_audio_path):
|
if not os.path.exists(ref_audio_path):
|
||||||
@ -1672,6 +1674,8 @@ class TTS:
|
|||||||
prompt_sem,
|
prompt_sem,
|
||||||
all_bert.unsqueeze(0).to(self.configs.device),
|
all_bert.unsqueeze(0).to(self.configs.device),
|
||||||
cumulation_amount=cumulation_amount,
|
cumulation_amount=cumulation_amount,
|
||||||
|
dynamic_cumulatation=dynamic_cumulatation,
|
||||||
|
dynamic_cumulatation_amount=dynamic_cumulatation_amount,
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
|
@ -684,8 +684,12 @@ def get_tts_wav(
|
|||||||
sample_steps=8,
|
sample_steps=8,
|
||||||
if_sr=False,
|
if_sr=False,
|
||||||
pause_second=0.3,
|
pause_second=0.3,
|
||||||
|
seed_checkbox=False,
|
||||||
|
seed_text_box=None,
|
||||||
):
|
):
|
||||||
global cache
|
global cache
|
||||||
|
if seed_checkbox:
|
||||||
|
set_seed(seed_text_box)
|
||||||
if ref_wav_path:
|
if ref_wav_path:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
@ -1204,6 +1208,20 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
|||||||
temperature = gr.Slider(
|
temperature = gr.Slider(
|
||||||
minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1
|
minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1
|
||||||
)
|
)
|
||||||
|
seed_checkbox = gr.Checkbox(
|
||||||
|
label="Seed",
|
||||||
|
value=False,
|
||||||
|
interactive=True,
|
||||||
|
scale=1,
|
||||||
|
)
|
||||||
|
seed_text_box = gr.Textbox(
|
||||||
|
label="Seed",
|
||||||
|
value="-1",
|
||||||
|
lines=1,
|
||||||
|
max_lines=1,
|
||||||
|
scale=1,
|
||||||
|
)
|
||||||
|
|
||||||
# with gr.Column():
|
# with gr.Column():
|
||||||
# gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。"))
|
# gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。"))
|
||||||
# phoneme=gr.Textbox(label=i18n("音素框"), value="")
|
# phoneme=gr.Textbox(label=i18n("音素框"), value="")
|
||||||
@ -1231,6 +1249,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
|||||||
sample_steps,
|
sample_steps,
|
||||||
if_sr_Checkbox,
|
if_sr_Checkbox,
|
||||||
pause_second_slider,
|
pause_second_slider,
|
||||||
|
seed_checkbox,
|
||||||
|
seed_text_box,
|
||||||
],
|
],
|
||||||
[output],
|
[output],
|
||||||
)
|
)
|
||||||
|
BIN
GPT_SoVITS/text/ja_userdic/user.dict
Normal file
BIN
GPT_SoVITS/text/ja_userdic/user.dict
Normal file
Binary file not shown.
1
GPT_SoVITS/text/ja_userdic/userdict.md5
Normal file
1
GPT_SoVITS/text/ja_userdic/userdict.md5
Normal file
@ -0,0 +1 @@
|
|||||||
|
878b3caf4d1cd7c2927c26e85072a2f5
|
65
call_example.py
Normal file
65
call_example.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
'''
|
||||||
|
If you want to generate a single audio file with GPT-SoVITS, you can use this script.
|
||||||
|
The def run() function in GPT_SoVITS.TTS_infer_pack.TTS.py is used to generate the audio, it's a generator function so it must be called with a loop.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
import numpy as np
|
||||||
|
import sounddevice as sd
|
||||||
|
import wave
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
now_dir = os.getcwd()
|
||||||
|
sys.path.append(now_dir)
|
||||||
|
sys.path.append(os.path.join(now_dir, 'GPT_SoVITS'))
|
||||||
|
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
||||||
|
|
||||||
|
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# create output directory for inference outputs
|
||||||
|
output_dir = "tts_outputs"
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
config_path = 'GPT_SoVITS/configs/tts_infer.yaml' # path to the config file
|
||||||
|
t2s_ckpt = 'GPT_SoVITS/pretrained_models/s1v3.ckpt' # path to the t2s checkpoint
|
||||||
|
vits_ckpt = 'GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth' # path to the vits checkpoint
|
||||||
|
ref_audio = 'local_files/test.wav' # path to the reference audio file
|
||||||
|
prompt_text = 'Flesh rots, carrion feeds the scavengers, and the bones remain. All part of the cycle of life.' # prompt text
|
||||||
|
text = "Hey there! This is a test of the TTS streaming. Is there anything that I can do to help you out? Or maybe you'd just like a quick snack... If not, that's okay too. I'm just here to chat and maybe become friends as I don't meet many people in this world." # text to be converted to audio
|
||||||
|
seed = 1 # -1 is random seed
|
||||||
|
|
||||||
|
cfg = TTS_Config(config_path)
|
||||||
|
pipeline = TTS(cfg)
|
||||||
|
pipeline.init_t2s_weights(t2s_ckpt)
|
||||||
|
pipeline.init_vits_weights(vits_ckpt)
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"text": text,
|
||||||
|
"text_lang": "en",
|
||||||
|
"ref_audio_path": ref_audio,
|
||||||
|
"prompt_text": prompt_text,
|
||||||
|
"prompt_lang": "en",
|
||||||
|
"top_k": 5,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"temperature": 1.0,
|
||||||
|
"sample_steps": 10,
|
||||||
|
"seed" : seed
|
||||||
|
}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
input("Enter to generate audio")
|
||||||
|
gen = pipeline.run(inputs)
|
||||||
|
idx = 0
|
||||||
|
for sr, fragment in gen:
|
||||||
|
out_path = os.path.join(output_dir, f"inference_{idx}.wav")
|
||||||
|
while os.path.exists(out_path):
|
||||||
|
idx += 1
|
||||||
|
out_path = os.path.join(output_dir, f"inference_{idx}.wav")
|
||||||
|
sf.write(out_path, fragment, sr)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
112
zc_streaming_example.py
Normal file
112
zc_streaming_example.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
import numpy as np
|
||||||
|
import sounddevice as sd
|
||||||
|
import wave
|
||||||
|
import time
|
||||||
|
|
||||||
|
now_dir = os.getcwd()
|
||||||
|
sys.path.append(now_dir)
|
||||||
|
sys.path.append(os.path.join(now_dir, 'GPT_SoVITS'))
|
||||||
|
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
||||||
|
|
||||||
|
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
|
||||||
|
|
||||||
|
def audio_playback_thread(audio_queue: queue.Queue, sample_rate: int):
|
||||||
|
"""
|
||||||
|
A background thread that plays audio fragments as they become available
|
||||||
|
in the queue using a continuous OutputStream for smooth playback.
|
||||||
|
"""
|
||||||
|
sd.default.samplerate = sample_rate
|
||||||
|
sd.default.channels = 1
|
||||||
|
stream = sd.OutputStream(dtype='float32')
|
||||||
|
stream.start()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
audio_fragment = audio_queue.get()
|
||||||
|
try:
|
||||||
|
if audio_fragment is None:
|
||||||
|
# Sentinel received, end thread
|
||||||
|
break
|
||||||
|
# ensure float32 in [-1,1]
|
||||||
|
data = audio_fragment.astype(np.float32) / 32768.0
|
||||||
|
stream.write(data)
|
||||||
|
finally:
|
||||||
|
audio_queue.task_done()
|
||||||
|
finally:
|
||||||
|
stream.stop()
|
||||||
|
stream.close()
|
||||||
|
print("Playback finished")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
config_path = 'GPT_SoVITS/configs/tts_infer.yaml' # path to the config file
|
||||||
|
t2s_ckpt = 'GPT_SoVITS/pretrained_models/s1v3.ckpt' # path to the t2s checkpoint
|
||||||
|
vits_ckpt = 'GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth' # path to the vits checkpoint
|
||||||
|
ref_audio = 'local_files/test.wav' # path to the reference audio file
|
||||||
|
prompt_text = 'Flesh rots, carrion feeds the scavengers, and the bones remain. All part of the cycle of life.' # reference_audio transcription
|
||||||
|
text = "Today we are going to be testing TTS streaming audio. Is there anything that I can do to help you out? Or maybe you'd just like a quick snack... If not, that's okay too. I'm just here to chat and maybe become friends as I don't meet many people in this world." # text to be converted to audio
|
||||||
|
seed = 1 # -1 is random seed
|
||||||
|
|
||||||
|
# Initialize the pipeline
|
||||||
|
cfg = TTS_Config(config_path)
|
||||||
|
pipeline = TTS(cfg)
|
||||||
|
pipeline.init_t2s_weights(t2s_ckpt)
|
||||||
|
pipeline.init_vits_weights(vits_ckpt)
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"text": text,
|
||||||
|
"text_lang": "en",
|
||||||
|
"ref_audio_path": ref_audio,
|
||||||
|
"prompt_text": prompt_text,
|
||||||
|
"prompt_lang": "en",
|
||||||
|
"top_k": 5,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"temperature": 1.0,
|
||||||
|
"cumulation_amount":10,
|
||||||
|
"search_length": 32000*2,
|
||||||
|
"num_zeroes": 5,
|
||||||
|
"sample_steps": 8,
|
||||||
|
"dynamic_cumulatation": True,
|
||||||
|
"dynamic_cumulatation_amount": 20,
|
||||||
|
"seed" : seed
|
||||||
|
}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
input("enter to continue")
|
||||||
|
fragments = []
|
||||||
|
# Initialize generator and fetch first fragment to get sample rate
|
||||||
|
gen = pipeline.run_generator(inputs)
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
sr, fragment = next(gen)
|
||||||
|
fragments.append(fragment)
|
||||||
|
except StopIteration:
|
||||||
|
print("No audio fragments generated.")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Create audio playback queue and start thread with sample rate
|
||||||
|
audio_queue = queue.Queue()
|
||||||
|
playback_thread = threading.Thread(
|
||||||
|
target=audio_playback_thread, args=(audio_queue, sr)
|
||||||
|
)
|
||||||
|
playback_thread.start()
|
||||||
|
|
||||||
|
for sr, fragment in gen:
|
||||||
|
if len(fragments) == 1:
|
||||||
|
audio_queue.put(fragments[0])
|
||||||
|
end = time.time()
|
||||||
|
print(f"Time taken to put first fragment: {end - start}")
|
||||||
|
audio_queue.put(fragment)
|
||||||
|
fragments.append(fragment)
|
||||||
|
|
||||||
|
# Signal playback thread to finish and wait
|
||||||
|
audio_queue.put(None)
|
||||||
|
audio_queue.join()
|
||||||
|
playback_thread.join()
|
||||||
|
|
||||||
|
print("Audio playback complete")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
x
Reference in New Issue
Block a user