diff --git a/.gitignore b/.gitignore index 0bb4e0bf..68b58f54 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ ffprobe* cfg.json speakers.json ref_audios +local_files/ tools/AP_BWE_main/24kto48k/* !tools/AP_BWE_main/24kto48k/readme.txt diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index 6c2bf9ec..a5e567cc 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -941,6 +941,8 @@ class Text2SemanticDecoder(nn.Module): prompts: torch.LongTensor, bert_feature: torch.LongTensor, cumulation_amount: int, + dynamic_cumulatation: bool, + dynamic_cumulatation_amount: int, top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, @@ -1038,6 +1040,8 @@ class Text2SemanticDecoder(nn.Module): if tokens_since_last_yield >= cumulation_amount: generated_tokens = y[:, last_yield_idx:] + if dynamic_cumulatation: + cumulation_amount += dynamic_cumulatation_amount yield generated_tokens last_yield_idx = y.shape[1] tokens_since_last_yield = 0 diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 201cb045..641774a9 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1604,6 +1604,8 @@ class TTS: search_length = inputs.get("search_length", 32000 * 5) num_zeroes = inputs.get("num_zeroes", 5) cumulation_amount = inputs.get("cumulation_amount", 50) + dynamic_cumulatation = inputs.get("dynamic_cumulatation", False) + dynamic_cumulatation_amount = inputs.get("dynamic_cumulatation_amount", 10) # Prepare reference audio if ref_audio_path and ref_audio_path != self.prompt_cache["ref_audio_path"]: if not os.path.exists(ref_audio_path): @@ -1672,6 +1674,8 @@ class TTS: prompt_sem, all_bert.unsqueeze(0).to(self.configs.device), cumulation_amount=cumulation_amount, + dynamic_cumulatation=dynamic_cumulatation, + dynamic_cumulatation_amount=dynamic_cumulatation_amount, top_k=top_k, top_p=top_p, temperature=temperature, diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 0ca03f5d..0554dabc 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -684,8 +684,12 @@ def get_tts_wav( sample_steps=8, if_sr=False, pause_second=0.3, + seed_checkbox=False, + seed_text_box=None, ): global cache + if seed_checkbox: + set_seed(seed_text_box) if ref_wav_path: pass else: @@ -1204,6 +1208,20 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: temperature = gr.Slider( minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1 ) + seed_checkbox = gr.Checkbox( + label="Seed", + value=False, + interactive=True, + scale=1, + ) + seed_text_box = gr.Textbox( + label="Seed", + value="-1", + lines=1, + max_lines=1, + scale=1, + ) + # with gr.Column(): # gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。")) # phoneme=gr.Textbox(label=i18n("音素框"), value="") @@ -1231,6 +1249,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: sample_steps, if_sr_Checkbox, pause_second_slider, + seed_checkbox, + seed_text_box, ], [output], ) diff --git a/GPT_SoVITS/text/ja_userdic/user.dict b/GPT_SoVITS/text/ja_userdic/user.dict new file mode 100644 index 00000000..6ddcfef6 Binary files /dev/null and b/GPT_SoVITS/text/ja_userdic/user.dict differ diff --git a/GPT_SoVITS/text/ja_userdic/userdict.md5 b/GPT_SoVITS/text/ja_userdic/userdict.md5 new file mode 100644 index 00000000..d38206b3 --- /dev/null +++ b/GPT_SoVITS/text/ja_userdic/userdict.md5 @@ -0,0 +1 @@ +878b3caf4d1cd7c2927c26e85072a2f5 \ No newline at end of file diff --git a/call_example.py b/call_example.py new file mode 100644 index 00000000..aed2d9e4 --- /dev/null +++ b/call_example.py @@ -0,0 +1,65 @@ +''' +If you want to generate a single audio file with GPT-SoVITS, you can use this script. +The def run() function in GPT_SoVITS.TTS_infer_pack.TTS.py is used to generate the audio, it's a generator function so it must be called with a loop. +''' + +import os +import sys +import queue +import threading +import numpy as np +import sounddevice as sd +import wave +import soundfile as sf + +now_dir = os.getcwd() +sys.path.append(now_dir) +sys.path.append(os.path.join(now_dir, 'GPT_SoVITS')) +# os.environ['CUDA_LAUNCH_BLOCKING'] = '1' + +from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config + +def main(): + # create output directory for inference outputs + output_dir = "tts_outputs" + os.makedirs(output_dir, exist_ok=True) + + config_path = 'GPT_SoVITS/configs/tts_infer.yaml' # path to the config file + t2s_ckpt = 'GPT_SoVITS/pretrained_models/s1v3.ckpt' # path to the t2s checkpoint + vits_ckpt = 'GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth' # path to the vits checkpoint + ref_audio = 'local_files/test.wav' # path to the reference audio file + prompt_text = 'Flesh rots, carrion feeds the scavengers, and the bones remain. All part of the cycle of life.' # prompt text + text = "Hey there! This is a test of the TTS streaming. Is there anything that I can do to help you out? Or maybe you'd just like a quick snack... If not, that's okay too. I'm just here to chat and maybe become friends as I don't meet many people in this world." # text to be converted to audio + seed = 1 # -1 is random seed + + cfg = TTS_Config(config_path) + pipeline = TTS(cfg) + pipeline.init_t2s_weights(t2s_ckpt) + pipeline.init_vits_weights(vits_ckpt) + + inputs = { + "text": text, + "text_lang": "en", + "ref_audio_path": ref_audio, + "prompt_text": prompt_text, + "prompt_lang": "en", + "top_k": 5, + "top_p": 1.0, + "temperature": 1.0, + "sample_steps": 10, + "seed" : seed + } + + while True: + input("Enter to generate audio") + gen = pipeline.run(inputs) + idx = 0 + for sr, fragment in gen: + out_path = os.path.join(output_dir, f"inference_{idx}.wav") + while os.path.exists(out_path): + idx += 1 + out_path = os.path.join(output_dir, f"inference_{idx}.wav") + sf.write(out_path, fragment, sr) + +if __name__ == '__main__': + main() diff --git a/zc_streaming_example.py b/zc_streaming_example.py new file mode 100644 index 00000000..3b8cf014 --- /dev/null +++ b/zc_streaming_example.py @@ -0,0 +1,112 @@ +import os +import sys +import queue +import threading +import numpy as np +import sounddevice as sd +import wave +import time + +now_dir = os.getcwd() +sys.path.append(now_dir) +sys.path.append(os.path.join(now_dir, 'GPT_SoVITS')) +# os.environ['CUDA_LAUNCH_BLOCKING'] = '1' + +from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config + +def audio_playback_thread(audio_queue: queue.Queue, sample_rate: int): + """ + A background thread that plays audio fragments as they become available + in the queue using a continuous OutputStream for smooth playback. + """ + sd.default.samplerate = sample_rate + sd.default.channels = 1 + stream = sd.OutputStream(dtype='float32') + stream.start() + try: + while True: + audio_fragment = audio_queue.get() + try: + if audio_fragment is None: + # Sentinel received, end thread + break + # ensure float32 in [-1,1] + data = audio_fragment.astype(np.float32) / 32768.0 + stream.write(data) + finally: + audio_queue.task_done() + finally: + stream.stop() + stream.close() + print("Playback finished") + +def main(): + config_path = 'GPT_SoVITS/configs/tts_infer.yaml' # path to the config file + t2s_ckpt = 'GPT_SoVITS/pretrained_models/s1v3.ckpt' # path to the t2s checkpoint + vits_ckpt = 'GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth' # path to the vits checkpoint + ref_audio = 'local_files/test.wav' # path to the reference audio file + prompt_text = 'Flesh rots, carrion feeds the scavengers, and the bones remain. All part of the cycle of life.' # reference_audio transcription + text = "Today we are going to be testing TTS streaming audio. Is there anything that I can do to help you out? Or maybe you'd just like a quick snack... If not, that's okay too. I'm just here to chat and maybe become friends as I don't meet many people in this world." # text to be converted to audio + seed = 1 # -1 is random seed + + # Initialize the pipeline + cfg = TTS_Config(config_path) + pipeline = TTS(cfg) + pipeline.init_t2s_weights(t2s_ckpt) + pipeline.init_vits_weights(vits_ckpt) + + inputs = { + "text": text, + "text_lang": "en", + "ref_audio_path": ref_audio, + "prompt_text": prompt_text, + "prompt_lang": "en", + "top_k": 5, + "top_p": 1.0, + "temperature": 1.0, + "cumulation_amount":10, + "search_length": 32000*2, + "num_zeroes": 5, + "sample_steps": 8, + "dynamic_cumulatation": True, + "dynamic_cumulatation_amount": 20, + "seed" : seed + } + + while True: + input("enter to continue") + fragments = [] + # Initialize generator and fetch first fragment to get sample rate + gen = pipeline.run_generator(inputs) + start = time.time() + try: + sr, fragment = next(gen) + fragments.append(fragment) + except StopIteration: + print("No audio fragments generated.") + break + + # Create audio playback queue and start thread with sample rate + audio_queue = queue.Queue() + playback_thread = threading.Thread( + target=audio_playback_thread, args=(audio_queue, sr) + ) + playback_thread.start() + + for sr, fragment in gen: + if len(fragments) == 1: + audio_queue.put(fragments[0]) + end = time.time() + print(f"Time taken to put first fragment: {end - start}") + audio_queue.put(fragment) + fragments.append(fragment) + + # Signal playback thread to finish and wait + audio_queue.put(None) + audio_queue.join() + playback_thread.join() + + print("Audio playback complete") + +if __name__ == '__main__': + main()