From 61453b59b21bee4c30501ce3ea4772b781b238d1 Mon Sep 17 00:00:00 2001 From: chasonjiang <1440499136@qq.com> Date: Sat, 9 Mar 2024 02:05:03 +0800 Subject: [PATCH] =?UTF-8?q?=09=E6=B7=BB=E5=8A=A0=E9=9F=B3=E9=A2=91?= =?UTF-8?q?=E5=80=8D=E9=80=9F=E6=94=AF=E6=8C=81:=20=20=20GPT=5FSoVITS/TTS?= =?UTF-8?q?=5Finfer=5Fpack/TTS.py=20=09=E6=B7=BB=E5=8A=A0=E9=9F=B3?= =?UTF-8?q?=E9=A2=91=E5=80=8D=E9=80=9F=E6=94=AF=E6=8C=81:=20=20=20GPT=5FSo?= =?UTF-8?q?VITS/inference=5Fwebui.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/TTS_infer_pack/TTS.py | 40 ++++++++++++++++++++++++++++---- GPT_SoVITS/inference_webui.py | 6 +++-- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 09f3175..70d0cc9 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1,5 +1,6 @@ import os, sys +import ffmpeg now_dir = os.getcwd() sys.path.append(now_dir) import os @@ -405,7 +406,8 @@ class TTS: "temperature": 0.6, "text_split_method": "", "batch_size": 1, - "batch_threshold": 0.75 + "batch_threshold": 0.75, + "speed_factor":1.0, } returns: tulpe[int, np.ndarray]: sampling rate and audio data. @@ -421,6 +423,7 @@ class TTS: text_split_method:str = inputs.get("text_split_method", "") batch_size = inputs.get("batch_size", 1) batch_threshold = inputs.get("batch_threshold", 0.75) + speed_factor = inputs.get("speed_factor", 1.0) no_prompt_text = False if prompt_text in [None, ""]: @@ -548,7 +551,34 @@ class TTS: audio = self.recovery_order(audio, batch_index_list) print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t_34, t_45)) - yield self.configs.sampling_rate, (np.concatenate(audio, 0) * 32768).astype( - np.int16 - ) - \ No newline at end of file + + audio = np.concatenate(audio, 0) + audio = (audio * 32768).astype(np.int16) + if speed_factor != 1.0: + audio = speed_change(audio, speed=speed_factor, sr=int(self.configs.sampling_rate)) + + yield self.configs.sampling_rate, audio + + + + +def speed_change(input_audio:np.ndarray, speed:float, sr:int): + # 将 NumPy 数组转换为原始 PCM 流 + raw_audio = input_audio.astype(np.int16).tobytes() + + # 设置 ffmpeg 输入流 + input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1) + + # 变速处理 + output_stream = input_stream.filter('atempo', speed) + + # 输出流到管道 + out, _ = ( + output_stream.output('pipe:', format='s16le', acodec='pcm_s16le') + .run(input=raw_audio, capture_stdout=True, capture_stderr=True) + ) + + # 将管道输出解码为 NumPy 数组 + processed_audio = np.frombuffer(out, np.int16) + + return processed_audio \ No newline at end of file diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 68a2136..f0336bb 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -68,7 +68,7 @@ tts_pipline = TTS(tts_config) gpt_path = tts_config.t2s_weights_path sovits_path = tts_config.vits_weights_path -def inference(text, text_lang, ref_audio_path, prompt_text, prompt_lang, top_k, top_p, temperature, text_split_method, batch_size): +def inference(text, text_lang, ref_audio_path, prompt_text, prompt_lang, top_k, top_p, temperature, text_split_method, batch_size, speed_factor): inputs={ "text": text, "text_lang": dict_language[text_lang], @@ -80,6 +80,7 @@ def inference(text, text_lang, ref_audio_path, prompt_text, prompt_lang, top_k, "temperature": temperature, "text_split_method": cut_method[text_split_method], "batch_size":int(batch_size), + "speed_factor":float(speed_factor) } yield next(tts_pipline.run(inputs)) @@ -154,6 +155,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: with gr.Row(): gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低):")) batch_size = gr.Slider(minimum=1,maximum=20,step=1,label=i18n("batch_size"),value=1,interactive=True) + speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="speed_factor",value=1.0,interactive=True) top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) @@ -165,7 +167,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: inference_button.click( inference, - [text,text_language, inp_ref, prompt_text, prompt_language, top_k, top_p, temperature, how_to_cut, batch_size], + [text,text_language, inp_ref, prompt_text, prompt_language, top_k, top_p, temperature, how_to_cut, batch_size, speed_factor], [output], )