From d2142eef85b553bcfb7c325b4f4ae8c3251ea534 Mon Sep 17 00:00:00 2001 From: samiabat Date: Thu, 22 May 2025 15:33:46 +0300 Subject: [PATCH] add version 4 kurari --- GPT_SoVITS/inference_cli.py | 15 ++++-------- api.py | 47 +++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py index bd987aaf..6a57ca2a 100644 --- a/GPT_SoVITS/inference_cli.py +++ b/GPT_SoVITS/inference_cli.py @@ -7,15 +7,8 @@ from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights i18n = I18nAuto() -def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path): - # Read reference text - with open(ref_text_path, 'r', encoding='utf-8') as file: - ref_text = file.read() - - # Read target text - with open(target_text_path, 'r', encoding='utf-8') as file: - target_text = file.read() - +def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_language, target_text, text_language, output_path): + # Change model weights change_gpt_weights(gpt_path=GPT_model_path) change_sovits_weights(sovits_path=SoVITS_model_path) @@ -25,10 +18,12 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, prompt_text=ref_text, prompt_language=i18n(ref_language), text=target_text, - text_language=i18n(target_language), top_p=1, temperature=1) + text_language=i18n(text_language), top_p=1, temperature=1) result_list = list(synthesis_result) + return result_list + if result_list: last_sampling_rate, last_audio_data = result_list[-1] output_wav_path = os.path.join(output_path, "output.wav") diff --git a/api.py b/api.py index 7721a663..045c1825 100644 --- a/api.py +++ b/api.py @@ -1087,6 +1087,53 @@ async def tts_endpoint(request: Request): json_post_raw.get("if_sr", False) ) +from GPT_SoVITS.inference_cli import synthesize +import soundfile as sf +import io +from fastapi.responses import StreamingResponse + +@app.get("/version-4") +async def version_4( + GPT_model_path = "GPT_SoVITS/pretrained_models/kurari-e40.ckpt", + SoVITS_model_path = "GPT_SoVITS/pretrained_models/kurari_e20_s1800_l32.pth", + ref_text: str = "おはよう〜。今日はどんな1日過ごすー?くらりはね〜いつでもあなたの味方だよ", + ref_language: str = "ja", + target_text: str = None, + text_language: str = "ja", + output_path: str = None +): + # Create a temporary buffer to store the audio + audio_buffer = io.BytesIO() + # GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text, ref_language, target_text, text_language, output_path + # Synthesize audio and get the result + synthesis_result = synthesize( + GPT_model_path = GPT_model_path, + SoVITS_model_path = SoVITS_model_path, + ref_audio_path = "idols/kurari/kurari.wav", + ref_text = ref_text, + ref_language = ref_language, + target_text = target_text, + target_language = text_language, + output_path = output_path # Don't save to file + ) + + # Get the last audio data and sample rate from synthesis result + result_list = list(synthesis_result) + if result_list: + last_sampling_rate, last_audio_data = result_list[-1] + + # Write audio data to buffer + sf.write(audio_buffer, last_audio_data, last_sampling_rate) + audio_buffer.seek(0) + + # Return audio as streaming response + return StreamingResponse( + audio_buffer, + media_type="audio/wav", + headers={"Content-Disposition": "attachment; filename=output.wav"} + ) + + return JSONResponse({"error": "Failed to generate audio"}, status_code=400) @app.get("/") async def tts_endpoint(