diff --git a/.dockerignore b/.dockerignore index bf36b884..acdf1cec 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,3 @@ -GPT_SoVITS/pretrained_models/* -tools/asr/models/* -tools/uvr5/uvr5_weights/* .git .DS_Store @@ -11,10 +8,7 @@ runtime .idea output logs -SoVITS_weights*/ -GPT_weights*/ TEMP -weight.json ffmpeg* ffprobe* cfg.json diff --git a/Docker/install_wrapper.sh b/Docker/install_wrapper.sh index 6dd93e5a..a28d3839 100644 --- a/Docker/install_wrapper.sh +++ b/Docker/install_wrapper.sh @@ -18,7 +18,7 @@ ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretr ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel -bash install.sh --device "CU${CUDA_VERSION//./}" --source HF +bash install.sh --device "MPS" --source HF pip cache purge diff --git a/Dockerfile b/Dockerfile index 71bf6fa1..3f17f8ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,62 +1,20 @@ -ARG CUDA_VERSION=12.6 -ARG TORCH_BASE=full +FROM python:3.10.18-bullseye -FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE} - -LABEL maintainer="XXXXRT" -LABEL version="V4" +LABEL version="V2pro" LABEL description="Docker image for GPT-SoVITS" -ARG CUDA_VERSION=12.6 +WORKDIR /GPT-SoVITS +COPY requirements.txt /GPT-SoVITS +RUN pip install -r requirements.txt -ENV CUDA_VERSION=${CUDA_VERSION} +COPY GPT_SoVITS /GPT-SoVITS/GPT_SoVITS +COPY tools /GPT-SoVITS/tools +COPY api.py /GPT-SoVITS +COPY api_v2.py /GPT-SoVITS +COPY config.py /GPT-SoVITS +COPY webui.py /GPT-SoVITS +COPY ref_audio /GPT-SoVITS/ref_audio -SHELL ["/bin/bash", "-c"] +EXPOSE 9871 9872 9873 9874 9880 8001 8002 -WORKDIR /workspace/GPT-SoVITS - -COPY Docker /workspace/GPT-SoVITS/Docker/ - -ARG LITE=false -ENV LITE=${LITE} - -ARG WORKFLOW=false -ENV WORKFLOW=${WORKFLOW} - -ARG TARGETPLATFORM -ENV TARGETPLATFORM=${TARGETPLATFORM} - -RUN bash Docker/miniconda_install.sh - -COPY extra-req.txt /workspace/GPT-SoVITS/ - -COPY requirements.txt /workspace/GPT-SoVITS/ - -COPY install.sh /workspace/GPT-SoVITS/ - -RUN bash Docker/install_wrapper.sh - -EXPOSE 9871 9872 9873 9874 9880 - -ENV PYTHONPATH="/workspace/GPT-SoVITS" - -RUN conda init bash && echo "conda activate base" >> ~/.bashrc - -WORKDIR /workspace - -RUN rm -rf /workspace/GPT-SoVITS - -WORKDIR /workspace/GPT-SoVITS - -COPY . /workspace/GPT-SoVITS - -CMD ["/bin/bash", "-c", "\ - rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \ - rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \ - rm -rf /workspace/GPT-SoVITS/tools/asr/models && \ - rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \ - ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \ - ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \ - ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \ - ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \ - exec bash"] \ No newline at end of file +CMD ["/bin/bash", "-c", "python GPT_SoVITS/inference_webui_api.py"] \ No newline at end of file diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml index 1ae466c9..35f3b71e 100644 --- a/GPT_SoVITS/configs/tts_infer.yaml +++ b/GPT_SoVITS/configs/tts_infer.yaml @@ -3,9 +3,9 @@ custom: cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_weights_v2ProPlus/111-e15.ckpt - version: v2Pro - vits_weights_path: SoVITS_weights_v2ProPlus/111_e8_s136.pth + t2s_weights_path: GPT_SoVITS/pretrained_models/meiv2pp-e15.ckpt + version: v2 + vits_weights_path: GPT_SoVITS/pretrained_models/meiv2pp_e8_s232.pth v1: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base diff --git a/GPT_SoVITS/inference_webui_api.py b/GPT_SoVITS/inference_webui_api.py index 5dbab69d..758a4ce1 100644 --- a/GPT_SoVITS/inference_webui_api.py +++ b/GPT_SoVITS/inference_webui_api.py @@ -525,29 +525,30 @@ import tempfile import shutil import os from pydantic import BaseModel +import soundfile as sf app = FastAPI() class InferenceRequest(BaseModel): text: str - text_lang: str = "中文" + text_lang: str = i18n("中文") ref_audio: str # 这里是base64编码的音频文件内容 - prompt_text: str = "" - prompt_lang: str = "中文" - top_k: int = 5 - top_p: float = 1 - temperature: float = 1 - text_split_method: str = "按标点符号切" + prompt_text: str + prompt_lang: str = i18n("中文") + top_k: int = 6 + top_p: float = 0.9 + temperature: float = 0.95 + text_split_method: str = i18n("按标点符号切") batch_size: int = 20 speed_factor: float = 1.1 - ref_text_free: bool = True + ref_text_free: bool = False split_bucket: bool = True fragment_interval: float = 0.3 seed: int = -1 keep_random: bool = True parallel_infer: bool = True - repetition_penalty: float = 1.35 + repetition_penalty: float = 1.45 sample_steps: int = 32 super_sampling: bool = False @@ -632,24 +633,21 @@ def wav_chunk_streamer(infer_gen): wav_file.writeframes(audio.tobytes()) return buffer.getvalue() - for wav_data, _ in infer_gen: - sr, audio = wav_data - if not isinstance(audio, np.ndarray): - audio = np.array(audio) - if audio.dtype != np.int16: - audio = (audio * 32768).astype(np.int16) - yield encode_wav_chunk(sr, audio) # 每段 WAV 数据 + for audio, _ in infer_gen: + audio_data = audio[0] if isinstance(audio[0], np.ndarray) else audio[1] + yield encode_wav_chunk(32000, audio_data) # 每段 WAV 数据 + @app.post("/tts_stream") async def api_inference(req: InferenceRequest): try: infer_gen = inference( text=req.text, - text_lang=req.text_lang, + text_lang=i18n(req.text_lang), ref_audio_path=req.ref_audio, aux_ref_audio_paths=[], prompt_text=req.prompt_text, - prompt_lang=req.prompt_lang, + prompt_lang=i18n(req.prompt_lang), top_k=req.top_k, top_p=req.top_p, temperature=req.temperature, @@ -683,4 +681,5 @@ async def api_inference(req: InferenceRequest): if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8001) \ No newline at end of file + port = int(os.environ.get("PORT", 8001)) # 默认端口8001 + uvicorn.run(app, host="0.0.0.0", port=port) \ No newline at end of file diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index a473b915..6687a235 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -31,6 +31,11 @@ import torch import logging import time +import numpy + +# 在文件开头添加输出目录配置 +output_dir = os.environ.get("output_dir", "outputs") +os.makedirs(output_dir, exist_ok=True) now_dir = os.getcwd() sys.path.append(now_dir) @@ -206,8 +211,19 @@ def inference( start_time = time.time() - for item in tts_pipeline.run(inputs): - yield item, actual_seed + for audio in tts_pipeline.run(inputs): + if isinstance(audio, tuple): + # 保存到本地 + output_filename = f"tts_{int(time.time())}.wav" + output_path = os.path.join(output_dir, output_filename) + audio_data = audio[0] if isinstance(audio[0], numpy.ndarray) else audio[1] + import soundfile as sf + sf.write(output_path, audio_data, 32000) + logging.info(f"音频已保存至: {output_path}") + # 返回原始音频数据给 Gradio + yield audio, actual_seed + else: + yield audio, actual_seed logging.info( f"TTS请求耗时: {time.time() - start_time:.3f}s | 文本: {text}" diff --git a/GPT_SoVITS/text/ja_userdic/user.dict b/GPT_SoVITS/text/ja_userdic/user.dict new file mode 100644 index 00000000..6ddcfef6 Binary files /dev/null and b/GPT_SoVITS/text/ja_userdic/user.dict differ diff --git a/GPT_SoVITS/text/ja_userdic/userdict.md5 b/GPT_SoVITS/text/ja_userdic/userdict.md5 new file mode 100644 index 00000000..7848c979 --- /dev/null +++ b/GPT_SoVITS/text/ja_userdic/userdict.md5 @@ -0,0 +1 @@ +d36bd5ffba62f195d22bf4f1a41cd08f \ No newline at end of file diff --git a/ref_audio/1.wav b/ref_audio/1.wav new file mode 100644 index 00000000..66894ac5 Binary files /dev/null and b/ref_audio/1.wav differ diff --git a/ref_audio/2.wav b/ref_audio/2.wav new file mode 100644 index 00000000..50ef77e9 Binary files /dev/null and b/ref_audio/2.wav differ diff --git a/ref_audio/3.wav b/ref_audio/3.wav new file mode 100644 index 00000000..04f68f01 Binary files /dev/null and b/ref_audio/3.wav differ