流式API

2026-06-04 05:01:27 +08:00 · 2025-07-15 11:38:15 +08:00 · 2025-07-15 11:38:15 +08:00 · 0d405f2d2a
commit 0d405f2d2a
parent bed73febf3
11 changed files with 55 additions and 87 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,6 +1,3 @@
-GPT_SoVITS/pretrained_models/*
-tools/asr/models/*
-tools/uvr5/uvr5_weights/*

 .git
 .DS_Store
@ -11,10 +8,7 @@ runtime
 .idea
 output
 logs
-SoVITS_weights*/
-GPT_weights*/
 TEMP
-weight.json
 ffmpeg*
 ffprobe*
 cfg.json
--- a/Docker/install_wrapper.sh
+++ b/Docker/install_wrapper.sh
@ -18,7 +18,7 @@ ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretr

 ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel

-bash install.sh --device "CU${CUDA_VERSION//./}" --source HF
+bash install.sh --device "MPS" --source HF

 pip cache purge

--- a/70
+++ b/70
@ -1,62 +1,20 @@
-ARG CUDA_VERSION=12.6
-ARG TORCH_BASE=full
+FROM python:3.10.18-bullseye

-FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE}
-
-LABEL maintainer="XXXXRT"
-LABEL version="V4"
+LABEL version="V2pro"
 LABEL description="Docker image for GPT-SoVITS"

-ARG CUDA_VERSION=12.6
+WORKDIR /GPT-SoVITS
+COPY requirements.txt /GPT-SoVITS
+RUN pip install -r requirements.txt

-ENV CUDA_VERSION=${CUDA_VERSION}
+COPY GPT_SoVITS /GPT-SoVITS/GPT_SoVITS
+COPY tools /GPT-SoVITS/tools
+COPY api.py /GPT-SoVITS
+COPY api_v2.py /GPT-SoVITS
+COPY config.py /GPT-SoVITS
+COPY webui.py /GPT-SoVITS
+COPY ref_audio /GPT-SoVITS/ref_audio

-SHELL ["/bin/bash", "-c"]
+EXPOSE 9871 9872 9873 9874 9880 8001 8002

-WORKDIR /workspace/GPT-SoVITS
-
-COPY Docker /workspace/GPT-SoVITS/Docker/
-
-ARG LITE=false
-ENV LITE=${LITE}
-
-ARG WORKFLOW=false
-ENV WORKFLOW=${WORKFLOW}
-
-ARG TARGETPLATFORM
-ENV TARGETPLATFORM=${TARGETPLATFORM}
-
-RUN bash Docker/miniconda_install.sh
-
-COPY extra-req.txt /workspace/GPT-SoVITS/
-
-COPY requirements.txt /workspace/GPT-SoVITS/
-
-COPY install.sh /workspace/GPT-SoVITS/
-
-RUN bash Docker/install_wrapper.sh
-
-EXPOSE 9871 9872 9873 9874 9880
-
-ENV PYTHONPATH="/workspace/GPT-SoVITS"
-
-RUN conda init bash && echo "conda activate base" >> ~/.bashrc
-
-WORKDIR /workspace
-
-RUN rm -rf /workspace/GPT-SoVITS
-
-WORKDIR /workspace/GPT-SoVITS
-
-COPY . /workspace/GPT-SoVITS
-
-CMD ["/bin/bash", "-c", "\
-  rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
-  rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
-  rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
-  rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
-  ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
-  ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
-  ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
-  ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
-  exec bash"]
+CMD ["/bin/bash", "-c", "python GPT_SoVITS/inference_webui_api.py"]
--- a/GPT_SoVITS/configs/tts_infer.yaml
+++ b/GPT_SoVITS/configs/tts_infer.yaml
@ -3,9 +3,9 @@ custom:
  cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
  device: cpu
  is_half: false
-  t2s_weights_path: GPT_weights_v2ProPlus/111-e15.ckpt
-  version: v2Pro
-  vits_weights_path: SoVITS_weights_v2ProPlus/111_e8_s136.pth
+  t2s_weights_path: GPT_SoVITS/pretrained_models/meiv2pp-e15.ckpt
+  version: v2
+  vits_weights_path: GPT_SoVITS/pretrained_models/meiv2pp_e8_s232.pth
 v1:
  bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
  cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
--- a/GPT_SoVITS/inference_webui_api.py
+++ b/GPT_SoVITS/inference_webui_api.py
@ -525,29 +525,30 @@ import tempfile
 import shutil
 import os
 from pydantic import BaseModel
+import soundfile as sf

 app = FastAPI()


 class InferenceRequest(BaseModel):
    text: str
-    text_lang: str = "中文"
+    text_lang: str = i18n("中文")
    ref_audio: str   # 这里是base64编码的音频文件内容
-    prompt_text: str = ""
-    prompt_lang: str = "中文"
-    top_k: int = 5
-    top_p: float = 1
-    temperature: float = 1
-    text_split_method: str = "按标点符号切"
+    prompt_text: str
+    prompt_lang: str = i18n("中文")
+    top_k: int = 6
+    top_p: float = 0.9
+    temperature: float = 0.95
+    text_split_method: str = i18n("按标点符号切")
    batch_size: int = 20
    speed_factor: float = 1.1
-    ref_text_free: bool = True
+    ref_text_free: bool = False
    split_bucket: bool = True
    fragment_interval: float = 0.3
    seed: int = -1
    keep_random: bool = True
    parallel_infer: bool = True
-    repetition_penalty: float = 1.35
+    repetition_penalty: float = 1.45
    sample_steps: int = 32
    super_sampling: bool = False

@ -632,24 +633,21 @@ def wav_chunk_streamer(infer_gen):
            wav_file.writeframes(audio.tobytes())
        return buffer.getvalue()

-    for wav_data, _ in infer_gen:
-        sr, audio = wav_data
-        if not isinstance(audio, np.ndarray):
-            audio = np.array(audio)
-        if audio.dtype != np.int16:
-            audio = (audio * 32768).astype(np.int16)
-        yield encode_wav_chunk(sr, audio)  # 每段 WAV 数据
+    for audio, _ in infer_gen:
+        audio_data = audio[0] if isinstance(audio[0], np.ndarray) else audio[1]
+        yield encode_wav_chunk(32000, audio_data)  # 每段 WAV 数据
+

@app.post("/tts_stream")
 async def api_inference(req: InferenceRequest):
    try:
        infer_gen = inference(
            text=req.text,
-            text_lang=req.text_lang,
+            text_lang=i18n(req.text_lang),
            ref_audio_path=req.ref_audio,
            aux_ref_audio_paths=[],
            prompt_text=req.prompt_text,
-            prompt_lang=req.prompt_lang,
+            prompt_lang=i18n(req.prompt_lang),
            top_k=req.top_k,
            top_p=req.top_p,
            temperature=req.temperature,
@ -683,4 +681,5 @@ async def api_inference(req: InferenceRequest):

 if __name__ == "__main__":
    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8001)
+    port = int(os.environ.get("PORT", 8001))  # 默认端口8001
+    uvicorn.run(app, host="0.0.0.0", port=port)
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@ -31,6 +31,11 @@ import torch

 import logging
 import time
+import numpy
+
+# 在文件开头添加输出目录配置
+output_dir = os.environ.get("output_dir", "outputs")
+os.makedirs(output_dir, exist_ok=True)

 now_dir = os.getcwd()
 sys.path.append(now_dir)
@ -206,8 +211,19 @@ def inference(

        start_time = time.time()

-        for item in tts_pipeline.run(inputs):
-            yield item, actual_seed
+        for audio in tts_pipeline.run(inputs):
+            if isinstance(audio, tuple):
+                # 保存到本地
+                output_filename = f"tts_{int(time.time())}.wav"
+                output_path = os.path.join(output_dir, output_filename)
+                audio_data = audio[0] if isinstance(audio[0], numpy.ndarray) else audio[1]
+                import soundfile as sf
+                sf.write(output_path, audio_data, 32000)
+                logging.info(f"音频已保存至: {output_path}")
+                # 返回原始音频数据给 Gradio
+                yield audio, actual_seed
+            else:
+                yield audio, actual_seed

        logging.info(
            f"TTS请求耗时: {time.time() - start_time:.3f}s | 文本: {text}"
--- a/GPT_SoVITS/text/ja_userdic/user.dict
+++ b/GPT_SoVITS/text/ja_userdic/user.dict
--- a/GPT_SoVITS/text/ja_userdic/userdict.md5
+++ b/GPT_SoVITS/text/ja_userdic/userdict.md5
@ -0,0 +1 @@
+d36bd5ffba62f195d22bf4f1a41cd08f
--- a/ref_audio/1.wav
+++ b/ref_audio/1.wav
--- a/ref_audio/2.wav
+++ b/ref_audio/2.wav
--- a/ref_audio/3.wav
+++ b/ref_audio/3.wav