add np_utils.py and api.md

2025-10-07 15:19:59 +08:00 · 2024-05-21 05:23:09 -10:00 · 2024-05-21 05:23:09 -10:00 · d46a29b35b
commit d46a29b35b
parent 6c172903ca
7 changed files with 147 additions and 162 deletions
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -17,7 +17,6 @@ logging.getLogger("httpx").setLevel(logging.ERROR)
 logging.getLogger("asyncio").setLevel(logging.ERROR)
 logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
 logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
-import pdb
 import torch

 if os.path.exists("./gweight.txt"):
@ -66,7 +65,7 @@ from text import cleaned_text_to_sequence
 from text.cleaner import clean_text
 from time import time as ttime
 from module.mel_processing import spectrogram_torch
-from my_utils import load_audio
+from pyutils.np_utils import load_audio
 from tools.i18n.i18n import I18nAuto

 i18n = I18nAuto()
--- a/GPT_SoVITS/module/data_utils.py
+++ b/GPT_SoVITS/module/data_utils.py
@ -1,23 +1,14 @@
-import time
-import logging
 import os
 import random
 import traceback
-import numpy as np
 import torch
 import torch.utils.data
 from tqdm import tqdm

-from module import commons
 from module.mel_processing import spectrogram_torch
 from text import cleaned_text_to_sequence
-from utils import load_wav_to_torch, load_filepaths_and_text
 import torch.nn.functional as F
-from functools import lru_cache
-import requests
-from scipy.io import wavfile
-from io import BytesIO
-from my_utils import load_audio
+from pyutils.np_utils import load_audio

 # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
 class TextAudioSpeakerLoader(torch.utils.data.Dataset):
--- a/GPT_SoVITS/onnx_export.py
+++ b/GPT_SoVITS/onnx_export.py
@ -9,7 +9,6 @@ cnhubert.cnhubert_base_path=cnhubert_base_path
 ssl_model = cnhubert.get_model()
 from text import cleaned_text_to_sequence
 import soundfile
-from my_utils import load_audio
 import os
 import json

--- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
+++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
@ -12,12 +12,12 @@ opt_dir=                            os.environ.get("opt_dir")
 cnhubert.cnhubert_base_path=                os.environ.get("cnhubert_base_dir")
 is_half=eval(os.environ.get("is_half","True"))

-import pdb,traceback,numpy as np,logging
+import traceback,numpy as np
 from scipy.io import wavfile
 import librosa,torch
 now_dir = os.getcwd()
 sys.path.append(now_dir)
-from my_utils import load_audio
+from pyutils.np_utils import load_audio

 # from config import cnhubert_base_path
 # cnhubert.cnhubert_base_path=cnhubert_base_path
--- a/GPT_SoVITS/pyutils/np_utils.py
+++ b/GPT_SoVITS/pyutils/np_utils.py
@ -1,21 +1,21 @@
-import ffmpeg
-import numpy as np
-
-
-def load_audio(file, sr):
-    try:
-        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
-        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
-        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
-        file = (
-            file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
-        )  # 防止小白拷路径头尾带了空格和"和回车
-        out, _ = (
-            ffmpeg.input(file, threads=0)
-            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
-            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
-        )
-    except Exception as e:
-        raise RuntimeError(f"Failed to load audio: {e}")
-
-    return np.frombuffer(out, np.float32).flatten()
+import ffmpeg
+import numpy as np
+
+
+def load_audio(file, sr):
+    try:
+        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        file = (
+            file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        )  # 防止小白拷路径头尾带了空格和"和回车
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to load audio: {e}")
+
+    return np.frombuffer(out, np.float32).flatten()
--- a/api.md
+++ b/api.md
@ -0,0 +1,115 @@
+# api
+
+` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
+
+## 执行参数:
+
+`-s` - `SoVITS模型路径, 可在 config.py 中指定`  
+`-g` - `GPT模型路径, 可在 config.py 中指定`  
+
+调用请求缺少参考音频时使用
+`-dr` - `默认参考音频路径`  
+`-dt` - `默认参考音频文本`  
+`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`  
+
+`-d` - `推理设备, "cuda","cpu"`  
+`-a` - `绑定地址, 默认"127.0.0.1"`  
+`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`  
+`-fp` - `覆盖 config.py 使用全精度`  
+`-hp` - `覆盖 config.py 使用半精度`  
+`-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"`  
+·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"`  
+·-cp` - `文本切分符号设定, 默认为空, 以",.，。"字符串的方式传入`  
+
+`-hb` - `cnhubert路径`  
+`-b` - `bert路径`  
+
+## 调用:
+
+### 推理
+
+endpoint: `/`
+
+使用执行参数指定的参考音频:
+- GET:
+    `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh`
+
+- POST:
+```json
+{
+    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
+    "text_language": "zh"
+}
+```
+
+使用执行参数指定的参考音频并设定分割符号:
+- GET:
+    `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh&cut_punc=，。`
+- POST:
+```json
+{
+    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
+    "text_language": "zh",
+    "cut_punc": "，。"
+}
+```
+
+手动指定当次推理所使用的参考音频:
+- GET:
+    `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh`
+- POST:
+```json
+{
+    "refer_wav_path": "123.wav",
+    "prompt_text": "一二三。",
+    "prompt_language": "zh",
+    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
+    "text_language": "zh"
+}
+```
+
+RESP:
+- 成功: 直接返回 wav 音频流， http code 200
+- 失败: 返回包含错误信息的 json, http code 400
+
+
+### 更换默认参考音频
+
+endpoint: `/change_refer`
+
+key与推理端一样
+
+- GET:
+    `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
+- POST:
+```json
+{
+    "refer_wav_path": "123.wav",
+    "prompt_text": "一二三。",
+    "prompt_language": "zh"
+}
+```
+
+RESP:
+成功: json, http code 200
+失败: json, 400
+
+
+### 命令控制
+
+endpoint: `/control`
+
+command:
+"restart": 重新运行
+"exit": 结束运行
+
+- GET:
+    `http://127.0.0.1:9880/control?command=restart`
+- POST:
+```json
+{
+    "command": "restart"
+}
+```
+
+RESP: 无
--- a/api.py
+++ b/api.py
@ -1,129 +1,10 @@
-"""
-# api.py usage
-
-` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
-
-## 执行参数:
-
-`-s` - `SoVITS模型路径, 可在 config.py 中指定`
-`-g` - `GPT模型路径, 可在 config.py 中指定`
-
-调用请求缺少参考音频时使用
-`-dr` - `默认参考音频路径`
-`-dt` - `默认参考音频文本`
-`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
-
-`-d` - `推理设备, "cuda","cpu"`
-`-a` - `绑定地址, 默认"127.0.0.1"`
-`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
-`-fp` - `覆盖 config.py 使用全精度`
-`-hp` - `覆盖 config.py 使用半精度`
-`-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"`
-·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"`
-·-cp` - `文本切分符号设定, 默认为空, 以",.，。"字符串的方式传入`
-
-`-hb` - `cnhubert路径`
-`-b` - `bert路径`
-
-## 调用:
-
-### 推理
-
-endpoint: `/`
-
-使用执行参数指定的参考音频:
-GET:
-    `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh`
-POST:
-```json
-{
-    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
-    "text_language": "zh"
-}
-```
-
-使用执行参数指定的参考音频并设定分割符号:
-GET:
-    `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh&cut_punc=，。`
-POST:
-```json
-{
-    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
-    "text_language": "zh",
-    "cut_punc": "，。",
-}
-```
-
-手动指定当次推理所使用的参考音频:
-GET:
-    `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh`
-POST:
-```json
-{
-    "refer_wav_path": "123.wav",
-    "prompt_text": "一二三。",
-    "prompt_language": "zh",
-    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
-    "text_language": "zh"
-}
-```
-
-RESP:
-成功: 直接返回 wav 音频流， http code 200
-失败: 返回包含错误信息的 json, http code 400
-
-
-### 更换默认参考音频
-
-endpoint: `/change_refer`
-
-key与推理端一样
-
-GET:
-    `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
-POST:
-```json
-{
-    "refer_wav_path": "123.wav",
-    "prompt_text": "一二三。",
-    "prompt_language": "zh"
-}
-```
-
-RESP:
-成功: json, http code 200
-失败: json, 400
-
-
-### 命令控制
-
-endpoint: `/control`
-
-command:
-"restart": 重新运行
-"exit": 结束运行
-
-GET:
-    `http://127.0.0.1:9880/control?command=restart`
-POST:
-```json
-{
-    "command": "restart"
-}
-```
-
-RESP: 无
-
-"""
-
-
 import argparse
 import os,re
 import sys

-now_dir = os.getcwd()
-sys.path.append(now_dir)
-sys.path.append("%s/GPT_SoVITS" % (now_dir))
+current_project_dir = os.getcwd()
+sys.path.append(current_project_dir)
+sys.path.append("%s/GPT_SoVITS" % (current_project_dir))

 import signal
 import LangSegment
@ -131,7 +12,7 @@ from time import time as ttime
 import torch
 import librosa
 import soundfile as sf
-from fastapi import FastAPI, Request, HTTPException
+from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse, JSONResponse
 import uvicorn
 from transformers import AutoModelForMaskedLM, AutoTokenizer
@ -143,7 +24,7 @@ from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 from text import cleaned_text_to_sequence
 from text.cleaner import clean_text
 from module.mel_processing import spectrogram_torch
-from my_utils import load_audio
+from pyutils.np_utils import load_audio
 import config as global_config
 import logging
 import subprocess
@ -159,7 +40,7 @@ class DefaultRefer:
        return is_full(self.path, self.text, self.language)


-def is_empty(*items):  # 任意一项不为空返回False
+def is_not_empty(*items):  # 任意一项不为空返回False
    for item in items:
        if item is not None and item != "":
            return False
@ -496,7 +377,7 @@ def handle_control(command):


 def handle_change(path, text, language):
-    if is_empty(path, text, language):
+    if is_not_empty(path, text, language):
        return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400)

    if path != "" or path is not None: