turn it into a package

This commit is contained in:
Jarod Mica 2024-11-16 02:30:20 -08:00
parent a70e1ad30c
commit 71b2fe69ea
84 changed files with 643 additions and 563 deletions

View File

@ -1,60 +1,66 @@
from copy import deepcopy from copy import deepcopy
import math import math
import os, sys, gc import os, sys, gc
import random
import traceback
from tqdm import tqdm
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
import ffmpeg import ffmpeg
import os import librosa
from typing import Generator, List, Tuple, Union
import numpy as np import numpy as np
import random
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import traceback
import yaml import yaml
from transformers import AutoModelForMaskedLM, AutoTokenizer
from huggingface_hub import snapshot_download, hf_hub_download
from importlib.resources import files
from time import time as ttime
from typing import Generator, List, Tuple, Union
from tqdm import tqdm
from AR.models.t2s_lightning_module import Text2SemanticLightningModule from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from feature_extractor.cnhubert import CNHubert from feature_extractor.cnhubert import CNHubert
from module.models import SynthesizerTrn from module.models import SynthesizerTrn
import librosa from module.mel_processing import spectrogram_torch
from time import time as ttime
from tools.i18n.i18n import I18nAuto, scan_language_list from tools.i18n.i18n import I18nAuto, scan_language_list
from tools.my_utils import load_audio from tools.my_utils import load_audio
from module.mel_processing import spectrogram_torch from transformers import AutoModelForMaskedLM, AutoTokenizer
from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.text_segmentation_method import splits
from TTS_infer_pack.TextPreprocessor import TextPreprocessor from TTS_infer_pack.TextPreprocessor import TextPreprocessor
language=os.environ.get("language","Auto") language=os.environ.get("language","Auto")
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
i18n = I18nAuto(language=language) i18n = I18nAuto(language=language)
LIBRARY_NAME = "GPT_SoVITS"
# configs/tts_infer.yaml # configs/tts_infer.yaml
""" """
custom: custom:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu device: cpu
is_half: false is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
version: v2 version: v2
default: default:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu device: cpu
is_half: false is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt t2s_weights_path: pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth vits_weights_path: pretrained_models/s2G488k.pth
version: v1 version: v1
default_v2: default_v2:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu device: cpu
is_half: false is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
version: v2 version: v2
""" """
@ -86,19 +92,19 @@ class TTS_Config:
"device": "cpu", "device": "cpu",
"is_half": False, "is_half": False,
"version": "v1", "version": "v1",
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", "t2s_weights_path": "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth", "vits_weights_path": "pretrained_models/s2G488k.pth",
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", "cnhuhbert_base_path": "pretrained_models/chinese-hubert-base",
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", "bert_base_path": "pretrained_models/chinese-roberta-wwm-ext-large",
}, },
"default_v2":{ "default_v2":{
"device": "cpu", "device": "cpu",
"is_half": False, "is_half": False,
"version": "v2", "version": "v2",
"t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "t2s_weights_path": "pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "vits_weights_path": "pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", "cnhuhbert_base_path": "pretrained_models/chinese-hubert-base",
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", "bert_base_path": "pretrained_models/chinese-roberta-wwm-ext-large",
}, },
} }
configs:dict = None configs:dict = None
@ -120,7 +126,7 @@ class TTS_Config:
def __init__(self, configs: Union[dict, str]=None): def __init__(self, configs: Union[dict, str]=None):
# 设置默认配置文件路径 # 设置默认配置文件路径
configs_base_path:str = "GPT_SoVITS/configs/" configs_base_path:str = "configs/"
os.makedirs(configs_base_path, exist_ok=True) os.makedirs(configs_base_path, exist_ok=True)
self.configs_path:str = os.path.join(configs_base_path, "tts_infer.yaml") self.configs_path:str = os.path.join(configs_base_path, "tts_infer.yaml")
@ -152,23 +158,24 @@ class TTS_Config:
self.bert_base_path = self.configs.get("bert_base_path", None) self.bert_base_path = self.configs.get("bert_base_path", None)
self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None) self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None)
self.languages = self.v2_languages if self.version=="v2" else self.v1_languages self.languages = self.v2_languages if self.version=="v2" else self.v1_languages
if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)): if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)):
self.t2s_weights_path = self.default_configs[default_config_key]['t2s_weights_path'] self.t2s_weights_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['t2s_weights_path']))
print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}") print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}")
if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)): if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)):
self.vits_weights_path = self.default_configs[default_config_key]['vits_weights_path'] self.vits_weights_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['vits_weights_path']))
print(f"fall back to default vits_weights_path: {self.vits_weights_path}") print(f"fall back to default vits_weights_path: {self.vits_weights_path}")
if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)): if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)):
self.bert_base_path = self.default_configs[default_config_key]['bert_base_path'] self.bert_base_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['bert_base_path']))
print(f"fall back to default bert_base_path: {self.bert_base_path}") print(f"fall back to default bert_base_path: {self.bert_base_path}")
if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)): if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)):
self.cnhuhbert_base_path = self.default_configs[default_config_key]['cnhuhbert_base_path'] self.cnhuhbert_base_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['cnhuhbert_base_path']))
print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}") print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}")
repo_name="lj1995/GPT-SoVITS"
snapshot_download(repo_id=repo_name, local_dir=os.path.dirname(self.bert_base_path))
self.update_configs() self.update_configs()
self.max_sec = None self.max_sec = None
self.hz:int = 50 self.hz:int = 50
self.semantic_frame_rate:str = "25hz" self.semantic_frame_rate:str = "25hz"

View File

@ -114,6 +114,8 @@ from fastapi import FastAPI, Request, HTTPException, Response
from fastapi.responses import StreamingResponse, JSONResponse from fastapi.responses import StreamingResponse, JSONResponse
from fastapi import FastAPI, UploadFile, File from fastapi import FastAPI, UploadFile, File
import uvicorn import uvicorn
from importlib.resources import files
from io import BytesIO from io import BytesIO
from tools.i18n.i18n import I18nAuto from tools.i18n.i18n import I18nAuto
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
@ -125,7 +127,7 @@ i18n = I18nAuto()
cut_method_names = get_cut_method_names() cut_method_names = get_cut_method_names()
parser = argparse.ArgumentParser(description="GPT-SoVITS api") parser = argparse.ArgumentParser(description="GPT-SoVITS api")
parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径") parser.add_argument("-c", "--tts_config", type=str, default=None, help="tts_infer路径")
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880") parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880")
args = parser.parse_args() args = parser.parse_args()
@ -136,7 +138,7 @@ host = args.bind_addr
argv = sys.argv argv = sys.argv
if config_path in [None, ""]: if config_path in [None, ""]:
config_path = "GPT-SoVITS/configs/tts_infer.yaml" config_path = str(files("GPT_SoVITS").joinpath("configs/tts_infer.yaml"))
tts_config = TTS_Config(config_path) tts_config = TTS_Config(config_path)
print(tts_config) print(tts_config)
@ -394,7 +396,7 @@ async def tts_get_endpoint(
@APP.post("/tts") @APP.post("/tts")
async def tts_post_endpoint(request: TTS_Request): async def tts_post_endpoint(request: TTS_Request):
req = request.dict() req = request.model_dump()
return await tts_handle(req) return await tts_handle(req)
@ -449,7 +451,8 @@ async def set_sovits_weights(weights_path: str = None):
if __name__ == "__main__": def main():
global port, host, argv
try: try:
if host == 'None': # 在调用时使用 -a None 参数可以让api监听双栈 if host == 'None': # 在调用时使用 -a None 参数可以让api监听双栈
host = None host = None
@ -458,3 +461,6 @@ if __name__ == "__main__":
traceback.print_exc() traceback.print_exc()
os.kill(os.getpid(), signal.SIGTERM) os.kill(os.getpid(), signal.SIGTERM)
exit(0) exit(0)
if __name__ == "__main__":
main()

View File

@ -1,24 +1,24 @@
custom: custom:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cuda device: cuda
is_half: true is_half: true
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
version: v2 version: v2
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
default: default:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu device: cpu
is_half: false is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt t2s_weights_path: pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
version: v1 version: v1
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth vits_weights_path: pretrained_models/s2G488k.pth
default_v2: default_v2:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu device: cpu
is_half: false is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
version: v2 version: v2
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth

View File

@ -1,33 +1,33 @@
import os,argparse import os,argparse
import traceback import traceback
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from tqdm import tqdm from tqdm import tqdm
path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
def execute_denoise(input_folder,output_folder): def execute_denoise(input_folder,output_folder):
os.makedirs(output_folder,exist_ok=True) os.makedirs(output_folder,exist_ok=True)
# print(input_folder) # print(input_folder)
# print(list(os.listdir(input_folder).sort())) # print(list(os.listdir(input_folder).sort()))
for name in tqdm(os.listdir(input_folder)): for name in tqdm(os.listdir(input_folder)):
try: try:
ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
except: except:
traceback.print_exc() traceback.print_exc()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_folder", type=str, required=True, parser.add_argument("-i", "--input_folder", type=str, required=True,
help="Path to the folder containing WAV files.") help="Path to the folder containing WAV files.")
parser.add_argument("-o", "--output_folder", type=str, required=True, parser.add_argument("-o", "--output_folder", type=str, required=True,
help="Output folder to store transcriptions.") help="Output folder to store transcriptions.")
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
help="fp16 or fp32")#还没接入 help="fp16 or fp32")#还没接入
cmd = parser.parse_args() cmd = parser.parse_args()
execute_denoise( execute_denoise(
input_folder = cmd.input_folder, input_folder = cmd.input_folder,
output_folder = cmd.output_folder, output_folder = cmd.output_folder,
) )

View File

@ -1,115 +1,115 @@
import platform,os,traceback import platform,os,traceback
import ffmpeg import ffmpeg
import numpy as np import numpy as np
import gradio as gr import gradio as gr
from tools.i18n.i18n import I18nAuto from tools.i18n.i18n import I18nAuto
import pandas as pd import pandas as pd
i18n = I18nAuto(language=os.environ.get('language','Auto')) i18n = I18nAuto(language=os.environ.get('language','Auto'))
def load_audio(file, sr): def load_audio(file, sr):
try: try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary. # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
if os.path.exists(file) == False: if os.path.exists(file) == False:
raise RuntimeError( raise RuntimeError(
"You input a wrong audio path that does not exists, please fix it!" "You input a wrong audio path that does not exists, please fix it!"
) )
out, _ = ( out, _ = (
ffmpeg.input(file, threads=0) ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
) )
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
raise RuntimeError(i18n("音频加载失败")) raise RuntimeError(i18n("音频加载失败"))
return np.frombuffer(out, np.float32).flatten() return np.frombuffer(out, np.float32).flatten()
def clean_path(path_str:str): def clean_path(path_str:str):
if path_str.endswith(('\\','/')): if path_str.endswith(('\\','/')):
return clean_path(path_str[0:-1]) return clean_path(path_str[0:-1])
path_str = path_str.replace('/', os.sep).replace('\\', os.sep) path_str = path_str.replace('/', os.sep).replace('\\', os.sep)
return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")
def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False): def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False):
files_status=[] files_status=[]
if is_train == True and file_list: if is_train == True and file_list:
file_list.append(os.path.join(file_list[0],'2-name2text.txt')) file_list.append(os.path.join(file_list[0],'2-name2text.txt'))
file_list.append(os.path.join(file_list[0],'3-bert')) file_list.append(os.path.join(file_list[0],'3-bert'))
file_list.append(os.path.join(file_list[0],'4-cnhubert')) file_list.append(os.path.join(file_list[0],'4-cnhubert'))
file_list.append(os.path.join(file_list[0],'5-wav32k')) file_list.append(os.path.join(file_list[0],'5-wav32k'))
file_list.append(os.path.join(file_list[0],'6-name2semantic.tsv')) file_list.append(os.path.join(file_list[0],'6-name2semantic.tsv'))
for file in file_list: for file in file_list:
if os.path.exists(file):files_status.append(True) if os.path.exists(file):files_status.append(True)
else:files_status.append(False) else:files_status.append(False)
if sum(files_status)!=len(files_status): if sum(files_status)!=len(files_status):
if is_train: if is_train:
for file,status in zip(file_list,files_status): for file,status in zip(file_list,files_status):
if status:pass if status:pass
else:gr.Warning(file) else:gr.Warning(file)
gr.Warning(i18n('以下文件或文件夹不存在')) gr.Warning(i18n('以下文件或文件夹不存在'))
return False return False
elif is_dataset_processing: elif is_dataset_processing:
if files_status[0]: if files_status[0]:
return True return True
elif not files_status[0]: elif not files_status[0]:
gr.Warning(file_list[0]) gr.Warning(file_list[0])
elif not files_status[1] and file_list[1]: elif not files_status[1] and file_list[1]:
gr.Warning(file_list[1]) gr.Warning(file_list[1])
gr.Warning(i18n('以下文件或文件夹不存在')) gr.Warning(i18n('以下文件或文件夹不存在'))
return False return False
else: else:
if file_list[0]: if file_list[0]:
gr.Warning(file_list[0]) gr.Warning(file_list[0])
gr.Warning(i18n('以下文件或文件夹不存在')) gr.Warning(i18n('以下文件或文件夹不存在'))
else: else:
gr.Warning(i18n('路径不能为空')) gr.Warning(i18n('路径不能为空'))
return False return False
return True return True
def check_details(path_list=None,is_train=False,is_dataset_processing=False): def check_details(path_list=None,is_train=False,is_dataset_processing=False):
if is_dataset_processing: if is_dataset_processing:
list_path, audio_path = path_list list_path, audio_path = path_list
if (not list_path.endswith('.list')): if (not list_path.endswith('.list')):
gr.Warning(i18n('请填入正确的List路径')) gr.Warning(i18n('请填入正确的List路径'))
return return
if audio_path: if audio_path:
if not os.path.isdir(audio_path): if not os.path.isdir(audio_path):
gr.Warning(i18n('请填入正确的音频文件夹路径')) gr.Warning(i18n('请填入正确的音频文件夹路径'))
return return
with open(list_path,"r",encoding="utf8")as f: with open(list_path,"r",encoding="utf8")as f:
line=f.readline().strip("\n").split("\n") line=f.readline().strip("\n").split("\n")
wav_name, _, __, ___ = line[0].split("|") wav_name, _, __, ___ = line[0].split("|")
wav_name=clean_path(wav_name) wav_name=clean_path(wav_name)
if (audio_path != "" and audio_path != None): if (audio_path != "" and audio_path != None):
wav_name = os.path.basename(wav_name) wav_name = os.path.basename(wav_name)
wav_path = "%s/%s"%(audio_path, wav_name) wav_path = "%s/%s"%(audio_path, wav_name)
else: else:
wav_path=wav_name wav_path=wav_name
if os.path.exists(wav_path): if os.path.exists(wav_path):
... ...
else: else:
gr.Warning(i18n('路径错误')) gr.Warning(i18n('路径错误'))
return return
if is_train: if is_train:
path_list.append(os.path.join(path_list[0],'2-name2text.txt')) path_list.append(os.path.join(path_list[0],'2-name2text.txt'))
path_list.append(os.path.join(path_list[0],'4-cnhubert')) path_list.append(os.path.join(path_list[0],'4-cnhubert'))
path_list.append(os.path.join(path_list[0],'5-wav32k')) path_list.append(os.path.join(path_list[0],'5-wav32k'))
path_list.append(os.path.join(path_list[0],'6-name2semantic.tsv')) path_list.append(os.path.join(path_list[0],'6-name2semantic.tsv'))
phone_path, hubert_path, wav_path, semantic_path = path_list[1:] phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
with open(phone_path,'r',encoding='utf-8') as f: with open(phone_path,'r',encoding='utf-8') as f:
if f.read(1):... if f.read(1):...
else:gr.Warning(i18n('缺少音素数据集')) else:gr.Warning(i18n('缺少音素数据集'))
if os.listdir(hubert_path):... if os.listdir(hubert_path):...
else:gr.Warning(i18n('缺少Hubert数据集')) else:gr.Warning(i18n('缺少Hubert数据集'))
if os.listdir(wav_path):... if os.listdir(wav_path):...
else:gr.Warning(i18n('缺少音频数据集')) else:gr.Warning(i18n('缺少音频数据集'))
df = pd.read_csv( df = pd.read_csv(
semantic_path, delimiter="\t", encoding="utf-8" semantic_path, delimiter="\t", encoding="utf-8"
) )
if len(df) >= 1:... if len(df) >= 1:...
else:gr.Warning(i18n('缺少语义数据集')) else:gr.Warning(i18n('缺少语义数据集'))

View File

@ -1,48 +1,48 @@
import os,sys,numpy as np import os,sys,numpy as np
import traceback import traceback
from scipy.io import wavfile from scipy.io import wavfile
# parent_directory = os.path.dirname(os.path.abspath(__file__)) # parent_directory = os.path.dirname(os.path.abspath(__file__))
# sys.path.append(parent_directory) # sys.path.append(parent_directory)
from tools.my_utils import load_audio from tools.my_utils import load_audio
from slicer2 import Slicer from slicer2 import Slicer
def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
os.makedirs(opt_root,exist_ok=True) os.makedirs(opt_root,exist_ok=True)
if os.path.isfile(inp): if os.path.isfile(inp):
input=[inp] input=[inp]
elif os.path.isdir(inp): elif os.path.isdir(inp):
input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
else: else:
return "输入路径存在但既不是文件也不是文件夹" return "输入路径存在但既不是文件也不是文件夹"
slicer = Slicer( slicer = Slicer(
sr=32000, # 长音频采样率 sr=32000, # 长音频采样率
threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
min_interval= int(min_interval), # 最短切割间隔 min_interval= int(min_interval), # 最短切割间隔
hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
) )
_max=float(_max) _max=float(_max)
alpha=float(alpha) alpha=float(alpha)
for inp_path in input[int(i_part)::int(all_part)]: for inp_path in input[int(i_part)::int(all_part)]:
# print(inp_path) # print(inp_path)
try: try:
name = os.path.basename(inp_path) name = os.path.basename(inp_path)
audio = load_audio(inp_path, 32000) audio = load_audio(inp_path, 32000)
# print(audio.shape) # print(audio.shape)
for chunk, start, end in slicer.slice(audio): # start和end是帧数 for chunk, start, end in slicer.slice(audio): # start和end是帧数
tmp_max = np.abs(chunk).max() tmp_max = np.abs(chunk).max()
if(tmp_max>1):chunk/=tmp_max if(tmp_max>1):chunk/=tmp_max
chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
wavfile.write( wavfile.write(
"%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
32000, 32000,
# chunk.astype(np.float32), # chunk.astype(np.float32),
(chunk * 32767).astype(np.int16), (chunk * 32767).astype(np.int16),
) )
except: except:
print(inp_path,"->fail->",traceback.format_exc()) print(inp_path,"->fail->",traceback.format_exc())
return "执行完毕,请检查输出文件" return "执行完毕,请检查输出文件"
print(slice(*sys.argv[1:])) print(slice(*sys.argv[1:]))

View File

@ -1,261 +1,261 @@
import numpy as np import numpy as np
# This function is obtained from librosa. # This function is obtained from librosa.
def get_rms( def get_rms(
y, y,
frame_length=2048, frame_length=2048,
hop_length=512, hop_length=512,
pad_mode="constant", pad_mode="constant",
): ):
padding = (int(frame_length // 2), int(frame_length // 2)) padding = (int(frame_length // 2), int(frame_length // 2))
y = np.pad(y, padding, mode=pad_mode) y = np.pad(y, padding, mode=pad_mode)
axis = -1 axis = -1
# put our new within-frame axis at the end for now # put our new within-frame axis at the end for now
out_strides = y.strides + tuple([y.strides[axis]]) out_strides = y.strides + tuple([y.strides[axis]])
# Reduce the shape on the framing axis # Reduce the shape on the framing axis
x_shape_trimmed = list(y.shape) x_shape_trimmed = list(y.shape)
x_shape_trimmed[axis] -= frame_length - 1 x_shape_trimmed[axis] -= frame_length - 1
out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
if axis < 0: if axis < 0:
target_axis = axis - 1 target_axis = axis - 1
else: else:
target_axis = axis + 1 target_axis = axis + 1
xw = np.moveaxis(xw, -1, target_axis) xw = np.moveaxis(xw, -1, target_axis)
# Downsample along the target axis # Downsample along the target axis
slices = [slice(None)] * xw.ndim slices = [slice(None)] * xw.ndim
slices[axis] = slice(0, None, hop_length) slices[axis] = slice(0, None, hop_length)
x = xw[tuple(slices)] x = xw[tuple(slices)]
# Calculate power # Calculate power
power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
return np.sqrt(power) return np.sqrt(power)
class Slicer: class Slicer:
def __init__( def __init__(
self, self,
sr: int, sr: int,
threshold: float = -40.0, threshold: float = -40.0,
min_length: int = 5000, min_length: int = 5000,
min_interval: int = 300, min_interval: int = 300,
hop_size: int = 20, hop_size: int = 20,
max_sil_kept: int = 5000, max_sil_kept: int = 5000,
): ):
if not min_length >= min_interval >= hop_size: if not min_length >= min_interval >= hop_size:
raise ValueError( raise ValueError(
"The following condition must be satisfied: min_length >= min_interval >= hop_size" "The following condition must be satisfied: min_length >= min_interval >= hop_size"
) )
if not max_sil_kept >= hop_size: if not max_sil_kept >= hop_size:
raise ValueError( raise ValueError(
"The following condition must be satisfied: max_sil_kept >= hop_size" "The following condition must be satisfied: max_sil_kept >= hop_size"
) )
min_interval = sr * min_interval / 1000 min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.0) self.threshold = 10 ** (threshold / 20.0)
self.hop_size = round(sr * hop_size / 1000) self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size) self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size) self.min_length = round(sr * min_length / 1000 / self.hop_size)
self.min_interval = round(min_interval / self.hop_size) self.min_interval = round(min_interval / self.hop_size)
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
def _apply_slice(self, waveform, begin, end): def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1: if len(waveform.shape) > 1:
return waveform[ return waveform[
:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
] ]
else: else:
return waveform[ return waveform[
begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
] ]
# @timeit # @timeit
def slice(self, waveform): def slice(self, waveform):
if len(waveform.shape) > 1: if len(waveform.shape) > 1:
samples = waveform.mean(axis=0) samples = waveform.mean(axis=0)
else: else:
samples = waveform samples = waveform
if samples.shape[0] <= self.min_length: if samples.shape[0] <= self.min_length:
return [waveform] return [waveform]
rms_list = get_rms( rms_list = get_rms(
y=samples, frame_length=self.win_size, hop_length=self.hop_size y=samples, frame_length=self.win_size, hop_length=self.hop_size
).squeeze(0) ).squeeze(0)
sil_tags = [] sil_tags = []
silence_start = None silence_start = None
clip_start = 0 clip_start = 0
for i, rms in enumerate(rms_list): for i, rms in enumerate(rms_list):
# Keep looping while frame is silent. # Keep looping while frame is silent.
if rms < self.threshold: if rms < self.threshold:
# Record start of silent frames. # Record start of silent frames.
if silence_start is None: if silence_start is None:
silence_start = i silence_start = i
continue continue
# Keep looping while frame is not silent and silence start has not been recorded. # Keep looping while frame is not silent and silence start has not been recorded.
if silence_start is None: if silence_start is None:
continue continue
# Clear recorded silence start if interval is not enough or clip is too short # Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = ( need_slice_middle = (
i - silence_start >= self.min_interval i - silence_start >= self.min_interval
and i - clip_start >= self.min_length and i - clip_start >= self.min_length
) )
if not is_leading_silence and not need_slice_middle: if not is_leading_silence and not need_slice_middle:
silence_start = None silence_start = None
continue continue
# Need slicing. Record the range of silent frames to be removed. # Need slicing. Record the range of silent frames to be removed.
if i - silence_start <= self.max_sil_kept: if i - silence_start <= self.max_sil_kept:
pos = rms_list[silence_start : i + 1].argmin() + silence_start pos = rms_list[silence_start : i + 1].argmin() + silence_start
if silence_start == 0: if silence_start == 0:
sil_tags.append((0, pos)) sil_tags.append((0, pos))
else: else:
sil_tags.append((pos, pos)) sil_tags.append((pos, pos))
clip_start = pos clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2: elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[ pos = rms_list[
i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
].argmin() ].argmin()
pos += i - self.max_sil_kept pos += i - self.max_sil_kept
pos_l = ( pos_l = (
rms_list[ rms_list[
silence_start : silence_start + self.max_sil_kept + 1 silence_start : silence_start + self.max_sil_kept + 1
].argmin() ].argmin()
+ silence_start + silence_start
) )
pos_r = ( pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin() rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i + i
- self.max_sil_kept - self.max_sil_kept
) )
if silence_start == 0: if silence_start == 0:
sil_tags.append((0, pos_r)) sil_tags.append((0, pos_r))
clip_start = pos_r clip_start = pos_r
else: else:
sil_tags.append((min(pos_l, pos), max(pos_r, pos))) sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos) clip_start = max(pos_r, pos)
else: else:
pos_l = ( pos_l = (
rms_list[ rms_list[
silence_start : silence_start + self.max_sil_kept + 1 silence_start : silence_start + self.max_sil_kept + 1
].argmin() ].argmin()
+ silence_start + silence_start
) )
pos_r = ( pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin() rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i + i
- self.max_sil_kept - self.max_sil_kept
) )
if silence_start == 0: if silence_start == 0:
sil_tags.append((0, pos_r)) sil_tags.append((0, pos_r))
else: else:
sil_tags.append((pos_l, pos_r)) sil_tags.append((pos_l, pos_r))
clip_start = pos_r clip_start = pos_r
silence_start = None silence_start = None
# Deal with trailing silence. # Deal with trailing silence.
total_frames = rms_list.shape[0] total_frames = rms_list.shape[0]
if ( if (
silence_start is not None silence_start is not None
and total_frames - silence_start >= self.min_interval and total_frames - silence_start >= self.min_interval
): ):
silence_end = min(total_frames, silence_start + self.max_sil_kept) silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1)) sil_tags.append((pos, total_frames + 1))
# Apply and return slices. # Apply and return slices.
####音频+起始时间+终止时间 ####音频+起始时间+终止时间
if len(sil_tags) == 0: if len(sil_tags) == 0:
return [[waveform,0,int(total_frames*self.hop_size)]] return [[waveform,0,int(total_frames*self.hop_size)]]
else: else:
chunks = [] chunks = []
if sil_tags[0][0] > 0: if sil_tags[0][0] > 0:
chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)]) chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
for i in range(len(sil_tags) - 1): for i in range(len(sil_tags) - 1):
chunks.append( chunks.append(
[self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)] [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
) )
if sil_tags[-1][1] < total_frames: if sil_tags[-1][1] < total_frames:
chunks.append( chunks.append(
[self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)] [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
) )
return chunks return chunks
def main(): def main():
import os.path import os.path
from argparse import ArgumentParser from argparse import ArgumentParser
import librosa import librosa
import soundfile import soundfile
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument("audio", type=str, help="The audio to be sliced") parser.add_argument("audio", type=str, help="The audio to be sliced")
parser.add_argument( parser.add_argument(
"--out", type=str, help="Output directory of the sliced audio clips" "--out", type=str, help="Output directory of the sliced audio clips"
) )
parser.add_argument( parser.add_argument(
"--db_thresh", "--db_thresh",
type=float, type=float,
required=False, required=False,
default=-40, default=-40,
help="The dB threshold for silence detection", help="The dB threshold for silence detection",
) )
parser.add_argument( parser.add_argument(
"--min_length", "--min_length",
type=int, type=int,
required=False, required=False,
default=5000, default=5000,
help="The minimum milliseconds required for each sliced audio clip", help="The minimum milliseconds required for each sliced audio clip",
) )
parser.add_argument( parser.add_argument(
"--min_interval", "--min_interval",
type=int, type=int,
required=False, required=False,
default=300, default=300,
help="The minimum milliseconds for a silence part to be sliced", help="The minimum milliseconds for a silence part to be sliced",
) )
parser.add_argument( parser.add_argument(
"--hop_size", "--hop_size",
type=int, type=int,
required=False, required=False,
default=10, default=10,
help="Frame length in milliseconds", help="Frame length in milliseconds",
) )
parser.add_argument( parser.add_argument(
"--max_sil_kept", "--max_sil_kept",
type=int, type=int,
required=False, required=False,
default=500, default=500,
help="The maximum silence length kept around the sliced clip, presented in milliseconds", help="The maximum silence length kept around the sliced clip, presented in milliseconds",
) )
args = parser.parse_args() args = parser.parse_args()
out = args.out out = args.out
if out is None: if out is None:
out = os.path.dirname(os.path.abspath(args.audio)) out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = librosa.load(args.audio, sr=None, mono=False) audio, sr = librosa.load(args.audio, sr=None, mono=False)
slicer = Slicer( slicer = Slicer(
sr=sr, sr=sr,
threshold=args.db_thresh, threshold=args.db_thresh,
min_length=args.min_length, min_length=args.min_length,
min_interval=args.min_interval, min_interval=args.min_interval,
hop_size=args.hop_size, hop_size=args.hop_size,
max_sil_kept=args.max_sil_kept, max_sil_kept=args.max_sil_kept,
) )
chunks = slicer.slice(audio) chunks = slicer.slice(audio)
if not os.path.exists(out): if not os.path.exists(out):
os.makedirs(out) os.makedirs(out)
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1: if len(chunk.shape) > 1:
chunk = chunk.T chunk = chunk.T
soundfile.write( soundfile.write(
os.path.join( os.path.join(
out, out,
f"%s_%d.wav" f"%s_%d.wav"
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
), ),
chunk, chunk,
sr, sr,
) )
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,54 +1,54 @@
{ {
"bins": 672, "bins": 672,
"unstable_bins": 8, "unstable_bins": 8,
"reduction_bins": 530, "reduction_bins": 530,
"band": { "band": {
"1": { "1": {
"sr": 7350, "sr": 7350,
"hl": 80, "hl": 80,
"n_fft": 640, "n_fft": 640,
"crop_start": 0, "crop_start": 0,
"crop_stop": 85, "crop_stop": 85,
"lpf_start": 25, "lpf_start": 25,
"lpf_stop": 53, "lpf_stop": 53,
"res_type": "polyphase" "res_type": "polyphase"
}, },
"2": { "2": {
"sr": 7350, "sr": 7350,
"hl": 80, "hl": 80,
"n_fft": 320, "n_fft": 320,
"crop_start": 4, "crop_start": 4,
"crop_stop": 87, "crop_stop": 87,
"hpf_start": 25, "hpf_start": 25,
"hpf_stop": 12, "hpf_stop": 12,
"lpf_start": 31, "lpf_start": 31,
"lpf_stop": 62, "lpf_stop": 62,
"res_type": "polyphase" "res_type": "polyphase"
}, },
"3": { "3": {
"sr": 14700, "sr": 14700,
"hl": 160, "hl": 160,
"n_fft": 512, "n_fft": 512,
"crop_start": 17, "crop_start": 17,
"crop_stop": 216, "crop_stop": 216,
"hpf_start": 48, "hpf_start": 48,
"hpf_stop": 24, "hpf_stop": 24,
"lpf_start": 139, "lpf_start": 139,
"lpf_stop": 210, "lpf_stop": 210,
"res_type": "polyphase" "res_type": "polyphase"
}, },
"4": { "4": {
"sr": 44100, "sr": 44100,
"hl": 480, "hl": 480,
"n_fft": 960, "n_fft": 960,
"crop_start": 78, "crop_start": 78,
"crop_stop": 383, "crop_stop": 383,
"hpf_start": 130, "hpf_start": 130,
"hpf_stop": 86, "hpf_stop": 86,
"res_type": "kaiser_fast" "res_type": "kaiser_fast"
} }
}, },
"sr": 44100, "sr": 44100,
"pre_filter_start": 668, "pre_filter_start": 668,
"pre_filter_stop": 672 "pre_filter_stop": 672
} }

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
include GPT_SoVITS/configs/*

View File

@ -1,3 +1,6 @@
# Jarod's NOTE
Working on turning this into a package. Right now, the API *does in fact* work to make requests to and this can be installed. Will get instrucitons out sometime in the future.
<div align="center"> <div align="center">

63
pyproject.toml Normal file
View File

@ -0,0 +1,63 @@
[build-system]
requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"]
build-backend = "setuptools.build_meta"
[project]
name = "GPT_SoVITS"
dynamic = ["version"]
description = ""
readme = "README.md"
license = {text = "MIT License"}
classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
]
dependencies = [
"numpy==1.26.4",
"scipy",
"tensorboard",
"librosa==0.9.2",
"numba",
"pytorch-lightning",
"gradio>=4.0,<=4.24.0",
"ffmpeg-python",
"onnxruntime; sys_platform == 'darwin'",
"onnxruntime-gpu; sys_platform != 'darwin'",
"tqdm",
"funasr==1.0.27",
"cn2an",
"pypinyin",
"pyopenjtalk>=0.3.4",
"g2p_en",
"torchaudio",
"modelscope==1.10.0",
"sentencepiece",
"transformers",
"chardet",
"PyYAML",
"psutil",
"jieba_fast",
"jieba",
"LangSegment>=0.2.0",
"Faster_Whisper",
"wordsegment",
"rotary_embedding_torch",
"pyjyutping",
"g2pk2",
"ko_pron",
"opencc; sys_platform != 'linux'",
"opencc==1.1.1; sys_platform == 'linux'",
"python_mecab_ko; sys_platform != 'win32'",
"fastapi<0.112.2",
]
[project.urls]
Homepage = "https://github.com/RVC-Boss/GPT-SoVITS"
[project.scripts]
"gpt_sovits_api" = "GPT_SoVITS.api_v2:main"
[tool.setuptools.packages.find]
include = ["GPT_SoVITS"]

View File

@ -1,8 +1,8 @@
numpy==1.23.4 numpy==1.26.4
scipy scipy
tensorboard tensorboard
librosa==0.9.2 librosa==0.9.2
numba==0.56.4 numba
pytorch-lightning pytorch-lightning
gradio>=4.0,<=4.24.0 gradio>=4.0,<=4.24.0
ffmpeg-python ffmpeg-python