From 71b2fe69ea577e04e168a0182e9b7ba96768d24a Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Sat, 16 Nov 2024 02:30:20 -0800 Subject: [PATCH 01/13] turn it into a package --- GPT_SoVITS/TTS_infer_pack/TTS.py | 79 +-- api_v2.py => GPT_SoVITS/api_v2.py | 14 +- GPT_SoVITS/configs/tts_infer.yaml | 24 +- {tools => GPT_SoVITS/tools}/__init__.py | 0 {tools => GPT_SoVITS/tools}/asr/config.py | 0 .../tools}/asr/fasterwhisper_asr.py | 0 {tools => GPT_SoVITS/tools}/asr/funasr_asr.py | 0 .../tools}/asr/models/.gitignore | 0 {tools => GPT_SoVITS/tools}/cmd-denoise.py | 64 +-- .../tools}/denoise-model/.gitignore | 0 {tools => GPT_SoVITS/tools}/i18n/i18n.py | 0 .../tools}/i18n/locale/en_US.json | 0 .../tools}/i18n/locale/es_ES.json | 0 .../tools}/i18n/locale/fr_FR.json | 0 .../tools}/i18n/locale/it_IT.json | 0 .../tools}/i18n/locale/ja_JP.json | 0 .../tools}/i18n/locale/ko_KR.json | 0 .../tools}/i18n/locale/pt_BR.json | 0 .../tools}/i18n/locale/ru_RU.json | 0 .../tools}/i18n/locale/tr_TR.json | 0 .../tools}/i18n/locale/zh_CN.json | 0 .../tools}/i18n/locale/zh_HK.json | 0 .../tools}/i18n/locale/zh_SG.json | 0 .../tools}/i18n/locale/zh_TW.json | 0 {tools => GPT_SoVITS/tools}/i18n/scan_i18n.py | 0 {tools => GPT_SoVITS/tools}/my_utils.py | 230 ++++---- {tools => GPT_SoVITS/tools}/slice_audio.py | 96 ++-- {tools => GPT_SoVITS/tools}/slicer2.py | 522 +++++++++--------- {tools => GPT_SoVITS/tools}/subfix_webui.py | 0 .../tools}/uvr5/bs_roformer/__init__.py | 0 .../tools}/uvr5/bs_roformer/attend.py | 0 .../tools}/uvr5/bs_roformer/bs_roformer.py | 0 .../tools}/uvr5/bsroformer.py | 0 .../tools}/uvr5/lib/lib_v5/dataset.py | 0 .../tools}/uvr5/lib/lib_v5/layers.py | 0 .../tools}/uvr5/lib/lib_v5/layers_123812KB.py | 0 .../tools}/uvr5/lib/lib_v5/layers_123821KB.py | 0 .../tools}/uvr5/lib/lib_v5/layers_33966KB.py | 0 .../tools}/uvr5/lib/lib_v5/layers_537227KB.py | 0 .../tools}/uvr5/lib/lib_v5/layers_537238KB.py | 0 .../tools}/uvr5/lib/lib_v5/layers_new.py | 0 .../uvr5/lib/lib_v5/model_param_init.py | 0 .../modelparams/1band_sr16000_hl512.json | 0 .../modelparams/1band_sr32000_hl512.json | 0 .../modelparams/1band_sr33075_hl384.json | 0 .../modelparams/1band_sr44100_hl1024.json | 0 .../modelparams/1band_sr44100_hl256.json | 0 .../modelparams/1band_sr44100_hl512.json | 0 .../modelparams/1band_sr44100_hl512_cut.json | 0 .../lib/lib_v5/modelparams/2band_32000.json | 0 .../lib_v5/modelparams/2band_44100_lofi.json | 0 .../lib/lib_v5/modelparams/2band_48000.json | 0 .../lib/lib_v5/modelparams/3band_44100.json | 0 .../lib_v5/modelparams/3band_44100_mid.json | 0 .../lib_v5/modelparams/3band_44100_msb2.json | 0 .../lib/lib_v5/modelparams/4band_44100.json | 0 .../lib_v5/modelparams/4band_44100_mid.json | 0 .../lib_v5/modelparams/4band_44100_msb.json | 0 .../lib_v5/modelparams/4band_44100_msb2.json | 0 .../modelparams/4band_44100_reverse.json | 0 .../lib_v5/modelparams/4band_44100_sw.json | 0 .../uvr5/lib/lib_v5/modelparams/4band_v2.json | 0 .../lib/lib_v5/modelparams/4band_v2_sn.json | 0 .../uvr5/lib/lib_v5/modelparams/4band_v3.json | 106 ++-- .../uvr5/lib/lib_v5/modelparams/ensemble.json | 0 .../tools}/uvr5/lib/lib_v5/nets.py | 0 .../tools}/uvr5/lib/lib_v5/nets_123812KB.py | 0 .../tools}/uvr5/lib/lib_v5/nets_123821KB.py | 0 .../tools}/uvr5/lib/lib_v5/nets_33966KB.py | 0 .../tools}/uvr5/lib/lib_v5/nets_537227KB.py | 0 .../tools}/uvr5/lib/lib_v5/nets_537238KB.py | 0 .../tools}/uvr5/lib/lib_v5/nets_61968KB.py | 0 .../tools}/uvr5/lib/lib_v5/nets_new.py | 0 .../tools}/uvr5/lib/lib_v5/spec_utils.py | 0 .../tools}/uvr5/lib/name_params.json | 0 {tools => GPT_SoVITS/tools}/uvr5/lib/utils.py | 0 {tools => GPT_SoVITS/tools}/uvr5/mdxnet.py | 0 .../tools}/uvr5/uvr5_weights/.gitignore | 0 {tools => GPT_SoVITS/tools}/uvr5/vr.py | 0 {tools => GPT_SoVITS/tools}/uvr5/webui.py | 0 MANIFEST.in | 1 + README.md | 3 + pyproject.toml | 63 +++ requirements.txt | 4 +- 84 files changed, 643 insertions(+), 563 deletions(-) rename api_v2.py => GPT_SoVITS/api_v2.py (98%) rename {tools => GPT_SoVITS/tools}/__init__.py (100%) rename {tools => GPT_SoVITS/tools}/asr/config.py (100%) rename {tools => GPT_SoVITS/tools}/asr/fasterwhisper_asr.py (100%) rename {tools => GPT_SoVITS/tools}/asr/funasr_asr.py (100%) rename {tools => GPT_SoVITS/tools}/asr/models/.gitignore (100%) rename {tools => GPT_SoVITS/tools}/cmd-denoise.py (97%) rename {tools => GPT_SoVITS/tools}/denoise-model/.gitignore (100%) rename {tools => GPT_SoVITS/tools}/i18n/i18n.py (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/en_US.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/es_ES.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/fr_FR.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/it_IT.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/ja_JP.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/ko_KR.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/pt_BR.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/ru_RU.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/tr_TR.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/zh_CN.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/zh_HK.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/zh_SG.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/locale/zh_TW.json (100%) rename {tools => GPT_SoVITS/tools}/i18n/scan_i18n.py (100%) rename {tools => GPT_SoVITS/tools}/my_utils.py (97%) rename {tools => GPT_SoVITS/tools}/slice_audio.py (97%) rename {tools => GPT_SoVITS/tools}/slicer2.py (97%) rename {tools => GPT_SoVITS/tools}/subfix_webui.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/bs_roformer/__init__.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/bs_roformer/attend.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/bs_roformer/bs_roformer.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/bsroformer.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/dataset.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/layers.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/layers_123812KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/layers_123821KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/layers_33966KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/layers_537227KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/layers_537238KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/layers_new.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/model_param_init.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/2band_32000.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/2band_48000.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/3band_44100.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_44100.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_v2.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/4band_v3.json (94%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/modelparams/ensemble.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/nets.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/nets_123812KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/nets_123821KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/nets_33966KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/nets_537227KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/nets_537238KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/nets_61968KB.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/nets_new.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/lib_v5/spec_utils.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/name_params.json (100%) rename {tools => GPT_SoVITS/tools}/uvr5/lib/utils.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/mdxnet.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/uvr5_weights/.gitignore (100%) rename {tools => GPT_SoVITS/tools}/uvr5/vr.py (100%) rename {tools => GPT_SoVITS/tools}/uvr5/webui.py (100%) create mode 100644 MANIFEST.in create mode 100644 pyproject.toml diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index a1eeb28c..809601b3 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1,60 +1,66 @@ from copy import deepcopy import math import os, sys, gc -import random -import traceback -from tqdm import tqdm + now_dir = os.getcwd() sys.path.append(now_dir) import ffmpeg -import os -from typing import Generator, List, Tuple, Union +import librosa import numpy as np +import random import torch import torch.nn.functional as F +import traceback import yaml -from transformers import AutoModelForMaskedLM, AutoTokenizer + +from huggingface_hub import snapshot_download, hf_hub_download +from importlib.resources import files +from time import time as ttime +from typing import Generator, List, Tuple, Union +from tqdm import tqdm + from AR.models.t2s_lightning_module import Text2SemanticLightningModule from feature_extractor.cnhubert import CNHubert from module.models import SynthesizerTrn -import librosa -from time import time as ttime +from module.mel_processing import spectrogram_torch from tools.i18n.i18n import I18nAuto, scan_language_list from tools.my_utils import load_audio -from module.mel_processing import spectrogram_torch +from transformers import AutoModelForMaskedLM, AutoTokenizer from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.TextPreprocessor import TextPreprocessor + language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) +LIBRARY_NAME = "GPT_SoVITS" # configs/tts_infer.yaml """ custom: - bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large - cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt - vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth + t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt + vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth version: v2 default: - bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large - cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt - vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth + t2s_weights_path: pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt + vits_weights_path: pretrained_models/s2G488k.pth version: v1 default_v2: - bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large - cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt - vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth + t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt + vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth version: v2 """ @@ -86,19 +92,19 @@ class TTS_Config: "device": "cpu", "is_half": False, "version": "v1", - "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", - "vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth", - "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", - "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + "t2s_weights_path": "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "vits_weights_path": "pretrained_models/s2G488k.pth", + "cnhuhbert_base_path": "pretrained_models/chinese-hubert-base", + "bert_base_path": "pretrained_models/chinese-roberta-wwm-ext-large", }, "default_v2":{ "device": "cpu", "is_half": False, "version": "v2", - "t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", - "vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", - "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", - "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + "t2s_weights_path": "pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "vits_weights_path": "pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "cnhuhbert_base_path": "pretrained_models/chinese-hubert-base", + "bert_base_path": "pretrained_models/chinese-roberta-wwm-ext-large", }, } configs:dict = None @@ -120,7 +126,7 @@ class TTS_Config: def __init__(self, configs: Union[dict, str]=None): # 设置默认配置文件路径 - configs_base_path:str = "GPT_SoVITS/configs/" + configs_base_path:str = "configs/" os.makedirs(configs_base_path, exist_ok=True) self.configs_path:str = os.path.join(configs_base_path, "tts_infer.yaml") @@ -152,23 +158,24 @@ class TTS_Config: self.bert_base_path = self.configs.get("bert_base_path", None) self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None) self.languages = self.v2_languages if self.version=="v2" else self.v1_languages - if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)): - self.t2s_weights_path = self.default_configs[default_config_key]['t2s_weights_path'] + self.t2s_weights_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['t2s_weights_path'])) print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}") if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)): - self.vits_weights_path = self.default_configs[default_config_key]['vits_weights_path'] + self.vits_weights_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['vits_weights_path'])) print(f"fall back to default vits_weights_path: {self.vits_weights_path}") if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)): - self.bert_base_path = self.default_configs[default_config_key]['bert_base_path'] + self.bert_base_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['bert_base_path'])) print(f"fall back to default bert_base_path: {self.bert_base_path}") if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)): - self.cnhuhbert_base_path = self.default_configs[default_config_key]['cnhuhbert_base_path'] + self.cnhuhbert_base_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['cnhuhbert_base_path'])) print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}") + + repo_name="lj1995/GPT-SoVITS" + snapshot_download(repo_id=repo_name, local_dir=os.path.dirname(self.bert_base_path)) self.update_configs() - self.max_sec = None self.hz:int = 50 self.semantic_frame_rate:str = "25hz" diff --git a/api_v2.py b/GPT_SoVITS/api_v2.py similarity index 98% rename from api_v2.py rename to GPT_SoVITS/api_v2.py index 92a18f37..5dfbebec 100644 --- a/api_v2.py +++ b/GPT_SoVITS/api_v2.py @@ -114,6 +114,8 @@ from fastapi import FastAPI, Request, HTTPException, Response from fastapi.responses import StreamingResponse, JSONResponse from fastapi import FastAPI, UploadFile, File import uvicorn + +from importlib.resources import files from io import BytesIO from tools.i18n.i18n import I18nAuto from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config @@ -125,7 +127,7 @@ i18n = I18nAuto() cut_method_names = get_cut_method_names() parser = argparse.ArgumentParser(description="GPT-SoVITS api") -parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径") +parser.add_argument("-c", "--tts_config", type=str, default=None, help="tts_infer路径") parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880") args = parser.parse_args() @@ -136,7 +138,7 @@ host = args.bind_addr argv = sys.argv if config_path in [None, ""]: - config_path = "GPT-SoVITS/configs/tts_infer.yaml" + config_path = str(files("GPT_SoVITS").joinpath("configs/tts_infer.yaml")) tts_config = TTS_Config(config_path) print(tts_config) @@ -394,7 +396,7 @@ async def tts_get_endpoint( @APP.post("/tts") async def tts_post_endpoint(request: TTS_Request): - req = request.dict() + req = request.model_dump() return await tts_handle(req) @@ -449,7 +451,8 @@ async def set_sovits_weights(weights_path: str = None): -if __name__ == "__main__": +def main(): + global port, host, argv try: if host == 'None': # 在调用时使用 -a None 参数,可以让api监听双栈 host = None @@ -458,3 +461,6 @@ if __name__ == "__main__": traceback.print_exc() os.kill(os.getpid(), signal.SIGTERM) exit(0) + +if __name__ == "__main__": + main() diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml index 66f1193d..fe63fa2a 100644 --- a/GPT_SoVITS/configs/tts_infer.yaml +++ b/GPT_SoVITS/configs/tts_infer.yaml @@ -1,24 +1,24 @@ custom: - bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large - cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: pretrained_models/chinese-hubert-base device: cuda is_half: true - t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt + t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt version: v2 - vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth + vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth default: - bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large - cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt + t2s_weights_path: pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt version: v1 - vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth + vits_weights_path: pretrained_models/s2G488k.pth default_v2: - bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large - cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt + t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt version: v2 - vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth + vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth diff --git a/tools/__init__.py b/GPT_SoVITS/tools/__init__.py similarity index 100% rename from tools/__init__.py rename to GPT_SoVITS/tools/__init__.py diff --git a/tools/asr/config.py b/GPT_SoVITS/tools/asr/config.py similarity index 100% rename from tools/asr/config.py rename to GPT_SoVITS/tools/asr/config.py diff --git a/tools/asr/fasterwhisper_asr.py b/GPT_SoVITS/tools/asr/fasterwhisper_asr.py similarity index 100% rename from tools/asr/fasterwhisper_asr.py rename to GPT_SoVITS/tools/asr/fasterwhisper_asr.py diff --git a/tools/asr/funasr_asr.py b/GPT_SoVITS/tools/asr/funasr_asr.py similarity index 100% rename from tools/asr/funasr_asr.py rename to GPT_SoVITS/tools/asr/funasr_asr.py diff --git a/tools/asr/models/.gitignore b/GPT_SoVITS/tools/asr/models/.gitignore similarity index 100% rename from tools/asr/models/.gitignore rename to GPT_SoVITS/tools/asr/models/.gitignore diff --git a/tools/cmd-denoise.py b/GPT_SoVITS/tools/cmd-denoise.py similarity index 97% rename from tools/cmd-denoise.py rename to GPT_SoVITS/tools/cmd-denoise.py index 1fdcab6d..29128966 100644 --- a/tools/cmd-denoise.py +++ b/GPT_SoVITS/tools/cmd-denoise.py @@ -1,33 +1,33 @@ -import os,argparse -import traceback - -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks -from tqdm import tqdm - -path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' -path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" -ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) -def execute_denoise(input_folder,output_folder): - os.makedirs(output_folder,exist_ok=True) - # print(input_folder) - # print(list(os.listdir(input_folder).sort())) - for name in tqdm(os.listdir(input_folder)): - try: - ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) - except: - traceback.print_exc() - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32")#还没接入 - cmd = parser.parse_args() - execute_denoise( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, +import os,argparse +import traceback + +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from tqdm import tqdm + +path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' +path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" +ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) +def execute_denoise(input_folder,output_folder): + os.makedirs(output_folder,exist_ok=True) + # print(input_folder) + # print(list(os.listdir(input_folder).sort())) + for name in tqdm(os.listdir(input_folder)): + try: + ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) + except: + traceback.print_exc() + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_folder", type=str, required=True, + help="Path to the folder containing WAV files.") + parser.add_argument("-o", "--output_folder", type=str, required=True, + help="Output folder to store transcriptions.") + parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], + help="fp16 or fp32")#还没接入 + cmd = parser.parse_args() + execute_denoise( + input_folder = cmd.input_folder, + output_folder = cmd.output_folder, ) \ No newline at end of file diff --git a/tools/denoise-model/.gitignore b/GPT_SoVITS/tools/denoise-model/.gitignore similarity index 100% rename from tools/denoise-model/.gitignore rename to GPT_SoVITS/tools/denoise-model/.gitignore diff --git a/tools/i18n/i18n.py b/GPT_SoVITS/tools/i18n/i18n.py similarity index 100% rename from tools/i18n/i18n.py rename to GPT_SoVITS/tools/i18n/i18n.py diff --git a/tools/i18n/locale/en_US.json b/GPT_SoVITS/tools/i18n/locale/en_US.json similarity index 100% rename from tools/i18n/locale/en_US.json rename to GPT_SoVITS/tools/i18n/locale/en_US.json diff --git a/tools/i18n/locale/es_ES.json b/GPT_SoVITS/tools/i18n/locale/es_ES.json similarity index 100% rename from tools/i18n/locale/es_ES.json rename to GPT_SoVITS/tools/i18n/locale/es_ES.json diff --git a/tools/i18n/locale/fr_FR.json b/GPT_SoVITS/tools/i18n/locale/fr_FR.json similarity index 100% rename from tools/i18n/locale/fr_FR.json rename to GPT_SoVITS/tools/i18n/locale/fr_FR.json diff --git a/tools/i18n/locale/it_IT.json b/GPT_SoVITS/tools/i18n/locale/it_IT.json similarity index 100% rename from tools/i18n/locale/it_IT.json rename to GPT_SoVITS/tools/i18n/locale/it_IT.json diff --git a/tools/i18n/locale/ja_JP.json b/GPT_SoVITS/tools/i18n/locale/ja_JP.json similarity index 100% rename from tools/i18n/locale/ja_JP.json rename to GPT_SoVITS/tools/i18n/locale/ja_JP.json diff --git a/tools/i18n/locale/ko_KR.json b/GPT_SoVITS/tools/i18n/locale/ko_KR.json similarity index 100% rename from tools/i18n/locale/ko_KR.json rename to GPT_SoVITS/tools/i18n/locale/ko_KR.json diff --git a/tools/i18n/locale/pt_BR.json b/GPT_SoVITS/tools/i18n/locale/pt_BR.json similarity index 100% rename from tools/i18n/locale/pt_BR.json rename to GPT_SoVITS/tools/i18n/locale/pt_BR.json diff --git a/tools/i18n/locale/ru_RU.json b/GPT_SoVITS/tools/i18n/locale/ru_RU.json similarity index 100% rename from tools/i18n/locale/ru_RU.json rename to GPT_SoVITS/tools/i18n/locale/ru_RU.json diff --git a/tools/i18n/locale/tr_TR.json b/GPT_SoVITS/tools/i18n/locale/tr_TR.json similarity index 100% rename from tools/i18n/locale/tr_TR.json rename to GPT_SoVITS/tools/i18n/locale/tr_TR.json diff --git a/tools/i18n/locale/zh_CN.json b/GPT_SoVITS/tools/i18n/locale/zh_CN.json similarity index 100% rename from tools/i18n/locale/zh_CN.json rename to GPT_SoVITS/tools/i18n/locale/zh_CN.json diff --git a/tools/i18n/locale/zh_HK.json b/GPT_SoVITS/tools/i18n/locale/zh_HK.json similarity index 100% rename from tools/i18n/locale/zh_HK.json rename to GPT_SoVITS/tools/i18n/locale/zh_HK.json diff --git a/tools/i18n/locale/zh_SG.json b/GPT_SoVITS/tools/i18n/locale/zh_SG.json similarity index 100% rename from tools/i18n/locale/zh_SG.json rename to GPT_SoVITS/tools/i18n/locale/zh_SG.json diff --git a/tools/i18n/locale/zh_TW.json b/GPT_SoVITS/tools/i18n/locale/zh_TW.json similarity index 100% rename from tools/i18n/locale/zh_TW.json rename to GPT_SoVITS/tools/i18n/locale/zh_TW.json diff --git a/tools/i18n/scan_i18n.py b/GPT_SoVITS/tools/i18n/scan_i18n.py similarity index 100% rename from tools/i18n/scan_i18n.py rename to GPT_SoVITS/tools/i18n/scan_i18n.py diff --git a/tools/my_utils.py b/GPT_SoVITS/tools/my_utils.py similarity index 97% rename from tools/my_utils.py rename to GPT_SoVITS/tools/my_utils.py index cdb95e0b..c1469234 100644 --- a/tools/my_utils.py +++ b/GPT_SoVITS/tools/my_utils.py @@ -1,115 +1,115 @@ -import platform,os,traceback -import ffmpeg -import numpy as np -import gradio as gr -from tools.i18n.i18n import I18nAuto -import pandas as pd -i18n = I18nAuto(language=os.environ.get('language','Auto')) - -def load_audio(file, sr): - try: - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 - if os.path.exists(file) == False: - raise RuntimeError( - "You input a wrong audio path that does not exists, please fix it!" - ) - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except Exception as e: - traceback.print_exc() - raise RuntimeError(i18n("音频加载失败")) - - return np.frombuffer(out, np.float32).flatten() - - -def clean_path(path_str:str): - if path_str.endswith(('\\','/')): - return clean_path(path_str[0:-1]) - path_str = path_str.replace('/', os.sep).replace('\\', os.sep) - return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") - - -def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False): - files_status=[] - if is_train == True and file_list: - file_list.append(os.path.join(file_list[0],'2-name2text.txt')) - file_list.append(os.path.join(file_list[0],'3-bert')) - file_list.append(os.path.join(file_list[0],'4-cnhubert')) - file_list.append(os.path.join(file_list[0],'5-wav32k')) - file_list.append(os.path.join(file_list[0],'6-name2semantic.tsv')) - for file in file_list: - if os.path.exists(file):files_status.append(True) - else:files_status.append(False) - if sum(files_status)!=len(files_status): - if is_train: - for file,status in zip(file_list,files_status): - if status:pass - else:gr.Warning(file) - gr.Warning(i18n('以下文件或文件夹不存在')) - return False - elif is_dataset_processing: - if files_status[0]: - return True - elif not files_status[0]: - gr.Warning(file_list[0]) - elif not files_status[1] and file_list[1]: - gr.Warning(file_list[1]) - gr.Warning(i18n('以下文件或文件夹不存在')) - return False - else: - if file_list[0]: - gr.Warning(file_list[0]) - gr.Warning(i18n('以下文件或文件夹不存在')) - else: - gr.Warning(i18n('路径不能为空')) - return False - return True - -def check_details(path_list=None,is_train=False,is_dataset_processing=False): - if is_dataset_processing: - list_path, audio_path = path_list - if (not list_path.endswith('.list')): - gr.Warning(i18n('请填入正确的List路径')) - return - if audio_path: - if not os.path.isdir(audio_path): - gr.Warning(i18n('请填入正确的音频文件夹路径')) - return - with open(list_path,"r",encoding="utf8")as f: - line=f.readline().strip("\n").split("\n") - wav_name, _, __, ___ = line[0].split("|") - wav_name=clean_path(wav_name) - if (audio_path != "" and audio_path != None): - wav_name = os.path.basename(wav_name) - wav_path = "%s/%s"%(audio_path, wav_name) - else: - wav_path=wav_name - if os.path.exists(wav_path): - ... - else: - gr.Warning(i18n('路径错误')) - return - if is_train: - path_list.append(os.path.join(path_list[0],'2-name2text.txt')) - path_list.append(os.path.join(path_list[0],'4-cnhubert')) - path_list.append(os.path.join(path_list[0],'5-wav32k')) - path_list.append(os.path.join(path_list[0],'6-name2semantic.tsv')) - phone_path, hubert_path, wav_path, semantic_path = path_list[1:] - with open(phone_path,'r',encoding='utf-8') as f: - if f.read(1):... - else:gr.Warning(i18n('缺少音素数据集')) - if os.listdir(hubert_path):... - else:gr.Warning(i18n('缺少Hubert数据集')) - if os.listdir(wav_path):... - else:gr.Warning(i18n('缺少音频数据集')) - df = pd.read_csv( - semantic_path, delimiter="\t", encoding="utf-8" - ) - if len(df) >= 1:... - else:gr.Warning(i18n('缺少语义数据集')) +import platform,os,traceback +import ffmpeg +import numpy as np +import gradio as gr +from tools.i18n.i18n import I18nAuto +import pandas as pd +i18n = I18nAuto(language=os.environ.get('language','Auto')) + +def load_audio(file, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 + if os.path.exists(file) == False: + raise RuntimeError( + "You input a wrong audio path that does not exists, please fix it!" + ) + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + traceback.print_exc() + raise RuntimeError(i18n("音频加载失败")) + + return np.frombuffer(out, np.float32).flatten() + + +def clean_path(path_str:str): + if path_str.endswith(('\\','/')): + return clean_path(path_str[0:-1]) + path_str = path_str.replace('/', os.sep).replace('\\', os.sep) + return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") + + +def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False): + files_status=[] + if is_train == True and file_list: + file_list.append(os.path.join(file_list[0],'2-name2text.txt')) + file_list.append(os.path.join(file_list[0],'3-bert')) + file_list.append(os.path.join(file_list[0],'4-cnhubert')) + file_list.append(os.path.join(file_list[0],'5-wav32k')) + file_list.append(os.path.join(file_list[0],'6-name2semantic.tsv')) + for file in file_list: + if os.path.exists(file):files_status.append(True) + else:files_status.append(False) + if sum(files_status)!=len(files_status): + if is_train: + for file,status in zip(file_list,files_status): + if status:pass + else:gr.Warning(file) + gr.Warning(i18n('以下文件或文件夹不存在')) + return False + elif is_dataset_processing: + if files_status[0]: + return True + elif not files_status[0]: + gr.Warning(file_list[0]) + elif not files_status[1] and file_list[1]: + gr.Warning(file_list[1]) + gr.Warning(i18n('以下文件或文件夹不存在')) + return False + else: + if file_list[0]: + gr.Warning(file_list[0]) + gr.Warning(i18n('以下文件或文件夹不存在')) + else: + gr.Warning(i18n('路径不能为空')) + return False + return True + +def check_details(path_list=None,is_train=False,is_dataset_processing=False): + if is_dataset_processing: + list_path, audio_path = path_list + if (not list_path.endswith('.list')): + gr.Warning(i18n('请填入正确的List路径')) + return + if audio_path: + if not os.path.isdir(audio_path): + gr.Warning(i18n('请填入正确的音频文件夹路径')) + return + with open(list_path,"r",encoding="utf8")as f: + line=f.readline().strip("\n").split("\n") + wav_name, _, __, ___ = line[0].split("|") + wav_name=clean_path(wav_name) + if (audio_path != "" and audio_path != None): + wav_name = os.path.basename(wav_name) + wav_path = "%s/%s"%(audio_path, wav_name) + else: + wav_path=wav_name + if os.path.exists(wav_path): + ... + else: + gr.Warning(i18n('路径错误')) + return + if is_train: + path_list.append(os.path.join(path_list[0],'2-name2text.txt')) + path_list.append(os.path.join(path_list[0],'4-cnhubert')) + path_list.append(os.path.join(path_list[0],'5-wav32k')) + path_list.append(os.path.join(path_list[0],'6-name2semantic.tsv')) + phone_path, hubert_path, wav_path, semantic_path = path_list[1:] + with open(phone_path,'r',encoding='utf-8') as f: + if f.read(1):... + else:gr.Warning(i18n('缺少音素数据集')) + if os.listdir(hubert_path):... + else:gr.Warning(i18n('缺少Hubert数据集')) + if os.listdir(wav_path):... + else:gr.Warning(i18n('缺少音频数据集')) + df = pd.read_csv( + semantic_path, delimiter="\t", encoding="utf-8" + ) + if len(df) >= 1:... + else:gr.Warning(i18n('缺少语义数据集')) diff --git a/tools/slice_audio.py b/GPT_SoVITS/tools/slice_audio.py similarity index 97% rename from tools/slice_audio.py rename to GPT_SoVITS/tools/slice_audio.py index 8a06292d..b9912ae0 100644 --- a/tools/slice_audio.py +++ b/GPT_SoVITS/tools/slice_audio.py @@ -1,48 +1,48 @@ -import os,sys,numpy as np -import traceback -from scipy.io import wavfile -# parent_directory = os.path.dirname(os.path.abspath(__file__)) -# sys.path.append(parent_directory) -from tools.my_utils import load_audio -from slicer2 import Slicer - -def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): - os.makedirs(opt_root,exist_ok=True) - if os.path.isfile(inp): - input=[inp] - elif os.path.isdir(inp): - input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] - else: - return "输入路径存在但既不是文件也不是文件夹" - slicer = Slicer( - sr=32000, # 长音频采样率 - threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 - min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 - min_interval= int(min_interval), # 最短切割间隔 - hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) - max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 - ) - _max=float(_max) - alpha=float(alpha) - for inp_path in input[int(i_part)::int(all_part)]: - # print(inp_path) - try: - name = os.path.basename(inp_path) - audio = load_audio(inp_path, 32000) - # print(audio.shape) - for chunk, start, end in slicer.slice(audio): # start和end是帧数 - tmp_max = np.abs(chunk).max() - if(tmp_max>1):chunk/=tmp_max - chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk - wavfile.write( - "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), - 32000, - # chunk.astype(np.float32), - (chunk * 32767).astype(np.int16), - ) - except: - print(inp_path,"->fail->",traceback.format_exc()) - return "执行完毕,请检查输出文件" - -print(slice(*sys.argv[1:])) - +import os,sys,numpy as np +import traceback +from scipy.io import wavfile +# parent_directory = os.path.dirname(os.path.abspath(__file__)) +# sys.path.append(parent_directory) +from tools.my_utils import load_audio +from slicer2 import Slicer + +def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): + os.makedirs(opt_root,exist_ok=True) + if os.path.isfile(inp): + input=[inp] + elif os.path.isdir(inp): + input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] + else: + return "输入路径存在但既不是文件也不是文件夹" + slicer = Slicer( + sr=32000, # 长音频采样率 + threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 + min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 + min_interval= int(min_interval), # 最短切割间隔 + hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) + max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 + ) + _max=float(_max) + alpha=float(alpha) + for inp_path in input[int(i_part)::int(all_part)]: + # print(inp_path) + try: + name = os.path.basename(inp_path) + audio = load_audio(inp_path, 32000) + # print(audio.shape) + for chunk, start, end in slicer.slice(audio): # start和end是帧数 + tmp_max = np.abs(chunk).max() + if(tmp_max>1):chunk/=tmp_max + chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk + wavfile.write( + "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), + 32000, + # chunk.astype(np.float32), + (chunk * 32767).astype(np.int16), + ) + except: + print(inp_path,"->fail->",traceback.format_exc()) + return "执行完毕,请检查输出文件" + +print(slice(*sys.argv[1:])) + diff --git a/tools/slicer2.py b/GPT_SoVITS/tools/slicer2.py similarity index 97% rename from tools/slicer2.py rename to GPT_SoVITS/tools/slicer2.py index ba6794b6..176781af 100644 --- a/tools/slicer2.py +++ b/GPT_SoVITS/tools/slicer2.py @@ -1,261 +1,261 @@ -import numpy as np - - -# This function is obtained from librosa. -def get_rms( - y, - frame_length=2048, - hop_length=512, - pad_mode="constant", -): - padding = (int(frame_length // 2), int(frame_length // 2)) - y = np.pad(y, padding, mode=pad_mode) - - axis = -1 - # put our new within-frame axis at the end for now - out_strides = y.strides + tuple([y.strides[axis]]) - # Reduce the shape on the framing axis - x_shape_trimmed = list(y.shape) - x_shape_trimmed[axis] -= frame_length - 1 - out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) - xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) - if axis < 0: - target_axis = axis - 1 - else: - target_axis = axis + 1 - xw = np.moveaxis(xw, -1, target_axis) - # Downsample along the target axis - slices = [slice(None)] * xw.ndim - slices[axis] = slice(0, None, hop_length) - x = xw[tuple(slices)] - - # Calculate power - power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) - - return np.sqrt(power) - - -class Slicer: - def __init__( - self, - sr: int, - threshold: float = -40.0, - min_length: int = 5000, - min_interval: int = 300, - hop_size: int = 20, - max_sil_kept: int = 5000, - ): - if not min_length >= min_interval >= hop_size: - raise ValueError( - "The following condition must be satisfied: min_length >= min_interval >= hop_size" - ) - if not max_sil_kept >= hop_size: - raise ValueError( - "The following condition must be satisfied: max_sil_kept >= hop_size" - ) - min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.0) - self.hop_size = round(sr * hop_size / 1000) - self.win_size = min(round(min_interval), 4 * self.hop_size) - self.min_length = round(sr * min_length / 1000 / self.hop_size) - self.min_interval = round(min_interval / self.hop_size) - self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) - - def _apply_slice(self, waveform, begin, end): - if len(waveform.shape) > 1: - return waveform[ - :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) - ] - else: - return waveform[ - begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) - ] - - # @timeit - def slice(self, waveform): - if len(waveform.shape) > 1: - samples = waveform.mean(axis=0) - else: - samples = waveform - if samples.shape[0] <= self.min_length: - return [waveform] - rms_list = get_rms( - y=samples, frame_length=self.win_size, hop_length=self.hop_size - ).squeeze(0) - sil_tags = [] - silence_start = None - clip_start = 0 - for i, rms in enumerate(rms_list): - # Keep looping while frame is silent. - if rms < self.threshold: - # Record start of silent frames. - if silence_start is None: - silence_start = i - continue - # Keep looping while frame is not silent and silence start has not been recorded. - if silence_start is None: - continue - # Clear recorded silence start if interval is not enough or clip is too short - is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = ( - i - silence_start >= self.min_interval - and i - clip_start >= self.min_length - ) - if not is_leading_silence and not need_slice_middle: - silence_start = None - continue - # Need slicing. Record the range of silent frames to be removed. - if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start - if silence_start == 0: - sil_tags.append((0, pos)) - else: - sil_tags.append((pos, pos)) - clip_start = pos - elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[ - i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 - ].argmin() - pos += i - self.max_sil_kept - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) - if silence_start == 0: - sil_tags.append((0, pos_r)) - clip_start = pos_r - else: - sil_tags.append((min(pos_l, pos), max(pos_r, pos))) - clip_start = max(pos_r, pos) - else: - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) - if silence_start == 0: - sil_tags.append((0, pos_r)) - else: - sil_tags.append((pos_l, pos_r)) - clip_start = pos_r - silence_start = None - # Deal with trailing silence. - total_frames = rms_list.shape[0] - if ( - silence_start is not None - and total_frames - silence_start >= self.min_interval - ): - silence_end = min(total_frames, silence_start + self.max_sil_kept) - pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start - sil_tags.append((pos, total_frames + 1)) - # Apply and return slices. - ####音频+起始时间+终止时间 - if len(sil_tags) == 0: - return [[waveform,0,int(total_frames*self.hop_size)]] - else: - chunks = [] - if sil_tags[0][0] > 0: - chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)]) - for i in range(len(sil_tags) - 1): - chunks.append( - [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)] - ) - if sil_tags[-1][1] < total_frames: - chunks.append( - [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)] - ) - return chunks - - -def main(): - import os.path - from argparse import ArgumentParser - - import librosa - import soundfile - - parser = ArgumentParser() - parser.add_argument("audio", type=str, help="The audio to be sliced") - parser.add_argument( - "--out", type=str, help="Output directory of the sliced audio clips" - ) - parser.add_argument( - "--db_thresh", - type=float, - required=False, - default=-40, - help="The dB threshold for silence detection", - ) - parser.add_argument( - "--min_length", - type=int, - required=False, - default=5000, - help="The minimum milliseconds required for each sliced audio clip", - ) - parser.add_argument( - "--min_interval", - type=int, - required=False, - default=300, - help="The minimum milliseconds for a silence part to be sliced", - ) - parser.add_argument( - "--hop_size", - type=int, - required=False, - default=10, - help="Frame length in milliseconds", - ) - parser.add_argument( - "--max_sil_kept", - type=int, - required=False, - default=500, - help="The maximum silence length kept around the sliced clip, presented in milliseconds", - ) - args = parser.parse_args() - out = args.out - if out is None: - out = os.path.dirname(os.path.abspath(args.audio)) - audio, sr = librosa.load(args.audio, sr=None, mono=False) - slicer = Slicer( - sr=sr, - threshold=args.db_thresh, - min_length=args.min_length, - min_interval=args.min_interval, - hop_size=args.hop_size, - max_sil_kept=args.max_sil_kept, - ) - chunks = slicer.slice(audio) - if not os.path.exists(out): - os.makedirs(out) - for i, chunk in enumerate(chunks): - if len(chunk.shape) > 1: - chunk = chunk.T - soundfile.write( - os.path.join( - out, - f"%s_%d.wav" - % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), - ), - chunk, - sr, - ) - - -if __name__ == "__main__": - main() +import numpy as np + + +# This function is obtained from librosa. +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + # put our new within-frame axis at the end for now + out_strides = y.strides + tuple([y.strides[axis]]) + # Reduce the shape on the framing axis + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + xw = np.moveaxis(xw, -1, target_axis) + # Downsample along the target axis + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + # Calculate power + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + + return np.sqrt(power) + + +class Slicer: + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): + if not min_length >= min_interval >= hop_size: + raise ValueError( + "The following condition must be satisfied: min_length >= min_interval >= hop_size" + ) + if not max_sil_kept >= hop_size: + raise ValueError( + "The following condition must be satisfied: max_sil_kept >= hop_size" + ) + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.0) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + if len(waveform.shape) > 1: + return waveform[ + :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) + ] + else: + return waveform[ + begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) + ] + + # @timeit + def slice(self, waveform): + if len(waveform.shape) > 1: + samples = waveform.mean(axis=0) + else: + samples = waveform + if samples.shape[0] <= self.min_length: + return [waveform] + rms_list = get_rms( + y=samples, frame_length=self.win_size, hop_length=self.hop_size + ).squeeze(0) + sil_tags = [] + silence_start = None + clip_start = 0 + for i, rms in enumerate(rms_list): + # Keep looping while frame is silent. + if rms < self.threshold: + # Record start of silent frames. + if silence_start is None: + silence_start = i + continue + # Keep looping while frame is not silent and silence start has not been recorded. + if silence_start is None: + continue + # Clear recorded silence start if interval is not enough or clip is too short + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = ( + i - silence_start >= self.min_interval + and i - clip_start >= self.min_length + ) + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + # Need slicing. Record the range of silent frames to be removed. + if i - silence_start <= self.max_sil_kept: + pos = rms_list[silence_start : i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + pos = rms_list[ + i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 + ].argmin() + pos += i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + # Deal with trailing silence. + total_frames = rms_list.shape[0] + if ( + silence_start is not None + and total_frames - silence_start >= self.min_interval + ): + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + # Apply and return slices. + ####音频+起始时间+终止时间 + if len(sil_tags) == 0: + return [[waveform,0,int(total_frames*self.hop_size)]] + else: + chunks = [] + if sil_tags[0][0] > 0: + chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)]) + for i in range(len(sil_tags) - 1): + chunks.append( + [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)] + ) + if sil_tags[-1][1] < total_frames: + chunks.append( + [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)] + ) + return chunks + + +def main(): + import os.path + from argparse import ArgumentParser + + import librosa + import soundfile + + parser = ArgumentParser() + parser.add_argument("audio", type=str, help="The audio to be sliced") + parser.add_argument( + "--out", type=str, help="Output directory of the sliced audio clips" + ) + parser.add_argument( + "--db_thresh", + type=float, + required=False, + default=-40, + help="The dB threshold for silence detection", + ) + parser.add_argument( + "--min_length", + type=int, + required=False, + default=5000, + help="The minimum milliseconds required for each sliced audio clip", + ) + parser.add_argument( + "--min_interval", + type=int, + required=False, + default=300, + help="The minimum milliseconds for a silence part to be sliced", + ) + parser.add_argument( + "--hop_size", + type=int, + required=False, + default=10, + help="Frame length in milliseconds", + ) + parser.add_argument( + "--max_sil_kept", + type=int, + required=False, + default=500, + help="The maximum silence length kept around the sliced clip, presented in milliseconds", + ) + args = parser.parse_args() + out = args.out + if out is None: + out = os.path.dirname(os.path.abspath(args.audio)) + audio, sr = librosa.load(args.audio, sr=None, mono=False) + slicer = Slicer( + sr=sr, + threshold=args.db_thresh, + min_length=args.min_length, + min_interval=args.min_interval, + hop_size=args.hop_size, + max_sil_kept=args.max_sil_kept, + ) + chunks = slicer.slice(audio) + if not os.path.exists(out): + os.makedirs(out) + for i, chunk in enumerate(chunks): + if len(chunk.shape) > 1: + chunk = chunk.T + soundfile.write( + os.path.join( + out, + f"%s_%d.wav" + % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), + chunk, + sr, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/subfix_webui.py b/GPT_SoVITS/tools/subfix_webui.py similarity index 100% rename from tools/subfix_webui.py rename to GPT_SoVITS/tools/subfix_webui.py diff --git a/tools/uvr5/bs_roformer/__init__.py b/GPT_SoVITS/tools/uvr5/bs_roformer/__init__.py similarity index 100% rename from tools/uvr5/bs_roformer/__init__.py rename to GPT_SoVITS/tools/uvr5/bs_roformer/__init__.py diff --git a/tools/uvr5/bs_roformer/attend.py b/GPT_SoVITS/tools/uvr5/bs_roformer/attend.py similarity index 100% rename from tools/uvr5/bs_roformer/attend.py rename to GPT_SoVITS/tools/uvr5/bs_roformer/attend.py diff --git a/tools/uvr5/bs_roformer/bs_roformer.py b/GPT_SoVITS/tools/uvr5/bs_roformer/bs_roformer.py similarity index 100% rename from tools/uvr5/bs_roformer/bs_roformer.py rename to GPT_SoVITS/tools/uvr5/bs_roformer/bs_roformer.py diff --git a/tools/uvr5/bsroformer.py b/GPT_SoVITS/tools/uvr5/bsroformer.py similarity index 100% rename from tools/uvr5/bsroformer.py rename to GPT_SoVITS/tools/uvr5/bsroformer.py diff --git a/tools/uvr5/lib/lib_v5/dataset.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/dataset.py similarity index 100% rename from tools/uvr5/lib/lib_v5/dataset.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/dataset.py diff --git a/tools/uvr5/lib/lib_v5/layers.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers.py diff --git a/tools/uvr5/lib/lib_v5/layers_123812KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_123812KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_123812KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_123812KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_123821KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_123821KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_123821KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_123821KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_33966KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_33966KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_33966KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_33966KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_537227KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_537227KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_537227KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_537227KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_537238KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_537238KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_537238KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_537238KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_new.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_new.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_new.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_new.py diff --git a/tools/uvr5/lib/lib_v5/model_param_init.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/model_param_init.py similarity index 100% rename from tools/uvr5/lib/lib_v5/model_param_init.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/model_param_init.py diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/2band_32000.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/2band_48000.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/3band_44100.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_v2.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json similarity index 94% rename from tools/uvr5/lib/lib_v5/modelparams/4band_v3.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json index 2a73bc97..edb908b8 100644 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json +++ b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json @@ -1,54 +1,54 @@ -{ - "bins": 672, - "unstable_bins": 8, - "reduction_bins": 530, - "band": { - "1": { - "sr": 7350, - "hl": 80, - "n_fft": 640, - "crop_start": 0, - "crop_stop": 85, - "lpf_start": 25, - "lpf_stop": 53, - "res_type": "polyphase" - }, - "2": { - "sr": 7350, - "hl": 80, - "n_fft": 320, - "crop_start": 4, - "crop_stop": 87, - "hpf_start": 25, - "hpf_stop": 12, - "lpf_start": 31, - "lpf_stop": 62, - "res_type": "polyphase" - }, - "3": { - "sr": 14700, - "hl": 160, - "n_fft": 512, - "crop_start": 17, - "crop_stop": 216, - "hpf_start": 48, - "hpf_stop": 24, - "lpf_start": 139, - "lpf_stop": 210, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 480, - "n_fft": 960, - "crop_start": 78, - "crop_stop": 383, - "hpf_start": 130, - "hpf_stop": 86, - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 668, - "pre_filter_stop": 672 +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 530, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 } \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/ensemble.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/ensemble.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/ensemble.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/ensemble.json diff --git a/tools/uvr5/lib/lib_v5/nets.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets.py diff --git a/tools/uvr5/lib/lib_v5/nets_123812KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_123812KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_123812KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_123812KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_123821KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_123821KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_123821KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_123821KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_33966KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_33966KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_33966KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_33966KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_537227KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_537227KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_537227KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_537227KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_537238KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_537238KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_537238KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_537238KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_61968KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_61968KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_61968KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_61968KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_new.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_new.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_new.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_new.py diff --git a/tools/uvr5/lib/lib_v5/spec_utils.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/spec_utils.py similarity index 100% rename from tools/uvr5/lib/lib_v5/spec_utils.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/spec_utils.py diff --git a/tools/uvr5/lib/name_params.json b/GPT_SoVITS/tools/uvr5/lib/name_params.json similarity index 100% rename from tools/uvr5/lib/name_params.json rename to GPT_SoVITS/tools/uvr5/lib/name_params.json diff --git a/tools/uvr5/lib/utils.py b/GPT_SoVITS/tools/uvr5/lib/utils.py similarity index 100% rename from tools/uvr5/lib/utils.py rename to GPT_SoVITS/tools/uvr5/lib/utils.py diff --git a/tools/uvr5/mdxnet.py b/GPT_SoVITS/tools/uvr5/mdxnet.py similarity index 100% rename from tools/uvr5/mdxnet.py rename to GPT_SoVITS/tools/uvr5/mdxnet.py diff --git a/tools/uvr5/uvr5_weights/.gitignore b/GPT_SoVITS/tools/uvr5/uvr5_weights/.gitignore similarity index 100% rename from tools/uvr5/uvr5_weights/.gitignore rename to GPT_SoVITS/tools/uvr5/uvr5_weights/.gitignore diff --git a/tools/uvr5/vr.py b/GPT_SoVITS/tools/uvr5/vr.py similarity index 100% rename from tools/uvr5/vr.py rename to GPT_SoVITS/tools/uvr5/vr.py diff --git a/tools/uvr5/webui.py b/GPT_SoVITS/tools/uvr5/webui.py similarity index 100% rename from tools/uvr5/webui.py rename to GPT_SoVITS/tools/uvr5/webui.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..084bdb35 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include GPT_SoVITS/configs/* \ No newline at end of file diff --git a/README.md b/README.md index 1fa36a79..f6fe3d77 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +# Jarod's NOTE +Working on turning this into a package. Right now, the API *does in fact* work to make requests to and this can be installed. Will get instrucitons out sometime in the future. +
diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..2ee1d234 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,63 @@ +[build-system] +requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "GPT_SoVITS" +dynamic = ["version"] +description = "" +readme = "README.md" +license = {text = "MIT License"} +classifiers = [ + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", +] +dependencies = [ + "numpy==1.26.4", + "scipy", + "tensorboard", + "librosa==0.9.2", + "numba", + "pytorch-lightning", + "gradio>=4.0,<=4.24.0", + "ffmpeg-python", + "onnxruntime; sys_platform == 'darwin'", + "onnxruntime-gpu; sys_platform != 'darwin'", + "tqdm", + "funasr==1.0.27", + "cn2an", + "pypinyin", + "pyopenjtalk>=0.3.4", + "g2p_en", + "torchaudio", + "modelscope==1.10.0", + "sentencepiece", + "transformers", + "chardet", + "PyYAML", + "psutil", + "jieba_fast", + "jieba", + "LangSegment>=0.2.0", + "Faster_Whisper", + "wordsegment", + "rotary_embedding_torch", + "pyjyutping", + "g2pk2", + "ko_pron", + "opencc; sys_platform != 'linux'", + "opencc==1.1.1; sys_platform == 'linux'", + "python_mecab_ko; sys_platform != 'win32'", + "fastapi<0.112.2", +] + + +[project.urls] +Homepage = "https://github.com/RVC-Boss/GPT-SoVITS" + +[project.scripts] +"gpt_sovits_api" = "GPT_SoVITS.api_v2:main" + +[tool.setuptools.packages.find] +include = ["GPT_SoVITS"] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 280d9d99..5230906f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -numpy==1.23.4 +numpy==1.26.4 scipy tensorboard librosa==0.9.2 -numba==0.56.4 +numba pytorch-lightning gradio>=4.0,<=4.24.0 ffmpeg-python From 9db1d9fa438fd4973f6c66f4f624fc25176e8cb6 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Sat, 16 Nov 2024 02:56:29 -0800 Subject: [PATCH 02/13] cleanup for package --- .dockerignore | 8 - Dockerfile | 42 -- GPT_SoVITS_Inference.ipynb | 152 ------ README.md | 347 +----------- colab_webui.ipynb | 97 ---- docker-compose.yaml | 32 -- dockerbuild.sh | 21 - go-webui.bat | 2 - go-webui.ps1 | 4 - gpt-sovits_kaggle.ipynb | 218 -------- install.sh | 6 - pyproject.toml | 5 +- test.py | 35 ++ test_streaming.py | 89 +++ webui.py | 1054 ------------------------------------ 15 files changed, 144 insertions(+), 1968 deletions(-) delete mode 100644 .dockerignore delete mode 100644 Dockerfile delete mode 100644 GPT_SoVITS_Inference.ipynb delete mode 100644 colab_webui.ipynb delete mode 100644 docker-compose.yaml delete mode 100755 dockerbuild.sh delete mode 100644 go-webui.bat delete mode 100644 go-webui.ps1 delete mode 100644 gpt-sovits_kaggle.ipynb delete mode 100644 install.sh create mode 100644 test.py create mode 100644 test_streaming.py delete mode 100644 webui.py diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index 4eca27be..00000000 --- a/.dockerignore +++ /dev/null @@ -1,8 +0,0 @@ -docs -logs -output -reference -SoVITS_weights -GPT_weights -TEMP -.git diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 80cd9f3a..00000000 --- a/Dockerfile +++ /dev/null @@ -1,42 +0,0 @@ -# Base CUDA image -FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04 - -LABEL maintainer="breakstring@hotmail.com" -LABEL version="dev-20240209" -LABEL description="Docker image for GPT-SoVITS" - - -# Install 3rd party apps -ENV DEBIAN_FRONTEND=noninteractive -ENV TZ=Etc/UTC -RUN apt-get update && \ - apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \ - git lfs install && \ - rm -rf /var/lib/apt/lists/* - -# Copy only requirements.txt initially to leverage Docker cache -WORKDIR /workspace -COPY requirements.txt /workspace/ -RUN pip install --no-cache-dir -r requirements.txt - -# Define a build-time argument for image type -ARG IMAGE_TYPE=full - -# Conditional logic based on the IMAGE_TYPE argument -# Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite" -COPY ./Docker /workspace/Docker -# elite 类型的镜像里面不包含额外的模型 -RUN if [ "$IMAGE_TYPE" != "elite" ]; then \ - chmod +x /workspace/Docker/download.sh && \ - /workspace/Docker/download.sh && \ - python /workspace/Docker/download.py && \ - python -m nltk.downloader averaged_perceptron_tagger cmudict; \ - fi - - -# Copy the rest of the application -COPY . /workspace - -EXPOSE 9871 9872 9873 9874 9880 - -CMD ["python", "webui.py"] diff --git a/GPT_SoVITS_Inference.ipynb b/GPT_SoVITS_Inference.ipynb deleted file mode 100644 index a5b55325..00000000 --- a/GPT_SoVITS_Inference.ipynb +++ /dev/null @@ -1,152 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)" - ], - "metadata": { - "id": "himHYZmra7ix" - } - }, - { - "cell_type": "code", - "metadata": { - "id": "e9b7iFV3dm1f" - }, - "source": [ - "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", - "%cd GPT-SoVITS\n", - "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n", - "!pip install -r requirements.txt" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title Download pretrained models 下载预训练模型\n", - "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n", - "!mkdir -p /content/GPT-SoVITS/tools/uvr5\n", - "%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n", - "%cd /content/GPT-SoVITS/tools/damo_asr/models\n", - "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n", - "# @title UVR5 pretrains 安装uvr5模型\n", - "%cd /content/GPT-SoVITS/tools/uvr5\n", - "!git clone https://huggingface.co/Delik/uvr5_weights\n", - "!git config core.sparseCheckout true\n", - "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" - ], - "metadata": { - "id": "0NgxXg5sjv7z", - "cellView": "form" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#@title Create folder models 创建文件夹模型\n", - "import os\n", - "base_directory = \"/content/GPT-SoVITS\"\n", - "folder_names = [\"SoVITS_weights\", \"GPT_weights\"]\n", - "\n", - "for folder_name in folder_names:\n", - " if os.path.exists(os.path.join(base_directory, folder_name)):\n", - " print(f\"The folder '{folder_name}' already exists. (文件夹'{folder_name}'已经存在。)\")\n", - " else:\n", - " os.makedirs(os.path.join(base_directory, folder_name))\n", - " print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n", - "\n", - "print(\"All folders have been created. (所有文件夹均已创建。)\")" - ], - "metadata": { - "cellView": "form", - "id": "cPDEH-9czOJF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import requests\n", - "import zipfile\n", - "import shutil\n", - "import os\n", - "\n", - "#@title Import model 导入模型 (HuggingFace)\n", - "hf_link = 'https://huggingface.co/modelloosrvcc/Nagisa_Shingetsu_GPT-SoVITS/resolve/main/Nagisa.zip' #@param {type: \"string\"}\n", - "\n", - "output_path = '/content/'\n", - "\n", - "response = requests.get(hf_link)\n", - "with open(output_path + 'file.zip', 'wb') as file:\n", - " file.write(response.content)\n", - "\n", - "with zipfile.ZipFile(output_path + 'file.zip', 'r') as zip_ref:\n", - " zip_ref.extractall(output_path)\n", - "\n", - "os.remove(output_path + \"file.zip\")\n", - "\n", - "source_directory = output_path\n", - "SoVITS_destination_directory = '/content/GPT-SoVITS/SoVITS_weights'\n", - "GPT_destination_directory = '/content/GPT-SoVITS/GPT_weights'\n", - "\n", - "for filename in os.listdir(source_directory):\n", - " if filename.endswith(\".pth\"):\n", - " source_path = os.path.join(source_directory, filename)\n", - " destination_path = os.path.join(SoVITS_destination_directory, filename)\n", - " shutil.move(source_path, destination_path)\n", - "\n", - "for filename in os.listdir(source_directory):\n", - " if filename.endswith(\".ckpt\"):\n", - " source_path = os.path.join(source_directory, filename)\n", - " destination_path = os.path.join(GPT_destination_directory, filename)\n", - " shutil.move(source_path, destination_path)\n", - "\n", - "print(f'Model downloaded. (模型已下载。)')" - ], - "metadata": { - "cellView": "form", - "id": "vbZY-LnM0tzq" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title launch WebUI 启动WebUI\n", - "!/usr/local/bin/pip install ipykernel\n", - "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", - "%cd /content/GPT-SoVITS/\n", - "!/usr/local/bin/python webui.py" - ], - "metadata": { - "id": "4oRGUzkrk8C7", - "cellView": "form" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/README.md b/README.md index f6fe3d77..36ee3958 100644 --- a/README.md +++ b/README.md @@ -1,153 +1,25 @@ # Jarod's NOTE -Working on turning this into a package. Right now, the API *does in fact* work to make requests to and this can be installed. Will get instrucitons out sometime in the future. +Working on turning this into a package. Right now, the API *does in fact* work to make requests to and this can be installed. -
+## Quick Install and Usage +Ideally, do this all inside of a venv for package isolation +1. Install by doing: + ``` + pip install git+https://github.com/JarodMica/GPT-SoVITS.git + ``` +2. Make sure torch is installed with CUDA enabled. Reccomend to run `pip uninstall torch` to uninstall torch, then reinstall with the following. I chose 2.4.0+cu121: + ``` + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 + ``` - -

GPT-SoVITS-WebUI

-A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

- -[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) - -RVC-Boss%2FGPT-SoVITS | Trendshift - - - -[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) -[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) - -**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md) - -
- ---- - -## Features: - -1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion. - -2. **Few-shot TTS:** Fine-tune the model with just 1 minute of training data for improved voice similarity and realism. - -3. **Cross-lingual Support:** Inference in languages different from the training dataset, currently supporting English, Japanese, Korean, Cantonese and Chinese. - -4. **WebUI Tools:** Integrated tools include voice accompaniment separation, automatic training set segmentation, Chinese ASR, and text labeling, assisting beginners in creating training datasets and GPT/SoVITS models. - -**Check out our [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw) here!** - -Unseen speakers few-shot fine-tuning demo: - -https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb - -**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** - -## Installation - -For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online. - -### Tested Environments - -- Python 3.9, PyTorch 2.0.1, CUDA 11 -- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3 -- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon) -- Python 3.9, PyTorch 2.2.2, CPU devices - -_Note: numba==0.56.4 requires py<3.11_ - -### Windows - -If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI. - -**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).** - -### Linux - -```bash -conda create -n GPTSoVits python=3.9 -conda activate GPTSoVits -bash install.sh +Now to use it, so far I've only tested it with the api_v2.py. Given that the install above went fine, you should now be able to run: ``` - -### macOS - -**Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.** - -1. Install Xcode command-line tools by running `xcode-select --install`. -2. Install FFmpeg by running `brew install ffmpeg`. -3. Install the program by running the following commands: - -```bash -conda create -n GPTSoVits python=3.9 -conda activate GPTSoVits -pip install -r requirements.txt -``` - -### Install Manually - -#### Install FFmpeg - -##### Conda Users - -```bash -conda install ffmpeg -``` - -##### Ubuntu/Debian Users - -```bash -sudo apt install ffmpeg -sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' -``` - -##### Windows Users - -Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root. - -Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only) - -##### MacOS Users -```bash -brew install ffmpeg -``` - -#### Install Dependences - -```bash -pip install -r requirements.txt -``` - -### Using Docker - -#### docker-compose.yaml configuration - -0. Regarding image tags: Due to rapid updates in the codebase and the slow process of packaging and testing images, please check [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits) for the currently packaged latest images and select as per your situation, or alternatively, build locally using a Dockerfile according to your own needs. -1. Environment Variables: - -- is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation. - -2. Volumes Configuration,The application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content. -3. shm_size: The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation. -4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances. - -#### Running with docker compose - -``` -docker compose -f "docker-compose.yaml" up -d -``` - -#### Running with docker command - -As above, modify the corresponding parameters based on your actual situation, then run the following command: - -``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +gpt_sovits_api ``` +Which will bootup local server that you can make requests to. Checkout `test.py` and `test_streaming.py` to get an idea for how you might be able to use the API. ## Pretrained Models - -**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).** +Probably don't need to follow the instructions for the below, these are just kept here for reference for now. 1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`. @@ -159,193 +31,8 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker 5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint. -## Dataset Format - -The TTS annotation .list file format: - -``` -vocal_path|speaker_name|language|text -``` - -Language dictionary: - -- 'zh': Chinese -- 'ja': Japanese -- 'en': English -- 'ko': Korean -- 'yue': Cantonese - -Example: - -``` -D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. -``` - -## Finetune and inference - - ### Open WebUI - - #### Integrated Package Users - - Double-click `go-webui.bat`or use `go-webui.ps1` - if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-v1.ps1` - - #### Others - - ```bash - python webui.py - ``` - - if you want to switch to V1,then - - ```bash - python webui.py v1 - ``` -Or maunally switch version in WebUI - - ### Finetune - - #### Path Auto-filling is now supported - - 1.Fill in the audio path - - 2.Slice the audio into small chunks - - 3.Denoise(optinal) - - 4.ASR - - 5.Proofreading ASR transcriptions - - 6.Go to the next Tab, then finetune the model - - ### Open Inference WebUI - - #### Integrated Package Users - - Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` - - #### Others - - ```bash - python GPT_SoVITS/inference_webui.py - ``` - OR - - ```bash - python webui.py - ``` -then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` - - ## V2 Release Notes - -New Features: - -1. Support Korean and Cantonese - -2. An optimized text frontend - -3. Pre-trained model extended from 2k hours to 5k hours - -4. Improved synthesis quality for low-quality reference audio - - [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7) ) - -Use v2 from v1 environment: - -1. `pip install -r requirements.txt` to update some packages - -2. Clone the latest codes from github. - -3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`. - - Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`. - -## Todo List - -- [x] **High Priority:** - - - [x] Localization in Japanese and English. - - [x] User guide. - - [x] Japanese and English dataset fine tune training. - -- [ ] **Features:** - - [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min). - - [x] TTS speaking speed control. - - [ ] ~~Enhanced TTS emotion control.~~ - - [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent). - - [x] Improve English and Japanese text frontend. - - [ ] Develop tiny and larger-sized TTS models. - - [x] Colab scripts. - - [ ] Try expand training dataset (2k hours -> 10k hours). - - [x] better sovits base model (enhanced audio quality) - - [ ] model mix - -## (Additional) Method for running from the command line -Use the command line to open the WebUI for UVR5 -``` -python tools/uvr5/webui.py "" -``` - -This is how the audio segmentation of the dataset is done using the command line -``` -python audio_slicer.py \ - --input_path "" \ - --output_root "" \ - --threshold \ - --min_length \ - --min_interval - --hop_size -``` -This is how dataset ASR processing is done using the command line(Only Chinese) -``` -python tools/asr/funasr_asr.py -i -o -``` -ASR processing is performed through Faster_Whisper(ASR marking except Chinese) - -(No progress bars, GPU performance may cause time delays) -``` -python ./tools/asr/fasterwhisper_asr.py -i -o -l -p -``` -A custom list save path is enabled - ## Credits -Special thanks to the following projects and contributors: +Special thanks to the RVC-Boss for getting this wonderful tool up and going, as well as all of the other attributions used to build it: -### Theoretical Research -- [ar-vits](https://github.com/innnky/ar-vits) -- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) -- [vits](https://github.com/jaywalnut310/vits) -- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556) -- [contentvec](https://github.com/auspicious3000/contentvec/) -- [hifi-gan](https://github.com/jik876/hifi-gan) -- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) -### Pretrained Models -- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) -- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) -### Text Frontend for Inference -- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) -- [LangSegment](https://github.com/juntaosun/LangSegment) -- [g2pW](https://github.com/GitYCC/g2pW) -- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) -- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) -### WebUI Tools -- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) -- [audio-slicer](https://github.com/openvpi/audio-slicer) -- [SubFix](https://github.com/cronrpc/SubFix) -- [FFmpeg](https://github.com/FFmpeg/FFmpeg) -- [gradio](https://github.com/gradio-app/gradio) -- [faster-whisper](https://github.com/SYSTRAN/faster-whisper) -- [FunASR](https://github.com/alibaba-damo-academy/FunASR) - -Thankful to @Naozumi520 for providing the Cantonese training set and for the guidance on Cantonese-related knowledge. - -## Thanks to all contributors for their efforts - - - - +**Original Repo:** https://github.com/RVC-Boss/GPT-SoVITS \ No newline at end of file diff --git a/colab_webui.ipynb b/colab_webui.ipynb deleted file mode 100644 index 838f8264..00000000 --- a/colab_webui.ipynb +++ /dev/null @@ -1,97 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "环境配置 environment" - ], - "metadata": { - "id": "_o6a8GS2lWQM" - } - }, - { - "cell_type": "code", - "metadata": { - "id": "e9b7iFV3dm1f" - }, - "source": [ - "!pip install -q condacolab\n", - "# Setting up condacolab and installing packages\n", - "import condacolab\n", - "condacolab.install_from_url(\"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh\")\n", - "%cd -q /content\n", - "!git clone https://github.com/RVC-Boss/GPT-SoVITS\n", - "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n", - "%cd -q /content/GPT-SoVITS\n", - "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n", - "!/usr/local/bin/pip install -r requirements.txt" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title Download pretrained models 下载预训练模型\n", - "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n", - "!mkdir -p /content/GPT-SoVITS/tools/uvr5\n", - "%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n", - "%cd /content/GPT-SoVITS/tools/damo_asr/models\n", - "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n", - "# @title UVR5 pretrains 安装uvr5模型\n", - "%cd /content/GPT-SoVITS/tools/uvr5\n", - "%rm -r uvr5_weights\n", - "!git clone https://huggingface.co/Delik/uvr5_weights\n", - "!git config core.sparseCheckout true\n", - "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" - ], - "metadata": { - "id": "0NgxXg5sjv7z" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title launch WebUI 启动WebUI\n", - "!/usr/local/bin/pip install ipykernel\n", - "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", - "%cd /content/GPT-SoVITS/\n", - "!/usr/local/bin/python webui.py" - ], - "metadata": { - "id": "4oRGUzkrk8C7" - }, - "execution_count": null, - "outputs": [] - } - ] -} diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index aca8ab9e..00000000 --- a/docker-compose.yaml +++ /dev/null @@ -1,32 +0,0 @@ -version: '3.8' - -services: - gpt-sovits: - image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container. - container_name: gpt-sovits-container - environment: - - is_half=False - - is_share=False - volumes: - - ./output:/workspace/output - - ./logs:/workspace/logs - - ./SoVITS_weights:/workspace/SoVITS_weights - - ./reference:/workspace/reference - working_dir: /workspace - ports: - - "9880:9880" - - "9871:9871" - - "9872:9872" - - "9873:9873" - - "9874:9874" - shm_size: 16G - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: "all" - capabilities: [gpu] - stdin_open: true - tty: true - restart: unless-stopped diff --git a/dockerbuild.sh b/dockerbuild.sh deleted file mode 100755 index 3a4a1e18..00000000 --- a/dockerbuild.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# 获取当前日期,格式为 YYYYMMDD -DATE=$(date +%Y%m%d) -# 获取最新的 Git commit 哈希值的前 7 位 -COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7) - -# 构建 full 版本的镜像 -docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest . -# 为同一个镜像添加带日期的标签 -docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE -# 为同一个镜像添加带当前代码库Commit哈希值的标签 -docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH - - -# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器) -docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite . -# 为同一个镜像添加带日期的标签 -docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite -# 为同一个镜像添加带当前代码库Commit哈希值的标签 -docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite diff --git a/go-webui.bat b/go-webui.bat deleted file mode 100644 index 398f6d9b..00000000 --- a/go-webui.bat +++ /dev/null @@ -1,2 +0,0 @@ -runtime\python.exe webui.py zh_CN -pause diff --git a/go-webui.ps1 b/go-webui.ps1 deleted file mode 100644 index 8962732a..00000000 --- a/go-webui.ps1 +++ /dev/null @@ -1,4 +0,0 @@ -$ErrorActionPreference = "SilentlyContinue" -chcp 65001 -& "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\webui.py zh_CN" -pause diff --git a/gpt-sovits_kaggle.ipynb b/gpt-sovits_kaggle.ipynb deleted file mode 100644 index 84ecd89c..00000000 --- a/gpt-sovits_kaggle.ipynb +++ /dev/null @@ -1,218 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "45857cb2", - "metadata": { - "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", - "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", - "execution": { - "iopub.execute_input": "2024-02-18T14:43:46.735480Z", - "iopub.status.busy": "2024-02-18T14:43:46.735183Z", - "iopub.status.idle": "2024-02-18T14:48:10.724175Z", - "shell.execute_reply": "2024-02-18T14:48:10.723059Z" - }, - "papermill": { - "duration": 263.994935, - "end_time": "2024-02-18T14:48:10.726613", - "exception": false, - "start_time": "2024-02-18T14:43:46.731678", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", - "%cd GPT-SoVITS\n", - "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n", - "!pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9d346b4", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-18T14:48:10.815802Z", - "iopub.status.busy": "2024-02-18T14:48:10.814899Z", - "iopub.status.idle": "2024-02-18T14:50:31.253276Z", - "shell.execute_reply": "2024-02-18T14:50:31.252024Z" - }, - "papermill": { - "duration": 140.484893, - "end_time": "2024-02-18T14:50:31.255720", - "exception": false, - "start_time": "2024-02-18T14:48:10.770827", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# @title Download pretrained models 下载预训练模型\n", - "!mkdir -p /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!mkdir -p /kaggle/working/GPT-SoVITS/tools/asr/models\n", - "!mkdir -p /kaggle/working/GPT-SoVITS/tools/uvr5\n", - "%cd /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n", - "%cd /kaggle/working/GPT-SoVITS/tools/asr/models\n", - "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n", - "# # @title UVR5 pretrains 安装uvr5模型\n", - "%cd /kaggle/working/GPT-SoVITS/tools/uvr5\n", - "!git clone https://huggingface.co/Delik/uvr5_weights\n", - "!git config core.sparseCheckout true\n", - "!mv /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea94d245", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-18T14:29:01.071549Z", - "iopub.status.busy": "2024-02-18T14:29:01.070592Z", - "iopub.status.idle": "2024-02-18T14:40:45.318368Z", - "shell.execute_reply": "2024-02-18T14:40:45.317130Z", - "shell.execute_reply.started": "2024-02-18T14:29:01.071512Z" - }, - "papermill": { - "duration": null, - "end_time": null, - "exception": false, - "start_time": "2024-02-18T14:50:31.309013", - "status": "running" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# @title launch WebUI 启动WebUI\n", - "%cd /kaggle/working/GPT-SoVITS/\n", - "!npm install -g localtunnel\n", - "import subprocess\n", - "import threading\n", - "import time\n", - "import socket\n", - "import urllib.request\n", - "def iframe_thread(port):\n", - " while True:\n", - " time.sleep(0.5)\n", - " sock= socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", - " if result == 0:\n", - " break\n", - " sock.close()\n", - "\n", - " from colorama import Fore, Style\n", - " print (Fore.GREEN + \"\\nIP: \", Fore. RED, urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"), \"\\n\", Style. RESET_ALL)\n", - " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", - " for line in p.stdout:\n", - " print(line.decode(), end='')\n", - "threading.Thread (target=iframe_thread, daemon=True, args=(9874,)).start()\n", - "\n", - "!python webui.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dda88a6d", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-18T14:40:56.880608Z", - "iopub.status.busy": "2024-02-18T14:40:56.879879Z" - }, - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# 开启推理页面\n", - "%cd /kaggle/working/GPT-SoVITS/\n", - "!npm install -g localtunnel\n", - "import subprocess\n", - "import threading\n", - "import time\n", - "import socket\n", - "import urllib.request\n", - "def iframe_thread(port):\n", - " while True:\n", - " time.sleep(0.5)\n", - " sock= socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", - " if result == 0:\n", - " break\n", - " sock.close()\n", - "\n", - " from colorama import Fore, Style\n", - " print (Fore.GREEN + \"\\nIP: \", Fore. RED, urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"), \"\\n\", Style. RESET_ALL)\n", - " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", - " for line in p.stdout:\n", - " print(line.decode(), end='')\n", - "threading.Thread (target=iframe_thread, daemon=True, args=(9872,)).start()\n", - "\n", - "!python ./GPT_SoVITS/inference_webui.py" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "nvidiaTeslaT4", - "dataSources": [ - { - "datasetId": 4459328, - "sourceId": 7649639, - "sourceType": "datasetVersion" - } - ], - "dockerImageVersionId": 30646, - "isGpuEnabled": true, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": null, - "end_time": null, - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-02-18T14:43:44.011910", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/install.sh b/install.sh deleted file mode 100644 index 8dadb26d..00000000 --- a/install.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -conda install -c conda-forge gcc -conda install -c conda-forge gxx -conda install ffmpeg cmake -conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia -pip install -r requirements.txt diff --git a/pyproject.toml b/pyproject.toml index 2ee1d234..cd500100 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "GPT_SoVITS" dynamic = ["version"] -description = "" +description = "A packaged version of GPT_SoVITS designed to be installable" readme = "README.md" license = {text = "MIT License"} classifiers = [ @@ -50,11 +50,12 @@ dependencies = [ "opencc==1.1.1; sys_platform == 'linux'", "python_mecab_ko; sys_platform != 'win32'", "fastapi<0.112.2", + "sounddevice" ] [project.urls] -Homepage = "https://github.com/RVC-Boss/GPT-SoVITS" +Homepage = "https://github.com/JarodMica/GPT-SoVITS/tree/main" [project.scripts] "gpt_sovits_api" = "GPT_SoVITS.api_v2:main" diff --git a/test.py b/test.py new file mode 100644 index 00000000..ecc007e2 --- /dev/null +++ b/test.py @@ -0,0 +1,35 @@ +import requests + +# API configuration +host = '127.0.0.1' +port = 9880 +url = f'http://{host}:{port}/tts' + +# Parameters for the request +params = { + 'text': 'ははは、それはいいよね!でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか', + 'text_lang': 'ja', + 'ref_audio_path': 'test.wav', + 'prompt_lang': 'ja', + 'prompt_text': 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか', + 'text_split_method': 'cut0', + 'batch_size': 1, + 'media_type': 'wav', + 'streaming_mode': False, +} + +try: + # Send the GET request + response = requests.get(url, params=params) + + # Check if the request was successful + if response.status_code == 200: + # Save the audio content to a file + with open('output.wav', 'wb') as f: + f.write(response.content) + print('Audio saved to output.wav') + else: + print(f'Error: {response.status_code}') + print(response.json()) +except requests.exceptions.RequestException as e: + print(f'An error occurred: {e}') diff --git a/test_streaming.py b/test_streaming.py new file mode 100644 index 00000000..ca667096 --- /dev/null +++ b/test_streaming.py @@ -0,0 +1,89 @@ +import requests +import wave +import io +import sounddevice as sd + +# API configuration +host = '127.0.0.1' +port = 9880 +url = f'http://{host}:{port}/tts' + +# Parameters for the request +params = { + 'text': 'ふふふ、それはいいよね!でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか', + 'text_lang': 'ja', + 'ref_audio_path': 'test.wav', + 'prompt_lang': 'ja', + 'prompt_text': 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか', + 'text_split_method': 'cut0', + 'batch_size': 1, + 'media_type': 'wav', + 'streaming_mode': True, +} + +while True: + input("Waiting for enter") + try: + # Send the GET request with streaming enabled + response = requests.get(url, params=params, stream=True) + + # Check if the request was successful + if response.status_code == 200: + buffer = b'' # Buffer to hold data until header is processed + header_size = 44 # Standard WAV header size + header_parsed = False + stream = None + + for chunk in response.iter_content(chunk_size=4096): + if chunk: + if not header_parsed: + buffer += chunk + if len(buffer) >= header_size: + # Parse WAV header + wav_header = buffer[:header_size] + wav_file = wave.open(io.BytesIO(wav_header), 'rb') + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + sample_rate = wav_file.getframerate() + wav_file.close() + + # Map sample_width to dtype + dtype_map = {1: 'int8', 2: 'int16', 3: 'int24', 4: 'int32'} + dtype = dtype_map.get(sample_width, 'int16') + + # Open audio stream + stream = sd.RawOutputStream( + samplerate=sample_rate, + channels=channels, + dtype=dtype, + blocksize=0, # Use default block size + ) + + stream.start() + + # Write any remaining data after the header + data = buffer[header_size:] + if data: + stream.write(data) + header_parsed = True + buffer = b'' # Clear buffer + else: + # Play audio data + if stream: + stream.write(chunk) + # Clean up + if stream: + stream.stop() + stream.close() + print('Audio playback completed.') + else: + print(f'Error: {response.status_code}') + # Print the error message from the API + try: + print(response.json()) + except ValueError: + print(response.text) + except requests.exceptions.RequestException as e: + print(f'An error occurred: {e}') + except Exception as e: + print(f'An error occurred: {e}') diff --git a/webui.py b/webui.py deleted file mode 100644 index d510370f..00000000 --- a/webui.py +++ /dev/null @@ -1,1054 +0,0 @@ -import os,sys -if len(sys.argv)==1:sys.argv.append('v2') -version="v1"if sys.argv[1]=="v1" else"v2" -os.environ["version"]=version -now_dir = os.getcwd() -sys.path.insert(0, now_dir) -import warnings -warnings.filterwarnings("ignore") -import json,yaml,torch,pdb,re,shutil -import platform -import psutil -import signal -torch.manual_seed(233333) -tmp = os.path.join(now_dir, "TEMP") -os.makedirs(tmp, exist_ok=True) -os.environ["TEMP"] = tmp -if(os.path.exists(tmp)): - for name in os.listdir(tmp): - if(name=="jieba.cache"):continue - path="%s/%s"%(tmp,name) - delete=os.remove if os.path.isfile(path) else shutil.rmtree - try: - delete(path) - except Exception as e: - print(str(e)) - pass -import site -import traceback -site_packages_roots = [] -for path in site.getsitepackages(): - if "packages" in path: - site_packages_roots.append(path) -if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir] -#os.environ["OPENBLAS_NUM_THREADS"] = "4" -os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" -os.environ["all_proxy"] = "" -for site_packages_root in site_packages_roots: - if os.path.exists(site_packages_root): - try: - with open("%s/users.pth" % (site_packages_root), "w") as f: - f.write( - "%s\n%s/tools\n%s/tools/asr\n%s/GPT_SoVITS\n%s/tools/uvr5" - % (now_dir, now_dir, now_dir, now_dir, now_dir) - ) - break - except PermissionError as e: - traceback.print_exc() -from tools import my_utils -import shutil -import pdb -from subprocess import Popen -import signal -from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share -from tools.i18n.i18n import I18nAuto, scan_language_list -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" -os.environ["language"]=language -i18n = I18nAuto(language=language) -from scipy.io import wavfile -from tools.my_utils import load_audio, check_for_existance, check_details -from multiprocessing import cpu_count -# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu -try: - import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... -import gradio as gr -n_cpu=cpu_count() - -ngpu = torch.cuda.device_count() -gpu_infos = [] -mem = [] -if_gpu_ok = False - -# 判断是否有能用来训练和加速推理的N卡 -ok_gpu_keywords={"10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060","H"} -set_gpu_numbers=set() -if torch.cuda.is_available() or ngpu != 0: - for i in range(ngpu): - gpu_name = torch.cuda.get_device_name(i) - if any(value in gpu_name.upper()for value in ok_gpu_keywords): - # A10#A100#V100#A40#P40#M40#K80#A4500 - if_gpu_ok = True # 至少有一张能用的N卡 - gpu_infos.append("%s\t%s" % (i, gpu_name)) - set_gpu_numbers.add(i) - mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4)) -# # 判断是否支持mps加速 -# if torch.backends.mps.is_available(): -# if_gpu_ok = True -# gpu_infos.append("%s\t%s" % ("0", "Apple GPU")) -# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存 - -if if_gpu_ok and len(gpu_infos) > 0: - gpu_info = "\n".join(gpu_infos) - default_batch_size = min(mem) // 2 -else: - gpu_info = ("%s\t%s" % ("0", "CPU")) - gpu_infos.append("%s\t%s" % ("0", "CPU")) - set_gpu_numbers.add(0) - default_batch_size = int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2) -gpus = "-".join([i[0] for i in gpu_infos]) -default_gpu_numbers=str(sorted(list(set_gpu_numbers))[0]) -def fix_gpu_number(input):#将越界的number强制改到界内 - try: - if(int(input)not in set_gpu_numbers):return default_gpu_numbers - except:return input - return input -def fix_gpu_numbers(inputs): - output=[] - try: - for input in inputs.split(","):output.append(str(fix_gpu_number(input))) - return ",".join(output) - except: - return inputs - -pretrained_sovits_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "GPT_SoVITS/pretrained_models/s2G488k.pth"] -pretrained_gpt_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"] - -pretrained_model_list = (pretrained_sovits_name[-int(version[-1])+2],pretrained_sovits_name[-int(version[-1])+2].replace("s2G","s2D"),pretrained_gpt_name[-int(version[-1])+2],"GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large","GPT_SoVITS/pretrained_models/chinese-hubert-base") - -_='' -for i in pretrained_model_list: - if os.path.exists(i):... - else:_+=f'\n {i}' -if _: - print("warning:",i18n('以下模型不存在:')+_) - -_ =[[],[]] -for i in range(2): - if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i]) - else:_[0].append("")##没有下pretrained模型的,说不定他们是想自己从零训底模呢 - if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i]) - else:_[-1].append("") -pretrained_gpt_name,pretrained_sovits_name = _ - -SoVITS_weight_root=["SoVITS_weights_v2","SoVITS_weights"] -GPT_weight_root=["GPT_weights_v2","GPT_weights"] -for root in SoVITS_weight_root+GPT_weight_root: - os.makedirs(root,exist_ok=True) -def get_weights_names(): - SoVITS_names = [name for name in pretrained_sovits_name if name!=""] - for path in SoVITS_weight_root: - for name in os.listdir(path): - if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (path, name)) - GPT_names = [name for name in pretrained_gpt_name if name!=""] - for path in GPT_weight_root: - for name in os.listdir(path): - if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (path, name)) - return SoVITS_names, GPT_names - -SoVITS_names,GPT_names = get_weights_names() -for path in SoVITS_weight_root+GPT_weight_root: - os.makedirs(path,exist_ok=True) - - -def custom_sort_key(s): - # 使用正则表达式提取字符串中的数字部分和非数字部分 - parts = re.split('(\d+)', s) - # 将数字部分转换为整数,非数字部分保持不变 - parts = [int(part) if part.isdigit() else part for part in parts] - return parts - -def change_choices(): - SoVITS_names, GPT_names = get_weights_names() - return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"} - -p_label=None -p_uvr5=None -p_asr=None -p_denoise=None -p_tts_inference=None - -def kill_proc_tree(pid, including_parent=True): - try: - parent = psutil.Process(pid) - except psutil.NoSuchProcess: - # Process already terminated - return - - children = parent.children(recursive=True) - for child in children: - try: - os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL - except OSError: - pass - if including_parent: - try: - os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL - except OSError: - pass - -system=platform.system() -def kill_process(pid): - if(system=="Windows"): - cmd = "taskkill /t /f /pid %s" % pid - os.system(cmd) - else: - kill_proc_tree(pid) - - -def change_label(path_list): - global p_label - if(p_label==None): - check_for_existance([path_list]) - path_list=my_utils.clean_path(path_list) - cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share) - yield i18n("打标工具WebUI已开启"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} - print(cmd) - p_label = Popen(cmd, shell=True) - elif(p_label!=None): - kill_process(p_label.pid) - p_label=None - yield i18n("打标工具WebUI已关闭"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} - -def change_uvr5(): - global p_uvr5 - if(p_uvr5==None): - cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share) - yield i18n("UVR5已开启"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} - print(cmd) - p_uvr5 = Popen(cmd, shell=True) - elif(p_uvr5!=None): - kill_process(p_uvr5.pid) - p_uvr5=None - yield i18n("UVR5已关闭"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} - -def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path, batched_infer_enabled): - global p_tts_inference - if batched_infer_enabled: - cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language) - else: - cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) - if(p_tts_inference==None): - os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path) - os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path) - os.environ["cnhubert_base_path"]=cnhubert_base_path - os.environ["bert_path"]=bert_path - os.environ["_CUDA_VISIBLE_DEVICES"]=fix_gpu_number(gpu_number) - os.environ["is_half"]=str(is_half) - os.environ["infer_ttswebui"]=str(webui_port_infer_tts) - os.environ["is_share"]=str(is_share) - yield i18n("TTS推理进程已开启"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} - print(cmd) - p_tts_inference = Popen(cmd, shell=True) - elif(p_tts_inference!=None): - kill_process(p_tts_inference.pid) - p_tts_inference=None - yield i18n("TTS推理进程已关闭"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} - -from tools.asr.config import asr_dict -def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang, asr_precision): - global p_asr - if(p_asr==None): - asr_inp_dir=my_utils.clean_path(asr_inp_dir) - asr_opt_dir=my_utils.clean_path(asr_opt_dir) - check_for_existance([asr_inp_dir]) - cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}' - cmd += f' -i "{asr_inp_dir}"' - cmd += f' -o "{asr_opt_dir}"' - cmd += f' -s {asr_model_size}' - cmd += f' -l {asr_lang}' - cmd += f" -p {asr_precision}" - output_file_name = os.path.basename(asr_inp_dir) - output_folder = asr_opt_dir or "output/asr_opt" - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') - yield "ASR任务开启:%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True}, {"__type__":"update"}, {"__type__":"update"}, {"__type__":"update"} - print(cmd) - p_asr = Popen(cmd, shell=True) - p_asr.wait() - p_asr=None - yield f"ASR任务完成, 查看终端进行下一步", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__":"update","value":output_file_path}, {"__type__":"update","value":output_file_path}, {"__type__":"update","value":asr_inp_dir} - else: - yield "已有正在进行的ASR任务,需先终止才能开启下一次任务", {"__type__":"update","visible":False}, {"__type__":"update","visible":True}, {"__type__":"update"}, {"__type__":"update"}, {"__type__":"update"} - # return None - -def close_asr(): - global p_asr - if(p_asr!=None): - kill_process(p_asr.pid) - p_asr=None - return "已终止ASR进程", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} -def open_denoise(denoise_inp_dir, denoise_opt_dir): - global p_denoise - if(p_denoise==None): - denoise_inp_dir=my_utils.clean_path(denoise_inp_dir) - denoise_opt_dir=my_utils.clean_path(denoise_opt_dir) - check_for_existance([denoise_inp_dir]) - cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32") - - yield "语音降噪任务开启:%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True}, {"__type__":"update"}, {"__type__":"update"} - print(cmd) - p_denoise = Popen(cmd, shell=True) - p_denoise.wait() - p_denoise=None - yield f"语音降噪任务完成, 查看终端进行下一步", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__":"update","value":denoise_opt_dir}, {"__type__":"update","value":denoise_opt_dir} - else: - yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务", {"__type__":"update","visible":False}, {"__type__":"update","visible":True}, {"__type__":"update"}, {"__type__":"update"} - # return None - -def close_denoise(): - global p_denoise - if(p_denoise!=None): - kill_process(p_denoise.pid) - p_denoise=None - return "已终止语音降噪进程", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} - -p_train_SoVITS=None -def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D): - global p_train_SoVITS - if(p_train_SoVITS==None): - with open("GPT_SoVITS/configs/s2.json")as f: - data=f.read() - data=json.loads(data) - s2_dir="%s/%s"%(exp_root,exp_name) - os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True) - if check_for_existance([s2_dir],is_train=True): - check_details([s2_dir],is_train=True) - if(is_half==False): - data["train"]["fp16_run"]=False - batch_size=max(1,batch_size//2) - data["train"]["batch_size"]=batch_size - data["train"]["epochs"]=total_epoch - data["train"]["text_low_lr_rate"]=text_low_lr_rate - data["train"]["pretrained_s2G"]=pretrained_s2G - data["train"]["pretrained_s2D"]=pretrained_s2D - data["train"]["if_save_latest"]=if_save_latest - data["train"]["if_save_every_weights"]=if_save_every_weights - data["train"]["save_every_epoch"]=save_every_epoch - data["train"]["gpu_numbers"]=gpu_numbers1Ba - data["model"]["version"]=version - data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir - data["save_weight_dir"]=SoVITS_weight_root[-int(version[-1])+2] - data["name"]=exp_name - data["version"]=version - tmp_config_path="%s/tmp_s2.json"%tmp - with open(tmp_config_path,"w")as f:f.write(json.dumps(data)) - - cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path) - yield "SoVITS训练开始:%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True} - print(cmd) - p_train_SoVITS = Popen(cmd, shell=True) - p_train_SoVITS.wait() - p_train_SoVITS=None - yield "SoVITS训练完成", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} - else: - yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务", {"__type__":"update","visible":False}, {"__type__":"update","visible":True} - -def close1Ba(): - global p_train_SoVITS - if(p_train_SoVITS!=None): - kill_process(p_train_SoVITS.pid) - p_train_SoVITS=None - return "已终止SoVITS训练", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} - -p_train_GPT=None -def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1): - global p_train_GPT - if(p_train_GPT==None): - with open("GPT_SoVITS/configs/s1longer.yaml"if version=="v1"else "GPT_SoVITS/configs/s1longer-v2.yaml")as f: - data=f.read() - data=yaml.load(data, Loader=yaml.FullLoader) - s1_dir="%s/%s"%(exp_root,exp_name) - os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True) - if check_for_existance([s1_dir],is_train=True): - check_details([s1_dir],is_train=True) - if(is_half==False): - data["train"]["precision"]="32" - batch_size = max(1, batch_size // 2) - data["train"]["batch_size"]=batch_size - data["train"]["epochs"]=total_epoch - data["pretrained_s1"]=pretrained_s1 - data["train"]["save_every_n_epoch"]=save_every_epoch - data["train"]["if_save_every_weights"]=if_save_every_weights - data["train"]["if_save_latest"]=if_save_latest - data["train"]["if_dpo"]=if_dpo - data["train"]["half_weights_save_dir"]=GPT_weight_root[-int(version[-1])+2] - data["train"]["exp_name"]=exp_name - data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir - data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir - data["output_dir"]="%s/logs_s1"%s1_dir - # data["version"]=version - - os.environ["_CUDA_VISIBLE_DEVICES"]=fix_gpu_numbers(gpu_numbers.replace("-",",")) - os.environ["hz"]="25hz" - tmp_config_path="%s/tmp_s1.yaml"%tmp - with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False)) - # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir) - cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path) - yield "GPT训练开始:%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True} - print(cmd) - p_train_GPT = Popen(cmd, shell=True) - p_train_GPT.wait() - p_train_GPT=None - yield "GPT训练完成", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} - else: - yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务", {"__type__":"update","visible":False}, {"__type__":"update","visible":True} - -def close1Bb(): - global p_train_GPT - if(p_train_GPT!=None): - kill_process(p_train_GPT.pid) - p_train_GPT=None - return "已终止GPT训练", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} - -ps_slice=[] -def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts): - global ps_slice - inp = my_utils.clean_path(inp) - opt_root = my_utils.clean_path(opt_root) - check_for_existance([inp]) - if(os.path.exists(inp)==False): - yield "输入路径不存在", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} - return - if os.path.isfile(inp):n_parts=1 - elif os.path.isdir(inp):pass - else: - yield "输入路径存在但既不是文件也不是文件夹", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} - return - if (ps_slice == []): - for i_part in range(n_parts): - cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts) - print(cmd) - p = Popen(cmd, shell=True) - ps_slice.append(p) - yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} - for p in ps_slice: - p.wait() - ps_slice=[] - yield "切割结束", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__": "update", "value":opt_root}, {"__type__": "update", "value":opt_root}, {"__type__": "update", "value":opt_root} - else: - yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} - -def close_slice(): - global ps_slice - if (ps_slice != []): - for p_slice in ps_slice: - try: - kill_process(p_slice.pid) - except: - traceback.print_exc() - ps_slice=[] - return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - -ps1a=[] -def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): - global ps1a - inp_text = my_utils.clean_path(inp_text) - inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1a == []): - opt_dir="%s/%s"%(exp_root,exp_name) - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "bert_pretrained_dir":bert_pretrained_dir, - } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) - for i_part in range(all_parts): - config.update( - { - "i_part": str(i_part), - "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), - "is_half": str(is_half) - } - ) - os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec - print(cmd) - p = Popen(cmd, shell=True) - ps1a.append(p) - yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1a: - p.wait() - opt = [] - for i_part in range(all_parts): - txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) - with open(txt_path, "r", encoding="utf8") as f: - opt += f.read().strip("\n").split("\n") - os.remove(txt_path) - path_text = "%s/2-name2text.txt" % opt_dir - with open(path_text, "w", encoding="utf8") as f: - f.write("\n".join(opt) + "\n") - ps1a=[] - if len("".join(opt)) > 0: - yield "文本进程成功", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - else: - yield "文本进程失败", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - else: - yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - -def close1a(): - global ps1a - if (ps1a != []): - for p1a in ps1a: - try: - kill_process(p1a.pid) - except: - traceback.print_exc() - ps1a=[] - return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - -ps1b=[] -def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): - global ps1b - inp_text = my_utils.clean_path(inp_text) - inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1b == []): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":"%s/%s"%(exp_root,exp_name), - "cnhubert_base_dir":ssl_pretrained_dir, - "is_half": str(is_half) - } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) - for i_part in range(all_parts): - config.update( - { - "i_part": str(i_part), - "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), - } - ) - os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec - print(cmd) - p = Popen(cmd, shell=True) - ps1b.append(p) - yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1b: - p.wait() - ps1b=[] - yield "SSL提取进程结束", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} - else: - yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - -def close1b(): - global ps1b - if (ps1b != []): - for p1b in ps1b: - try: - kill_process(p1b.pid) - except: - traceback.print_exc() - ps1b=[] - return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - -ps1c=[] -def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): - global ps1c - inp_text = my_utils.clean_path(inp_text) - if check_for_existance([inp_text,''], is_dataset_processing=True): - check_details([inp_text,''], is_dataset_processing=True) - if (ps1c == []): - opt_dir="%s/%s"%(exp_root,exp_name) - config={ - "inp_text":inp_text, - "exp_name":exp_name, - "opt_dir":opt_dir, - "pretrained_s2G":pretrained_s2G_path, - "s2config_path":"GPT_SoVITS/configs/s2.json", - "is_half": str(is_half) - } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) - for i_part in range(all_parts): - config.update( - { - "i_part": str(i_part), - "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), - } - ) - os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec - print(cmd) - p = Popen(cmd, shell=True) - ps1c.append(p) - yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1c: - p.wait() - opt = ["item_name\tsemantic_audio"] - path_semantic = "%s/6-name2semantic.tsv" % opt_dir - for i_part in range(all_parts): - semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) - with open(semantic_path, "r", encoding="utf8") as f: - opt += f.read().strip("\n").split("\n") - os.remove(semantic_path) - with open(path_semantic, "w", encoding="utf8") as f: - f.write("\n".join(opt) + "\n") - ps1c=[] - yield "语义token提取进程结束", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} - else: - yield "已有正在进行的语义token提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - -def close1c(): - global ps1c - if (ps1c != []): - for p1c in ps1c: - try: - kill_process(p1c.pid) - except: - traceback.print_exc() - ps1c=[] - return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} -#####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G -ps1abc=[] -def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path): - global ps1abc - inp_text = my_utils.clean_path(inp_text) - inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1abc == []): - opt_dir="%s/%s"%(exp_root,exp_name) - try: - #############################1a - path_text="%s/2-name2text.txt" % opt_dir - if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "bert_pretrained_dir":bert_pretrained_dir, - "is_half": str(is_half) - } - gpu_names=gpu_numbers1a.split("-") - all_parts=len(gpu_names) - for i_part in range(all_parts): - config.update( - { - "i_part": str(i_part), - "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), - } - ) - os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec - print(cmd) - p = Popen(cmd, shell=True) - ps1abc.append(p) - yield "进度:1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() - - opt = [] - for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) - txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) - with open(txt_path, "r",encoding="utf8") as f: - opt += f.read().strip("\n").split("\n") - os.remove(txt_path) - with open(path_text, "w",encoding="utf8") as f: - f.write("\n".join(opt) + "\n") - assert len("".join(opt)) > 0, "1Aa-文本获取进程失败" - yield "进度:1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc=[] - #############################1b - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "cnhubert_base_dir":ssl_pretrained_dir, - } - gpu_names=gpu_numbers1Ba.split("-") - all_parts=len(gpu_names) - for i_part in range(all_parts): - config.update( - { - "i_part": str(i_part), - "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), - } - ) - os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec - print(cmd) - p = Popen(cmd, shell=True) - ps1abc.append(p) - yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() - yield "进度:1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc=[] - #############################1c - path_semantic = "%s/6-name2semantic.tsv" % opt_dir - if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)): - config={ - "inp_text":inp_text, - "exp_name":exp_name, - "opt_dir":opt_dir, - "pretrained_s2G":pretrained_s2G_path, - "s2config_path":"GPT_SoVITS/configs/s2.json", - } - gpu_names=gpu_numbers1c.split("-") - all_parts=len(gpu_names) - for i_part in range(all_parts): - config.update( - { - "i_part": str(i_part), - "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), - } - ) - os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec - print(cmd) - p = Popen(cmd, shell=True) - ps1abc.append(p) - yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() - - opt = ["item_name\tsemantic_audio"] - for i_part in range(all_parts): - semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) - with open(semantic_path, "r",encoding="utf8") as f: - opt += f.read().strip("\n").split("\n") - os.remove(semantic_path) - with open(path_semantic, "w",encoding="utf8") as f: - f.write("\n".join(opt) + "\n") - yield "进度:all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc = [] - yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - except: - traceback.print_exc() - close1abc() - yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - else: - yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - -def close1abc(): - global ps1abc - if (ps1abc != []): - for p1abc in ps1abc: - try: - kill_process(p1abc.pid) - except: - traceback.print_exc() - ps1abc=[] - return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - -def switch_version(version_): - os.environ['version']=version_ - global version - version = version_ - if pretrained_sovits_name[-int(version[-1])+2] !='' and pretrained_gpt_name[-int(version[-1])+2] !='':... - else: - gr.Warning(i18n(f'未下载{version.upper()}模型')) - return {'__type__':'update', 'value':pretrained_sovits_name[-int(version[-1])+2]}, {'__type__':'update', 'value':pretrained_sovits_name[-int(version[-1])+2].replace("s2G","s2D")}, {'__type__':'update', 'value':pretrained_gpt_name[-int(version[-1])+2]}, {'__type__':'update', 'value':pretrained_gpt_name[-int(version[-1])+2]}, {'__type__':'update', 'value':pretrained_sovits_name[-int(version[-1])+2]} - -if os.path.exists('GPT_SoVITS/text/G2PWModel'):... -else: - cmd = '"%s" GPT_SoVITS/download.py'%python_exec - p = Popen(cmd, shell=True) - p.wait() - -def sync(text): - return {'__type__':'update','value':text} -with gr.Blocks(title="GPT-SoVITS WebUI") as app: - gr.Markdown( - value= - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - ) - gr.Markdown( - value= - i18n("中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e") - ) - - with gr.Tabs(): - with gr.TabItem(i18n("0-前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 - gr.Markdown(value=i18n("0a-UVR5人声伴奏分离&去混响去延迟工具")) - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - uvr5_info = gr.Textbox(label=i18n("UVR5进程输出信息")) - open_uvr5 = gr.Button(value=i18n("开启UVR5-WebUI"),variant="primary",visible=True) - close_uvr5 = gr.Button(value=i18n("关闭UVR5-WebUI"),variant="primary",visible=False) - gr.Markdown(value=i18n("0b-语音切分工具")) - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"),value="") - slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt") - with gr.Row(): - threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34") - min_length=gr.Textbox(label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"),value="4000") - min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300") - hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"),value="10") - max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500") - with gr.Row(): - _max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True) - alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True) - with gr.Row(): - n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True) - slicer_info = gr.Textbox(label=i18n("语音切割进程输出信息")) - open_slicer_button=gr.Button(i18n("开启语音切割"), variant="primary",visible=True) - close_slicer_button=gr.Button(i18n("终止语音切割"), variant="primary",visible=False) - gr.Markdown(value=i18n("0bb-语音降噪工具")) - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="") - denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt") - with gr.Row(): - denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息")) - open_denoise_button = gr.Button(i18n("开启语音降噪"), variant="primary",visible=True) - close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False) - gr.Markdown(value=i18n("0c-中文批量离线ASR工具")) - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - asr_inp_dir = gr.Textbox( - label=i18n("输入文件夹路径"), - value="D:\\GPT-SoVITS\\raw\\xxx", - interactive=True, - ) - asr_opt_dir = gr.Textbox( - label = i18n("输出文件夹路径"), - value = "output/asr_opt", - interactive = True, - ) - with gr.Row(): - asr_model = gr.Dropdown( - label = i18n("ASR 模型"), - choices = list(asr_dict.keys()), - interactive = True, - value="达摩 ASR (中文)" - ) - asr_size = gr.Dropdown( - label = i18n("ASR 模型尺寸"), - choices = ["large"], - interactive = True, - value="large" - ) - asr_lang = gr.Dropdown( - label = i18n("ASR 语言设置"), - choices = ["zh","yue"], - interactive = True, - value="zh" - ) - asr_precision = gr.Dropdown( - label = i18n("数据类型精度"), - choices = ["float32"], - interactive = True, - value="float32" - ) - with gr.Row(): - asr_info = gr.Textbox(label=i18n("ASR进程输出信息")) - open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True) - close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False) - - def change_lang_choices(key): #根据选择的模型修改可选的语言 - # return gr.Dropdown(choices=asr_dict[key]['lang']) - return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]} - def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 - # return gr.Dropdown(choices=asr_dict[key]['size']) - return {"__type__": "update", "choices": asr_dict[key]['size'],"value":asr_dict[key]['size'][-1]} - def change_precision_choices(key): #根据选择的模型修改可选的语言 - if key =="Faster Whisper (多语种)": - if default_batch_size <= 4: - precision = 'int8' - elif is_half: - precision = 'float16' - else: - precision = 'float32' - else: - precision = 'float32' - # return gr.Dropdown(choices=asr_dict[key]['precision']) - return {"__type__": "update", "choices": asr_dict[key]['precision'],"value":precision} - asr_model.change(change_lang_choices, [asr_model], [asr_lang]) - asr_model.change(change_size_choices, [asr_model], [asr_size]) - asr_model.change(change_precision_choices, [asr_model], [asr_precision]) - - - gr.Markdown(value=i18n("0d-语音文本校对标注工具")) - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - path_list = gr.Textbox( - label=i18n(".list标注文件的路径"), - value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", - interactive=True, - ) - label_info = gr.Textbox(label=i18n("打标工具进程输出信息")) - - open_label = gr.Button(value=i18n("开启打标WebUI"),variant="primary",visible=True) - close_label = gr.Button(value=i18n("关闭打标WebUI"),variant="primary",visible=False) - open_label.click(change_label, [path_list], [label_info,open_label,close_label]) - close_label.click(change_label, [path_list], [label_info,open_label,close_label]) - open_uvr5.click(change_uvr5, [], [uvr5_info,open_uvr5,close_uvr5]) - close_uvr5.click(change_uvr5, [], [uvr5_info,open_uvr5,close_uvr5]) - - with gr.TabItem(i18n("1-GPT-SoVITS-TTS")): - with gr.Row(): - with gr.Row(): - exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True) - gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False) - version_checkbox = gr.Radio(label=i18n("版本"),value=version,choices=['v1','v2']) - with gr.Row(): - pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value=pretrained_sovits_name[-int(version[-1])+2], interactive=True, lines=2, max_lines=3,scale=9) - pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value=pretrained_sovits_name[-int(version[-1])+2].replace("s2G","s2D"), interactive=True, lines=2, max_lines=3,scale=9) - pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value=pretrained_gpt_name[-int(version[-1])+2], interactive=True, lines=2, max_lines=3,scale=10) - with gr.TabItem(i18n("1A-训练集格式化工具")): - gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹")) - with gr.Row(): - with gr.Row(): - inp_text = gr.Textbox(label=i18n("*文本标注文件"),value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True,scale=10) - with gr.Row(): - inp_wav_dir = gr.Textbox( - label=i18n("*训练集音频文件目录"), - # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", - interactive=True, - placeholder=i18n("填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。"), scale=10 - ) - gr.Markdown(value=i18n("1Aa-文本内容")) - with gr.Row(): - with gr.Row(): - gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) - with gr.Row(): - bert_pretrained_dir = gr.Textbox(label=i18n("预训练的中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False,lines=2) - with gr.Row(): - button1a_open = gr.Button(i18n("开启文本获取"), variant="primary",visible=True) - button1a_close = gr.Button(i18n("终止文本获取进程"), variant="primary",visible=False) - with gr.Row(): - info1a=gr.Textbox(label=i18n("文本进程输出信息")) - gr.Markdown(value=i18n("1Ab-SSL自监督特征提取")) - with gr.Row(): - with gr.Row(): - gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) - with gr.Row(): - cnhubert_base_dir = gr.Textbox(label=i18n("预训练的SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False,lines=2) - with gr.Row(): - button1b_open = gr.Button(i18n("开启SSL提取"), variant="primary",visible=True) - button1b_close = gr.Button(i18n("终止SSL提取进程"), variant="primary",visible=False) - with gr.Row(): - info1b=gr.Textbox(label=i18n("SSL进程输出信息")) - gr.Markdown(value=i18n("1Ac-语义token提取")) - with gr.Row(): - with gr.Row(): - gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) - with gr.Row(): - pretrained_s2G_ = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value=pretrained_sovits_name[-int(version[-1])+2], interactive=False,lines=2) - with gr.Row(): - button1c_open = gr.Button(i18n("开启语义token提取"), variant="primary",visible=True) - button1c_close = gr.Button(i18n("终止语义token提取进程"), variant="primary",visible=False) - with gr.Row(): - info1c=gr.Textbox(label=i18n("语义token提取进程输出信息")) - gr.Markdown(value=i18n("1Aabc-训练集格式化一键三连")) - with gr.Row(): - with gr.Row(): - button1abc_open = gr.Button(i18n("开启一键三连"), variant="primary",visible=True) - button1abc_close = gr.Button(i18n("终止一键三连"), variant="primary",visible=False) - with gr.Row(): - info1abc=gr.Textbox(label=i18n("一键三连进程输出信息")) - - pretrained_s2G.change(sync,[pretrained_s2G],[pretrained_s2G_]) - open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang, asr_precision], [asr_info,open_asr_button,close_asr_button,path_list,inp_text,inp_wav_dir]) - close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button]) - open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button,asr_inp_dir,denoise_input_dir,inp_wav_dir]) - close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button]) - open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button,asr_inp_dir,inp_wav_dir]) - close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button]) - - button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close]) - button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close]) - button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close]) - button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close]) - button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close]) - button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close]) - button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close]) - button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close]) - with gr.TabItem(i18n("1B-微调训练")): - gr.Markdown(value=i18n("1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。")) - with gr.Row(): - with gr.Column(): - with gr.Row(): - batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True) - total_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=8,interactive=True) - with gr.Row(): - text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True) - save_every_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True) - with gr.Column(): - with gr.Column(): - if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True) - if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) - with gr.Row(): - gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True) - with gr.Row(): - with gr.Row(): - button1Ba_open = gr.Button(i18n("开启SoVITS训练"), variant="primary",visible=True) - button1Ba_close = gr.Button(i18n("终止SoVITS训练"), variant="primary",visible=False) - with gr.Row(): - info1Ba=gr.Textbox(label=i18n("SoVITS训练进程输出信息")) - gr.Markdown(value=i18n("1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。")) - with gr.Row(): - with gr.Column(): - with gr.Row(): - batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True) - total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True) - with gr.Row(): - save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True) - if_dpo = gr.Checkbox(label=i18n("是否开启dpo训练选项(实验性)"), value=False, interactive=True, show_label=True) - with gr.Column(): - with gr.Column(): - if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True) - if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) - with gr.Row(): - gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True) - with gr.Row(): - with gr.Row(): - button1Bb_open = gr.Button(i18n("开启GPT训练"), variant="primary",visible=True) - button1Bb_close = gr.Button(i18n("终止GPT训练"), variant="primary",visible=False) - with gr.Row(): - info1Bb=gr.Textbox(label=i18n("GPT训练进程输出信息")) - button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close]) - button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close]) - button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close]) - button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close]) - with gr.TabItem(i18n("1C-推理")): - gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。")) - with gr.Row(): - with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name[0],interactive=True) - SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name[0],interactive=True) - with gr.Row(): - gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True) - refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") - refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown]) - with gr.Row(): - with gr.Row(): - batched_infer_enabled = gr.Checkbox(label=i18n("启用并行推理版本(推理速度更快)"), value=False, interactive=True, show_label=True) - with gr.Row(): - open_tts = gr.Button(value=i18n("开启TTS推理WebUI"),variant='primary',visible=True) - close_tts = gr.Button(value=i18n("关闭TTS推理WebUI"),variant='primary',visible=False) - with gr.Row(): - tts_info = gr.Textbox(label=i18n("TTS推理WebUI进程输出信息")) - open_tts.click(change_tts_inference, [bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown, batched_infer_enabled], [tts_info,open_tts,close_tts]) - close_tts.click(change_tts_inference, [bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown, batched_infer_enabled], [tts_info,open_tts,close_tts]) - version_checkbox.change(switch_version,[version_checkbox],[pretrained_s2G,pretrained_s2D,pretrained_s1,GPT_dropdown,SoVITS_dropdown]) - with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("施工中,请静候佳音")) - app.queue().launch(#concurrency_count=511, max_size=1022 - server_name="0.0.0.0", - inbrowser=True, - share=is_share, - server_port=webui_port_main, - quiet=True, - ) From 38218e794d723288c2af6799f87d993bfce4d452 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Sat, 16 Nov 2024 02:57:56 -0800 Subject: [PATCH 03/13] update name --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 36ee3958..0cd8fece 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Working on turning this into a package. Right now, the API *does in fact* work Ideally, do this all inside of a venv for package isolation 1. Install by doing: ``` - pip install git+https://github.com/JarodMica/GPT-SoVITS.git + pip install git+https://github.com/JarodMica/GPT-SoVITS-Package.git ``` 2. Make sure torch is installed with CUDA enabled. Reccomend to run `pip uninstall torch` to uninstall torch, then reinstall with the following. I chose 2.4.0+cu121: ``` From 251160362f2d385ef2a3a89f24362bae342dcc9b Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 00:11:17 -0800 Subject: [PATCH 04/13] Add token streaming in batches to the TTS class. --- GPT_SoVITS/TTS_infer_pack/TTS.py | 419 ++++++++++++++++++++- GPT_SoVITS/TTS_infer_pack/zero_crossing.py | 203 ++++++++++ GPT_SoVITS/api_v2.py | 2 +- infer_script.py | 272 +++++++++++++ 4 files changed, 893 insertions(+), 3 deletions(-) create mode 100644 GPT_SoVITS/TTS_infer_pack/zero_crossing.py create mode 100644 infer_script.py diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 809601b3..42105064 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -13,6 +13,10 @@ import torch import torch.nn.functional as F import traceback import yaml +import queue +import sounddevice as sd +import soundfile as sf +import threading from huggingface_hub import snapshot_download, hf_hub_download from importlib.resources import files @@ -875,9 +879,64 @@ class TTS: t_34 += t4 - t3 refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]] - + + # Split the semantic tokens into chunks + num_chunks = 10 # Number of chunks to split into + chunked_pred_semantic_list = [] # This will store the chunks for each sample + + pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)] + + for semantic_tokens in pred_semantic_list: + total_length = semantic_tokens.shape[0] + chunk_size = total_length // num_chunks + chunks = [] + for i in range(num_chunks): + overlap = 0 + # samples_per_token = 1280 + # sample_rate = 32000 + # start_idx = i * chunk_size - overlap + start_idx = 0 + # so each subsequent sample is overlapping by 5120 samples + if start_idx < 0: + start_idx = 0 + if i == num_chunks - 1: + # Make sure to include the remainder in the last chunk + end_idx = total_length + else: + end_idx = (i + 1) * chunk_size + chunk = semantic_tokens[start_idx:end_idx] + chunks.append(chunk) + chunked_pred_semantic_list.append(chunks) + + # Process chunks through VITS + batch_audio_chunks = [] # List to hold audio chunks for each sample - batch_audio_fragment = [] + for i, (chunks, phones) in enumerate(zip(chunked_pred_semantic_list, batch_phones)): + phones = phones.unsqueeze(0).to(self.configs.device) + audio_chunks = [] + for chunk in chunks: + # Prepare the chunk for VITS + chunk = chunk.unsqueeze(0).unsqueeze(0).to(self.configs.device) + # Process the chunk through VITS + audio_fragment = self.vits_model.decode( + chunk, phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :] + audio_chunks.append(audio_fragment.cpu().numpy()) + batch_audio_chunks.append(audio_chunks) + + output_dir = 'output_chunks' + os.makedirs(output_dir, exist_ok=True) + + for sample_idx, audio_chunks in enumerate(batch_audio_chunks): + for chunk_idx, audio_chunk in enumerate(audio_chunks): + # Convert audio_chunk to float32 + audio_chunk = audio_chunk.astype(np.float32) + # Create a filename for each chunk + filename = f'sample_{sample_idx}_chunk_{chunk_idx}.wav' + output_path = os.path.join(output_dir, filename) + # Save the audio chunk + sf.write(output_path, audio_chunk, self.configs.sampling_rate) + print(f'Saved audio chunk: {output_path}') # ## vits并行推理 method 1 # pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)] @@ -965,6 +1024,362 @@ class TTS: raise e finally: self.empty_cache() + + @torch.no_grad() + def run_generator(self, inputs:dict): + """ + Text to speech inference. + + Args: + inputs (dict): + { + "text": "", # str.(required) text to be synthesized + "text_lang: "", # str.(required) language of the text to be synthesized + "ref_audio_path": "", # str.(required) reference audio path + "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion + "prompt_text": "", # str.(optional) prompt text for the reference audio + "prompt_lang": "", # str.(required) language of the prompt text for the reference audio + "top_k": 5, # int. top k sampling + "top_p": 1, # float. top p sampling + "temperature": 1, # float. temperature for sampling + "text_split_method": "cut0", # str. text split method, see text_segmentation_method.py for details. + "batch_size": 1, # int. batch size for inference + "batch_threshold": 0.75, # float. threshold for batch splitting. + "split_bucket: True, # bool. whether to split the batch into multiple buckets. + "return_fragment": False, # bool. step by step return the audio fragment. + "speed_factor":1.0, # float. control the speed of the synthesized audio. + "fragment_interval":0.3, # float. to control the interval of the audio fragment. + "seed": -1, # int. random seed for reproducibility. + "parallel_infer": True, # bool. whether to use parallel inference. + "repetition_penalty": 1.35 # float. repetition penalty for T2S model. + } + returns: + Tuple[int, np.ndarray]: sampling rate and audio data. + """ + ########## variables initialization ########### + self.stop_flag:bool = False + text:str = inputs.get("text", "") + text_lang:str = inputs.get("text_lang", "") + ref_audio_path:str = inputs.get("ref_audio_path", "") + aux_ref_audio_paths:list = inputs.get("aux_ref_audio_paths", []) + prompt_text:str = inputs.get("prompt_text", "") + prompt_lang:str = inputs.get("prompt_lang", "") + top_k:int = inputs.get("top_k", 5) + top_p:float = inputs.get("top_p", 1) + temperature:float = inputs.get("temperature", 1) + text_split_method:str = inputs.get("text_split_method", "cut0") + batch_size = inputs.get("batch_size", 1) + batch_threshold = inputs.get("batch_threshold", 0.75) + speed_factor = inputs.get("speed_factor", 1.0) + split_bucket = inputs.get("split_bucket", True) + return_fragment = inputs.get("return_fragment", False) + fragment_interval = inputs.get("fragment_interval", 0.3) + seed = inputs.get("seed", -1) + seed = -1 if seed in ["", None] else seed + actual_seed = set_seed(seed) + parallel_infer = inputs.get("parallel_infer", True) + repetition_penalty = inputs.get("repetition_penalty", 1.35) + + if parallel_infer: + print(i18n("并行推理模式已开启")) + self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_batch_infer + else: + print(i18n("并行推理模式已关闭")) + self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_naive_batched + + if return_fragment: + print(i18n("分段返回模式已开启")) + if split_bucket: + split_bucket = False + print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理")) + + if split_bucket and speed_factor==1.0: + print(i18n("分桶处理模式已开启")) + elif speed_factor!=1.0: + print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理")) + split_bucket = False + else: + print(i18n("分桶处理模式已关闭")) + + if fragment_interval<0.01: + fragment_interval = 0.01 + print(i18n("分段间隔过小,已自动设置为0.01")) + + no_prompt_text = False + if prompt_text in [None, ""]: + no_prompt_text = True + + assert text_lang in self.configs.languages + if not no_prompt_text: + assert prompt_lang in self.configs.languages + + if ref_audio_path in [None, ""] and \ + ((self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []])): + raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()") + + ###### setting reference audio and prompt text preprocessing ######## + t0 = ttime() + if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]): + if not os.path.exists(ref_audio_path): + raise ValueError(f"{ref_audio_path} not exists") + self.set_ref_audio(ref_audio_path) + + aux_ref_audio_paths = aux_ref_audio_paths if aux_ref_audio_paths is not None else [] + paths = set(aux_ref_audio_paths)&set(self.prompt_cache["aux_ref_audio_paths"]) + if not (len(list(paths)) == len(aux_ref_audio_paths) == len(self.prompt_cache["aux_ref_audio_paths"])): + self.prompt_cache["aux_ref_audio_paths"] = aux_ref_audio_paths + self.prompt_cache["refer_spec"] = [self.prompt_cache["refer_spec"][0]] + for path in aux_ref_audio_paths: + if path in [None, ""]: + continue + if not os.path.exists(path): + print(i18n("音频文件不存在,跳过:{}").format(path)) + continue + self.prompt_cache["refer_spec"].append(self._get_ref_spec(path)) + + if not no_prompt_text: + prompt_text = prompt_text.strip("\n") + if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_lang != "en" else "." + print(i18n("实际输入的参考文本:"), prompt_text) + if self.prompt_cache["prompt_text"] != prompt_text: + self.prompt_cache["prompt_text"] = prompt_text + self.prompt_cache["prompt_lang"] = prompt_lang + phones, bert_features, norm_text = \ + self.text_preprocessor.segment_and_extract_feature_for_text( + prompt_text, + prompt_lang, + self.configs.version) + self.prompt_cache["phones"] = phones + self.prompt_cache["bert_features"] = bert_features + self.prompt_cache["norm_text"] = norm_text + + + + + ###### text preprocessing ######## + t1 = ttime() + data:list = None + if not return_fragment: + data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version) + if len(data) == 0: + yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate), + dtype=np.int16) + return + + batch_index_list:list = None + data, batch_index_list = self.to_batch(data, + prompt_data=self.prompt_cache if not no_prompt_text else None, + batch_size=batch_size, + threshold=batch_threshold, + split_bucket=split_bucket, + device=self.configs.device, + precision=self.precision + ) + else: + print(i18n("############ 切分文本 ############")) + texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method) + data = [] + for i in range(len(texts)): + if i%batch_size == 0: + data.append([]) + data[-1].append(texts[i]) + + def make_batch(batch_texts): + batch_data = [] + print(i18n("############ 提取文本Bert特征 ############")) + for text in tqdm(batch_texts): + phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(text, text_lang, self.configs.version) + if phones is None: + continue + res={ + "phones": phones, + "bert_features": bert_features, + "norm_text": norm_text, + } + batch_data.append(res) + if len(batch_data) == 0: + return None + batch, _ = self.to_batch(batch_data, + prompt_data=self.prompt_cache if not no_prompt_text else None, + batch_size=batch_size, + threshold=batch_threshold, + split_bucket=False, + device=self.configs.device, + precision=self.precision + ) + return batch[0] + + t2 = ttime() + + try: + print("############ 推理 ############") + ###### inference ###### + t_34 = 0.0 + t_45 = 0.0 + audio = [] + for item in data: + t3 = ttime() + if return_fragment: + item = make_batch(item) + if item is None: + continue + + batch_phones:List[torch.LongTensor] = item["phones"] + # batch_phones:torch.LongTensor = item["phones"] + batch_phones_len:torch.LongTensor = item["phones_len"] + all_phoneme_ids:torch.LongTensor = item["all_phones"] + all_phoneme_lens:torch.LongTensor = item["all_phones_len"] + all_bert_features:torch.LongTensor = item["all_bert_features"] + norm_text:str = item["norm_text"] + max_len = item["max_len"] + + print(i18n("前端处理后的文本(每句):"), norm_text) + if no_prompt_text : + prompt = None + else: + prompt = self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device) + + refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]] + + generated_tokens_list = [] + start = ttime() + + from GPT_SoVITS.TTS_infer_pack.zero_crossing import find_zero_zone, find_matching_index + zc_index1 = 0 + zc_index2 = 0 + crossing_direction = 0 + first_chunk = True + last_chunk = False + search_length = 32000*5 + num_zeroes = 5 + cumulation_amount=50 + + # Use infer_panel_generator to generate tokens in batches + for generated_tokens in self.t2s_model.model.infer_panel_generator( + all_phoneme_ids[0].unsqueeze(0), + all_phoneme_lens[0].unsqueeze(0), + prompt[0].unsqueeze(0) if prompt is not None else None, + all_bert_features[0].unsqueeze(0), + cumulation_amount=cumulation_amount, + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=self.configs.hz * self.configs.max_sec, + max_len=max_len, + repetition_penalty=repetition_penalty, + ): + # Append the generated tokens + generated_tokens_list.append(generated_tokens) + total_tokens = sum([tokens.size(1) for tokens in generated_tokens_list]) + + tokens_to_process = torch.cat(generated_tokens_list, dim=1)[:, :total_tokens] # uses full context for decoding + + # Check if tokens_to_process contains the EOS token (1024) + contains_eos = (tokens_to_process == 1024).any() + + if contains_eos: + # Replace all instances of the EOS token (1024) with 0 + tokens_to_process = tokens_to_process.masked_fill(tokens_to_process == 1024, 0) + print("Replaced EOS token (1024) with 0 in tokens_to_process") + last_chunk = True + first_chunk = False + + # Prepare input for VITS model + _pred_semantic = tokens_to_process.unsqueeze(0) + phones = batch_phones[0].unsqueeze(0).to(self.configs.device) + + # Generate audio for the tokens + audio_output = self.vits_model.decode( + _pred_semantic, phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :] + + audio_output = audio_output[:].cpu().numpy() + # Convert audio_fragment to float32 and normalize + audio_output = audio_output.astype(np.float32) + max_val = np.abs(audio_output).max() + if max_val > 1.0: + audio_output /= max_val + + start_index = len(audio_output) - search_length + if start_index < 0: + search_length = len(audio_output) + print(f"search_length is too HIGH! Auto adjusted to {search_length} frames as the chunks are only {len(audio_output)} frames large") + start_index = 0 + center_index = zc_index2 # Start from previous zero crossing index and search outwards + max_offset = int(search_length // 2) # branches out in both ways + + if center_index < 0: + raise "Something wrong is going on here, center index issue, less than 0" + elif center_index >= len(audio_output): + raise "Something wrong is going on here, center index issue, greater than audio_output" + + if first_chunk: + + zc_index1, crossing_direction = find_zero_zone( + chunk=audio_output, + start_index=start_index, + search_length=search_length, + num_zeroes=num_zeroes + ) + audio_fragment = audio_output[:zc_index1] + yield self.configs.sampling_rate, audio_fragment + first_chunk = False + zc_index2 = zc_index1 + elif last_chunk: + zc_index1 = find_matching_index( + chunk=audio_output, + center_index=center_index, + max_offset=max_offset, + crossing_direction=crossing_direction + ) + audio_fragment = audio_output[zc_index1:] + yield self.configs.sampling_rate, audio_fragment + + else: + zc_index1 = find_matching_index( + chunk=audio_output, + center_index=center_index, + max_offset=max_offset, + crossing_direction=crossing_direction + ) + + zc_index2, crossing_direction = find_zero_zone( + chunk=audio_output, + start_index=start_index, + search_length=search_length, + num_zeroes=num_zeroes + ) + audio_fragment = audio_output[zc_index1:zc_index2] + yield self.configs.sampling_rate, audio_fragment + + end = ttime() + print(f"Time to speech: {end-start}") + + except Exception as e: + traceback.print_exc() + # 必须返回一个空音频, 否则会导致显存不释放。 + yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate), + dtype=np.int16) + # 重置模型, 否则会导致显存释放不完全。 + del self.t2s_model + del self.vits_model + self.t2s_model = None + self.vits_model = None + self.init_t2s_weights(self.configs.t2s_weights_path) + self.init_vits_weights(self.configs.vits_weights_path) + raise e + finally: + self.empty_cache() + + def empty_cache(self): + try: + gc.collect() # 触发gc的垃圾回收。避免内存一直增长。 + if "cuda" in str(self.configs.device): + torch.cuda.empty_cache() + elif str(self.configs.device) == "mps": + torch.mps.empty_cache() + except: + pass def empty_cache(self): try: diff --git a/GPT_SoVITS/TTS_infer_pack/zero_crossing.py b/GPT_SoVITS/TTS_infer_pack/zero_crossing.py new file mode 100644 index 00000000..542a6de9 --- /dev/null +++ b/GPT_SoVITS/TTS_infer_pack/zero_crossing.py @@ -0,0 +1,203 @@ +import numpy as np +import wave +import struct + +def read_wav_file(filename): + """ + Reads a WAV file and returns the sample rate and data as a numpy array. + """ + with wave.open(filename, 'rb') as wf: + sample_rate = wf.getframerate() + n_frames = wf.getnframes() + sample_width = wf.getsampwidth() + n_channels = wf.getnchannels() + + audio_data = wf.readframes(n_frames) + # Determine the format string for struct unpacking + fmt = "<" + {1:'b', 2:'h', 4:'i'}[sample_width] * n_frames * n_channels + audio_samples = struct.unpack(fmt, audio_data) + audio_array = np.array(audio_samples, dtype=int) + + # If stereo, reshape the array + if n_channels > 1: + audio_array = audio_array.reshape(-1, n_channels) + return sample_rate, audio_array, sample_width, n_channels + +def write_wav_file(filename, sample_rate, data, sample_width, n_channels): + """ + Writes numpy array data to a WAV file. + """ + with wave.open(filename, 'wb') as wf: + wf.setnchannels(n_channels) + wf.setsampwidth(sample_width) + wf.setframerate(sample_rate) + # Flatten the array if it's multi-dimensional + if data.ndim > 1: + data = data.flatten() + # Pack the data into bytes + fmt = "<" + {1:'b', 2:'h', 4:'i'}[sample_width] * len(data) + byte_data = struct.pack(fmt, *data) + wf.writeframes(byte_data) + +def find_zero_zone(chunk, start_index, search_length, num_zeroes=11): + zone = chunk[start_index:start_index + search_length] + print(f"Zero-crossing search zone: Start={start_index}, Length={len(zone)}") + + zero_threshold = 1.0e-4 + # Check for y consecutive zeros + for idx in range(len(zone), -1 + num_zeroes, -1): + index_to_start = idx-num_zeroes + abs_zone = np.abs(zone[index_to_start:idx]) + if np.all(abs_zone < zero_threshold): + index_midpoint = index_to_start + int(num_zeroes // 2) + return (start_index + index_midpoint), None + + print("Falling back to zero crossing due to no zero zone found. You may hear more prominent pops and clicks in the audio. Try increasing search length or cumulative tokens.") + return find_zero_crossing(chunk, start_index, search_length) + +def find_zero_crossing(chunk, start_index, search_length): + # If the model is falling back on the this function, it might be a bad indicator that the search length is too low + + zone = chunk[start_index:start_index + search_length] + sign_changes = np.where(np.diff(np.sign(zone)) != 0)[0] + + if len(sign_changes) == 0: + raise ("No zero-crossings found in this zone. This should not be happening, debugging time.") + else: + zc_index = start_index + sign_changes[0] + 1 + print(f"Zero-crossing found at index {zc_index}") + # Determine the crossing direction in chunk1 + prev_value = chunk[zc_index - 1] + curr_value = chunk[zc_index] + crossing_direction = np.sign(curr_value) - np.sign(prev_value) + print(f"Crossing direction in chunk1: {np.sign(prev_value)} to {np.sign(curr_value)}") + return zc_index, crossing_direction + +def find_matching_index(chunk, center_index, max_offset, crossing_direction): + """ + Finds a zero-crossing in data that matches the specified crossing direction, + starting from center_index and searching outward. + """ + if crossing_direction == None: + return center_index # if zero zone + + # fall back for zero_crossing + data_length = len(chunk) + print(f"Center index in chunk2: {center_index}") + for offset in range(max_offset + 1): + # Check index bounds + idx_forward = center_index + offset + idx_backward = center_index - offset + found = False + + # Check forward direction + if idx_forward < data_length - 1: + prev_sign = np.sign(chunk[idx_forward]) + curr_sign = np.sign(chunk[idx_forward + 1]) + direction = curr_sign - prev_sign + if direction == crossing_direction: + print(f"Matching zero-crossing found at index {idx_forward + 1} (forward)") + return idx_forward + 1 + + # Check backward direction + if idx_backward > 0: + prev_sign = np.sign(chunk[idx_backward - 1]) + curr_sign = np.sign(chunk[idx_backward]) + direction = curr_sign - prev_sign + if direction == crossing_direction: + print(f"Matching zero-crossing found at index {idx_backward} (backward)") + return idx_backward + + print("No matching zero-crossings found in this zone.") + return None + +# legacy, just for history. delete me sometime +def splice_chunks(chunk1, chunk2, search_length, y): + """ + Splices two audio chunks at zero-crossing points. + """ + # Define the zone to search in chunk1 + start_index1 = len(chunk1) - search_length + if start_index1 < 0: + start_index1 = 0 + search_length = len(chunk1) + print(f"Searching for zero-crossing in chunk1 from index {start_index1} to {len(chunk1)}") + # Find zero-crossing in chunk1 + zc_index1, crossing_direction = find_zero_crossing(chunk1, start_index1, search_length, y) + if zc_index1 is None: + print("No zero-crossing found in chunk1 within the specified zone.") + return None + + # Define the zone to search in chunk2 near the same index + # Since chunk2 overlaps with chunk1, we can assume that index positions correspond + # Adjusted search in chunk2 + # You can adjust this value if needed + center_index = zc_index1 # Assuming alignment between chunk1 and chunk2 + max_offset = search_length + + # Ensure center_index is within bounds + if center_index < 0: + center_index = 0 + elif center_index >= len(chunk2): + center_index = len(chunk2) - 1 + + print(f"Searching for matching zero-crossing in chunk2 around index {center_index} with max offset {max_offset}") + + zc_index2 = find_matching_zero_crossing(chunk2, center_index, max_offset, crossing_direction) + + if zc_index2 is None: + print("No matching zero-crossing found in chunk2.") + return None + + print(f"Zero-crossing in chunk1 at index {zc_index1}, chunk2 at index {zc_index2}") + # Splice the chunks + new_chunk = np.concatenate((chunk1[:zc_index1], chunk2[zc_index2:])) + print(f"Spliced chunk length: {len(new_chunk)}") + return new_chunk + +# legacy, just for history. delete me sometime +def process_audio_chunks(filenames, sample_rate, x, y, output_filename): + """ + Processes and splices a list of audio chunks. + """ + # Read the first chunk + sr, chunk_data, sample_width, n_channels = read_wav_file(filenames[0]) + if sr != sample_rate: + print(f"Sample rate mismatch in {filenames[0]}") + return + print(f"Processing {filenames[0]}") + # Initialize the combined audio with the first chunk + combined_audio = chunk_data + # Process remaining chunks + for filename in filenames[1:]: + sr, next_chunk_data, _, _ = read_wav_file(filename) + if sr != sample_rate: + print(f"Sample rate mismatch in {filename}") + return + print(f"Processing {filename}") + # Splice the current combined audio with the next chunk + new_combined = splice_chunks(combined_audio, next_chunk_data, x, y) + if new_combined is None: + print(f"Failed to splice chunks between {filename} and previous chunk.") + return + combined_audio = new_combined + # Write the final combined audio to output file + write_wav_file(output_filename, sample_rate, combined_audio, sample_width, n_channels) + print(f"Final audio saved to {output_filename}") + +# Main execution +if __name__ == "__main__": + # User-specified parameters + sample_rate = 32000 # Sample rate in Hz + x = 500 # Number of frames to search from the end of the chunk + y = 10 # Number of consecutive zeros to look for + output_filename = "combined_output.wav" + folder_with_chunks = "output_chunks" + import os + def absolute_file_paths(directory): + path = os.path.abspath(directory) + return [entry.path for entry in os.scandir(path) if entry.is_file()] + # List of input audio chunk filenames in sequential order + filenames = absolute_file_paths(folder_with_chunks) + # Process and splice the audio chunks + process_audio_chunks(filenames, sample_rate, x, y, output_filename) diff --git a/GPT_SoVITS/api_v2.py b/GPT_SoVITS/api_v2.py index 5dfbebec..84e6ffd7 100644 --- a/GPT_SoVITS/api_v2.py +++ b/GPT_SoVITS/api_v2.py @@ -358,7 +358,7 @@ async def tts_get_endpoint( top_p:float = 1, temperature:float = 1, text_split_method:str = "cut0", - batch_size:int = 1, + batch_size:int = 4, batch_threshold:float = 0.75, split_bucket:bool = True, speed_factor:float = 1.0, diff --git a/infer_script.py b/infer_script.py new file mode 100644 index 00000000..4bb38f8c --- /dev/null +++ b/infer_script.py @@ -0,0 +1,272 @@ +''' +This is just an example inference script to test batching with llama, mainly for my reference in the future. +''' + +import os +import sys +import numpy as np +import soundfile as sf +import threading +import queue +import sounddevice as sd +import time +import speech_recognition as sr + +# Ensure that GPT_SoVITS is in the Python path +now_dir = os.getcwd() +sys.path.append(now_dir) +sys.path.append(os.path.join(now_dir, 'GPT_SoVITS')) +os.environ['CUDA_LAUNCH_BLOCKING'] = '1' + + +from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config + +from llama_cpp import Llama +import sys + +# Initialize the Llama model +llm = Llama( + model_path="ggml-model-q8_0.gguf", + n_gpu_layers=-1, # Uncomment to use GPU acceleration + seed=1337, # Uncomment to set a specific seed + n_ctx=2048, # Uncomment to increase the context window + chat_format="llama-3", + verbose=False +) + +from time import time + +def generate_chat_completion_openai_v1_stream(messages): + start = time() + stream = llm.create_chat_completion_openai_v1( + messages=messages, + temperature=0.8, # Adjust temperature as needed + top_p=0.95, # Adjust top_p as needed + top_k=40, # Adjust top_k as needed + max_tokens=50, # Adjust the maximum number of tokens as needed + # stop=["\n"], # Adjust the stop sequence as needed + stream=True # Enable streaming + ) + end = time() + total = end - start + print(total) + for chunk in stream: + if chunk.choices[0].delta.content is not None: + yield chunk.choices[0].delta.content + +def audio_playback_thread(audio_queue, sample_rate): + """ + Audio playback thread that plays audio fragments from the queue. + """ + sd.default.samplerate = sample_rate + sd.default.channels = 1 + stream = sd.OutputStream(dtype='float32') + stream.start() + + try: + while True: + # Get the next audio fragment + audio_fragment = audio_queue.get() + try: + if audio_fragment is None: + # Sentinel value received, exit the loop + break + # Write the audio fragment to the stream + stream.write(audio_fragment) + finally: + # Mark the item as processed + audio_queue.task_done() + finally: + stream.stop() + stream.close() + +def main(): + + config_path = 'configs/tts_infer.yaml' + # GPT_model_path = 'pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt' + GPT_model_path = 'custom_trained.ckpt' + # SoVITS_model_path = 'pretrained_models/gsv-v2final-pretrained/s2G2333k.pth' + SoVITS_model_path = 'custom_trained.pth' + ref_audio_path = 'ref_audio.wav' + ref_text = 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか' + target_text = """hahahaha, well well, let me tell you about that! it was perhaps the most exquisite day of my life! Phew, I've never had one better! """ + output_path = 'output' + ref_language = 'ja' + target_language = 'ja' + + + # Ensure output directory exists + os.makedirs(output_path, exist_ok=True) + + # Initialize TTS configuration and pipeline + tts_config = TTS_Config(config_path) + tts_pipeline = TTS(tts_config) + + # Load model weights + tts_pipeline.init_t2s_weights(GPT_model_path) + tts_pipeline.init_vits_weights(SoVITS_model_path) + + # Prepare inputs for TTS + inputs = { + "text": target_text, + "text_lang": target_language.lower(), + "ref_audio_path": ref_audio_path, + "prompt_text": ref_text, + "prompt_lang": ref_language.lower(), + "top_k": 5, + "top_p": 1.0, + "temperature": 1.0, + "text_split_method": "cut0", + "batch_size": 1, + "batch_threshold": 0.75, + "split_bucket": True, + "speed_factor": 1.0, + "fragment_interval": 0.3, + "seed": 2855904637, + "return_fragment": True, + "parallel_infer": False, + "repetition_penalty": 1.35, + } + + # Run TTS inference + + system_message = '''You are a friendly AI named Vivy. + + HOW YOU SHOULD RESPOND: + - The responses should include only verbal responses, for example *laughs* should be replaced with haha + ''' + + # Initialize conversation history with system message + conversation_history = [ + {"role": "system", "content": f"{system_message}"} + ] + + # Create a queue for audio fragments + audio_queue = queue.Queue(maxsize=100) # Adjust maxsize based on your needs + + # Start the audio playback thread + playback_thread = threading.Thread( + target=audio_playback_thread, + args=(audio_queue, tts_pipeline.configs.sampling_rate) + ) + playback_thread.start() + + # Setup speech recognition + r = sr.Recognizer() + mic = sr.Microphone() + + try: + while True: + # Prompt for speech input instead of text input + while True: + print("\nPlease speak your message (say 'quit' to exit):") + with mic as source: + # Adjust for ambient noise to improve recognition accuracy + r.adjust_for_ambient_noise(source, duration=1.0) + print("Listening...") + audio_data = r.listen(source, timeout=None, phrase_time_limit=60) + try: + # Replace 'recognize_whisper' with your actual recognition method + # Ensure that the method is correctly implemented or available + user_input = r.recognize_whisper(audio_data=audio_data, model="base") + print("You said: " + user_input) + + # Check if the input is not empty or just whitespace + if user_input.strip() == "": + print("No speech detected. Please try again.") + continue # Continue listening + break # Valid input received, exit inner loop + except sr.UnknownValueError: + print("Sorry, I could not understand the audio. Please try again.") + continue # Continue listening + except sr.RequestError as e: + print(f"Could not request results from speech recognition service; {e}") + continue # Continue listening + + # Check if the user wants to quit + if user_input.lower() == "quit": + print("Exiting the application. Goodbye!") + sys.exit() + + # Append user message to conversation history + conversation_history.append({"role": "user", "content": user_input}) + + # Initialize variables to track character count and buffering + buffer = "" + char_count = 0 + waiting_for_punctuation = False + assistant_buffer = "" + + # Generate and print the chat completion with streaming + for token in generate_chat_completion_openai_v1_stream(conversation_history): + print(token, end="", flush=True) # Print each character as it's generated + buffer += token + assistant_buffer += token + char_count += len(token) + + if not waiting_for_punctuation: + if char_count >= 100: + waiting_for_punctuation = True # Start looking for punctuation + else: + if any(punct in token for punct in ['.', '!', '?']): + # Send the buffer to TTS + inputs["text"] = buffer + synthesis_result = tts_pipeline.run_generator(inputs) + # Consume the generator and put audio fragments into the queue + for sampling_rate, audio_fragment in synthesis_result: + audio_queue.put(audio_fragment) + #put sielnce into audio queue after tts sythesis generator has finished + silence_duration = 0.5 # in seconds + num_samples = int(sampling_rate * silence_duration) + silence = np.zeros(num_samples, dtype='float32') + audio_queue.put(silence) + + # Reset counters and buffer + char_count = 0 + buffer = "" + waiting_for_punctuation = False + + # Append assistant message to conversation history + conversation_history.append({"role": "assistant", "content": assistant_buffer}) + + # Handle any remaining text after the generator is done + if buffer.strip(): + inputs["text"] = buffer + synthesis_result = tts_pipeline.run_generator(inputs) + + # Consume the generator and put audio fragments into the queue + for sampling_rate, audio_fragment in synthesis_result: + audio_queue.put(audio_fragment) + #put sielnce into audio queue after tts sythesis generator has finished + silence_duration = 0.5 # in seconds + num_samples = int(sampling_rate * silence_duration) + silence = np.zeros(num_samples, dtype='float32') + audio_queue.put(silence) + + conversation_history.append({"role": "assistant", "content": buffer}) + buffer = "" + char_count = 0 + waiting_for_punctuation = False + finally: + # After all processing is done, send a sentinel to the audio queue and wait for threads to finish + audio_queue.put(None) + audio_queue.join() + playback_thread.join() + + + # text = input("GO:") + # inputs["text"] = text + # synthesis_result = tts_pipeline.run_generator(inputs) + # audio_data_list = list(synthesis_result) + # if audio_data_list: + # # Since return_fragment is False, we expect only one tuple in audio_data_list + # sampling_rate, audio_data = audio_data_list[0] + # output_wav_path = os.path.join(output_path, "output.wav") + # # Save the audio data to a WAV file + # sf.write(output_wav_path, audio_data, sampling_rate) + # print(f"Audio saved to {output_wav_path}") + # else: + # print("No audio data generated.") + +if __name__ == '__main__': + main() From 8292d7bfc2a70417f13e9476781460101c054557 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 00:24:23 -0800 Subject: [PATCH 05/13] Tidy things up --- GPT_SoVITS/TTS_infer_pack/TTS.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 42105064..b7e7b468 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1079,6 +1079,9 @@ class TTS: actual_seed = set_seed(seed) parallel_infer = inputs.get("parallel_infer", True) repetition_penalty = inputs.get("repetition_penalty", 1.35) + search_length = inputs.get("search_length", 32000 * 5) + num_zeroes = inputs.get("num_zeroes", 5) + cumulation_amount = inputs.get("cumulation_amount", 50) if parallel_infer: print(i18n("并行推理模式已开启")) @@ -1250,9 +1253,6 @@ class TTS: crossing_direction = 0 first_chunk = True last_chunk = False - search_length = 32000*5 - num_zeroes = 5 - cumulation_amount=50 # Use infer_panel_generator to generate tokens in batches for generated_tokens in self.t2s_model.model.infer_panel_generator( @@ -1313,14 +1313,14 @@ class TTS: elif center_index >= len(audio_output): raise "Something wrong is going on here, center index issue, greater than audio_output" + # How this works: Sequentially progress through audio file in "chunks", starting from the previous zc (zero_crossing) index up and through to the last chunk. if first_chunk: - zc_index1, crossing_direction = find_zero_zone( - chunk=audio_output, - start_index=start_index, - search_length=search_length, - num_zeroes=num_zeroes - ) + chunk=audio_output, + start_index=start_index, + search_length=search_length, + num_zeroes=num_zeroes + ) audio_fragment = audio_output[:zc_index1] yield self.configs.sampling_rate, audio_fragment first_chunk = False @@ -1431,8 +1431,6 @@ class TTS: return sr, audio - - def speed_change(input_audio:np.ndarray, speed:float, sr:int): # 将 NumPy 数组转换为原始 PCM 流 From c5490bb2a26545fb05366672fed2890287dc1077 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 01:52:08 -0800 Subject: [PATCH 06/13] change imports --- GPT_SoVITS/AR/data/data_module.py | 4 ++-- GPT_SoVITS/AR/models/t2s_lightning_module.py | 6 +++--- .../AR/models/t2s_lightning_module_onnx.py | 6 +++--- GPT_SoVITS/AR/models/t2s_model.py | 14 +++++++------- GPT_SoVITS/AR/models/t2s_model_onnx.py | 10 +++++----- GPT_SoVITS/AR/modules/activation.py | 6 +++--- GPT_SoVITS/AR/modules/activation_onnx.py | 6 +++--- GPT_SoVITS/AR/modules/transformer.py | 4 ++-- GPT_SoVITS/AR/modules/transformer_onnx.py | 4 ++-- GPT_SoVITS/AR/text_processing/phonemizer.py | 2 +- GPT_SoVITS/TTS_infer_pack/TTS.py | 16 ++++++++-------- GPT_SoVITS/export_torch_script.py | 2 +- GPT_SoVITS/inference_gui.py | 2 +- GPT_SoVITS/inference_webui.py | 2 +- GPT_SoVITS/module/attentions.py | 2 +- GPT_SoVITS/module/attentions_onnx.py | 2 +- GPT_SoVITS/onnx_export.py | 2 +- GPT_SoVITS/s1_train.py | 8 ++++---- GPT_SoVITS/tools/i18n/locale/pt_BR.json | 2 +- GPT_SoVITS/tools/uvr5/bsroformer.py | 6 +++--- GPT_SoVITS/tools/uvr5/mdxnet.py | 6 +++--- api.py | 2 +- 22 files changed, 57 insertions(+), 57 deletions(-) diff --git a/GPT_SoVITS/AR/data/data_module.py b/GPT_SoVITS/AR/data/data_module.py index cb947959..9dd4addd 100644 --- a/GPT_SoVITS/AR/data/data_module.py +++ b/GPT_SoVITS/AR/data/data_module.py @@ -1,8 +1,8 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py # reference: https://github.com/lifeiteng/vall-e from pytorch_lightning import LightningDataModule -from AR.data.bucket_sampler import DistributedBucketSampler -from AR.data.dataset import Text2SemanticDataset +from GPT_SoVITS.AR.data.bucket_sampler import DistributedBucketSampler +from GPT_SoVITS.AR.data.dataset import Text2SemanticDataset from torch.utils.data import DataLoader diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py index 2dd3f392..c45e2a72 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py @@ -8,9 +8,9 @@ from typing import Dict import torch from pytorch_lightning import LightningModule -from AR.models.t2s_model import Text2SemanticDecoder -from AR.modules.lr_schedulers import WarmupCosineLRSchedule -from AR.modules.optim import ScaledAdam +from GPT_SoVITS.AR.models.t2s_model import Text2SemanticDecoder +from GPT_SoVITS.AR.modules.lr_schedulers import WarmupCosineLRSchedule +from GPT_SoVITS.AR.modules.optim import ScaledAdam class Text2SemanticLightningModule(LightningModule): def __init__(self, config, output_dir, is_train=True): diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py index 487edb01..04db0713 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py @@ -8,9 +8,9 @@ from typing import Dict import torch from pytorch_lightning import LightningModule -from AR.models.t2s_model_onnx import Text2SemanticDecoder -from AR.modules.lr_schedulers import WarmupCosineLRSchedule -from AR.modules.optim import ScaledAdam +from GPT_SoVITS.AR.models.t2s_model_onnx import Text2SemanticDecoder +from GPT_SoVITS.AR.modules.lr_schedulers import WarmupCosineLRSchedule +from GPT_SoVITS.AR.modules.optim import ScaledAdam class Text2SemanticLightningModule(LightningModule): diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index fb528914..541b68d9 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -5,8 +5,8 @@ from typing import List, Optional import torch from tqdm import tqdm -from AR.models.utils import make_pad_mask -from AR.models.utils import ( +from GPT_SoVITS.AR.models.utils import make_pad_mask +from GPT_SoVITS.AR.models.utils import ( topk_sampling, sample, logits_to_probs, @@ -15,11 +15,11 @@ from AR.models.utils import ( make_reject_y, get_batch_logps ) -from AR.modules.embedding import SinePositionalEmbedding -from AR.modules.embedding import TokenEmbedding -from AR.modules.transformer import LayerNorm -from AR.modules.transformer import TransformerEncoder -from AR.modules.transformer import TransformerEncoderLayer +from GPT_SoVITS.AR.modules.embedding import SinePositionalEmbedding +from GPT_SoVITS.AR.modules.embedding import TokenEmbedding +from GPT_SoVITS.AR.modules.transformer import LayerNorm +from GPT_SoVITS.AR.modules.transformer import TransformerEncoder +from GPT_SoVITS.AR.modules.transformer import TransformerEncoderLayer from torch import nn from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy diff --git a/GPT_SoVITS/AR/models/t2s_model_onnx.py b/GPT_SoVITS/AR/models/t2s_model_onnx.py index 7834297d..83c8cda4 100644 --- a/GPT_SoVITS/AR/models/t2s_model_onnx.py +++ b/GPT_SoVITS/AR/models/t2s_model_onnx.py @@ -3,11 +3,11 @@ import torch from tqdm import tqdm -from AR.modules.embedding_onnx import SinePositionalEmbedding -from AR.modules.embedding_onnx import TokenEmbedding -from AR.modules.transformer_onnx import LayerNorm -from AR.modules.transformer_onnx import TransformerEncoder -from AR.modules.transformer_onnx import TransformerEncoderLayer +from GPT_SoVITS.AR.modules.embedding_onnx import SinePositionalEmbedding +from GPT_SoVITS.AR.modules.embedding_onnx import TokenEmbedding +from GPT_SoVITS.AR.modules.transformer_onnx import LayerNorm +from GPT_SoVITS.AR.modules.transformer_onnx import TransformerEncoder +from GPT_SoVITS.AR.modules.transformer_onnx import TransformerEncoderLayer from torch import nn from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy diff --git a/GPT_SoVITS/AR/modules/activation.py b/GPT_SoVITS/AR/modules/activation.py index 5ca888b5..5e9b0293 100644 --- a/GPT_SoVITS/AR/modules/activation.py +++ b/GPT_SoVITS/AR/modules/activation.py @@ -12,7 +12,7 @@ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter from torch.nn import functional as F -from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched +from GPT_SoVITS.AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched F.multi_head_attention_forward = multi_head_attention_forward_patched @@ -152,14 +152,14 @@ class MultiheadAttention(Module): self.in_proj_linear = linear1_cls( embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs ) - self.in_proj_weight = self.in_proj_linear.weight + self.in_proj_weight = self.in_proj_lineGPT_SoVITS.AR.weight self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: - self.in_proj_bias = self.in_proj_linear.bias + self.in_proj_bias = self.in_proj_lineGPT_SoVITS.AR.bias else: self.register_parameter("in_proj_bias", None) diff --git a/GPT_SoVITS/AR/modules/activation_onnx.py b/GPT_SoVITS/AR/modules/activation_onnx.py index b54acd99..92d89ca4 100644 --- a/GPT_SoVITS/AR/modules/activation_onnx.py +++ b/GPT_SoVITS/AR/modules/activation_onnx.py @@ -12,7 +12,7 @@ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter from torch.nn import functional as F -from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched +from GPT_SoVITS.AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched class MultiheadAttention(Module): @@ -95,14 +95,14 @@ class MultiheadAttention(Module): self.in_proj_linear = linear1_cls( embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs ) - self.in_proj_weight = self.in_proj_linear.weight + self.in_proj_weight = self.in_proj_lineGPT_SoVITS.AR.weight self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: - self.in_proj_bias = self.in_proj_linear.bias + self.in_proj_bias = self.in_proj_lineGPT_SoVITS.AR.bias else: self.register_parameter("in_proj_bias", None) diff --git a/GPT_SoVITS/AR/modules/transformer.py b/GPT_SoVITS/AR/modules/transformer.py index 7921f48e..2ddff82d 100644 --- a/GPT_SoVITS/AR/modules/transformer.py +++ b/GPT_SoVITS/AR/modules/transformer.py @@ -10,8 +10,8 @@ from typing import Tuple from typing import Union import torch -from AR.modules.activation import MultiheadAttention -from AR.modules.scaling import BalancedDoubleSwish +from GPT_SoVITS.AR.modules.activation import MultiheadAttention +from GPT_SoVITS.AR.modules.scaling import BalancedDoubleSwish from torch import nn from torch import Tensor from torch.nn import functional as F diff --git a/GPT_SoVITS/AR/modules/transformer_onnx.py b/GPT_SoVITS/AR/modules/transformer_onnx.py index a3f68b43..fd9853a0 100644 --- a/GPT_SoVITS/AR/modules/transformer_onnx.py +++ b/GPT_SoVITS/AR/modules/transformer_onnx.py @@ -10,8 +10,8 @@ from typing import Tuple from typing import Union import torch -from AR.modules.activation_onnx import MultiheadAttention -from AR.modules.scaling import BalancedDoubleSwish +from GPT_SoVITS.AR.modules.activation_onnx import MultiheadAttention +from GPT_SoVITS.AR.modules.scaling import BalancedDoubleSwish from torch import nn from torch import Tensor from torch.nn import functional as F diff --git a/GPT_SoVITS/AR/text_processing/phonemizer.py b/GPT_SoVITS/AR/text_processing/phonemizer.py index 9c5f58fb..7a33ba9d 100644 --- a/GPT_SoVITS/AR/text_processing/phonemizer.py +++ b/GPT_SoVITS/AR/text_processing/phonemizer.py @@ -9,7 +9,7 @@ import regex from gruut import sentences from gruut.const import Sentence from gruut.const import Word -from AR.text_processing.symbols import SYMBOL_TO_ID +from GPT_SoVITS.AR.text_processing.symbols import SYMBOL_TO_ID class GruutPhonemizer: diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index b7e7b468..fe5f7055 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -25,15 +25,15 @@ from typing import Generator, List, Tuple, Union from tqdm import tqdm -from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from feature_extractor.cnhubert import CNHubert -from module.models import SynthesizerTrn -from module.mel_processing import spectrogram_torch -from tools.i18n.i18n import I18nAuto, scan_language_list -from tools.my_utils import load_audio +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.feature_extractor.cnhubert import CNHubert +from GPT_SoVITS.module.models import SynthesizerTrn +from GPT_SoVITS.module.mel_processing import spectrogram_torch +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools.my_utils import load_audio from transformers import AutoModelForMaskedLM, AutoTokenizer -from TTS_infer_pack.text_segmentation_method import splits -from TTS_infer_pack.TextPreprocessor import TextPreprocessor +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import splits +from GPT_SoVITS.TTS_infer_pack.TextPreprocessor import TextPreprocessor language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py index f7bef133..5be41e19 100644 --- a/GPT_SoVITS/export_torch_script.py +++ b/GPT_SoVITS/export_torch_script.py @@ -13,7 +13,7 @@ from torch.nn import functional as F from transformers import AutoModelForMaskedLM, AutoTokenizer from feature_extractor import cnhubert -from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule from module.models_onnx import SynthesizerTrn from inference_webui import get_phones_and_bert diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py index 2059155d..93ea0f5d 100644 --- a/GPT_SoVITS/inference_gui.py +++ b/GPT_SoVITS/inference_gui.py @@ -299,7 +299,7 @@ class GPTSoVITSGUI(QMainWindow): result = "Audio saved to " + output_wav_path - self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) + self.status_bGPT_SoVITS.AR.showMessage("合成完成!输出路径:" + output_wav_path, 5000) self.output_text.append("处理结果:\n" + result) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 5aff4ae5..69b30294 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -84,7 +84,7 @@ from feature_extractor import cnhubert cnhubert.cnhubert_base_path = cnhubert_base_path from module.models import SynthesizerTrn -from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule from text import cleaned_text_to_sequence from text.cleaner import clean_text from time import time as ttime diff --git a/GPT_SoVITS/module/attentions.py b/GPT_SoVITS/module/attentions.py index a2e9e515..c136a07b 100644 --- a/GPT_SoVITS/module/attentions.py +++ b/GPT_SoVITS/module/attentions.py @@ -365,7 +365,7 @@ class MultiHeadAttention(nn.Module): def _attention_bias_proximal(self, length): """Bias for self-attention to encourage attention to close positions. Args: - length: an integer scalar. + length: an integer scalGPT_SoVITS.AR. Returns: a Tensor with shape [1, 1, length, length] """ diff --git a/GPT_SoVITS/module/attentions_onnx.py b/GPT_SoVITS/module/attentions_onnx.py index 097b1b9c..3cfbd55d 100644 --- a/GPT_SoVITS/module/attentions_onnx.py +++ b/GPT_SoVITS/module/attentions_onnx.py @@ -303,7 +303,7 @@ class MultiHeadAttention(nn.Module): def _attention_bias_proximal(self, length): """Bias for self-attention to encourage attention to close positions. Args: - length: an integer scalar. + length: an integer scalGPT_SoVITS.AR. Returns: a Tensor with shape [1, 1, length, length] """ diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index 43aac19a..3fdc6918 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -1,5 +1,5 @@ from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 -from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule +from GPT_SoVITS.AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule import torch import torchaudio from torch import nn diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py index 898ca549..bfd8fb22 100644 --- a/GPT_SoVITS/s1_train.py +++ b/GPT_SoVITS/s1_train.py @@ -14,14 +14,14 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger from pytorch_lightning.strategies import DDPStrategy -from AR.data.data_module import Text2SemanticDataModule -from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from AR.utils.io import load_yaml_config +from GPT_SoVITS.AR.data.data_module import Text2SemanticDataModule +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.AR.utils.io import load_yaml_config logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) torch.set_float32_matmul_precision("high") -from AR.utils import get_newest_ckpt +from GPT_SoVITS.AR.utils import get_newest_ckpt from collections import OrderedDict from time import time as ttime diff --git a/GPT_SoVITS/tools/i18n/locale/pt_BR.json b/GPT_SoVITS/tools/i18n/locale/pt_BR.json index 1a52de67..3e95d3a7 100644 --- a/GPT_SoVITS/tools/i18n/locale/pt_BR.json +++ b/GPT_SoVITS/tools/i18n/locale/pt_BR.json @@ -106,7 +106,7 @@ "开启一键三连": "Ativar um clique", "开启打标WebUI": "Abrir Labeling WebUI", "开启文本获取": "Ativar obtenção de texto", - "开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativar.", + "开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativGPT_SoVITS.AR.", "开启离线批量ASR": "Ativar ASR offline em lote", "开启语义token提取": "Ativar extração de token semântico", "开启语音切割": "Ativar corte de voz", diff --git a/GPT_SoVITS/tools/uvr5/bsroformer.py b/GPT_SoVITS/tools/uvr5/bsroformer.py index d1620326..3d0cd1da 100644 --- a/GPT_SoVITS/tools/uvr5/bsroformer.py +++ b/GPT_SoVITS/tools/uvr5/bsroformer.py @@ -62,7 +62,7 @@ class BsRoformer_Loader: length_init = mix.shape[-1] progress_bar = tqdm(total=length_init // step + 1) - progress_bar.set_description("Processing") + progress_bGPT_SoVITS.AR.set_description("Processing") # Do pad from the beginning and end to account floating window results better if length_init > 2 * border and (border > 0): @@ -102,7 +102,7 @@ class BsRoformer_Loader: batch_data.append(part) batch_locations.append((i, length)) i += step - progress_bar.update(1) + progress_bGPT_SoVITS.AR.update(1) if len(batch_data) >= batch_size or (i >= mix.shape[1]): arr = torch.stack(batch_data, dim=0) @@ -131,7 +131,7 @@ class BsRoformer_Loader: # Remove pad estimated_sources = estimated_sources[..., border:-border] - progress_bar.close() + progress_bGPT_SoVITS.AR.close() return {k: v for k, v in zip(['vocals', 'other'], estimated_sources)} diff --git a/GPT_SoVITS/tools/uvr5/mdxnet.py b/GPT_SoVITS/tools/uvr5/mdxnet.py index 372db25b..3587ff00 100644 --- a/GPT_SoVITS/tools/uvr5/mdxnet.py +++ b/GPT_SoVITS/tools/uvr5/mdxnet.py @@ -143,7 +143,7 @@ class Predictor: def demix_base(self, mixes, margin_size): chunked_sources = [] progress_bar = tqdm(total=len(mixes)) - progress_bar.set_description("Processing") + progress_bGPT_SoVITS.AR.set_description("Processing") for mix in mixes: cmix = mixes[mix] sources = [] @@ -188,12 +188,12 @@ class Predictor: end = None sources.append(tar_signal[:, start:end]) - progress_bar.update(1) + progress_bGPT_SoVITS.AR.update(1) chunked_sources.append(sources) _sources = np.concatenate(chunked_sources, axis=-1) # del self.model - progress_bar.close() + progress_bGPT_SoVITS.AR.close() return _sources def prediction(self, m, vocal_root, others_root, format): diff --git a/api.py b/api.py index c5f7024f..2ebc5310 100644 --- a/api.py +++ b/api.py @@ -163,7 +163,7 @@ import numpy as np from feature_extractor import cnhubert from io import BytesIO from module.models import SynthesizerTrn -from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule from text import cleaned_text_to_sequence from text.cleaner import clean_text from module.mel_processing import spectrogram_torch From 54bcce13d20849e9a727931b4b092c9376b66f7b Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 01:58:31 -0800 Subject: [PATCH 07/13] more import changes --- GPT_SoVITS/AR/data/dataset.py | 2 +- GPT_SoVITS/AR/models/t2s_model.py | 151 ++++++++++++++++++ GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 6 +- GPT_SoVITS/download.py | 2 +- GPT_SoVITS/export_torch_script.py | 4 +- GPT_SoVITS/inference_webui.py | 10 +- GPT_SoVITS/module/models.py | 18 +-- GPT_SoVITS/module/models_onnx.py | 16 +- GPT_SoVITS/module/modules.py | 6 +- GPT_SoVITS/module/mrte_model.py | 2 +- GPT_SoVITS/module/quantize.py | 2 +- GPT_SoVITS/onnx_export.py | 4 +- GPT_SoVITS/prepare_datasets/1-get-text.py | 2 +- GPT_SoVITS/prepare_datasets/3-get-semantic.py | 2 +- GPT_SoVITS/s2_train.py | 10 +- GPT_SoVITS/text/__init__.py | 8 +- GPT_SoVITS/text/cantonese.py | 6 +- GPT_SoVITS/text/chinese.py | 6 +- GPT_SoVITS/text/chinese2.py | 8 +- GPT_SoVITS/text/cleaner.py | 14 +- GPT_SoVITS/text/english.py | 4 +- GPT_SoVITS/text/g2pw/__init__.py | 2 +- GPT_SoVITS/text/japanese.py | 2 +- GPT_SoVITS/text/korean.py | 2 +- GPT_SoVITS/text/zh_normalization/__init__.py | 2 +- api.py | 10 +- 26 files changed, 226 insertions(+), 75 deletions(-) diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py index 9d2dfe8a..2bd85f9b 100644 --- a/GPT_SoVITS/AR/data/dataset.py +++ b/GPT_SoVITS/AR/data/dataset.py @@ -17,7 +17,7 @@ from transformers import AutoTokenizer version = os.environ.get('version',None) -from text import cleaned_text_to_sequence +from GPT_SoVITS.text import cleaned_text_to_sequence # from config import exp_dir diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index 541b68d9..de541185 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -884,6 +884,157 @@ class Text2SemanticDecoder(nn.Module): return y[:, :-1], 0 return y[:, :-1], idx - 1 + def infer_panel_generator( + self, + x: torch.LongTensor, + x_lens: torch.LongTensor, + prompts: torch.LongTensor, + bert_feature: torch.LongTensor, + cumulation_amount: int, + top_k: int = -100, + top_p: int = 100, + early_stop_num: int = -1, + temperature: float = 1.0, + repetition_penalty: float = 1.35, + **kwargs + ): + """ + Generator method that yields generated tokens based on a specified cumulative amount. + + Args: + x (torch.LongTensor): Input phoneme IDs. + x_lens (torch.LongTensor): Lengths of the input sequences. + prompts (torch.LongTensor): Initial prompt tokens. + bert_feature (torch.LongTensor): BERT features corresponding to the input. + cumulation_amount (int): Number of tokens to generate before yielding. + top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p (float): If set to < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + early_stop_num (int): Early stopping after generating a certain number of tokens. + temperature (float): The value used to module the next token probabilities. + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. + **kwargs: Additional keyword arguments. + + Yields: + torch.LongTensor: Generated tokens since the last yield. + """ + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + + # AR Decoder + y = prompts + + x_len = x.shape[1] + x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device) + stop = False + + # Initialize cumulative token counter + tokens_since_last_yield = 0 + + # Initialize last yield index + if y is not None: + prefix_len = y.shape[1] + else: + prefix_len = 0 + + last_yield_idx = prefix_len + + k_cache = None + v_cache = None + + ################### first step ########################## + if y is not None: + y_emb = self.ar_audio_embedding(y) + y_len = y_emb.shape[1] + prefix_len = y.shape[1] + y_pos = self.ar_audio_position(y_emb) + xy_pos = torch.concat([x, y_pos], dim=1) + ref_free = False + else: + y_emb = None + y_len = 0 + prefix_len = 0 + y_pos = None + xy_pos = x + y = torch.zeros(x.shape[0], 0, dtype=torch.int64, device=x.device) + ref_free = True + + bsz = x.shape[0] + src_len = x_len + y_len + x_attn_mask_pad = F.pad( + x_attn_mask, + (0, y_len), # Extend x_attn_mask to include y tokens + value=True, + ) + y_attn_mask = F.pad( + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), diagonal=1), + (x_len, 0), + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + xy_attn_mask = xy_attn_mask.unsqueeze(0).expand(bsz * self.num_head, -1, -1) + xy_attn_mask = xy_attn_mask.view(bsz, self.num_head, src_len, src_len) + xy_attn_mask = xy_attn_mask.to(device=x.device, dtype=torch.bool) + + for idx in tqdm(range(1500)): + if xy_attn_mask is not None: + xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None) + else: + xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) + + logits = self.ar_predict_layer(xy_dec[:, -1]) + + if idx == 0: + xy_attn_mask = None + if idx < 11: # Ensure at least 10 tokens are generated before stopping + logits = logits[:, :-1] + + samples = sample( + logits, + y, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + temperature=temperature, + )[0] + + y = torch.concat([y, samples], dim=1) + tokens_since_last_yield += 1 + + if tokens_since_last_yield >= cumulation_amount: + # Yield back the generated tokens since last yield + generated_tokens = y[:, last_yield_idx:] + # print(generated_tokens) + yield generated_tokens + last_yield_idx = y.shape[1] + tokens_since_last_yield = 0 + + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + print("Using early stop num:", early_stop_num) + stop = True + + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + + if stop: + if y.shape[1] == 0: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + print("Bad zero prediction") + print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") + break + + # Update for next step + y_emb = self.ar_audio_embedding(y[:, -1:]) + y_len += 1 + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len - 1 + ].to(dtype=y_emb.dtype, device=y_emb.device) + + # After loop ends, yield any remaining tokens + if last_yield_idx < y.shape[1]: + generated_tokens = y[:, last_yield_idx:] + yield generated_tokens + def infer_panel( self, diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index b90bd929..beb31b93 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -8,10 +8,10 @@ sys.path.append(now_dir) import re import torch import LangSegment -from text import chinese +from GPT_SoVITS.text import chinese from typing import Dict, List, Tuple -from text.cleaner import clean_text -from text import cleaned_text_to_sequence +from GPT_SoVITS.text.cleaner import clean_text +from GPT_SoVITS.text import cleaned_text_to_sequence from transformers import AutoModelForMaskedLM, AutoTokenizer from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method diff --git a/GPT_SoVITS/download.py b/GPT_SoVITS/download.py index e7c8c97b..e20fbfd7 100644 --- a/GPT_SoVITS/download.py +++ b/GPT_SoVITS/download.py @@ -1,5 +1,5 @@ import os, sys now_dir = os.getcwd() sys.path.insert(0, now_dir) -from text.g2pw import G2PWPinyin +from GPT_SoVITS.text.g2pw import G2PWPinyin g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",v_to_u=False, neutral_tone_with_five=True) \ No newline at end of file diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py index 5be41e19..9605590e 100644 --- a/GPT_SoVITS/export_torch_script.py +++ b/GPT_SoVITS/export_torch_script.py @@ -3,7 +3,7 @@ import argparse from typing import Optional from my_utils import load_audio -from text import cleaned_text_to_sequence +from GPT_SoVITS.text import cleaned_text_to_sequence import torch import torchaudio @@ -14,7 +14,7 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer from feature_extractor import cnhubert from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule -from module.models_onnx import SynthesizerTrn +from GPT_SoVITS.module.models_onnx import SynthesizerTrn from inference_webui import get_phones_and_bert diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 69b30294..8ca53c37 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -83,12 +83,12 @@ from feature_extractor import cnhubert cnhubert.cnhubert_base_path = cnhubert_base_path -from module.models import SynthesizerTrn +from GPT_SoVITS.module.models import SynthesizerTrn from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule -from text import cleaned_text_to_sequence -from text.cleaner import clean_text +from GPT_SoVITS.text import cleaned_text_to_sequence +from GPT_SoVITS.text.cleaner import clean_text from time import time as ttime -from module.mel_processing import spectrogram_torch +from GPT_SoVITS.module.mel_processing import spectrogram_torch from tools.my_utils import load_audio from tools.i18n.i18n import I18nAuto, scan_language_list @@ -303,7 +303,7 @@ def get_first(text): text = re.split(pattern, text)[0].strip() return text -from text import chinese +from GPT_SoVITS.text import chinese def get_phones_and_bert(text,language,version,final=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: language = language.replace("all_","") diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 968c4cbf..298db6f5 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -9,18 +9,18 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons -from module import modules -from module import attentions +from GPT_SoVITS.module import commons +from GPT_SoVITS.module import modules +from GPT_SoVITS.module import attentions from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from module.commons import init_weights, get_padding -from module.mrte_model import MRTE -from module.quantize import ResidualVectorQuantizer -# from text import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.module.commons import init_weights, get_padding +from GPT_SoVITS.module.mrte_model import MRTE +from GPT_SoVITS.module.quantize import ResidualVectorQuantizer +# from GPT_SoVITS.text import symbols +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 from torch.cuda.amp import autocast import contextlib diff --git a/GPT_SoVITS/module/models_onnx.py b/GPT_SoVITS/module/models_onnx.py index abe2a3c6..d39f50e4 100644 --- a/GPT_SoVITS/module/models_onnx.py +++ b/GPT_SoVITS/module/models_onnx.py @@ -5,17 +5,17 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons -from module import modules -from module import attentions_onnx as attentions +from GPT_SoVITS.module import commons +from GPT_SoVITS.module import modules +from GPT_SoVITS.module import attentions_onnx as attentions from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from module.commons import init_weights, get_padding -from module.quantize import ResidualVectorQuantizer -# from text import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.module.commons import init_weights, get_padding +from GPT_SoVITS.module.quantize import ResidualVectorQuantizer +# from GPT_SoVITS.text import symbols +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 from torch.cuda.amp import autocast diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py index f4447455..03034042 100644 --- a/GPT_SoVITS/module/modules.py +++ b/GPT_SoVITS/module/modules.py @@ -7,9 +7,9 @@ from torch.nn import functional as F from torch.nn import Conv1d from torch.nn.utils import weight_norm, remove_weight_norm -from module import commons -from module.commons import init_weights, get_padding -from module.transforms import piecewise_rational_quadratic_transform +from GPT_SoVITS.module import commons +from GPT_SoVITS.module.commons import init_weights, get_padding +from GPT_SoVITS.module.transforms import piecewise_rational_quadratic_transform import torch.distributions as D diff --git a/GPT_SoVITS/module/mrte_model.py b/GPT_SoVITS/module/mrte_model.py index b0cd242c..68abdf9a 100644 --- a/GPT_SoVITS/module/mrte_model.py +++ b/GPT_SoVITS/module/mrte_model.py @@ -3,7 +3,7 @@ import torch from torch import nn from torch.nn.utils import remove_weight_norm, weight_norm -from module.attentions import MultiHeadAttention +from GPT_SoVITS.module.attentions import MultiHeadAttention class MRTE(nn.Module): diff --git a/GPT_SoVITS/module/quantize.py b/GPT_SoVITS/module/quantize.py index f9a5c632..434ca1ab 100644 --- a/GPT_SoVITS/module/quantize.py +++ b/GPT_SoVITS/module/quantize.py @@ -13,7 +13,7 @@ import typing as tp import torch from torch import nn -from module.core_vq import ResidualVectorQuantization +from GPT_SoVITS.module.core_vq import ResidualVectorQuantization @dataclass diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index 3fdc6918..092be008 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -1,4 +1,4 @@ -from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 +from GPT_SoVITS.module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 from GPT_SoVITS.AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule import torch import torchaudio @@ -8,7 +8,7 @@ from feature_extractor import cnhubert cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" cnhubert.cnhubert_base_path = cnhubert_base_path ssl_model = cnhubert.get_model() -from text import cleaned_text_to_sequence +from GPT_SoVITS.text import cleaned_text_to_sequence import soundfile from tools.my_utils import load_audio import os diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index bdeacc7b..ef569488 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -18,7 +18,7 @@ import sys, numpy as np, traceback, pdb import os.path from glob import glob from tqdm import tqdm -from text.cleaner import clean_text +from GPT_SoVITS.text.cleaner import clean_text from transformers import AutoModelForMaskedLM, AutoTokenizer import numpy as np from tools.my_utils import clean_path diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index a29a6629..365b3a97 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -23,7 +23,7 @@ import torch.multiprocessing as mp from glob import glob from tqdm import tqdm import logging, librosa, utils -from module.models import SynthesizerTrn +from GPT_SoVITS.module.models import SynthesizerTrn from tools.my_utils import clean_path logging.getLogger("numba").setLevel(logging.WARNING) # from config import pretrained_s2G diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py index f5de615f..2b83df9b 100644 --- a/GPT_SoVITS/s2_train.py +++ b/GPT_SoVITS/s2_train.py @@ -18,19 +18,19 @@ logging.getLogger("matplotlib").setLevel(logging.INFO) logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) from random import randint -from module import commons +from GPT_SoVITS.module import commons -from module.data_utils import ( +from GPT_SoVITS.module.data_utils import ( TextAudioSpeakerLoader, TextAudioSpeakerCollate, DistributedBucketSampler, ) -from module.models import ( +from GPT_SoVITS.module.models import ( SynthesizerTrn, MultiPeriodDiscriminator, ) -from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from GPT_SoVITS.module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss +from GPT_SoVITS.module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch from process_ckpt import savee torch.backends.cudnn.benchmark = False diff --git a/GPT_SoVITS/text/__init__.py b/GPT_SoVITS/text/__init__.py index 2791d7ab..ba4d53d7 100644 --- a/GPT_SoVITS/text/__init__.py +++ b/GPT_SoVITS/text/__init__.py @@ -1,11 +1,11 @@ import os # if os.environ.get("version","v1")=="v1": -# from text.symbols import symbols +# from GPT_SoVITS.text.symbols import symbols # else: -# from text.symbols2 import symbols +# from GPT_SoVITS.text.symbols2 import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)} _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)} diff --git a/GPT_SoVITS/text/cantonese.py b/GPT_SoVITS/text/cantonese.py index 83860814..b43efcf6 100644 --- a/GPT_SoVITS/text/cantonese.py +++ b/GPT_SoVITS/text/cantonese.py @@ -5,8 +5,8 @@ import re import cn2an from pyjyutping import jyutping -from text.symbols import punctuation -from text.zh_normalization.text_normlization import TextNormalizer +from GPT_SoVITS.text.symbols import punctuation +from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer normalizer = lambda x: cn2an.transform(x, "an2cn") @@ -182,7 +182,7 @@ def get_jyutping(text): def get_bert_feature(text, word2ph): - from text import chinese_bert + from GPT_SoVITS.text import chinese_bert return chinese_bert.get_bert_feature(text, word2ph) diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py index 2255c6e1..3e50a39a 100644 --- a/GPT_SoVITS/text/chinese.py +++ b/GPT_SoVITS/text/chinese.py @@ -5,9 +5,9 @@ import re import cn2an from pypinyin import lazy_pinyin, Style -from text.symbols import punctuation -from text.tone_sandhi import ToneSandhi -from text.zh_normalization.text_normlization import TextNormalizer +from GPT_SoVITS.text.symbols import punctuation +from GPT_SoVITS.text.tone_sandhi import ToneSandhi +from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer normalizer = lambda x: cn2an.transform(x, "an2cn") diff --git a/GPT_SoVITS/text/chinese2.py b/GPT_SoVITS/text/chinese2.py index f716b410..84e9263b 100644 --- a/GPT_SoVITS/text/chinese2.py +++ b/GPT_SoVITS/text/chinese2.py @@ -6,9 +6,9 @@ import cn2an from pypinyin import lazy_pinyin, Style from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals -from text.symbols import punctuation -from text.tone_sandhi import ToneSandhi -from text.zh_normalization.text_normlization import TextNormalizer +from GPT_SoVITS.text.symbols import punctuation +from GPT_SoVITS.text.tone_sandhi import ToneSandhi +from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer normalizer = lambda x: cn2an.transform(x, "an2cn") @@ -25,7 +25,7 @@ import jieba_fast.posseg as psg is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False if is_g2pw: print("当前使用g2pw进行拼音推理") - from text.g2pw import G2PWPinyin, correct_pronunciation + from GPT_SoVITS.text.g2pw import G2PWPinyin, correct_pronunciation parent_directory = os.path.dirname(current_file_path) g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True) diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 98535f27..f774c30a 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -1,14 +1,14 @@ -from text import cleaned_text_to_sequence +from GPT_SoVITS.text import cleaned_text_to_sequence import os # if os.environ.get("version","v1")=="v1": -# from text import chinese -# from text.symbols import symbols +# from GPT_SoVITS.text import chinese +# from GPT_SoVITS.text.symbols import symbols # else: -# from text import chinese2 as chinese -# from text.symbols2 import symbols +# from GPT_SoVITS.text import chinese2 as chinese +# from GPT_SoVITS.text.symbols2 import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 special = [ # ("%", "zh", "SP"), diff --git a/GPT_SoVITS/text/english.py b/GPT_SoVITS/text/english.py index bd5612e0..3f6a71fa 100644 --- a/GPT_SoVITS/text/english.py +++ b/GPT_SoVITS/text/english.py @@ -4,9 +4,9 @@ import re import wordsegment from g2p_en import G2p -from text.symbols import punctuation +from GPT_SoVITS.text.symbols import punctuation -from text.symbols2 import symbols +from GPT_SoVITS.text.symbols2 import symbols import unicodedata from builtins import str as unicode diff --git a/GPT_SoVITS/text/g2pw/__init__.py b/GPT_SoVITS/text/g2pw/__init__.py index d7126755..3436e58b 100644 --- a/GPT_SoVITS/text/g2pw/__init__.py +++ b/GPT_SoVITS/text/g2pw/__init__.py @@ -1 +1 @@ -from text.g2pw.g2pw import * \ No newline at end of file +from GPT_SoVITS.text.g2pw.g2pw import * \ No newline at end of file diff --git a/GPT_SoVITS/text/japanese.py b/GPT_SoVITS/text/japanese.py index 440062a1..6ef157ac 100644 --- a/GPT_SoVITS/text/japanese.py +++ b/GPT_SoVITS/text/japanese.py @@ -31,7 +31,7 @@ except Exception as e: pass -from text.symbols import punctuation +from GPT_SoVITS.text.symbols import punctuation # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile( r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" diff --git a/GPT_SoVITS/text/korean.py b/GPT_SoVITS/text/korean.py index 8f281369..21340d88 100644 --- a/GPT_SoVITS/text/korean.py +++ b/GPT_SoVITS/text/korean.py @@ -5,7 +5,7 @@ from jamo import h2j, j2hcj import ko_pron from g2pk2 import G2p -from text.symbols2 import symbols +from GPT_SoVITS.text.symbols2 import symbols # This is a list of Korean classifiers preceded by pure Korean numerals. _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' diff --git a/GPT_SoVITS/text/zh_normalization/__init__.py b/GPT_SoVITS/text/zh_normalization/__init__.py index 46b367a6..ad201c81 100644 --- a/GPT_SoVITS/text/zh_normalization/__init__.py +++ b/GPT_SoVITS/text/zh_normalization/__init__.py @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from text.zh_normalization.text_normlization import * +from GPT_SoVITS.text.zh_normalization.text_normlization import * diff --git a/api.py b/api.py index 2ebc5310..69c3ecc3 100644 --- a/api.py +++ b/api.py @@ -162,11 +162,11 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer import numpy as np from feature_extractor import cnhubert from io import BytesIO -from module.models import SynthesizerTrn +from GPT_SoVITS.module.models import SynthesizerTrn from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule -from text import cleaned_text_to_sequence -from text.cleaner import clean_text -from module.mel_processing import spectrogram_torch +from GPT_SoVITS.text import cleaned_text_to_sequence +from GPT_SoVITS.text.cleaner import clean_text +from GPT_SoVITS.module.mel_processing import spectrogram_torch from tools.my_utils import load_audio import config as global_config import logging @@ -312,7 +312,7 @@ def get_bert_inf(phones, word2ph, norm_text, language): return bert -from text import chinese +from GPT_SoVITS.text import chinese def get_phones_and_bert(text,language,version,final=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: language = language.replace("all_","") From 91c0466294b9cd9dd6f7d328cfe5a88f85d3712a Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 02:02:11 -0800 Subject: [PATCH 08/13] fix --- GPT_SoVITS/feature_extractor/cnhubert.py | 2 +- GPT_SoVITS/module/attentions.py | 4 ++-- GPT_SoVITS/module/attentions_onnx.py | 2 +- GPT_SoVITS/module/data_utils.py | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/GPT_SoVITS/feature_extractor/cnhubert.py b/GPT_SoVITS/feature_extractor/cnhubert.py index 013e462b..9548eb71 100644 --- a/GPT_SoVITS/feature_extractor/cnhubert.py +++ b/GPT_SoVITS/feature_extractor/cnhubert.py @@ -16,7 +16,7 @@ from transformers import ( HubertModel, ) -import utils +import GPT_SoVITS.utils import torch.nn as nn cnhubert_base_path = None diff --git a/GPT_SoVITS/module/attentions.py b/GPT_SoVITS/module/attentions.py index c136a07b..8adbd91e 100644 --- a/GPT_SoVITS/module/attentions.py +++ b/GPT_SoVITS/module/attentions.py @@ -3,8 +3,8 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons -from module.modules import LayerNorm +from GPT_SoVITS.module import commons +from GPT_SoVITS.module.modules import LayerNorm class Encoder(nn.Module): diff --git a/GPT_SoVITS/module/attentions_onnx.py b/GPT_SoVITS/module/attentions_onnx.py index 3cfbd55d..3bdc4419 100644 --- a/GPT_SoVITS/module/attentions_onnx.py +++ b/GPT_SoVITS/module/attentions_onnx.py @@ -3,7 +3,7 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons +from GPT_SoVITS.module import commons from typing import Optional diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index bfa29e87..2a6d6653 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -8,16 +8,16 @@ import torch import torch.utils.data from tqdm import tqdm -from module import commons -from module.mel_processing import spectrogram_torch -from text import cleaned_text_to_sequence -from utils import load_wav_to_torch, load_filepaths_and_text +from GPT_SoVITS.module import commons +from GPT_SoVITS.module.mel_processing import spectrogram_torch +from GPT_SoVITS.text import cleaned_text_to_sequence +from GPT_SoVITS.utils import load_wav_to_torch, load_filepaths_and_text import torch.nn.functional as F from functools import lru_cache import requests from scipy.io import wavfile from io import BytesIO -from tools.my_utils import load_audio +from GPT_SoVITS.tools.my_utils import load_audio version = os.environ.get('version',None) # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): From 55badf45c56bed6e0491f080faf4d4fc277968c1 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 02:03:19 -0800 Subject: [PATCH 09/13] fix --- GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 2 +- GPT_SoVITS/api_v2.py | 2 +- GPT_SoVITS/inference_cli.py | 2 +- GPT_SoVITS/inference_gui.py | 2 +- GPT_SoVITS/inference_webui.py | 4 ++-- GPT_SoVITS/inference_webui_fast.py | 2 +- GPT_SoVITS/onnx_export.py | 2 +- GPT_SoVITS/prepare_datasets/1-get-text.py | 2 +- GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py | 2 +- GPT_SoVITS/prepare_datasets/3-get-semantic.py | 2 +- GPT_SoVITS/process_ckpt.py | 2 +- GPT_SoVITS/tools/asr/fasterwhisper_asr.py | 4 ++-- GPT_SoVITS/tools/my_utils.py | 2 +- GPT_SoVITS/tools/slice_audio.py | 2 +- GPT_SoVITS/tools/uvr5/webui.py | 4 ++-- api.py | 2 +- 16 files changed, 19 insertions(+), 19 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index beb31b93..0f1ab7a8 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -15,7 +15,7 @@ from GPT_SoVITS.text import cleaned_text_to_sequence from transformers import AutoModelForMaskedLM, AutoTokenizer from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method -from tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools..i18n.i18n import I18nAuto, scan_language_list language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/api_v2.py b/GPT_SoVITS/api_v2.py index 84e6ffd7..a22386a7 100644 --- a/GPT_SoVITS/api_v2.py +++ b/GPT_SoVITS/api_v2.py @@ -117,7 +117,7 @@ import uvicorn from importlib.resources import files from io import BytesIO -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools..i18n.i18n import I18nAuto from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names from fastapi.responses import StreamingResponse diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py index bd987aaf..251ce791 100644 --- a/GPT_SoVITS/inference_cli.py +++ b/GPT_SoVITS/inference_cli.py @@ -2,7 +2,7 @@ import argparse import os import soundfile as sf -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools..i18n.i18n import I18nAuto from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav i18n = I18nAuto() diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py index 93ea0f5d..c02120d4 100644 --- a/GPT_SoVITS/inference_gui.py +++ b/GPT_SoVITS/inference_gui.py @@ -5,7 +5,7 @@ from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushB from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox import soundfile as sf -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools..i18n.i18n import I18nAuto i18n = I18nAuto() from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 8ca53c37..946ea59a 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -89,8 +89,8 @@ from GPT_SoVITS.text import cleaned_text_to_sequence from GPT_SoVITS.text.cleaner import clean_text from time import time as ttime from GPT_SoVITS.module.mel_processing import spectrogram_torch -from tools.my_utils import load_audio -from tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools..my_utils import load_audio +from GPT_SoVITS.tools..i18n.i18n import I18nAuto, scan_language_list language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index dcc2bcf9..9ea80455 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -46,7 +46,7 @@ version=os.environ.get("version","v2") import gradio as gr from TTS_infer_pack.TTS import TTS, TTS_Config from TTS_infer_pack.text_segmentation_method import get_method -from tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools..i18n.i18n import I18nAuto, scan_language_list language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index 092be008..80954710 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -10,7 +10,7 @@ cnhubert.cnhubert_base_path = cnhubert_base_path ssl_model = cnhubert.get_model() from GPT_SoVITS.text import cleaned_text_to_sequence import soundfile -from tools.my_utils import load_audio +from GPT_SoVITS.tools..my_utils import load_audio import os import json diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index ef569488..38cab467 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -21,7 +21,7 @@ from tqdm import tqdm from GPT_SoVITS.text.cleaner import clean_text from transformers import AutoModelForMaskedLM, AutoTokenizer import numpy as np -from tools.my_utils import clean_path +from GPT_SoVITS.tools..my_utils import clean_path # inp_text=sys.argv[1] # inp_wav_dir=sys.argv[2] diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 27b61f27..32fe8045 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -19,7 +19,7 @@ from scipy.io import wavfile import librosa now_dir = os.getcwd() sys.path.append(now_dir) -from tools.my_utils import load_audio,clean_path +from GPT_SoVITS.tools..my_utils import load_audio,clean_path # from config import cnhubert_base_path # cnhubert.cnhubert_base_path=cnhubert_base_path diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index 365b3a97..602877cd 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -24,7 +24,7 @@ from glob import glob from tqdm import tqdm import logging, librosa, utils from GPT_SoVITS.module.models import SynthesizerTrn -from tools.my_utils import clean_path +from GPT_SoVITS.tools..my_utils import clean_path logging.getLogger("numba").setLevel(logging.WARNING) # from config import pretrained_s2G diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index 3a436f10..0fd4783e 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -3,7 +3,7 @@ from collections import OrderedDict from time import time as ttime import shutil,os import torch -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools..i18n.i18n import I18nAuto i18n = I18nAuto() diff --git a/GPT_SoVITS/tools/asr/fasterwhisper_asr.py b/GPT_SoVITS/tools/asr/fasterwhisper_asr.py index d46cbbd7..ba124dc2 100644 --- a/GPT_SoVITS/tools/asr/fasterwhisper_asr.py +++ b/GPT_SoVITS/tools/asr/fasterwhisper_asr.py @@ -9,7 +9,7 @@ import torch from faster_whisper import WhisperModel from tqdm import tqdm -from tools.asr.config import check_fw_local_models +from GPT_SoVITS.tools..asr.config import check_fw_local_models language_code_list = [ "af", "am", "ar", "as", "az", @@ -69,7 +69,7 @@ def execute_asr(input_folder, output_folder, model_size, language, precision): if info.language == "zh": print("检测为中文文本, 转 FunASR 处理") if("only_asr" not in globals()): - from tools.asr.funasr_asr import only_asr #如果用英文就不需要导入下载模型 + from GPT_SoVITS.tools..asr.funasr_asr import only_asr #如果用英文就不需要导入下载模型 text = only_asr(file_path, language=info.language.lower()) if text == '': diff --git a/GPT_SoVITS/tools/my_utils.py b/GPT_SoVITS/tools/my_utils.py index c1469234..60bdd0fb 100644 --- a/GPT_SoVITS/tools/my_utils.py +++ b/GPT_SoVITS/tools/my_utils.py @@ -2,7 +2,7 @@ import platform,os,traceback import ffmpeg import numpy as np import gradio as gr -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools..i18n.i18n import I18nAuto import pandas as pd i18n = I18nAuto(language=os.environ.get('language','Auto')) diff --git a/GPT_SoVITS/tools/slice_audio.py b/GPT_SoVITS/tools/slice_audio.py index b9912ae0..2708a0d0 100644 --- a/GPT_SoVITS/tools/slice_audio.py +++ b/GPT_SoVITS/tools/slice_audio.py @@ -3,7 +3,7 @@ import traceback from scipy.io import wavfile # parent_directory = os.path.dirname(os.path.abspath(__file__)) # sys.path.append(parent_directory) -from tools.my_utils import load_audio +from GPT_SoVITS.tools..my_utils import load_audio from slicer2 import Slicer def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): diff --git a/GPT_SoVITS/tools/uvr5/webui.py b/GPT_SoVITS/tools/uvr5/webui.py index dc6dd024..abf46b60 100644 --- a/GPT_SoVITS/tools/uvr5/webui.py +++ b/GPT_SoVITS/tools/uvr5/webui.py @@ -1,8 +1,8 @@ import os import traceback,gradio as gr import logging -from tools.i18n.i18n import I18nAuto -from tools.my_utils import clean_path +from GPT_SoVITS.tools..i18n.i18n import I18nAuto +from GPT_SoVITS.tools..my_utils import clean_path i18n = I18nAuto() logger = logging.getLogger(__name__) diff --git a/api.py b/api.py index 69c3ecc3..3965ab27 100644 --- a/api.py +++ b/api.py @@ -167,7 +167,7 @@ from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModu from GPT_SoVITS.text import cleaned_text_to_sequence from GPT_SoVITS.text.cleaner import clean_text from GPT_SoVITS.module.mel_processing import spectrogram_torch -from tools.my_utils import load_audio +from GPT_SoVITS.tools..my_utils import load_audio import config as global_config import logging import subprocess From 4beb007f50839d414ce167777dc24a11c3ff91b8 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 02:04:53 -0800 Subject: [PATCH 10/13] ..fix --- GPT_SoVITS/TTS_infer_pack/TTS.py | 4 ++-- GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 2 +- GPT_SoVITS/api_v2.py | 2 +- GPT_SoVITS/inference_cli.py | 2 +- GPT_SoVITS/inference_gui.py | 2 +- GPT_SoVITS/inference_webui.py | 4 ++-- GPT_SoVITS/inference_webui_fast.py | 2 +- GPT_SoVITS/module/data_utils.py | 2 +- GPT_SoVITS/onnx_export.py | 2 +- GPT_SoVITS/prepare_datasets/1-get-text.py | 2 +- GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py | 2 +- GPT_SoVITS/prepare_datasets/3-get-semantic.py | 2 +- GPT_SoVITS/process_ckpt.py | 2 +- GPT_SoVITS/tools/asr/fasterwhisper_asr.py | 4 ++-- GPT_SoVITS/tools/my_utils.py | 2 +- GPT_SoVITS/tools/slice_audio.py | 2 +- GPT_SoVITS/tools/uvr5/webui.py | 4 ++-- api.py | 2 +- 18 files changed, 22 insertions(+), 22 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index fe5f7055..f047a8b0 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -29,8 +29,8 @@ from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModu from GPT_SoVITS.feature_extractor.cnhubert import CNHubert from GPT_SoVITS.module.models import SynthesizerTrn from GPT_SoVITS.module.mel_processing import spectrogram_torch -from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list -from GPT_SoVITS.tools.my_utils import load_audio +from GPT_SoVITS.toolsi18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.toolsmy_utils import load_audio from transformers import AutoModelForMaskedLM, AutoTokenizer from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import splits from GPT_SoVITS.TTS_infer_pack.TextPreprocessor import TextPreprocessor diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 0f1ab7a8..30f6565a 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -15,7 +15,7 @@ from GPT_SoVITS.text import cleaned_text_to_sequence from transformers import AutoModelForMaskedLM, AutoTokenizer from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method -from GPT_SoVITS.tools..i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/api_v2.py b/GPT_SoVITS/api_v2.py index a22386a7..445f6b6b 100644 --- a/GPT_SoVITS/api_v2.py +++ b/GPT_SoVITS/api_v2.py @@ -117,7 +117,7 @@ import uvicorn from importlib.resources import files from io import BytesIO -from GPT_SoVITS.tools..i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names from fastapi.responses import StreamingResponse diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py index 251ce791..ca87a3f7 100644 --- a/GPT_SoVITS/inference_cli.py +++ b/GPT_SoVITS/inference_cli.py @@ -2,7 +2,7 @@ import argparse import os import soundfile as sf -from GPT_SoVITS.tools..i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav i18n = I18nAuto() diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py index c02120d4..18ade117 100644 --- a/GPT_SoVITS/inference_gui.py +++ b/GPT_SoVITS/inference_gui.py @@ -5,7 +5,7 @@ from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushB from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox import soundfile as sf -from GPT_SoVITS.tools..i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto i18n = I18nAuto() from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 946ea59a..7ec6c2ce 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -89,8 +89,8 @@ from GPT_SoVITS.text import cleaned_text_to_sequence from GPT_SoVITS.text.cleaner import clean_text from time import time as ttime from GPT_SoVITS.module.mel_processing import spectrogram_torch -from GPT_SoVITS.tools..my_utils import load_audio -from GPT_SoVITS.tools..i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools.my_utils import load_audio +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 9ea80455..072b2f40 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -46,7 +46,7 @@ version=os.environ.get("version","v2") import gradio as gr from TTS_infer_pack.TTS import TTS, TTS_Config from TTS_infer_pack.text_segmentation_method import get_method -from GPT_SoVITS.tools..i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index 2a6d6653..de19b5ed 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -17,7 +17,7 @@ from functools import lru_cache import requests from scipy.io import wavfile from io import BytesIO -from GPT_SoVITS.tools.my_utils import load_audio +from GPT_SoVITS.toolsmy_utils import load_audio version = os.environ.get('version',None) # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index 80954710..596ea97a 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -10,7 +10,7 @@ cnhubert.cnhubert_base_path = cnhubert_base_path ssl_model = cnhubert.get_model() from GPT_SoVITS.text import cleaned_text_to_sequence import soundfile -from GPT_SoVITS.tools..my_utils import load_audio +from GPT_SoVITS.tools.my_utils import load_audio import os import json diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index 38cab467..cbc9ca84 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -21,7 +21,7 @@ from tqdm import tqdm from GPT_SoVITS.text.cleaner import clean_text from transformers import AutoModelForMaskedLM, AutoTokenizer import numpy as np -from GPT_SoVITS.tools..my_utils import clean_path +from GPT_SoVITS.tools.my_utils import clean_path # inp_text=sys.argv[1] # inp_wav_dir=sys.argv[2] diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 32fe8045..c5673ffe 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -19,7 +19,7 @@ from scipy.io import wavfile import librosa now_dir = os.getcwd() sys.path.append(now_dir) -from GPT_SoVITS.tools..my_utils import load_audio,clean_path +from GPT_SoVITS.tools.my_utils import load_audio,clean_path # from config import cnhubert_base_path # cnhubert.cnhubert_base_path=cnhubert_base_path diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index 602877cd..3e2e144a 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -24,7 +24,7 @@ from glob import glob from tqdm import tqdm import logging, librosa, utils from GPT_SoVITS.module.models import SynthesizerTrn -from GPT_SoVITS.tools..my_utils import clean_path +from GPT_SoVITS.tools.my_utils import clean_path logging.getLogger("numba").setLevel(logging.WARNING) # from config import pretrained_s2G diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index 0fd4783e..c76b5639 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -3,7 +3,7 @@ from collections import OrderedDict from time import time as ttime import shutil,os import torch -from GPT_SoVITS.tools..i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto i18n = I18nAuto() diff --git a/GPT_SoVITS/tools/asr/fasterwhisper_asr.py b/GPT_SoVITS/tools/asr/fasterwhisper_asr.py index ba124dc2..e2689b31 100644 --- a/GPT_SoVITS/tools/asr/fasterwhisper_asr.py +++ b/GPT_SoVITS/tools/asr/fasterwhisper_asr.py @@ -9,7 +9,7 @@ import torch from faster_whisper import WhisperModel from tqdm import tqdm -from GPT_SoVITS.tools..asr.config import check_fw_local_models +from GPT_SoVITS.tools.asr.config import check_fw_local_models language_code_list = [ "af", "am", "ar", "as", "az", @@ -69,7 +69,7 @@ def execute_asr(input_folder, output_folder, model_size, language, precision): if info.language == "zh": print("检测为中文文本, 转 FunASR 处理") if("only_asr" not in globals()): - from GPT_SoVITS.tools..asr.funasr_asr import only_asr #如果用英文就不需要导入下载模型 + from GPT_SoVITS.tools.asr.funasr_asr import only_asr #如果用英文就不需要导入下载模型 text = only_asr(file_path, language=info.language.lower()) if text == '': diff --git a/GPT_SoVITS/tools/my_utils.py b/GPT_SoVITS/tools/my_utils.py index 60bdd0fb..9222dcad 100644 --- a/GPT_SoVITS/tools/my_utils.py +++ b/GPT_SoVITS/tools/my_utils.py @@ -2,7 +2,7 @@ import platform,os,traceback import ffmpeg import numpy as np import gradio as gr -from GPT_SoVITS.tools..i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto import pandas as pd i18n = I18nAuto(language=os.environ.get('language','Auto')) diff --git a/GPT_SoVITS/tools/slice_audio.py b/GPT_SoVITS/tools/slice_audio.py index 2708a0d0..14b2c37f 100644 --- a/GPT_SoVITS/tools/slice_audio.py +++ b/GPT_SoVITS/tools/slice_audio.py @@ -3,7 +3,7 @@ import traceback from scipy.io import wavfile # parent_directory = os.path.dirname(os.path.abspath(__file__)) # sys.path.append(parent_directory) -from GPT_SoVITS.tools..my_utils import load_audio +from GPT_SoVITS.tools.my_utils import load_audio from slicer2 import Slicer def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): diff --git a/GPT_SoVITS/tools/uvr5/webui.py b/GPT_SoVITS/tools/uvr5/webui.py index abf46b60..96c55793 100644 --- a/GPT_SoVITS/tools/uvr5/webui.py +++ b/GPT_SoVITS/tools/uvr5/webui.py @@ -1,8 +1,8 @@ import os import traceback,gradio as gr import logging -from GPT_SoVITS.tools..i18n.i18n import I18nAuto -from GPT_SoVITS.tools..my_utils import clean_path +from GPT_SoVITS.tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools.my_utils import clean_path i18n = I18nAuto() logger = logging.getLogger(__name__) diff --git a/api.py b/api.py index 3965ab27..d0db7179 100644 --- a/api.py +++ b/api.py @@ -167,7 +167,7 @@ from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModu from GPT_SoVITS.text import cleaned_text_to_sequence from GPT_SoVITS.text.cleaner import clean_text from GPT_SoVITS.module.mel_processing import spectrogram_torch -from GPT_SoVITS.tools..my_utils import load_audio +from GPT_SoVITS.tools.my_utils import load_audio import config as global_config import logging import subprocess From 63a412bda55f5a58356c5193e32eb339d7ca4b48 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 02:07:09 -0800 Subject: [PATCH 11/13] . --- GPT_SoVITS/TTS_infer_pack/TTS.py | 4 ++-- GPT_SoVITS/module/data_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index f047a8b0..fe5f7055 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -29,8 +29,8 @@ from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModu from GPT_SoVITS.feature_extractor.cnhubert import CNHubert from GPT_SoVITS.module.models import SynthesizerTrn from GPT_SoVITS.module.mel_processing import spectrogram_torch -from GPT_SoVITS.toolsi18n.i18n import I18nAuto, scan_language_list -from GPT_SoVITS.toolsmy_utils import load_audio +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools.my_utils import load_audio from transformers import AutoModelForMaskedLM, AutoTokenizer from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import splits from GPT_SoVITS.TTS_infer_pack.TextPreprocessor import TextPreprocessor diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index de19b5ed..2a6d6653 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -17,7 +17,7 @@ from functools import lru_cache import requests from scipy.io import wavfile from io import BytesIO -from GPT_SoVITS.toolsmy_utils import load_audio +from GPT_SoVITS.tools.my_utils import load_audio version = os.environ.get('version',None) # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): From 894d724b3675bacd8ff004895ee49c844e2b8740 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 02:08:38 -0800 Subject: [PATCH 12/13] . --- GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 2 +- GPT_SoVITS/inference_webui_fast.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 30f6565a..81f22ec4 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -13,7 +13,7 @@ from typing import Dict, List, Tuple from GPT_SoVITS.text.cleaner import clean_text from GPT_SoVITS.text import cleaned_text_to_sequence from transformers import AutoModelForMaskedLM, AutoTokenizer -from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 072b2f40..84966a80 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -44,8 +44,8 @@ bert_path = os.environ.get("bert_path", None) version=os.environ.get("version","v2") import gradio as gr -from TTS_infer_pack.TTS import TTS, TTS_Config -from TTS_infer_pack.text_segmentation_method import get_method +from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list language=os.environ.get("language","Auto") From 60ddc7a4a43e2d677e82e484b18adac2d47dfc17 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Mon, 23 Dec 2024 04:57:56 -0800 Subject: [PATCH 13/13] . --- GPT_SoVITS/s2_train.py | 38 +++++++++++++++++++------------------- GPT_SoVITS/text/cleaner.py | 4 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py index 2b83df9b..9e166598 100644 --- a/GPT_SoVITS/s2_train.py +++ b/GPT_SoVITS/s2_train.py @@ -1,7 +1,7 @@ import warnings warnings.filterwarnings("ignore") -import utils, os -hps = utils.get_hparams(stage=2) +import GPT_SoVITS.utils, os +hps = GPT_SoVITS.utils.get_hparams(stage=2) os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") import torch from torch.nn import functional as F @@ -67,7 +67,7 @@ def main(): def run(rank, n_gpus, hps): global global_step if rank == 0: - logger = utils.get_logger(hps.data.exp_dir) + logger = GPT_SoVITS.utils.get_logger(hps.data.exp_dir) logger.info(hps) # utils.check_git_hash(hps.s2_ckpt_dir) writer = SummaryWriter(log_dir=hps.s2_ckpt_dir) @@ -192,16 +192,16 @@ def run(rank, n_gpus, hps): net_d = net_d.to(device) try: # 如果能加载自动resume - _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "D_*.pth"), + _, _, _, epoch_str = GPT_SoVITS.utils.load_checkpoint( + GPT_SoVITS.utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "D_*.pth"), net_d, optim_d, ) # D多半加载没事 if rank == 0: logger.info("loaded D") # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) - _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "G_*.pth"), + _, _, _, epoch_str = GPT_SoVITS.utils.load_checkpoint( + GPT_SoVITS.utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "G_*.pth"), net_g, optim_g, ) @@ -427,20 +427,20 @@ def train_and_evaluate( # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy( + "slice/mel_org": GPT_SoVITS.utils.plot_spectrogram_to_numpy( y_mel[0].data.cpu().numpy() ), - "slice/mel_gen": utils.plot_spectrogram_to_numpy( + "slice/mel_gen": GPT_SoVITS.utils.plot_spectrogram_to_numpy( y_hat_mel[0].data.cpu().numpy() ), - "all/mel": utils.plot_spectrogram_to_numpy( + "all/mel": GPT_SoVITS.utils.plot_spectrogram_to_numpy( mel[0].data.cpu().numpy() ), - "all/stats_ssl": utils.plot_spectrogram_to_numpy( + "all/stats_ssl": GPT_SoVITS.utils.plot_spectrogram_to_numpy( stats_ssl[0].data.cpu().numpy() ), } - utils.summarize( + GPT_SoVITS.utils.summarize( writer=writer, global_step=global_step, images=image_dict, @@ -449,7 +449,7 @@ def train_and_evaluate( global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: if hps.train.if_save_latest == 0: - utils.save_checkpoint( + GPT_SoVITS.utils.save_checkpoint( net_g, optim_g, hps.train.learning_rate, @@ -458,7 +458,7 @@ def train_and_evaluate( "%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(global_step) ), ) - utils.save_checkpoint( + GPT_SoVITS.utils.save_checkpoint( net_d, optim_d, hps.train.learning_rate, @@ -468,7 +468,7 @@ def train_and_evaluate( ), ) else: - utils.save_checkpoint( + GPT_SoVITS.utils.save_checkpoint( net_g, optim_g, hps.train.learning_rate, @@ -477,7 +477,7 @@ def train_and_evaluate( "%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(233333333333) ), ) - utils.save_checkpoint( + GPT_SoVITS.utils.save_checkpoint( net_d, optim_d, hps.train.learning_rate, @@ -565,7 +565,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): ) image_dict.update( { - f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy( + f"gen/mel_{batch_idx}_{test}": GPT_SoVITS.utils.plot_spectrogram_to_numpy( y_hat_mel[0].cpu().numpy() ) } @@ -575,7 +575,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): ) image_dict.update( { - f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy( + f"gt/mel_{batch_idx}": GPT_SoVITS.utils.plot_spectrogram_to_numpy( mel[0].cpu().numpy() ) } @@ -587,7 +587,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): # f"gen/audio_{batch_idx}_style_pred": y_hat[0, :, :] # }) - utils.summarize( + GPT_SoVITS.utils.summarize( writer=writer_eval, global_step=global_step, images=image_dict, diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index f774c30a..d32de36b 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -33,7 +33,7 @@ def clean_text(text, language, version=None): for special_s, special_l, target_symbol in special: if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol, version) - language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) + language_module = __import__("GPT_SoVITS.text."+language_module_map[language],fromlist=[language_module_map[language]]) if hasattr(language_module,"text_normalize"): norm_text = language_module.text_normalize(text) else: @@ -67,7 +67,7 @@ def clean_special(text, language, special_s, target_symbol, version=None): 特殊静音段sp符号处理 """ text = text.replace(special_s, ",") - language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) + language_module = __import__("GPT_SoVITS.text."+language_module_map[language],fromlist=[language_module_map[language]]) norm_text = language_module.text_normalize(text) phones = language_module.g2p(norm_text) new_ph = []