From 00f417ea068af55ccc6e586052ff99f31fd643cb Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Wed, 7 Aug 2024 18:49:55 +0800
Subject: [PATCH 1/6] Update models.py
---
GPT_SoVITS/module/models.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py
index 6bfee085..968c4cbf 100644
--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@@ -1,3 +1,5 @@
+import warnings
+warnings.filterwarnings("ignore")
import copy
import math
import os
From 893b45246b47f029edae1dc963f524758603354a Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Thu, 8 Aug 2024 18:05:05 +0800
Subject: [PATCH 2/6] Update onnx_api.py
---
GPT_SoVITS/text/g2pw/onnx_api.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py
index 374c9a4e..32fc2c01 100644
--- a/GPT_SoVITS/text/g2pw/onnx_api.py
+++ b/GPT_SoVITS/text/g2pw/onnx_api.py
@@ -86,10 +86,10 @@ class G2PWOnnxConverter:
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
sess_options.intra_op_num_threads = 2
- self.session_g2pW = onnxruntime.InferenceSession(
- os.path.join(uncompress_path, 'g2pW.onnx'),
- sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
- # sess_options=sess_options)
+ try:
+ self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+ except:
+ self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CPUExecutionProvider'])
self.config = load_config(
config_path=os.path.join(uncompress_path, 'config.py'),
use_default=True)
From 2310bcde5378930a1472570be0f54d766616b04b Mon Sep 17 00:00:00 2001
From: KamioRinn <63162909+KamioRinn@users.noreply.github.com>
Date: Sat, 10 Aug 2024 12:28:53 +0800
Subject: [PATCH 3/6] Optimize short sentence (#1430)
---
GPT_SoVITS/inference_webui.py | 7 +++++--
GPT_SoVITS/text/cleaner.py | 2 +-
api.py | 5 ++++-
3 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 727b9f7b..878f8d85 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -299,7 +299,7 @@ def get_first(text):
return text
from text import chinese
-def get_phones_and_bert(text,language,version):
+def get_phones_and_bert(text,language,version,final=False):
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
language = language.replace("all_","")
if language == "en":
@@ -366,6 +366,9 @@ def get_phones_and_bert(text,language,version):
phones = sum(phones_list, [])
norm_text = ''.join(norm_text_list)
+ if not final and len(phones) < 6:
+ return get_phones_and_bert("." + text,language,version,final=True)
+
return phones,bert.to(dtype),norm_text
@@ -408,7 +411,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
print(i18n("实际输入的参考文本:"), prompt_text)
text = text.strip("\n")
- if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
+ # if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
print(i18n("实际输入的目标文本:"), text)
zero_wav = np.zeros(
diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py
index 1091a342..298e4d28 100644
--- a/GPT_SoVITS/text/cleaner.py
+++ b/GPT_SoVITS/text/cleaner.py
@@ -45,7 +45,7 @@ def clean_text(text, language, version=None):
elif language == "en":
phones = language_module.g2p(norm_text)
if len(phones) < 4:
- phones = [','] * (4 - len(phones)) + phones
+ phones = [','] + phones
word2ph = None
else:
phones = language_module.g2p(norm_text)
diff --git a/api.py b/api.py
index e510ab95..3b173948 100644
--- a/api.py
+++ b/api.py
@@ -275,7 +275,7 @@ def get_bert_inf(phones, word2ph, norm_text, language):
return bert
from text import chinese
-def get_phones_and_bert(text,language,version):
+def get_phones_and_bert(text,language,version,final=False):
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
language = language.replace("all_","")
if language == "en":
@@ -340,6 +340,9 @@ def get_phones_and_bert(text,language,version):
phones = sum(phones_list, [])
norm_text = ''.join(norm_text_list)
+ if not final and len(phones) < 6:
+ return get_phones_and_bert("." + text,language,version,final=True)
+
return phones,bert.to(torch.float16 if is_half == True else torch.float32),norm_text
From 62831dfcc7ec236cc767c78add8bb950f5444313 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=93=9D=E6=A2=A6=E5=AE=9E?=
<36986837+SapphireLab@users.noreply.github.com>
Date: Mon, 12 Aug 2024 10:42:52 +0800
Subject: [PATCH 4/6] fix_onlyasr (#1433)
---
tools/asr/fasterwhisper_asr.py | 7 +--
tools/asr/funasr_asr.py | 100 +++++++++++++++++++--------------
2 files changed, 60 insertions(+), 47 deletions(-)
diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py
index da8eadfb..d46cbbd7 100644
--- a/tools/asr/fasterwhisper_asr.py
+++ b/tools/asr/fasterwhisper_asr.py
@@ -68,10 +68,9 @@ def execute_asr(input_folder, output_folder, model_size, language, precision):
if info.language == "zh":
print("检测为中文文本, 转 FunASR 处理")
- if("only_asr"not in globals()):
- from tools.asr.funasr_asr import \
- only_asr # #如果用英文就不需要导入下载模型
- text = only_asr(file_path)
+ if("only_asr" not in globals()):
+ from tools.asr.funasr_asr import only_asr #如果用英文就不需要导入下载模型
+ text = only_asr(file_path, language=info.language.lower())
if text == '':
for segment in segments:
diff --git a/tools/asr/funasr_asr.py b/tools/asr/funasr_asr.py
index 11209ada..fe520e24 100644
--- a/tools/asr/funasr_asr.py
+++ b/tools/asr/funasr_asr.py
@@ -3,30 +3,72 @@
import argparse
import os
import traceback
-from tqdm import tqdm
+
# from funasr.utils import version_checker
# version_checker.check_for_update = lambda: None
from funasr import AutoModel
+from tqdm import tqdm
+funasr_models = {} # 存储模型避免重复加载
-def only_asr(input_file):
+def only_asr(input_file, language):
try:
+ model = create_model(language)
text = model.generate(input=input_file)[0]["text"]
except:
text = ''
print(traceback.format_exc())
return text
+def create_model(language="zh"):
+ path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
+ path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
+ path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+ path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+ vad_model_revision = punc_model_revision = "v2.0.4"
+
+ if language == "zh":
+ path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
+ path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+ model_revision = "v2.0.4"
+ elif language == "yue":
+ path_asr = 'tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online'
+ path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
+ model_revision = "master"
+ path_vad = path_punc = None
+ vad_model_revision = punc_model_revision = None
+ ###友情提示:粤语带VAD识别可能会有少量shape不对报错的,但是不带VAD可以.不带vad只能分阶段单独加标点。不过标点模型对粤语效果真的不行…
+ else:
+ raise ValueError("FunASR 不支持该语言" + ": " + language)
+
+ if language in funasr_models:
+ return funasr_models[language]
+ else:
+ model = AutoModel(
+ model = path_asr,
+ model_revision = model_revision,
+ vad_model = path_vad,
+ vad_model_revision = vad_model_revision,
+ punc_model = path_punc,
+ punc_model_revision = punc_model_revision,
+ )
+ print(f"FunASR 模型加载完成: {language.upper()}")
+
+ funasr_models[language] = model
+ return model
+
def execute_asr(input_folder, output_folder, model_size, language):
input_file_names = os.listdir(input_folder)
input_file_names.sort()
output = []
output_file_name = os.path.basename(input_folder)
+
+ model = create_model(language)
for file_name in tqdm(input_file_names):
try:
- print(file_name)
+ print("\n" + file_name)
file_path = os.path.join(input_folder, file_name)
text = model.generate(input=file_path)[0]["text"]
output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
@@ -42,47 +84,19 @@ def execute_asr(input_folder, output_folder, model_size, language):
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
return output_file_path
-
-parser = argparse.ArgumentParser()
-parser.add_argument("-i", "--input_folder", type=str, required=True,
- help="Path to the folder containing WAV files.")
-parser.add_argument("-o", "--output_folder", type=str, required=True,
- help="Output folder to store transcriptions.")
-parser.add_argument("-s", "--model_size", type=str, default='large',
- help="Model Size of FunASR is Large")
-parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh','yue','auto'],
- help="Language of the audio files.")
-parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
- help="fp16 or fp32")#还没接入
-
-cmd = parser.parse_args()
-
-path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
-path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
-path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
-path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-vad_model_revision=punc_model_revision="v2.0.4"
-
-if(cmd.language=="zh"):
- path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
- path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
- model_revision="v2.0.4"
-else:
- path_asr = 'tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online'
- path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
- model_revision="master"
- path_vad=path_punc=vad_model_revision=punc_model_revision=None###友情提示:粤语带VAD识别可能会有少量shape不对报错的,但是不带VAD可以.不带vad只能分阶段单独加标点。不过标点模型对粤语效果真的不行…
-
-model = AutoModel(
- model=path_asr,
- model_revision=model_revision,
- vad_model=path_vad,
- vad_model_revision=vad_model_revision,
- punc_model=path_punc,
- punc_model_revision=punc_model_revision,
-)
-
if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-i", "--input_folder", type=str, required=True,
+ help="Path to the folder containing WAV files.")
+ parser.add_argument("-o", "--output_folder", type=str, required=True,
+ help="Output folder to store transcriptions.")
+ parser.add_argument("-s", "--model_size", type=str, default='large',
+ help="Model Size of FunASR is Large")
+ parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh','yue','auto'],
+ help="Language of the audio files.")
+ parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
+ help="fp16 or fp32")#还没接入
+ cmd = parser.parse_args()
execute_asr(
input_folder = cmd.input_folder,
output_folder = cmd.output_folder,
From d552c971bfe25e85fe0da6d8cf661d1051cdee6c Mon Sep 17 00:00:00 2001
From: AkitoLiu <39857739+Akito-UzukiP@users.noreply.github.com>
Date: Mon, 12 Aug 2024 10:43:36 +0800
Subject: [PATCH 5/6] add modifiable japanese dict (#1443)
---
GPT_SoVITS/text/ja_userdic/userdict.csv | 1 +
GPT_SoVITS/text/japanese.py | 24 +++++++++++++++++++++++-
2 files changed, 24 insertions(+), 1 deletion(-)
create mode 100644 GPT_SoVITS/text/ja_userdic/userdict.csv
diff --git a/GPT_SoVITS/text/ja_userdic/userdict.csv b/GPT_SoVITS/text/ja_userdic/userdict.csv
new file mode 100644
index 00000000..b23e0d63
--- /dev/null
+++ b/GPT_SoVITS/text/ja_userdic/userdict.csv
@@ -0,0 +1 @@
+主殿,*,*,-32767,名詞,固有名詞,一般,*,*,*,アルジドノ,アルジドノ,アルジドノ,3/5,*
\ No newline at end of file
diff --git a/GPT_SoVITS/text/japanese.py b/GPT_SoVITS/text/japanese.py
index 4c10720e..e9fe7c1a 100644
--- a/GPT_SoVITS/text/japanese.py
+++ b/GPT_SoVITS/text/japanese.py
@@ -1,8 +1,30 @@
# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
import re
-import sys
import pyopenjtalk
+import os
+import hashlib
+current_file_path = os.path.dirname(__file__)
+def get_hash(fp: str) -> str:
+ hash_md5 = hashlib.md5()
+ with open(fp, "rb") as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hash_md5.update(chunk)
+ return hash_md5.hexdigest()
+
+USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
+USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
+USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
+# 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成
+if os.path.exists(USERDIC_CSV_PATH):
+ if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read():
+ pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
+ with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f:
+ f.write(get_hash(USERDIC_CSV_PATH))
+
+if os.path.exists(USERDIC_BIN_PATH):
+ pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
+
from text.symbols import punctuation
# Regular expression matching Japanese without punctuation marks:
From c60e796452650f15d1b8b45cc428226597397286 Mon Sep 17 00:00:00 2001
From: Lion-Wu <130235128+Lion-Wu@users.noreply.github.com>
Date: Mon, 12 Aug 2024 10:47:02 +0800
Subject: [PATCH 6/6] Update README (#1423)
Update README (#1423)
---
README.md | 48 +++++++------------
docs/cn/README.md | 62 ++++++++++--------------
docs/ja/README.md | 119 ++++++++++++++++++++++++++++++++++++++-------
docs/ko/README.md | 120 +++++++++++++++++++++++++++++++++++++++-------
docs/tr/README.md | 116 +++++++++++++++++++++++++++++++++++++-------
5 files changed, 348 insertions(+), 117 deletions(-)
diff --git a/README.md b/README.md
index 98fc0d0a..91f15703 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
## Installation
-For users in the China region, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online.
+For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online.
### Tested Environments
@@ -53,7 +53,7 @@ _Note: numba==0.56.4 requires py<3.11_
If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
-Users in the China region can [download the package](https://www.icloud.com.cn/iclouddrive/030K8WjGJ9xMXhpzJVIMEWPzQ#GPT-SoVITS-beta0706fix1) by clicking the link and then selecting "Download a copy." (Log out if you encounter errors while downloading.)
+**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
### Linux
@@ -141,31 +141,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
## Pretrained Models
-Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
+**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
-Download G2PW models from [G2PWModel-v2-onnx.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS\text`.(Chinese TTS Only)
+1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
-For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
+2. Download G2PW models from [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only)
-Users in the China region can download these two models by entering the links below and clicking "Download a copy" (Log out if you encounter errors while downloading.)
+3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
-- [GPT-SoVITS Models](https://www.icloud.com/iclouddrive/044boFMiOHHt22SNr-c-tirbA#pretrained_models)
+4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
-- [UVR5 Weights](https://www.icloud.com.cn/iclouddrive/0bekRKDiJXboFhbfm3lM2fVbA#UVR5_Weights)
-
-- [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS\text`.
-
-For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
-
-Or Download FunASR Model from [FunASR Model](https://www.icloud.com/iclouddrive/0b52_7SQWYr75kHkPoPXgpeQA#models), unzip and replace `tools/asr/models`.(Log out if you encounter errors while downloading.)
-
-For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
-
-Users in the China region can download this model by entering the links below
-
-- [Faster Whisper Large V3](https://www.icloud.com/iclouddrive/00bUEp9_mcjMq_dhHu_vrAFDQ#faster-whisper-large-v3) (Click "Download a copy", log out if you encounter errors while downloading.)
-
-- [Faster Whisper Large V3](https://hf-mirror.com/Systran/faster-whisper-large-v3) (HuggingFace mirror site)
+5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
## Dataset Format
@@ -249,25 +235,25 @@ then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
New Features:
- 1.Support Korean and Cantonese
+1. Support Korean and Cantonese
- 2.An optimized text frontend
+2. An optimized text frontend
- 3.Pre-trained model extended from 2k hours to 5k hours
+3. Pre-trained model extended from 2k hours to 5k hours
- 4.Improved synthesis quality for low-quality reference audio
+4. Improved synthesis quality for low-quality reference audio
- [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7) )
+ [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7) )
Use v2 from v1 environment:
- 1.pip install -r requirements.txt to update some packages
+1. `pip install -r requirements.txt` to update some packages
- 2.clone the latest codes from github
+2. Clone the latest codes from github.
- 3.download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into GPT_SoVITS\pretrained_models\gsv-v2final-pretrained
+3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.
- Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS\text`.
+ Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
## Todo List
diff --git a/docs/cn/README.md b/docs/cn/README.md
index ea67f9dc..8c86b48a 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -38,7 +38,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
## 安装
-中国地区用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验。
+中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验。
### 测试通过的环境
@@ -53,7 +53,7 @@ _注: numba==0.56.4 需要 python<3.11_
如果你是 Windows 用户(已在 win>=10 上测试),可以下载[下载整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。
-中国地区用户可以通过点击链接并选择“下载副本”[下载整合包](https://www.icloud.com.cn/iclouddrive/030K8WjGJ9xMXhpzJVIMEWPzQ#GPT-SoVITS-beta0706fix1)。(如果下载时遇到错误,请退出登录)
+**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO)。**
### Linux
@@ -141,31 +141,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
## 预训练模型
-从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将它们放置在 `GPT_SoVITS\pretrained_models` 中。
+**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX)。**
-从 [G2PWModel-v2-onnx.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载G2PW模型,并将它们解压重命名为`G2PWModel` 后放置在 `GPT_SoVITS\text` 中。(仅限中文TTS)
+1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。
-对于 UVR5(人声/伴奏分离和混响移除,附加),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将它们放置在 `tools/uvr5/uvr5_weights` 中。
+2. 从 [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文TTS)
-中国地区用户可以进入以下链接并点击“下载副本”下载以上两个模型(如果下载时遇到错误,请退出登录):
+3. 对于 UVR5(人声/伴奏分离和混响移除,额外功能),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将其放置在 `tools/uvr5/uvr5_weights` 目录中。
-- [GPT-SoVITS Models](https://www.icloud.com/iclouddrive/044boFMiOHHt22SNr-c-tirbA#pretrained_models)
-
-- [UVR5 Weights](https://www.icloud.com.cn/iclouddrive/0bekRKDiJXboFhbfm3lM2fVbA#UVR5_Weights)
-
-- [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载G2PW模型,并将它们解压重命名为 `G2PWModel` 后放置在 `GPT_SoVITS\text` 中)
-
-对于中文自动语音识别(附加),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 中。
-
-或者从[FunASR模型链接](https://www.icloud.com/iclouddrive/0b52_7SQWYr75kHkPoPXgpeQA#models)下载模型,并将它们解压后替换 `tools/asr/models` 。(点击“下载副本”,如果下载时遇到错误,请退出登录)
-
-对于英语与日语自动语音识别(附加),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将它们放置在 `tools/asr/models` 中。 此外,[其他模型](https://huggingface.co/Systran)可能具有类似效果,但占用更小的磁盘空间。
-
-中国地区用户可以通过以下链接下载:
-- [Faster Whisper Large V3](https://www.icloud.com/iclouddrive/00bUEp9_mcjMq_dhHu_vrAFDQ#faster-whisper-large-v3)(点击“下载副本”,如果下载时遇到错误,请退出登录)
-
-- [Faster Whisper Large V3](https://hf-mirror.com/Systran/faster-whisper-large-v3)(Hugging Face镜像站)
+4. 对于中文 ASR(额外功能),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。
+5. 对于英语或日语 ASR(额外功能),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将其放置在 `tools/asr/models` 目录中。此外,[其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间。
## 数据集格式
@@ -249,44 +235,44 @@ python webui.py
新特性:
- 1.支持韩语及粤语
+1. 支持韩语及粤语
- 2.更好的文本前端
+2. 更好的文本前端
- 3.底模由2k小时扩展至5k小时
+3. 底模由2k小时扩展至5k小时
- 4.对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
+4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
- 详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+ 详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
从v1环境迁移至v2
- 1.需要pip安装requirements.txt更新环境
+1. 需要pip安装requirements.txt更新环境
- 2.需要克隆github上的最新代码
+2. 需要克隆github上的最新代码
- 3.需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下
+3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下
- 中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS\text`目录下
+ 中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
## 待办事项清单
-- [ ] **高优先级:**
+- [x] **高优先级:**
- [x] 日语和英语的本地化。
- [x] 用户指南。
- [x] 日语和英语数据集微调训练。
- [ ] **功能:**
- - [ ] 零样本声音转换(5 秒)/ 少样本声音转换(1 分钟)。
- - [ ] TTS 语速控制。
- - [ ] 增强的 TTS 情感控制。
+ - [x] 零样本声音转换(5 秒)/ 少样本声音转换(1 分钟)。
+ - [x] TTS 语速控制。
+ - [ ] ~~增强的 TTS 情感控制。~~
- [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布。
- - [ ] 改进英语和日语文本前端。
+ - [x] 改进英语和日语文本前端。
- [ ] 开发体积小和更大的 TTS 模型。
- [x] Colab 脚本。
- [ ] 扩展训练数据集(从 2k 小时到 10k 小时)。
- - [ ] 更好的 sovits 基础模型(增强的音频质量)。
+ - [x] 更好的 sovits 基础模型(增强的音频质量)。
- [ ] 模型混合。
## (附加)命令行运行方式
@@ -350,6 +336,8 @@ python ./tools/asr/fasterwhisper_asr.py -i -o