diff --git a/Dockerfile b/Dockerfile
index 74e282c..80cd9f3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -34,9 +34,6 @@ RUN if [ "$IMAGE_TYPE" != "elite" ]; then \
fi
-# Copy the rest of the application
-COPY . /workspace
-
# Copy the rest of the application
COPY . /workspace
diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py
index 1a2ffef..54b9278 100644
--- a/GPT_SoVITS/AR/data/dataset.py
+++ b/GPT_SoVITS/AR/data/dataset.py
@@ -64,7 +64,7 @@ class Text2SemanticDataset(Dataset):
# get dict
self.path2 = phoneme_path # "%s/2-name2text.txt"%exp_dir#phoneme_path
self.path3 = "%s/3-bert" % (
- os.path.basename(phoneme_path)
+ os.path.dirname(phoneme_path)
) # "%s/3-bert"%exp_dir#bert_dir
self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path
assert os.path.exists(self.path2)
diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py
index 0059033..58a21ee 100644
--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@@ -907,7 +907,7 @@ class SynthesizerTrn(nn.Module):
ge = self.ref_enc(y * y_mask, y_mask)
with autocast(enabled=False):
- maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext
+ maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
with maybe_no_grad:
if self.freeze_quantizer:
self.ssl_proj.eval()
diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py
index b241382..e01a63b 100644
--- a/GPT_SoVITS/prepare_datasets/1-get-text.py
+++ b/GPT_SoVITS/prepare_datasets/1-get-text.py
@@ -117,9 +117,12 @@ if os.path.exists(txt_path) == False:
try:
wav_name, spk_name, language, text = line.split("|")
# todo.append([name,text,"zh"])
- todo.append(
- [wav_name, text, language_v1_to_language_v2.get(language, language)]
- )
+ if language in language_v1_to_language_v2.keys():
+ todo.append(
+ [wav_name, text, language_v1_to_language_v2.get(language, language)]
+ )
+ else:
+ print(f"\033[33m[Waring] The {language = } of {wav_name} is not supported for training.\033[0m")
except:
print(line, traceback.format_exc())
diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
index 9a2f73c..61c933a 100644
--- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
+++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
@@ -82,7 +82,7 @@ def name2go(wav_name,wav_path):
tensor_wav16 = tensor_wav16.to(device)
ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215])
if np.isnan(ssl.detach().numpy()).sum()!= 0:
- nan_fails.append(wav_name)
+ nan_fails.append((wav_name,wav_path))
print("nan filtered:%s"%wav_name)
return
wavfile.write(
@@ -90,7 +90,7 @@ def name2go(wav_name,wav_path):
32000,
tmp_audio32.astype("int16"),
)
- my_save(ssl,hubert_path )
+ my_save(ssl,hubert_path)
with open(inp_text,"r",encoding="utf8")as f:
lines=f.read().strip("\n").split("\n")
@@ -113,8 +113,8 @@ for line in lines[int(i_part)::int(all_parts)]:
if(len(nan_fails)>0 and is_half==True):
is_half=False
model=model.float()
- for wav_name in nan_fails:
+ for wav in nan_fails:
try:
- name2go(wav_name)
+ name2go(wav[0],wav[1])
except:
print(wav_name,traceback.format_exc())
diff --git a/GPT_SoVITS/text/english.py b/GPT_SoVITS/text/english.py
index 68ce789..30fafb5 100644
--- a/GPT_SoVITS/text/english.py
+++ b/GPT_SoVITS/text/english.py
@@ -320,7 +320,7 @@ class en_G2p(G2p):
# 尝试分离所有格
if re.match(r"^([a-z]+)('s)$", word):
- phones = self.qryword(word[:-2])
+ phones = self.qryword(word[:-2])[:]
# P T K F TH HH 无声辅音结尾 's 发 ['S']
if phones[-1] in ['P', 'T', 'K', 'F', 'TH', 'HH']:
phones.extend(['S'])
@@ -359,4 +359,4 @@ def g2p(text):
if __name__ == "__main__":
print(g2p("hello"))
print(g2p(text_normalize("e.g. I used openai's AI tool to draw a picture.")))
- print(g2p(text_normalize("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")))
\ No newline at end of file
+ print(g2p(text_normalize("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")))
diff --git a/README.md b/README.md
index 1122516..d8f67e2 100644
--- a/README.md
+++ b/README.md
@@ -12,8 +12,7 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.
[](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)
[](https://discord.gg/dnrgs5GHfG)
-
-[**English**](./README.md) | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md)
+**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
@@ -52,11 +51,11 @@ _Note: numba==0.56.4 requires py<3.11_
### Windows
-If you are a Windows user (tested with win>=10), you can directly download the [pre-packaged distribution](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
+If you are a Windows user (tested with win>=10), you can download [the 0206fix3 packedge](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta-fast-inference-branch.7z?download=true) or [the 0217fix2 packedge](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta0217fix2.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
-Users in China region can download [the 0217 package](https://www.icloud.com.cn/iclouddrive/061bfkcVJcBfsMfLF5R2XKdTQ#GPT-SoVITS-beta0217) or [the 0306fix2 package](https://www.icloud.com.cn/iclouddrive/09aaTLf96aa92dbLe0fPNM5CQ#GPT-SoVITS-beta0306fix2) by clicking the links and then selecting "Download a copy."
+Users in China region can download [the 0206fix3 package](https://www.icloud.com.cn/iclouddrive/075NNKIRC2zqnWn-9rhD63WGA#GPT-SoVITS-beta0206fix3) or [the 0217fix2 package](https://www.icloud.com.cn/iclouddrive/091QHaIbZMDZYQg7IX3g2kCqg#GPT-SoVITS-beta0217fix2) by clicking the links and then selecting "Download a copy." (Log out if you encounter errors while downloading.)
-_Note: The 0306fix2 version doubles the inference speed and fixes all issues with the no reference text mode._
+_Note: The inference speed of version 0206 is faster, while the inference quality of the new 0217 version is better. You can choose according to your needs._
### Linux
@@ -198,7 +197,7 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
- [ ] better sovits base model (enhanced audio quality)
- [ ] model mix
-## (Optional) If you need, here will provide the command line operation mode
+## (Additional) Method for running from the command line
Use the command line to open the WebUI for UVR5
```
python tools/uvr5/webui.py ""
@@ -233,7 +232,7 @@ A custom list save path is enabled
Special thanks to the following projects and contributors:
-### Theoretical
+### Theoretical Research
- [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits)
diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md
index 625e478..36c1db4 100644
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@@ -147,10 +147,33 @@
5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573
+### 202403/202404/202405更新
+
+2个重点
+
+1-修复sovits训练未冻结vq的问题(可能造成效果下降)
+
+2-增加一个快速推理分支
+
+以下都是小修补
+
+1-修复无参考文本模式问题
+
+2-优化中英文文本前端
+
+3-api格式优化
+
+4-cmd格式问题修复
+
+5-训练数据处理阶段不支持的语言提示报错
+
+6-nan自动转fp32阶段的hubert提取bug修复
todolist:
1-中文多音字推理优化(有没有人来测试的,欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488
-
+(v2底模训练已经合了,下个版本发布就要合了)
+
+2-正在尝试解决低音质参考音频导致音质较差的问题,v2再试试如果能解决就发了,节点暂定高考后吧
diff --git a/docs/cn/README.md b/docs/cn/README.md
index 2c48cbc..dcca243 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -10,8 +10,9 @@
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)
+[](https://discord.gg/dnrgs5GHfG)
-[**English**](../../README.md) | [**中文简体**](./README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md)
+[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
@@ -50,11 +51,11 @@ _注: numba==0.56.4 需要 python<3.11_
### Windows
-如果你是 Windows 用户(已在 win>=10 上测试),可以直接下载[预打包文件](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。
+如果你是 Windows 用户(已在 win>=10 上测试),可以下载[0206fix3 整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta-fast-inference-branch.7z?download=true)或[0217fix2 整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta0217fix2.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。
-中国地区用户可以通过点击链接并选择“下载副本”来下载[0217版本包](https://www.icloud.com.cn/iclouddrive/061bfkcVJcBfsMfLF5R2XKdTQ#GPT-SoVITS-beta0217)或[0306fix2版本包](https://www.icloud.com.cn/iclouddrive/09aaTLf96aa92dbLe0fPNM5CQ#GPT-SoVITS-beta0306fix2)。
+中国地区用户可以通过点击链接并选择“下载副本”来下载[0206fix3 整合包](https://www.icloud.com.cn/iclouddrive/075NNKIRC2zqnWn-9rhD63WGA#GPT-SoVITS-beta0206fix3)或[0217fix2 整合包](https://www.icloud.com.cn/iclouddrive/091QHaIbZMDZYQg7IX3g2kCqg#GPT-SoVITS-beta0217fix2)。(如果下载时遇到错误,请退出登录)
-_注:0306fix2版本推理速度翻倍,节约生命。修复了无参考文本模式的所有问题。_
+_注:0206版本的推理速度更快,0217新版的推理效果更好,可按需选择_
### Linux
@@ -148,7 +149,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
对于中文自动语音识别(附加),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 中。
-对于英语与日语自动语音识别(附加),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将它们放置在 `tools/asr/models` 中。 此外,[其他模型](https://huggingface.co/Systran)可能具有类似效果,但占用更小的磁盘空间。
+对于英语与日语自动语音识别(附加),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将它们放置在 `tools/asr/models` 中。 此外,[其他模型](https://huggingface.co/Systran)可能具有类似效果,但占用更小的磁盘空间。
中国地区用户可以通过以下链接下载:
- [Faster Whisper Large V3](https://www.icloud.com/iclouddrive/0c4pQxFs7oWyVU1iMTq2DbmLA#faster-whisper-large-v3)(点击“下载副本”)
@@ -184,7 +185,7 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
- [ ] 用户指南。
- [x] 日语和英语数据集微调训练。
-- [ ] **Features:**
+- [ ] **功能:**
- [ ] 零样本声音转换(5 秒)/ 少样本声音转换(1 分钟)。
- [ ] TTS 语速控制。
- [ ] 增强的 TTS 情感控制。
@@ -196,7 +197,7 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
- [ ] 更好的 sovits 基础模型(增强的音频质量)。
- [ ] 模型混合。
-## (可选)命令行的操作方式
+## (附加)命令行运行方式
使用命令行打开UVR5的WebUI
````
python tools/uvr5/webui.py ""
@@ -226,24 +227,33 @@ python tools/asr/funasr_asr.py -i -o