diff --git a/Docker/damo.sha256 b/Docker/damo.sha256
new file mode 100644
index 00000000..6e9804da
--- /dev/null
+++ b/Docker/damo.sha256
@@ -0,0 +1,3 @@
+5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb
+b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb
+a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb
\ No newline at end of file
diff --git a/Docker/download.sh b/Docker/download.sh
new file mode 100644
index 00000000..447e018e
--- /dev/null
+++ b/Docker/download.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+set -Eeuo pipefail
+
+echo "Downloading models..."
+
+aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue
+
+echo "Checking SHA256..."
+
+parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c"
diff --git a/Docker/links.sha256 b/Docker/links.sha256
new file mode 100644
index 00000000..cda6dc15
--- /dev/null
+++ b/Docker/links.sha256
@@ -0,0 +1,12 @@
+b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
+fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth
+020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth
+24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
+e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
+39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth
+45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth
+5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
+8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
+01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
+56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
+233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
\ No newline at end of file
diff --git a/Docker/links.txt b/Docker/links.txt
new file mode 100644
index 00000000..e6603db0
--- /dev/null
+++ b/Docker/links.txt
@@ -0,0 +1,34 @@
+# GPT-SoVITS models
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt
+ out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth
+ out=GPT_SoVITS/pretrained_models/s2D488k.pth
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth
+ out=GPT_SoVITS/pretrained_models/s2G488k.pth
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json
+ out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json
+ out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin
+ out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json
+ out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin
+ out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json
+ out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
+# UVR5
+https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
+ out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth
+https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
+ out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth
+https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
+ out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
+https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
+ out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
+https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
+ out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
+https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
+ out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
+https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
+ out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..cbf92cb5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,53 @@
+# Base CUDA image
+FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04
+
+LABEL maintainer="breakstring@hotmail.com"
+LABEL version="dev-20240123.03"
+LABEL description="Docker image for GPT-SoVITS"
+
+
+# Install 3rd party apps
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \
+ rm -rf /var/lib/apt/lists/* && \
+ git lfs install
+
+# Copy application
+WORKDIR /workspace
+COPY . /workspace
+
+# Download models
+RUN chmod +x /workspace/Docker/download.sh && /workspace/Docker/download.sh
+
+# 本应该从 requirements.txt 里面安装package,但是由于funasr和modelscope的问题,暂时先在后面手工安装依赖包吧
+RUN pip install --no-cache-dir torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm cn2an pypinyin pyopenjtalk g2p_en chardet transformers jieba psutil PyYAML
+# 这里强制指定了modelscope和funasr的版本,后面damo_asr的模型让它们自己下载
+RUN pip install --no-cache-dir modelscope~=1.10.0 torchaudio sentencepiece funasr~=0.8.7
+
+# 先屏蔽掉,让容器里自己下载
+# Clone damo_asr
+#WORKDIR /workspace/tools/damo_asr/models
+#RUN git clone --depth 1 https://www.modelscope.cn/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch && \
+# (cd speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch && git lfs pull)
+#RUN git clone --depth 1 https://www.modelscope.cn/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch.git speech_fsmn_vad_zh-cn-16k-common-pytorch && \
+# (cd speech_fsmn_vad_zh-cn-16k-common-pytorch && git lfs pull)
+#RUN git clone --depth 1 https://www.modelscope.cn/iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git punc_ct-transformer_zh-cn-common-vocab272727-pytorch && \
+# (cd punc_ct-transformer_zh-cn-common-vocab272727-pytorch && git lfs pull)
+
+#RUN parallel --will-cite -a /workspace/Docker/damo.sha256 "echo -n {} | sha256sum -c"
+
+#WORKDIR /workspace
+
+EXPOSE 9870
+EXPOSE 9871
+EXPOSE 9872
+EXPOSE 9873
+EXPOSE 9874
+
+VOLUME /workspace/output
+VOLUME /workspace/logs
+VOLUME /workspace/SoVITS_weights
+
+CMD ["python", "webui.py"]
\ No newline at end of file
diff --git a/GPT_SoVITS/AR/data/bucket_sampler.py b/GPT_SoVITS/AR/data/bucket_sampler.py
index 7d752db5..647491f7 100644
--- a/GPT_SoVITS/AR/data/bucket_sampler.py
+++ b/GPT_SoVITS/AR/data/bucket_sampler.py
@@ -41,12 +41,13 @@ class DistributedBucketSampler(Sampler[T_co]):
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
- num_replicas = dist.get_world_size()
+ num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
- rank = dist.get_rank()
- torch.cuda.set_device(rank)
+ rank = dist.get_rank() if torch.cuda.is_available() else 0
+ if torch.cuda.is_available():
+ torch.cuda.set_device(rank)
if rank >= num_replicas or rank < 0:
raise ValueError(
"Invalid rank {}, rank should be in the interval"
diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py
new file mode 100644
index 00000000..bb9e30b9
--- /dev/null
+++ b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py
@@ -0,0 +1,106 @@
+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
+import os, sys
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from typing import Dict
+
+import torch
+from pytorch_lightning import LightningModule
+from AR.models.t2s_model_onnx import Text2SemanticDecoder
+from AR.modules.lr_schedulers import WarmupCosineLRSchedule
+from AR.modules.optim import ScaledAdam
+
+
+class Text2SemanticLightningModule(LightningModule):
+ def __init__(self, config, output_dir, is_train=True):
+ super().__init__()
+ self.config = config
+ self.top_k = 3
+ self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
+ pretrained_s1 = config.get("pretrained_s1")
+ if pretrained_s1 and is_train:
+ # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
+ print(
+ self.load_state_dict(
+ torch.load(pretrained_s1, map_location="cpu")["weight"]
+ )
+ )
+ if is_train:
+ self.automatic_optimization = False
+ self.save_hyperparameters()
+ self.eval_dir = output_dir / "eval"
+ self.eval_dir.mkdir(parents=True, exist_ok=True)
+
+ def training_step(self, batch: Dict, batch_idx: int):
+ opt = self.optimizers()
+ scheduler = self.lr_schedulers()
+ loss, acc = self.model.forward(
+ batch["phoneme_ids"],
+ batch["phoneme_ids_len"],
+ batch["semantic_ids"],
+ batch["semantic_ids_len"],
+ batch["bert_feature"],
+ )
+ self.manual_backward(loss)
+ if batch_idx > 0 and batch_idx % 4 == 0:
+ opt.step()
+ opt.zero_grad()
+ scheduler.step()
+
+ self.log(
+ "total_loss",
+ loss,
+ on_step=True,
+ on_epoch=True,
+ prog_bar=True,
+ sync_dist=True,
+ )
+ self.log(
+ "lr",
+ scheduler.get_last_lr()[0],
+ on_epoch=True,
+ prog_bar=True,
+ sync_dist=True,
+ )
+ self.log(
+ f"top_{self.top_k}_acc",
+ acc,
+ on_step=True,
+ on_epoch=True,
+ prog_bar=True,
+ sync_dist=True,
+ )
+
+ def validation_step(self, batch: Dict, batch_idx: int):
+ return
+
+ def configure_optimizers(self):
+ model_parameters = self.model.parameters()
+ parameters_names = []
+ parameters_names.append(
+ [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
+ )
+ lm_opt = ScaledAdam(
+ model_parameters,
+ lr=0.01,
+ betas=(0.9, 0.95),
+ clipping_scale=2.0,
+ parameters_names=parameters_names,
+ show_dominant_parameters=False,
+ clipping_update_period=1000,
+ )
+
+ return {
+ "optimizer": lm_opt,
+ "lr_scheduler": {
+ "scheduler": WarmupCosineLRSchedule(
+ lm_opt,
+ init_lr=self.config["optimizer"]["lr_init"],
+ peak_lr=self.config["optimizer"]["lr"],
+ end_lr=self.config["optimizer"]["lr_end"],
+ warmup_steps=self.config["optimizer"]["warmup_steps"],
+ total_steps=self.config["optimizer"]["decay_steps"],
+ )
+ },
+ }
diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py
index 9f8330b1..083dc099 100644
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@@ -302,6 +302,8 @@ class Text2SemanticDecoder(nn.Module):
xy_dec[:, -1]
) ##不用改,如果用了cache的默认就是只有一帧,取最后一帧一样的
# samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
+ if(idx==0):###第一次跑不能EOS否则没有了
+ logits = logits[:, :-1] ###刨除1024终止符号的概率
samples = sample(
logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35
)[0].unsqueeze(0)
diff --git a/GPT_SoVITS/AR/models/t2s_model_onnx.py b/GPT_SoVITS/AR/models/t2s_model_onnx.py
new file mode 100644
index 00000000..263b9337
--- /dev/null
+++ b/GPT_SoVITS/AR/models/t2s_model_onnx.py
@@ -0,0 +1,337 @@
+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_model.py
+import torch
+from tqdm import tqdm
+
+from AR.modules.embedding_onnx import SinePositionalEmbedding
+from AR.modules.embedding_onnx import TokenEmbedding
+from AR.modules.transformer_onnx import LayerNorm
+from AR.modules.transformer_onnx import TransformerEncoder
+from AR.modules.transformer_onnx import TransformerEncoderLayer
+from torch import nn
+from torch.nn import functional as F
+from torchmetrics.classification import MulticlassAccuracy
+
+default_config = {
+ "embedding_dim": 512,
+ "hidden_dim": 512,
+ "num_head": 8,
+ "num_layers": 12,
+ "num_codebook": 8,
+ "p_dropout": 0.0,
+ "vocab_size": 1024 + 1,
+ "phoneme_vocab_size": 512,
+ "EOS": 1024,
+}
+
+inf_tensor_value = torch.FloatTensor([-float("Inf")]).float()
+
+def logits_to_probs(
+ logits,
+ previous_tokens = None,
+ temperature: float = 1.0,
+ top_k = None,
+ top_p = None,
+ repetition_penalty: float = 1.0,
+):
+ previous_tokens = previous_tokens.squeeze()
+ if previous_tokens is not None and repetition_penalty != 1.0:
+ previous_tokens = previous_tokens.long()
+ score = torch.gather(logits, dim=0, index=previous_tokens)
+ score = torch.where(
+ score < 0, score * repetition_penalty, score / repetition_penalty
+ )
+ logits.scatter_(dim=0, index=previous_tokens, src=score)
+
+ if top_p is not None and top_p < 1.0:
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ cum_probs = torch.cumsum(
+ torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
+ )
+ sorted_indices_to_remove = cum_probs > top_p
+ sorted_indices_to_remove[0] = False # keep at least one option
+ indices_to_remove = sorted_indices_to_remove.scatter(
+ dim=0, index=sorted_indices, src=sorted_indices_to_remove
+ )
+ logits = logits.masked_fill(indices_to_remove, -float("Inf"))
+
+ logits = logits / max(temperature, 1e-5)
+
+ if top_k is not None:
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+ pivot = v.select(-1, -1).unsqueeze(-1)
+ logits = torch.where(logits < pivot, inf_tensor_value, logits)
+
+ probs = torch.nn.functional.softmax(logits, dim=-1)
+ return probs
+
+
+def multinomial_sample_one_no_sync(
+ probs_sort
+): # Does multinomial sampling without a cuda synchronization
+ q = torch.randn_like(probs_sort)
+ return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+
+
+def sample(
+ logits,
+ previous_tokens,
+ **sampling_kwargs,
+):
+ probs = logits_to_probs(
+ logits=logits, previous_tokens=previous_tokens, **sampling_kwargs
+ )
+ idx_next = multinomial_sample_one_no_sync(probs)
+ return idx_next, probs
+
+
+class OnnxEncoder(nn.Module):
+ def __init__(self, ar_text_embedding, bert_proj, ar_text_position):
+ super().__init__()
+ self.ar_text_embedding = ar_text_embedding
+ self.bert_proj = bert_proj
+ self.ar_text_position = ar_text_position
+
+ def forward(self, x, bert_feature):
+ x = self.ar_text_embedding(x)
+ x = x + self.bert_proj(bert_feature.transpose(1, 2))
+ return self.ar_text_position(x)
+
+
+class T2SFirstStageDecoder(nn.Module):
+ def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric,
+ top_k, early_stop_num, num_layers):
+ super().__init__()
+ self.ar_audio_embedding = ar_audio_embedding
+ self.ar_audio_position = ar_audio_position
+ self.h = h
+ self.ar_predict_layer = ar_predict_layer
+ self.loss_fct = loss_fct
+ self.ar_accuracy_metric = ar_accuracy_metric
+ self.top_k = top_k
+ self.early_stop_num = early_stop_num
+ self.num_layers = num_layers
+
+ def forward(self, x, prompt):
+ y = prompt
+ x_example = x[:,:,0] * 0.0
+ #N, 1, 512
+ cache = {
+ "all_stage": self.num_layers,
+ "k": None,
+ "v": None,
+ "y_emb": None,
+ "first_infer": 1,
+ "stage": 0,
+ }
+
+ y_emb = self.ar_audio_embedding(y)
+
+ cache["y_emb"] = y_emb
+ y_pos = self.ar_audio_position(y_emb)
+
+ xy_pos = torch.concat([x, y_pos], dim=1)
+
+ y_example = y_pos[:,:,0] * 0.0
+ x_attn_mask = torch.matmul(x_example.transpose(0, 1) , x_example).bool()
+ y_attn_mask = torch.ones_like(torch.matmul(y_example.transpose(0, 1), y_example), dtype=torch.int64)
+ y_attn_mask = torch.cumsum(y_attn_mask, dim=1) - torch.cumsum(
+ torch.ones_like(y_example.transpose(0, 1), dtype=torch.int64), dim=0
+ )
+ y_attn_mask = y_attn_mask > 0
+
+ x_y_pad = torch.matmul(x_example.transpose(0, 1), y_example).bool()
+ y_x_pad = torch.matmul(y_example.transpose(0, 1), x_example).bool()
+ x_attn_mask_pad = torch.cat([x_attn_mask, torch.ones_like(x_y_pad)], dim=1)
+ y_attn_mask = torch.cat([y_x_pad, y_attn_mask], dim=1)
+ xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)
+ cache["k"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\
+ .unsqueeze(1).repeat(self.num_layers, 1, 1, 1)
+ cache["v"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\
+ .unsqueeze(1).repeat(self.num_layers, 1, 1, 1)
+
+ xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
+ logits = self.ar_predict_layer(xy_dec[:, -1])
+ samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
+
+ y = torch.concat([y, samples], dim=1)
+
+ return y, cache["k"], cache["v"], cache["y_emb"], x_example
+
+
+class T2SStageDecoder(nn.Module):
+ def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric,
+ top_k, early_stop_num, num_layers):
+ super().__init__()
+ self.ar_audio_embedding = ar_audio_embedding
+ self.ar_audio_position = ar_audio_position
+ self.h = h
+ self.ar_predict_layer = ar_predict_layer
+ self.loss_fct = loss_fct
+ self.ar_accuracy_metric = ar_accuracy_metric
+ self.top_k = top_k
+ self.early_stop_num = early_stop_num
+ self.num_layers = num_layers
+
+ def forward(self, y, k, v, y_emb, x_example):
+ cache = {
+ "all_stage": self.num_layers,
+ "k": torch.nn.functional.pad(k, (0, 0, 0, 0, 0, 1)),
+ "v": torch.nn.functional.pad(v, (0, 0, 0, 0, 0, 1)),
+ "y_emb": y_emb,
+ "first_infer": 0,
+ "stage": 0,
+ }
+
+ y_emb = torch.cat(
+ [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1
+ )
+ cache["y_emb"] = y_emb
+ y_pos = self.ar_audio_position(y_emb)
+
+ xy_pos = y_pos[:, -1:]
+
+ y_example = y_pos[:,:,0] * 0.0
+
+ xy_attn_mask = torch.cat([x_example, y_example], dim=1)
+ xy_attn_mask = torch.zeros_like(xy_attn_mask, dtype=torch.bool)
+
+ xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
+ logits = self.ar_predict_layer(xy_dec[:, -1])
+ samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
+
+ y = torch.concat([y, samples], dim=1)
+
+ return y, cache["k"], cache["v"], cache["y_emb"], logits, samples
+
+
+class Text2SemanticDecoder(nn.Module):
+ def __init__(self, config, norm_first=False, top_k=3):
+ super(Text2SemanticDecoder, self).__init__()
+ self.model_dim = config["model"]["hidden_dim"]
+ self.embedding_dim = config["model"]["embedding_dim"]
+ self.num_head = config["model"]["head"]
+ self.num_layers = config["model"]["n_layer"]
+ self.norm_first = norm_first
+ self.vocab_size = config["model"]["vocab_size"]
+ self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"]
+ self.p_dropout = float(config["model"]["dropout"])
+ self.EOS = config["model"]["EOS"]
+ self.norm_first = norm_first
+ assert self.EOS == self.vocab_size - 1
+ self.bert_proj = nn.Linear(1024, self.embedding_dim)
+ self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size, self.p_dropout)
+ self.ar_text_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True)
+ self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size, self.p_dropout)
+ self.ar_audio_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True)
+ self.h = TransformerEncoder(
+ TransformerEncoderLayer(
+ d_model=self.model_dim,
+ nhead=self.num_head,
+ dim_feedforward=self.model_dim * 4,
+ dropout=0.1,
+ batch_first=True,
+ norm_first=norm_first,
+ ),
+ num_layers=self.num_layers,
+ norm=LayerNorm(self.model_dim) if norm_first else None,
+ )
+ self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False)
+ self.loss_fct = nn.CrossEntropyLoss(reduction="sum")
+ self.ar_accuracy_metric = MulticlassAccuracy(
+ self.vocab_size,
+ top_k=top_k,
+ average="micro",
+ multidim_average="global",
+ ignore_index=self.EOS,
+ )
+ self.top_k = torch.LongTensor([1])
+ self.early_stop_num = torch.LongTensor([-1])
+
+ def init_onnx(self):
+ self.onnx_encoder = OnnxEncoder(self.ar_text_embedding, self.bert_proj, self.ar_text_position)
+ self.first_stage_decoder = T2SFirstStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h,
+ self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num,
+ self.num_layers)
+ self.stage_decoder = T2SStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h,
+ self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num,
+ self.num_layers)
+
+ def forward(self, x, prompts, bert_feature):
+ early_stop_num = self.early_stop_num
+ prefix_len = prompts.shape[1]
+
+ x = self.onnx_encoder(x, bert_feature)
+ y, k, v, y_emb, stage, x_example = self.first_stage_decoder(x, prompts)
+
+ stop = False
+ for idx in range(1, 1500):
+ enco = self.stage_decoder(y, k, v, y_emb, stage, x_example)
+ y, k, v, y_emb, stage, logits, samples = enco
+ if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
+ stop = True
+ if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
+ stop = True
+ if stop:
+ break
+ y[0, -1] = 0
+ return y, idx
+
+ def infer(self, x, prompts, bert_feature):
+ top_k = self.top_k
+ early_stop_num = self.early_stop_num
+
+ x = self.onnx_encoder(x, bert_feature)
+
+ y = prompts
+ prefix_len = y.shape[1]
+ x_len = x.shape[1]
+ x_example = x[:,:,0] * 0.0
+ x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example)
+ x_attn_mask = torch.zeros_like(x_attn_mask, dtype=torch.bool)
+
+ stop = False
+ cache = {
+ "all_stage": self.num_layers,
+ "k": [None] * self.num_layers,
+ "v": [None] * self.num_layers,
+ "y_emb": None,
+ "first_infer": 1,
+ "stage": 0,
+ }
+ for idx in range(1500):
+ if cache["first_infer"] == 1:
+ y_emb = self.ar_audio_embedding(y)
+ else:
+ y_emb = torch.cat(
+ [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1
+ )
+ cache["y_emb"] = y_emb
+ y_pos = self.ar_audio_position(y_emb)
+ if cache["first_infer"] == 1:
+ xy_pos = torch.concat([x, y_pos], dim=1)
+ else:
+ xy_pos = y_pos[:, -1:]
+ y_len = y_pos.shape[1]
+ if cache["first_infer"] == 1:
+ x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True)
+ y_attn_mask = F.pad(
+ torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
+ (x_len, 0), value=False
+ )
+ xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)
+ else:
+ xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool)
+ xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
+ logits = self.ar_predict_layer(xy_dec[:, -1])
+ samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
+ if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
+ stop = True
+ if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
+ stop = True
+ if stop:
+ if prompts.shape[1] == y.shape[1]:
+ y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+ break
+ y = torch.concat([y, samples], dim=1)
+ cache["first_infer"] = 0
+ return y, idx
\ No newline at end of file
diff --git a/GPT_SoVITS/AR/modules/activation_onnx.py b/GPT_SoVITS/AR/modules/activation_onnx.py
new file mode 100644
index 00000000..b54acd99
--- /dev/null
+++ b/GPT_SoVITS/AR/modules/activation_onnx.py
@@ -0,0 +1,178 @@
+# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
+from typing import Optional
+from typing import Tuple
+import torch
+from torch import Tensor
+from torch.nn import Linear
+from torch.nn import Module
+from torch.nn.init import constant_
+from torch.nn.init import xavier_normal_
+from torch.nn.init import xavier_uniform_
+from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
+from torch.nn.parameter import Parameter
+
+from torch.nn import functional as F
+from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched
+
+
+class MultiheadAttention(Module):
+ __constants__ = ["batch_first"]
+ bias_k: Optional[torch.Tensor]
+ bias_v: Optional[torch.Tensor]
+
+ def __init__(
+ self,
+ embed_dim,
+ num_heads,
+ dropout=0.0,
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ kdim=None,
+ vdim=None,
+ batch_first=False,
+ linear1_cls=Linear,
+ linear2_cls=Linear,
+ device=None,
+ dtype=None,
+ ) -> None:
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super(MultiheadAttention, self).__init__()
+ self.embed_dim = embed_dim
+ self.kdim = kdim if kdim is not None else embed_dim
+ self.vdim = vdim if vdim is not None else embed_dim
+ self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.batch_first = batch_first
+ self.head_dim = embed_dim // num_heads
+ assert (
+ self.head_dim * num_heads == self.embed_dim
+ ), "embed_dim must be divisible by num_heads"
+
+ if add_bias_kv:
+ self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+ self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+ else:
+ self.bias_k = self.bias_v = None
+
+ if linear1_cls == Linear:
+ if not self._qkv_same_embed_dim:
+ self.q_proj_weight = Parameter(
+ torch.empty((embed_dim, embed_dim), **factory_kwargs)
+ )
+ self.k_proj_weight = Parameter(
+ torch.empty((embed_dim, self.kdim), **factory_kwargs)
+ )
+ self.v_proj_weight = Parameter(
+ torch.empty((embed_dim, self.vdim), **factory_kwargs)
+ )
+ self.register_parameter("in_proj_weight", None)
+ else:
+ self.in_proj_weight = Parameter(
+ torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
+ )
+ self.register_parameter("q_proj_weight", None)
+ self.register_parameter("k_proj_weight", None)
+ self.register_parameter("v_proj_weight", None)
+
+ if bias:
+ self.in_proj_bias = Parameter(
+ torch.empty(3 * embed_dim, **factory_kwargs)
+ )
+ else:
+ self.register_parameter("in_proj_bias", None)
+ self.out_proj = NonDynamicallyQuantizableLinear(
+ embed_dim, embed_dim, bias=bias, **factory_kwargs
+ )
+
+ self._reset_parameters()
+ else:
+ if not self._qkv_same_embed_dim:
+ raise NotImplementedError
+ else:
+ self.in_proj_linear = linear1_cls(
+ embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
+ )
+ self.in_proj_weight = self.in_proj_linear.weight
+
+ self.register_parameter("q_proj_weight", None)
+ self.register_parameter("k_proj_weight", None)
+ self.register_parameter("v_proj_weight", None)
+
+ if bias:
+ self.in_proj_bias = self.in_proj_linear.bias
+ else:
+ self.register_parameter("in_proj_bias", None)
+
+ self.out_proj = linear2_cls(
+ embed_dim, embed_dim, bias=bias, **factory_kwargs
+ )
+
+ if self.bias_k is not None:
+ xavier_normal_(self.bias_k)
+ if self.bias_v is not None:
+ xavier_normal_(self.bias_v)
+
+ self.add_zero_attn = add_zero_attn
+
+ def _reset_parameters(self):
+ if self._qkv_same_embed_dim:
+ xavier_uniform_(self.in_proj_weight)
+ else:
+ xavier_uniform_(self.q_proj_weight)
+ xavier_uniform_(self.k_proj_weight)
+ xavier_uniform_(self.v_proj_weight)
+
+ if self.in_proj_bias is not None:
+ constant_(self.in_proj_bias, 0.0)
+ constant_(self.out_proj.bias, 0.0)
+
+ if self.bias_k is not None:
+ xavier_normal_(self.bias_k)
+ if self.bias_v is not None:
+ xavier_normal_(self.bias_v)
+
+ def __setstate__(self, state):
+ # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+ if "_qkv_same_embed_dim" not in state:
+ state["_qkv_same_embed_dim"] = True
+
+ super(MultiheadAttention, self).__setstate__(state)
+
+ def forward(
+ self,
+ query: Tensor,
+ key: Tensor,
+ value: Tensor,
+ key_padding_mask: Optional[Tensor] = None,
+ need_weights: bool = True,
+ attn_mask: Optional[Tensor] = None,
+ average_attn_weights: bool = True,
+ cache=None,
+ ) -> Tuple[Tensor, Optional[Tensor]]:
+ any_nested = query.is_nested or key.is_nested or value.is_nested
+ query = key = value = query.transpose(1, 0)
+ attn_output = multi_head_attention_forward_patched(
+ query,
+ key,
+ value,
+ self.embed_dim,
+ self.num_heads,
+ self.in_proj_weight,
+ self.in_proj_bias,
+ self.bias_k,
+ self.bias_v,
+ self.add_zero_attn,
+ self.dropout,
+ self.out_proj.weight,
+ self.out_proj.bias,
+ training=self.training,
+ key_padding_mask=key_padding_mask,
+ need_weights=need_weights,
+ attn_mask=attn_mask,
+ average_attn_weights=average_attn_weights,
+ cache=cache,
+ )
+ return attn_output.transpose(1, 0)
diff --git a/GPT_SoVITS/AR/modules/embedding_onnx.py b/GPT_SoVITS/AR/modules/embedding_onnx.py
new file mode 100644
index 00000000..b93405b4
--- /dev/null
+++ b/GPT_SoVITS/AR/modules/embedding_onnx.py
@@ -0,0 +1,63 @@
+# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
+import math
+
+import torch
+from torch import nn
+
+
+class TokenEmbedding(nn.Module):
+ def __init__(
+ self,
+ embedding_dim: int,
+ vocab_size: int,
+ dropout: float = 0.0,
+ ):
+ super().__init__()
+
+ self.vocab_size = vocab_size
+ self.embedding_dim = embedding_dim
+
+ self.dropout = torch.nn.Dropout(p=dropout)
+ self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
+
+ @property
+ def weight(self) -> torch.Tensor:
+ return self.word_embeddings.weight
+
+ def embedding(self, index: int) -> torch.Tensor:
+ return self.word_embeddings.weight[index : index + 1]
+
+ def forward(self, x: torch.Tensor):
+ x = self.word_embeddings(x)
+ x = self.dropout(x)
+ return x
+
+
+class SinePositionalEmbedding(nn.Module):
+ def __init__(
+ self,
+ embedding_dim: int,
+ dropout: float = 0.0,
+ scale: bool = False,
+ alpha: bool = False,
+ ):
+ super().__init__()
+ self.embedding_dim = embedding_dim
+ self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
+ self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
+ self.dropout = torch.nn.Dropout(p=dropout)
+ self.reverse = False
+ self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
+
+ def extend_pe(self, x):
+ position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
+ scpe = (position * self.div_term).unsqueeze(0)
+ pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
+ pe = pe.contiguous().view(1, -1, self.embedding_dim)
+ return pe
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ pe = self.extend_pe(x)
+ output = x.unsqueeze(-1) if x.ndim == 2 else x
+ output = output * self.x_scale + self.alpha * pe
+ return self.dropout(output)
diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
new file mode 100644
index 00000000..14bdb550
--- /dev/null
+++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
@@ -0,0 +1,92 @@
+from torch.nn.functional import *
+from torch.nn.functional import (
+ _mha_shape_check,
+ _canonical_mask,
+ _none_or_dtype,
+ _in_projection_packed,
+)
+
+def multi_head_attention_forward_patched(
+ query,
+ key,
+ value,
+ embed_dim_to_check: int,
+ num_heads: int,
+ in_proj_weight,
+ in_proj_bias: Optional[Tensor],
+ bias_k: Optional[Tensor],
+ bias_v: Optional[Tensor],
+ add_zero_attn: bool,
+ dropout_p: float,
+ out_proj_weight: Tensor,
+ out_proj_bias: Optional[Tensor],
+ training: bool = True,
+ key_padding_mask: Optional[Tensor] = None,
+ need_weights: bool = True,
+ attn_mask: Optional[Tensor] = None,
+ use_separate_proj_weight: bool = False,
+ q_proj_weight: Optional[Tensor] = None,
+ k_proj_weight: Optional[Tensor] = None,
+ v_proj_weight: Optional[Tensor] = None,
+ static_k: Optional[Tensor] = None,
+ static_v: Optional[Tensor] = None,
+ average_attn_weights: bool = True,
+ is_causal: bool = False,
+ cache=None,
+) -> Tuple[Tensor, Optional[Tensor]]:
+
+ # set up shape vars
+ _, _, embed_dim = query.shape
+ attn_mask = _canonical_mask(
+ mask=attn_mask,
+ mask_name="attn_mask",
+ other_type=None,
+ other_name="",
+ target_type=query.dtype,
+ check_other=False,
+ )
+ head_dim = embed_dim // num_heads
+
+ proj_qkv = linear(query, in_proj_weight, in_proj_bias)
+ proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
+ q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
+
+ if cache["first_infer"] == 1:
+ cache["k"][cache["stage"]] = k
+ cache["v"][cache["stage"]] = v
+ else:
+ cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
+ cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
+ k = cache["k"][cache["stage"]]
+ v = cache["v"][cache["stage"]]
+ cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
+
+ attn_mask = _canonical_mask(
+ mask=attn_mask,
+ mask_name="attn_mask",
+ other_type=None,
+ other_name="",
+ target_type=q.dtype,
+ check_other=False,
+ )
+ attn_mask = attn_mask.unsqueeze(0)
+
+ q = q.view(-1, num_heads, head_dim).transpose(0, 1)
+ k = k.view(-1, num_heads, head_dim).transpose(0, 1)
+ v = v.view(-1, num_heads, head_dim).transpose(0, 1)
+
+ dropout_p = 0.0
+ attn_mask = attn_mask.unsqueeze(0)
+ q = q.view(num_heads, -1, head_dim).unsqueeze(0)
+ k = k.view(num_heads, -1, head_dim).unsqueeze(0)
+ v = v.view(num_heads, -1, head_dim).unsqueeze(0)
+ attn_output = scaled_dot_product_attention(
+ q, k, v, attn_mask, dropout_p, is_causal
+ )
+ attn_output = (
+ attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
+ )
+ attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+ attn_output = attn_output.view(-1, 1, attn_output.size(1))
+
+ return attn_output
diff --git a/GPT_SoVITS/AR/modules/transformer_onnx.py b/GPT_SoVITS/AR/modules/transformer_onnx.py
new file mode 100644
index 00000000..a3f68b43
--- /dev/null
+++ b/GPT_SoVITS/AR/modules/transformer_onnx.py
@@ -0,0 +1,292 @@
+# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py
+import copy
+import numbers
+from functools import partial
+from typing import Any
+from typing import Callable
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from AR.modules.activation_onnx import MultiheadAttention
+from AR.modules.scaling import BalancedDoubleSwish
+from torch import nn
+from torch import Tensor
+from torch.nn import functional as F
+
+_shape_t = Union[int, List[int], torch.Size]
+
+
+class LayerNorm(nn.Module):
+ __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
+ normalized_shape: Tuple[int, ...]
+ eps: float
+ elementwise_affine: bool
+
+ def __init__(
+ self,
+ normalized_shape: _shape_t,
+ eps: float = 1e-5,
+ elementwise_affine: bool = True,
+ device=None,
+ dtype=None,
+ ) -> None:
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super(LayerNorm, self).__init__()
+ if isinstance(normalized_shape, numbers.Integral):
+ # mypy error: incompatible types in assignment
+ normalized_shape = (normalized_shape,) # type: ignore[assignment]
+ self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type]
+ self.eps = eps
+ self.elementwise_affine = elementwise_affine
+ if self.elementwise_affine:
+ self.weight = nn.Parameter(
+ torch.empty(self.normalized_shape, **factory_kwargs)
+ )
+ self.bias = nn.Parameter(
+ torch.empty(self.normalized_shape, **factory_kwargs)
+ )
+ else:
+ self.register_parameter("weight", None)
+ self.register_parameter("bias", None)
+
+ self.reset_parameters()
+
+ def reset_parameters(self) -> None:
+ if self.elementwise_affine:
+ nn.init.ones_(self.weight)
+ nn.init.zeros_(self.bias)
+
+ def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
+ if isinstance(input, tuple):
+ input, embedding = input
+ return (
+ F.layer_norm(
+ input,
+ self.normalized_shape,
+ self.weight,
+ self.bias,
+ self.eps,
+ ),
+ embedding,
+ )
+
+ assert embedding is None
+ return F.layer_norm(
+ input, self.normalized_shape, self.weight, self.bias, self.eps
+ )
+
+ def extra_repr(self) -> str:
+ return (
+ "{normalized_shape}, eps={eps}, "
+ "elementwise_affine={elementwise_affine}".format(**self.__dict__)
+ )
+
+
+class IdentityNorm(nn.Module):
+ def __init__(
+ self,
+ d_model: int,
+ eps: float = 1e-5,
+ device=None,
+ dtype=None,
+ ) -> None:
+ super(IdentityNorm, self).__init__()
+
+ def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
+ if isinstance(input, tuple):
+ return input
+
+ assert embedding is None
+ return input
+
+
+class TransformerEncoder(nn.Module):
+ r"""TransformerEncoder is a stack of N encoder layers. Users can build the
+ BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
+
+ Args:
+ encoder_layer: an instance of the TransformerEncoderLayer() class (required).
+ num_layers: the number of sub-encoder-layers in the encoder (required).
+ norm: the layer normalization component (optional).
+ enable_nested_tensor: if True, input will automatically convert to nested tensor
+ (and convert back on output). This will improve the overall performance of
+ TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
+
+ Examples::
+ >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
+ >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6)
+ >>> src = torch.rand(10, 32, 512)
+ >>> out = transformer_encoder(src)
+ """
+ __constants__ = ["norm"]
+
+ def __init__(self, encoder_layer, num_layers, norm=None):
+ super(TransformerEncoder, self).__init__()
+ self.layers = _get_clones(encoder_layer, num_layers)
+ self.num_layers = num_layers
+ self.norm = norm
+
+ def forward(
+ self,
+ src: Tensor,
+ mask: Optional[Tensor] = None,
+ src_key_padding_mask: Optional[Tensor] = None,
+ return_layer_states: bool = False,
+ cache=None,
+ ) -> Tensor:
+ output = src
+ for mod in self.layers:
+ output = mod(
+ output,
+ src_mask=mask,
+ src_key_padding_mask=src_key_padding_mask,
+ cache=cache,
+ )
+
+ if self.norm is not None:
+ output = self.norm(output)
+
+ return output
+
+
+class TransformerEncoderLayer(nn.Module):
+ __constants__ = ["batch_first", "norm_first"]
+ def __init__(
+ self,
+ d_model: int,
+ nhead: int,
+ dim_feedforward: int = 2048,
+ dropout: float = 0.1,
+ activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+ batch_first: bool = False,
+ norm_first: bool = False,
+ device=None,
+ dtype=None,
+ linear1_self_attention_cls: nn.Module = nn.Linear,
+ linear2_self_attention_cls: nn.Module = nn.Linear,
+ linear1_feedforward_cls: nn.Module = nn.Linear,
+ linear2_feedforward_cls: nn.Module = nn.Linear,
+ layer_norm_cls: nn.Module = LayerNorm,
+ layer_norm_eps: float = 1e-5,
+ adaptive_layer_norm=False,
+ ) -> None:
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super(TransformerEncoderLayer, self).__init__()
+ self.self_attn = MultiheadAttention(
+ d_model, # 512 16
+ nhead,
+ dropout=dropout,
+ batch_first=batch_first,
+ linear1_cls=linear1_self_attention_cls,
+ linear2_cls=linear2_self_attention_cls,
+ **factory_kwargs,
+ )
+ self.linear1 = linear1_feedforward_cls(
+ d_model, dim_feedforward, **factory_kwargs
+ )
+ self.dropout = nn.Dropout(dropout)
+ self.linear2 = linear2_feedforward_cls(
+ dim_feedforward, d_model, **factory_kwargs
+ )
+ self.norm_first = norm_first
+ self.dropout1 = nn.Dropout(dropout)
+ self.dropout2 = nn.Dropout(dropout)
+ if isinstance(activation, str):
+ activation = _get_activation_fn(activation)
+ elif isinstance(activation, partial):
+ activation = activation(d_model)
+ elif activation == BalancedDoubleSwish:
+ activation = BalancedDoubleSwish(d_model)
+ self.activation = activation
+
+ norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
+ if layer_norm_cls == IdentityNorm:
+ norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+ else:
+ norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
+
+ if adaptive_layer_norm:
+ self.norm1 = AdaptiveLayerNorm(d_model, norm1)
+ self.norm2 = AdaptiveLayerNorm(d_model, norm2)
+ else:
+ self.norm1 = norm1
+ self.norm2 = norm2
+
+ def __setstate__(self, state):
+ super(TransformerEncoderLayer, self).__setstate__(state)
+ if not hasattr(self, "activation"):
+ self.activation = F.relu
+
+ def forward(
+ self,
+ src: Tensor,
+ src_mask: Optional[Tensor] = None,
+ src_key_padding_mask: Optional[Tensor] = None,
+ cache=None,
+ ) -> Tensor:
+ x = src
+ stage_embedding = None
+ x = self.norm1(
+ x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache),
+ stage_embedding,
+ )
+ x = self.norm2(x + self._ff_block(x), stage_embedding)
+
+ return x
+
+ def _sa_block(
+ self,
+ x: Tensor,
+ attn_mask: Optional[Tensor],
+ key_padding_mask: Optional[Tensor],
+ cache=None,
+ ) -> Tensor:
+ x = self.self_attn(
+ x,
+ x,
+ x,
+ attn_mask=attn_mask,
+ key_padding_mask=key_padding_mask,
+ need_weights=False,
+ cache=cache,
+ )
+ return self.dropout1(x)
+
+ def _ff_block(self, x: Tensor) -> Tensor:
+ x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+ return self.dropout2(x)
+
+
+class AdaptiveLayerNorm(nn.Module):
+ r"""Adaptive Layer Normalization"""
+
+ def __init__(self, d_model, norm) -> None:
+ super(AdaptiveLayerNorm, self).__init__()
+ self.project_layer = nn.Linear(d_model, 2 * d_model)
+ self.norm = norm
+ self.d_model = d_model
+ self.eps = self.norm.eps
+
+ def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor:
+ if isinstance(input, tuple):
+ input, embedding = input
+ weight, bias = torch.split(
+ self.project_layer(embedding),
+ split_size_or_sections=self.d_model,
+ dim=-1,
+ )
+ return (weight * self.norm(input) + bias, embedding)
+
+ weight, bias = torch.split(
+ self.project_layer(embedding),
+ split_size_or_sections=self.d_model,
+ dim=-1,
+ )
+ return weight * self.norm(input) + bias
+
+
+def _get_clones(module, N):
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index dbc7eb3d..bb571833 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -1,4 +1,10 @@
-import os
+import os,re,logging
+logging.getLogger("markdown_it").setLevel(logging.ERROR)
+logging.getLogger("urllib3").setLevel(logging.ERROR)
+logging.getLogger("httpcore").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+logging.getLogger("asyncio").setLevel(logging.ERROR)
+import pdb
gpt_path = os.environ.get(
"gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
@@ -31,8 +37,18 @@ from text.cleaner import clean_text
from time import time as ttime
from module.mel_processing import spectrogram_torch
from my_utils import load_audio
+from tools.i18n.i18n import I18nAuto
+i18n = I18nAuto()
+
+os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
+
+if torch.cuda.is_available():
+ device = "cuda"
+elif torch.backends.mps.is_available():
+ device = "mps"
+else:
+ device = "cpu"
-device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(bert_path)
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
if is_half == True:
@@ -40,8 +56,6 @@ if is_half == True:
else:
bert_model = bert_model.to(device)
-
-# bert_model=bert_model.to(device)
def get_bert_feature(text, word2ph):
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
@@ -55,15 +69,8 @@ def get_bert_feature(text, word2ph):
repeat_feature = res[i].repeat(word2ph[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
- # if(is_half==True):phone_level_feature=phone_level_feature.half()
return phone_level_feature.T
-
-n_semantic = 1024
-
-dict_s2=torch.load(sovits_path,map_location="cpu")
-hps=dict_s2["config"]
-
class DictToAttrRecursive(dict):
def __init__(self, input_dict):
super().__init__(input_dict)
@@ -92,40 +99,48 @@ class DictToAttrRecursive(dict):
raise AttributeError(f"Attribute {item} not found")
-hps = DictToAttrRecursive(hps)
-
-hps.model.semantic_frame_rate = "25hz"
-dict_s1 = torch.load(gpt_path, map_location="cpu")
-config = dict_s1["config"]
ssl_model = cnhubert.get_model()
if is_half == True:
ssl_model = ssl_model.half().to(device)
else:
ssl_model = ssl_model.to(device)
-vq_model = SynthesizerTrn(
- hps.data.filter_length // 2 + 1,
- hps.train.segment_size // hps.data.hop_length,
- n_speakers=hps.data.n_speakers,
- **hps.model
-)
-if is_half == True:
- vq_model = vq_model.half().to(device)
-else:
- vq_model = vq_model.to(device)
-vq_model.eval()
-print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
-hz = 50
-max_sec = config["data"]["max_sec"]
-t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False)
-t2s_model.load_state_dict(dict_s1["weight"])
-if is_half == True:
- t2s_model = t2s_model.half()
-t2s_model = t2s_model.to(device)
-t2s_model.eval()
-total = sum([param.nelement() for param in t2s_model.parameters()])
-print("Number of parameter: %.2fM" % (total / 1e6))
+def change_sovits_weights(sovits_path):
+ global vq_model,hps
+ dict_s2=torch.load(sovits_path,map_location="cpu")
+ hps=dict_s2["config"]
+ hps = DictToAttrRecursive(hps)
+ hps.model.semantic_frame_rate = "25hz"
+ vq_model = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ n_speakers=hps.data.n_speakers,
+ **hps.model
+ )
+ del vq_model.enc_q
+ if is_half == True:
+ vq_model = vq_model.half().to(device)
+ else:
+ vq_model = vq_model.to(device)
+ vq_model.eval()
+ print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
+change_sovits_weights(sovits_path)
+def change_gpt_weights(gpt_path):
+ global hz,max_sec,t2s_model,config
+ hz = 50
+ dict_s1 = torch.load(gpt_path, map_location="cpu")
+ config = dict_s1["config"]
+ max_sec = config["data"]["max_sec"]
+ t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
+ t2s_model.load_state_dict(dict_s1["weight"])
+ if is_half == True:
+ t2s_model = t2s_model.half()
+ t2s_model = t2s_model.to(device)
+ t2s_model.eval()
+ total = sum([param.nelement() for param in t2s_model.parameters()])
+ print("Number of parameter: %.2fM" % (total / 1e6))
+change_gpt_weights(gpt_path)
def get_spepc(hps, filename):
audio = load_audio(filename, int(hps.data.sampling_rate))
@@ -143,7 +158,11 @@ def get_spepc(hps, filename):
return spec
-dict_language = {"中文": "zh", "英文": "en", "日文": "ja"}
+dict_language={
+ i18n("中文"):"zh",
+ i18n("英文"):"en",
+ i18n("日文"):"ja"
+}
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
@@ -179,19 +198,22 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
phones1 = cleaned_text_to_sequence(phones1)
texts = text.split("\n")
audio_opt = []
+
+ if prompt_language == "zh":
+ bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
+ else:
+ bert1 = torch.zeros(
+ (1024, len(phones1)),
+ dtype=torch.float16 if is_half == True else torch.float32,
+ ).to(device)
+
for text in texts:
# 解决输入目标文本的空行导致报错的问题
if (len(text.strip()) == 0):
continue
phones2, word2ph2, norm_text2 = clean_text(text, text_language)
phones2 = cleaned_text_to_sequence(phones2)
- if prompt_language == "zh":
- bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
- else:
- bert1 = torch.zeros(
- (1024, len(phones1)),
- dtype=torch.float16 if is_half == True else torch.float32,
- ).to(device)
+
if text_language == "zh":
bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
else:
@@ -319,50 +341,83 @@ def cut3(inp):
inp = inp.strip("\n")
return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
+def custom_sort_key(s):
+ # 使用正则表达式提取字符串中的数字部分和非数字部分
+ parts = re.split('(\d+)', s)
+ # 将数字部分转换为整数,非数字部分保持不变
+ parts = [int(part) if part.isdigit() else part for part in parts]
+ return parts
+
+def change_choices():
+ SoVITS_names, GPT_names = get_weights_names()
+ return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
+
+pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
+pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+SoVITS_weight_root="SoVITS_weights"
+GPT_weight_root="GPT_weights"
+os.makedirs(SoVITS_weight_root,exist_ok=True)
+os.makedirs(GPT_weight_root,exist_ok=True)
+def get_weights_names():
+ SoVITS_names = [pretrained_sovits_name]
+ for name in os.listdir(SoVITS_weight_root):
+ if name.endswith(".pth"):SoVITS_names.append("%s/%s"%(SoVITS_weight_root,name))
+ GPT_names = [pretrained_gpt_name]
+ for name in os.listdir(GPT_weight_root):
+ if name.endswith(".ckpt"): GPT_names.append("%s/%s"%(GPT_weight_root,name))
+ return SoVITS_names,GPT_names
+SoVITS_names,GPT_names = get_weights_names()
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.Markdown(
- value="本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE."
+ value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
)
- # with gr.Tabs():
- # with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
with gr.Group():
- gr.Markdown(value="*请上传并填写参考信息")
+ gr.Markdown(value=i18n("模型切换"))
with gr.Row():
- inp_ref = gr.Audio(label="请上传参考音频", type="filepath")
- prompt_text = gr.Textbox(label="参考音频的文本", value="")
+ GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path,interactive=True)
+ SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path,interactive=True)
+ refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
+ refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
+ SoVITS_dropdown.change(change_sovits_weights,[SoVITS_dropdown],[])
+ GPT_dropdown.change(change_gpt_weights,[GPT_dropdown],[])
+ gr.Markdown(value=i18n("*请上传并填写参考信息"))
+ with gr.Row():
+ inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath")
+ prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
prompt_language = gr.Dropdown(
- label="参考音频的语种", choices=["中文", "英文", "日文"], value="中文"
+ label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
)
- gr.Markdown(value="*请填写需要合成的目标文本")
+ gr.Markdown(value=i18n("*请填写需要合成的目标文本"))
with gr.Row():
- text = gr.Textbox(label="需要合成的文本", value="")
+ text = gr.Textbox(label=i18n("需要合成的文本"), value="")
text_language = gr.Dropdown(
- label="需要合成的语种", choices=["中文", "英文", "日文"], value="中文"
+ label=i18n("需要合成的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
)
- inference_button = gr.Button("合成语音", variant="primary")
- output = gr.Audio(label="输出的语音")
+ inference_button = gr.Button(i18n("合成语音"), variant="primary")
+ output = gr.Audio(label=i18n("输出的语音"))
inference_button.click(
get_tts_wav,
[inp_ref, prompt_text, prompt_language, text, text_language],
[output],
)
- gr.Markdown(value="文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")
+ gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
with gr.Row():
- text_inp = gr.Textbox(label="需要合成的切分前文本", value="")
- button1 = gr.Button("凑五句一切", variant="primary")
- button2 = gr.Button("凑50字一切", variant="primary")
- button3 = gr.Button("按中文句号。切", variant="primary")
- text_opt = gr.Textbox(label="切分后文本", value="")
+ text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"),value="")
+ button1 = gr.Button(i18n("凑五句一切"), variant="primary")
+ button2 = gr.Button(i18n("凑50字一切"), variant="primary")
+ button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
+ text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
button1.click(cut1, [text_inp], [text_opt])
button2.click(cut2, [text_inp], [text_opt])
button3.click(cut3, [text_inp], [text_opt])
- gr.Markdown(value="后续将支持混合语种编码文本输入。")
+ gr.Markdown(value=i18n("后续将支持混合语种编码文本输入。"))
app.queue(concurrency_count=511, max_size=1022).launch(
server_name="0.0.0.0",
inbrowser=True,
+ share=is_share,
server_port=infer_ttswebui,
quiet=True,
)
diff --git a/GPT_SoVITS/module/attentions_onnx.py b/GPT_SoVITS/module/attentions_onnx.py
new file mode 100644
index 00000000..df0ae824
--- /dev/null
+++ b/GPT_SoVITS/module/attentions_onnx.py
@@ -0,0 +1,365 @@
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from module import commons
+from module.modules import LayerNorm
+
+
+class LayerNorm(nn.Module):
+ def __init__(self, channels, eps=1e-5):
+ super().__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = nn.Parameter(torch.ones(channels))
+ self.beta = nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ x = x.transpose(1, -1)
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+ return x.transpose(1, -1)
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+class Encoder(nn.Module):
+ def __init__(
+ self,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size=1,
+ p_dropout=0.0,
+ window_size=4,
+ isflow=True,
+ **kwargs
+ ):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+ # if isflow:
+ # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
+ # self.gin_channels = 256
+ self.cond_layer_idx = self.n_layers
+ if "gin_channels" in kwargs:
+ self.gin_channels = kwargs["gin_channels"]
+ if self.gin_channels != 0:
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
+ # vits2 says 3rd block, so idx is 2 by default
+ self.cond_layer_idx = (
+ kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
+ )
+ logging.debug(self.gin_channels, self.cond_layer_idx)
+ assert (
+ self.cond_layer_idx < self.n_layers
+ ), "cond_layer_idx should be less than n_layers"
+ self.drop = nn.Dropout(p_dropout)
+ self.attn_layers = nn.ModuleList()
+ self.norm_layers_1 = nn.ModuleList()
+ self.ffn_layers = nn.ModuleList()
+ self.norm_layers_2 = nn.ModuleList()
+ for i in range(self.n_layers):
+ self.attn_layers.append(
+ MultiHeadAttention(
+ hidden_channels,
+ hidden_channels,
+ n_heads,
+ p_dropout=p_dropout,
+ window_size=window_size,
+ )
+ )
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
+ self.ffn_layers.append(
+ FFN(
+ hidden_channels,
+ hidden_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=p_dropout,
+ )
+ )
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+ def forward(self, x, x_mask, g=None):
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ x = x * x_mask
+ for i in range(self.n_layers):
+ if i == self.cond_layer_idx and g is not None:
+ g = self.spk_emb_linear(g.transpose(1, 2))
+ g = g.transpose(1, 2)
+ x = x + g
+ x = x * x_mask
+ y = self.attn_layers[i](x, x, attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_1[i](x + y)
+
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
+
+
+class MultiHeadAttention(nn.Module):
+ def __init__(
+ self,
+ channels,
+ out_channels,
+ n_heads,
+ p_dropout=0.0,
+ window_size=None,
+ heads_share=True,
+ block_length=None,
+ proximal_bias=False,
+ proximal_init=False,
+ ):
+ super().__init__()
+ assert channels % n_heads == 0
+
+ self.channels = channels
+ self.out_channels = out_channels
+ self.n_heads = n_heads
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+ self.heads_share = heads_share
+ self.block_length = block_length
+ self.proximal_bias = proximal_bias
+ self.proximal_init = proximal_init
+ self.attn = None
+
+ self.k_channels = channels // n_heads
+ self.conv_q = nn.Conv1d(channels, channels, 1)
+ self.conv_k = nn.Conv1d(channels, channels, 1)
+ self.conv_v = nn.Conv1d(channels, channels, 1)
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
+ self.drop = nn.Dropout(p_dropout)
+
+ if window_size is not None:
+ n_heads_rel = 1 if heads_share else n_heads
+ rel_stddev = self.k_channels**-0.5
+ self.emb_rel_k = nn.Parameter(
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+ * rel_stddev
+ )
+ self.emb_rel_v = nn.Parameter(
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+ * rel_stddev
+ )
+
+ nn.init.xavier_uniform_(self.conv_q.weight)
+ nn.init.xavier_uniform_(self.conv_k.weight)
+ nn.init.xavier_uniform_(self.conv_v.weight)
+ if proximal_init:
+ with torch.no_grad():
+ self.conv_k.weight.copy_(self.conv_q.weight)
+ self.conv_k.bias.copy_(self.conv_q.bias)
+
+ def forward(self, x, c, attn_mask=None):
+ q = self.conv_q(x)
+ k = self.conv_k(c)
+ v = self.conv_v(c)
+
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+ x = self.conv_o(x)
+ return x
+
+ def attention(self, query, key, value, mask=None):
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
+ b, d, t_s, _ = (*key.size(), query.size(2))
+ query = query.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
+ key = key.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
+ value = value.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
+
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+ if self.window_size is not None:
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+ rel_logits = self._matmul_with_relative_keys(
+ query / math.sqrt(self.k_channels), key_relative_embeddings
+ )
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
+ scores = scores + scores_local
+ if mask is not None:
+ scores = scores.masked_fill(mask == 0, -1e4)
+ if self.block_length is not None:
+ block_mask = (
+ torch.ones_like(scores)
+ .triu(-self.block_length)
+ .tril(self.block_length)
+ )
+ scores = scores.masked_fill(block_mask == 0, -1e4)
+ p_attn = F.softmax(scores, dim=-1)
+ p_attn = self.drop(p_attn)
+ output = torch.matmul(p_attn, value)
+ if self.window_size is not None:
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
+ value_relative_embeddings = self._get_relative_embeddings(
+ self.emb_rel_v, t_s
+ )
+ output = output + self._matmul_with_relative_values(
+ relative_weights, value_relative_embeddings
+ )
+ output = (
+ output.transpose(2, 3).contiguous().view(b, d, -1)
+ )
+ return output, p_attn
+
+ def _matmul_with_relative_values(self, x, y):
+ """
+ x: [b, h, l, m]
+ y: [h or 1, m, d]
+ ret: [b, h, l, d]
+ """
+ ret = torch.matmul(x, y.unsqueeze(0))
+ return ret
+
+ def _matmul_with_relative_keys(self, x, y):
+ """
+ x: [b, h, l, d]
+ y: [h or 1, m, d]
+ ret: [b, h, l, m]
+ """
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+ return ret
+
+ def _get_relative_embeddings(self, relative_embeddings, length):
+ max_relative_position = 2 * self.window_size + 1
+ # Pad first before slice to avoid using cond ops.
+ pad_length = max(length - (self.window_size + 1), 0)
+ slice_start_position = max((self.window_size + 1) - length, 0)
+ slice_end_position = slice_start_position + 2 * length - 1
+ if pad_length > 0:
+ padded_relative_embeddings = F.pad(
+ relative_embeddings,
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+ )
+ else:
+ padded_relative_embeddings = relative_embeddings
+ used_relative_embeddings = padded_relative_embeddings[
+ :, slice_start_position:slice_end_position
+ ]
+ return used_relative_embeddings
+
+ def _relative_position_to_absolute_position(self, x):
+ """
+ x: [b, h, l, 2*l-1]
+ ret: [b, h, l, l]
+ """
+ batch, heads, length, _ = x.size()
+ # Concat columns of pad to shift from relative to absolute indexing.
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
+ x_flat = x.view([batch, heads, length * 2 * length])
+ x_flat = F.pad(
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+ )
+
+ # Reshape and slice out the padded elements.
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+ :, :, :length, length - 1 :
+ ]
+ return x_final
+
+ def _absolute_position_to_relative_position(self, x):
+ """
+ x: [b, h, l, l]
+ ret: [b, h, l, 2*l-1]
+ """
+ batch, heads, length, _ = x.size()
+ # padd along column
+ x = F.pad(
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+ )
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+ # add 0's in the beginning that will skew the elements after reshape
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+ return x_final
+
+ def _attention_bias_proximal(self, length):
+ """Bias for self-attention to encourage attention to close positions.
+ Args:
+ length: an integer scalar.
+ Returns:
+ a Tensor with shape [1, 1, length, length]
+ """
+ r = torch.arange(length, dtype=torch.float32)
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=0.0,
+ activation=None,
+ causal=False,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.activation = activation
+ self.causal = causal
+
+ if causal:
+ self.padding = self._causal_padding
+ else:
+ self.padding = self._same_padding
+
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+ self.drop = nn.Dropout(p_dropout)
+
+ def forward(self, x, x_mask):
+ x = self.conv_1(self.padding(x * x_mask))
+ if self.activation == "gelu":
+ x = x * torch.sigmoid(1.702 * x)
+ else:
+ x = torch.relu(x)
+ x = self.drop(x)
+ x = self.conv_2(self.padding(x * x_mask))
+ return x * x_mask
+
+ def _causal_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = self.kernel_size - 1
+ pad_r = 0
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, commons.convert_pad_shape(padding))
+ return x
+
+ def _same_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = (self.kernel_size - 1) // 2
+ pad_r = self.kernel_size // 2
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, commons.convert_pad_shape(padding))
+ return x
diff --git a/GPT_SoVITS/module/models_onnx.py b/GPT_SoVITS/module/models_onnx.py
new file mode 100644
index 00000000..35fd291f
--- /dev/null
+++ b/GPT_SoVITS/module/models_onnx.py
@@ -0,0 +1,920 @@
+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from module import commons
+from module import modules
+from module import attentions_onnx as attentions
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from module.commons import init_weights, get_padding
+from module.mrte_model import MRTE
+from module.quantize import ResidualVectorQuantizer
+from text import symbols
+from torch.cuda.amp import autocast
+
+
+class StochasticDurationPredictor(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout,
+ n_flows=4,
+ gin_channels=0,
+ ):
+ super().__init__()
+ filter_channels = in_channels # it needs to be removed from future version.
+ self.in_channels = in_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.log_flow = modules.Log()
+ self.flows = nn.ModuleList()
+ self.flows.append(modules.ElementwiseAffine(2))
+ for i in range(n_flows):
+ self.flows.append(
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+ )
+ self.flows.append(modules.Flip())
+
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+ self.post_convs = modules.DDSConv(
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+ )
+ self.post_flows = nn.ModuleList()
+ self.post_flows.append(modules.ElementwiseAffine(2))
+ for i in range(4):
+ self.post_flows.append(
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+ )
+ self.post_flows.append(modules.Flip())
+
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+ self.convs = modules.DDSConv(
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+ )
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
+ x = torch.detach(x)
+ x = self.pre(x)
+ if g is not None:
+ g = torch.detach(g)
+ x = x + self.cond(g)
+ x = self.convs(x, x_mask)
+ x = self.proj(x) * x_mask
+
+ if not reverse:
+ flows = self.flows
+ assert w is not None
+
+ logdet_tot_q = 0
+ h_w = self.post_pre(w)
+ h_w = self.post_convs(h_w, x_mask)
+ h_w = self.post_proj(h_w) * x_mask
+ e_q = (
+ torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
+ * x_mask
+ )
+ z_q = e_q
+ for flow in self.post_flows:
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+ logdet_tot_q += logdet_q
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
+ u = torch.sigmoid(z_u) * x_mask
+ z0 = (w - u) * x_mask
+ logdet_tot_q += torch.sum(
+ (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
+ )
+ logq = (
+ torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
+ - logdet_tot_q
+ )
+
+ logdet_tot = 0
+ z0, logdet = self.log_flow(z0, x_mask)
+ logdet_tot += logdet
+ z = torch.cat([z0, z1], 1)
+ for flow in flows:
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+ logdet_tot = logdet_tot + logdet
+ nll = (
+ torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
+ - logdet_tot
+ )
+ return nll + logq # [b]
+ else:
+ flows = list(reversed(self.flows))
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
+ z = (
+ torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
+ * noise_scale
+ )
+ for flow in flows:
+ z = flow(z, x_mask, g=x, reverse=reverse)
+ z0, z1 = torch.split(z, [1, 1], 1)
+ logw = z0
+ return logw
+
+
+class DurationPredictor(nn.Module):
+ def __init__(
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
+ ):
+ super().__init__()
+
+ self.in_channels = in_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.gin_channels = gin_channels
+
+ self.drop = nn.Dropout(p_dropout)
+ self.conv_1 = nn.Conv1d(
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+ )
+ self.norm_1 = modules.LayerNorm(filter_channels)
+ self.conv_2 = nn.Conv1d(
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+ )
+ self.norm_2 = modules.LayerNorm(filter_channels)
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+
+ def forward(self, x, x_mask, g=None):
+ x = torch.detach(x)
+ if g is not None:
+ g = torch.detach(g)
+ x = x + self.cond(g)
+ x = self.conv_1(x * x_mask)
+ x = torch.relu(x)
+ x = self.norm_1(x)
+ x = self.drop(x)
+ x = self.conv_2(x * x_mask)
+ x = torch.relu(x)
+ x = self.norm_2(x)
+ x = self.drop(x)
+ x = self.proj(x * x_mask)
+ return x * x_mask
+
+
+class TextEncoder(nn.Module):
+ def __init__(
+ self,
+ out_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ latent_channels=192,
+ ):
+ super().__init__()
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.latent_channels = latent_channels
+
+ self.ssl_proj = nn.Conv1d(768, hidden_channels, 1)
+
+ self.encoder_ssl = attentions.Encoder(
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers // 2,
+ kernel_size,
+ p_dropout,
+ )
+
+ self.encoder_text = attentions.Encoder(
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+ )
+ self.text_embedding = nn.Embedding(len(symbols), hidden_channels)
+
+ self.mrte = MRTE()
+
+ self.encoder2 = attentions.Encoder(
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers // 2,
+ kernel_size,
+ p_dropout,
+ )
+
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, y, text, ge):
+ y_mask = torch.ones_like(y[:1,:1,:])
+
+ y = self.ssl_proj(y * y_mask) * y_mask
+ y = self.encoder_ssl(y * y_mask, y_mask)
+
+ text_mask = torch.ones_like(text).to(y.dtype).unsqueeze(0)
+
+ text = self.text_embedding(text).transpose(1, 2)
+ text = self.encoder_text(text * text_mask, text_mask)
+ y = self.mrte(y, y_mask, text, text_mask, ge)
+
+ y = self.encoder2(y * y_mask, y_mask)
+
+ stats = self.proj(y) * y_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ return y, m, logs, y_mask
+
+ def extract_latent(self, x):
+ x = self.ssl_proj(x)
+ quantized, codes, commit_loss, quantized_list = self.quantizer(x)
+ return codes.transpose(0, 1)
+
+ def decode_latent(self, codes, y_mask, refer, refer_mask, ge):
+ quantized = self.quantizer.decode(codes)
+
+ y = self.vq_proj(quantized) * y_mask
+ y = self.encoder_ssl(y * y_mask, y_mask)
+
+ y = self.mrte(y, y_mask, refer, refer_mask, ge)
+
+ y = self.encoder2(y * y_mask, y_mask)
+
+ stats = self.proj(y) * y_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ return y, m, logs, y_mask, quantized
+
+
+class ResidualCouplingBlock(nn.Module):
+ def __init__(
+ self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ n_flows=4,
+ gin_channels=0,
+ ):
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.flows = nn.ModuleList()
+ for i in range(n_flows):
+ self.flows.append(
+ modules.ResidualCouplingLayer(
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=gin_channels,
+ mean_only=True,
+ )
+ )
+ self.flows.append(modules.Flip())
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ else:
+ for flow in reversed(self.flows):
+ x = flow(x, x_mask, g=g, reverse=reverse)
+ return x
+
+
+class PosteriorEncoder(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = modules.WN(
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=gin_channels,
+ )
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths, g=None):
+ if g != None:
+ g = g.detach()
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+ x.dtype
+ )
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+ return z, m, logs, x_mask
+
+
+class WNEncoder(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = modules.WN(
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=gin_channels,
+ )
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+ self.norm = modules.LayerNorm(out_channels)
+
+ def forward(self, x, x_lengths, g=None):
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+ x.dtype
+ )
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+ out = self.proj(x) * x_mask
+ out = self.norm(out)
+ return out
+
+
+class Generator(torch.nn.Module):
+ def __init__(
+ self,
+ initial_channel,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=0,
+ ):
+ super(Generator, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+ self.conv_pre = Conv1d(
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
+ )
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(
+ weight_norm(
+ ConvTranspose1d(
+ upsample_initial_channel // (2**i),
+ upsample_initial_channel // (2 ** (i + 1)),
+ k,
+ u,
+ padding=(k - u) // 2,
+ )
+ )
+ )
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel // (2 ** (i + 1))
+ for j, (k, d) in enumerate(
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
+ ):
+ self.resblocks.append(resblock(ch, k, d))
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ def forward(self, x, g=None):
+ x = self.conv_pre(x)
+ if g is not None:
+ x = x + self.cond(g)
+
+ for i in range(self.num_upsamples):
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ x = self.ups[i](x)
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i * self.num_kernels + j](x)
+ else:
+ xs += self.resblocks[i * self.num_kernels + j](x)
+ x = xs / self.num_kernels
+ x = F.leaky_relu(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+
+ return x
+
+ def remove_weight_norm(self):
+ print("Removing weight norm...")
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+ super(DiscriminatorP, self).__init__()
+ self.period = period
+ self.use_spectral_norm = use_spectral_norm
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList(
+ [
+ norm_f(
+ Conv2d(
+ 1,
+ 32,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ norm_f(
+ Conv2d(
+ 32,
+ 128,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ norm_f(
+ Conv2d(
+ 128,
+ 512,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ norm_f(
+ Conv2d(
+ 512,
+ 1024,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ norm_f(
+ Conv2d(
+ 1024,
+ 1024,
+ (kernel_size, 1),
+ 1,
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ ]
+ )
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+ def forward(self, x):
+ fmap = []
+
+ # 1d to 2d
+ b, c, t = x.shape
+ if t % self.period != 0: # pad first
+ n_pad = self.period - (t % self.period)
+ x = F.pad(x, (0, n_pad), "reflect")
+ t = t + n_pad
+ x = x.view(b, c, t // self.period, self.period)
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(DiscriminatorS, self).__init__()
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList(
+ [
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+ ]
+ )
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+ def forward(self, x):
+ fmap = []
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(MultiPeriodDiscriminator, self).__init__()
+ periods = [2, 3, 5, 7, 11]
+
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+ discs = discs + [
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+ ]
+ self.discriminators = nn.ModuleList(discs)
+
+ def forward(self, y, y_hat):
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ y_d_rs.append(y_d_r)
+ y_d_gs.append(y_d_g)
+ fmap_rs.append(fmap_r)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class ReferenceEncoder(nn.Module):
+ """
+ inputs --- [N, Ty/r, n_mels*r] mels
+ outputs --- [N, ref_enc_gru_size]
+ """
+
+ def __init__(self, spec_channels, gin_channels=0):
+ super().__init__()
+ self.spec_channels = spec_channels
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
+ K = len(ref_enc_filters)
+ filters = [1] + ref_enc_filters
+ convs = [
+ weight_norm(
+ nn.Conv2d(
+ in_channels=filters[i],
+ out_channels=filters[i + 1],
+ kernel_size=(3, 3),
+ stride=(2, 2),
+ padding=(1, 1),
+ )
+ )
+ for i in range(K)
+ ]
+ self.convs = nn.ModuleList(convs)
+ # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)])
+
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
+ self.gru = nn.GRU(
+ input_size=ref_enc_filters[-1] * out_channels,
+ hidden_size=256 // 2,
+ batch_first=True,
+ )
+ self.proj = nn.Linear(128, gin_channels)
+
+ def forward(self, inputs):
+ N = inputs.size(0)
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
+ for conv in self.convs:
+ out = conv(out)
+ # out = wn(out)
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
+
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
+ T = out.size(1)
+ N = out.size(0)
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
+
+ self.gru.flatten_parameters()
+ memory, out = self.gru(out) # out --- [1, N, 128]
+
+ return self.proj(out.squeeze(0)).unsqueeze(-1)
+
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
+ for i in range(n_convs):
+ L = (L - kernel_size + 2 * pad) // stride + 1
+ return L
+
+
+class Quantizer_module(torch.nn.Module):
+ def __init__(self, n_e, e_dim):
+ super(Quantizer_module, self).__init__()
+ self.embedding = nn.Embedding(n_e, e_dim)
+ self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e)
+
+ def forward(self, x):
+ d = (
+ torch.sum(x**2, 1, keepdim=True)
+ + torch.sum(self.embedding.weight**2, 1)
+ - 2 * torch.matmul(x, self.embedding.weight.T)
+ )
+ min_indicies = torch.argmin(d, 1)
+ z_q = self.embedding(min_indicies)
+ return z_q, min_indicies
+
+
+class Quantizer(torch.nn.Module):
+ def __init__(self, embed_dim=512, n_code_groups=4, n_codes=160):
+ super(Quantizer, self).__init__()
+ assert embed_dim % n_code_groups == 0
+ self.quantizer_modules = nn.ModuleList(
+ [
+ Quantizer_module(n_codes, embed_dim // n_code_groups)
+ for _ in range(n_code_groups)
+ ]
+ )
+ self.n_code_groups = n_code_groups
+ self.embed_dim = embed_dim
+
+ def forward(self, xin):
+ # B, C, T
+ B, C, T = xin.shape
+ xin = xin.transpose(1, 2)
+ x = xin.reshape(-1, self.embed_dim)
+ x = torch.split(x, self.embed_dim // self.n_code_groups, dim=-1)
+ min_indicies = []
+ z_q = []
+ for _x, m in zip(x, self.quantizer_modules):
+ _z_q, _min_indicies = m(_x)
+ z_q.append(_z_q)
+ min_indicies.append(_min_indicies) # B * T,
+ z_q = torch.cat(z_q, -1).reshape(xin.shape)
+ loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean(
+ (z_q - xin.detach()) ** 2
+ )
+ z_q = xin + (z_q - xin).detach()
+ z_q = z_q.transpose(1, 2)
+ codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups)
+ return z_q, loss, codes.transpose(1, 2)
+
+ def embed(self, x):
+ # idx: N, 4, T
+ x = x.transpose(1, 2)
+ x = torch.split(x, 1, 2)
+ ret = []
+ for q, embed in zip(x, self.quantizer_modules):
+ q = embed.embedding(q.squeeze(-1))
+ ret.append(q)
+ ret = torch.cat(ret, -1)
+ return ret.transpose(1, 2) # N, C, T
+
+
+class CodePredictor(nn.Module):
+ def __init__(
+ self,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ n_q=8,
+ dims=1024,
+ ssl_dim=768,
+ ):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+
+ self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1)
+ self.ref_enc = modules.MelStyleEncoder(
+ ssl_dim, style_vector_dim=hidden_channels
+ )
+
+ self.encoder = attentions.Encoder(
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+ )
+
+ self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1)
+ self.n_q = n_q
+ self.dims = dims
+
+ def forward(self, x, x_mask, refer, codes, infer=False):
+ x = x.detach()
+ x = self.vq_proj(x * x_mask) * x_mask
+ g = self.ref_enc(refer, x_mask)
+ x = x + g
+ x = self.encoder(x * x_mask, x_mask)
+ x = self.out_proj(x * x_mask) * x_mask
+ logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(
+ 2, 3
+ )
+ target = codes[1:].transpose(0, 1)
+ if not infer:
+ logits = logits.reshape(-1, self.dims)
+ target = target.reshape(-1)
+ loss = torch.nn.functional.cross_entropy(logits, target)
+ return loss
+ else:
+ _, top10_preds = torch.topk(logits, 10, dim=-1)
+ correct_top10 = torch.any(top10_preds == target.unsqueeze(-1), dim=-1)
+ top3_acc = 100 * torch.mean(correct_top10.float()).detach().cpu().item()
+
+ print("Top-10 Accuracy:", top3_acc, "%")
+
+ pred_codes = torch.argmax(logits, dim=-1)
+ acc = 100 * torch.mean((pred_codes == target).float()).detach().cpu().item()
+ print("Top-1 Accuracy:", acc, "%")
+
+ return pred_codes.transpose(0, 1)
+
+
+class SynthesizerTrn(nn.Module):
+ """
+ Synthesizer for Training
+ """
+
+ def __init__(
+ self,
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ n_speakers=0,
+ gin_channels=0,
+ use_sdp=True,
+ semantic_frame_rate=None,
+ freeze_quantizer=None,
+ **kwargs
+ ):
+ super().__init__()
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.n_speakers = n_speakers
+ self.gin_channels = gin_channels
+
+ self.use_sdp = use_sdp
+ self.enc_p = TextEncoder(
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ )
+ self.dec = Generator(
+ inter_channels,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=gin_channels,
+ )
+ self.enc_q = PosteriorEncoder(
+ spec_channels,
+ inter_channels,
+ hidden_channels,
+ 5,
+ 1,
+ 16,
+ gin_channels=gin_channels,
+ )
+ self.flow = ResidualCouplingBlock(
+ inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
+ )
+
+ self.ref_enc = modules.MelStyleEncoder(
+ spec_channels, style_vector_dim=gin_channels
+ )
+
+ ssl_dim = 768
+ self.ssl_dim = ssl_dim
+ assert semantic_frame_rate in ["25hz", "50hz"]
+ self.semantic_frame_rate = semantic_frame_rate
+ if semantic_frame_rate == "25hz":
+ self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
+ else:
+ self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)
+
+ self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
+ if freeze_quantizer:
+ self.ssl_proj.requires_grad_(False)
+ self.quantizer.requires_grad_(False)
+ # self.enc_p.text_embedding.requires_grad_(False)
+ # self.enc_p.encoder_text.requires_grad_(False)
+ # self.enc_p.mrte.requires_grad_(False)
+
+ def forward(self, codes, text, refer):
+ refer_mask = torch.ones_like(refer[:1,:1,:])
+ ge = self.ref_enc(refer * refer_mask, refer_mask)
+
+ y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
+ text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
+
+ quantized = self.quantizer.decode(codes)
+ if self.semantic_frame_rate == "25hz":
+ dquantized = torch.cat([quantized, quantized]).permute(1, 2, 0)
+ quantized = dquantized.contiguous().view(1, self.ssl_dim, -1)
+
+ x, m_p, logs_p, y_mask = self.enc_p(
+ quantized, text, ge
+ )
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p)
+
+ z = self.flow(z_p, y_mask, g=ge, reverse=True)
+
+ o = self.dec((z * y_mask)[:, :, :], g=ge)
+ return o
+
+ def extract_latent(self, x):
+ ssl = self.ssl_proj(x)
+ quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
+ return codes.transpose(0, 1)
\ No newline at end of file
diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py
new file mode 100644
index 00000000..f08679f9
--- /dev/null
+++ b/GPT_SoVITS/onnx_export.py
@@ -0,0 +1,314 @@
+from module.models_onnx import SynthesizerTrn, symbols
+from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule
+import torch
+import torchaudio
+from torch import nn
+from feature_extractor import cnhubert
+cnhubert_base_path = "pretrained_models/chinese-hubert-base"
+cnhubert.cnhubert_base_path=cnhubert_base_path
+ssl_model = cnhubert.get_model()
+from text import cleaned_text_to_sequence
+import soundfile
+from my_utils import load_audio
+import os
+import json
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+ hann_window = torch.hann_window(win_size).to(
+ dtype=y.dtype, device=y.device
+ )
+ y = torch.nn.functional.pad(
+ y.unsqueeze(1),
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+ mode="reflect",
+ )
+ y = y.squeeze(1)
+ spec = torch.stft(
+ y,
+ n_fft,
+ hop_length=hop_size,
+ win_length=win_size,
+ window=hann_window,
+ center=center,
+ pad_mode="reflect",
+ normalized=False,
+ onesided=True,
+ return_complex=False,
+ )
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+ return spec
+
+
+class DictToAttrRecursive(dict):
+ def __init__(self, input_dict):
+ super().__init__(input_dict)
+ for key, value in input_dict.items():
+ if isinstance(value, dict):
+ value = DictToAttrRecursive(value)
+ self[key] = value
+ setattr(self, key, value)
+
+ def __getattr__(self, item):
+ try:
+ return self[item]
+ except KeyError:
+ raise AttributeError(f"Attribute {item} not found")
+
+ def __setattr__(self, key, value):
+ if isinstance(value, dict):
+ value = DictToAttrRecursive(value)
+ super(DictToAttrRecursive, self).__setitem__(key, value)
+ super().__setattr__(key, value)
+
+ def __delattr__(self, item):
+ try:
+ del self[item]
+ except KeyError:
+ raise AttributeError(f"Attribute {item} not found")
+
+
+class T2SEncoder(nn.Module):
+ def __init__(self, t2s, vits):
+ super().__init__()
+ self.encoder = t2s.onnx_encoder
+ self.vits = vits
+
+ def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content):
+ codes = self.vits.extract_latent(ssl_content)
+ prompt_semantic = codes[0, 0]
+ bert = torch.cat([ref_bert.transpose(0, 1), text_bert.transpose(0, 1)], 1)
+ all_phoneme_ids = torch.cat([ref_seq, text_seq], 1)
+ bert = bert.unsqueeze(0)
+ prompt = prompt_semantic.unsqueeze(0)
+ return self.encoder(all_phoneme_ids, bert), prompt
+
+
+class T2SModel(nn.Module):
+ def __init__(self, t2s_path, vits_model):
+ super().__init__()
+ dict_s1 = torch.load(t2s_path, map_location="cpu")
+ self.config = dict_s1["config"]
+ self.t2s_model = Text2SemanticLightningModule(self.config, "ojbk", is_train=False)
+ self.t2s_model.load_state_dict(dict_s1["weight"])
+ self.t2s_model.eval()
+ self.vits_model = vits_model.vq_model
+ self.hz = 50
+ self.max_sec = self.config["data"]["max_sec"]
+ self.t2s_model.model.top_k = torch.LongTensor([self.config["inference"]["top_k"]])
+ self.t2s_model.model.early_stop_num = torch.LongTensor([self.hz * self.max_sec])
+ self.t2s_model = self.t2s_model.model
+ self.t2s_model.init_onnx()
+ self.onnx_encoder = T2SEncoder(self.t2s_model, self.vits_model)
+ self.first_stage_decoder = self.t2s_model.first_stage_decoder
+ self.stage_decoder = self.t2s_model.stage_decoder
+ #self.t2s_model = torch.jit.script(self.t2s_model)
+
+ def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content):
+ early_stop_num = self.t2s_model.early_stop_num
+
+ #[1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N]
+ x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
+
+ prefix_len = prompts.shape[1]
+
+ #[1,N,512] [1,N]
+ y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts)
+
+ stop = False
+ for idx in range(1, 1500):
+ #[1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N]
+ enco = self.stage_decoder(y, k, v, y_emb, x_example)
+ y, k, v, y_emb, logits, samples = enco
+ if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
+ stop = True
+ if torch.argmax(logits, dim=-1)[0] == self.t2s_model.EOS or samples[0, 0] == self.t2s_model.EOS:
+ stop = True
+ if stop:
+ break
+ y[0, -1] = 0
+
+ return y[:, -idx:].unsqueeze(0)
+
+ def export(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name, dynamo=False):
+ #self.onnx_encoder = torch.jit.script(self.onnx_encoder)
+ if dynamo:
+ export_options = torch.onnx.ExportOptions(dynamic_shapes=True)
+ onnx_encoder_export_output = torch.onnx.dynamo_export(
+ self.onnx_encoder,
+ (ref_seq, text_seq, ref_bert, text_bert, ssl_content),
+ export_options=export_options
+ )
+ onnx_encoder_export_output.save(f"onnx/{project_name}/{project_name}_t2s_encoder.onnx")
+ return
+ torch.onnx.export(
+ self.onnx_encoder,
+ (ref_seq, text_seq, ref_bert, text_bert, ssl_content),
+ f"onnx/{project_name}/{project_name}_t2s_encoder.onnx",
+ input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"],
+ output_names=["x", "prompts"],
+ dynamic_axes={
+ "ref_seq": [1],
+ "text_seq": [1],
+ "ref_bert": [0],
+ "text_bert": [0],
+ "ssl_content": [2],
+ },
+ opset_version=16
+ )
+ x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
+ torch.exp
+ torch.onnx.export(
+ self.first_stage_decoder,
+ (x, prompts),
+ f"onnx/{project_name}/{project_name}_t2s_fsdec.onnx",
+ input_names=["x", "prompts"],
+ output_names=["y", "k", "v", "y_emb", "x_example"],
+ dynamic_axes={
+ "x": [1],
+ "prompts": [1],
+ },
+ verbose=True,
+ opset_version=16
+ )
+ y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts)
+
+ torch.onnx.export(
+ self.stage_decoder,
+ (y, k, v, y_emb, x_example),
+ f"onnx/{project_name}/{project_name}_t2s_sdec.onnx",
+ input_names=["iy", "ik", "iv", "iy_emb", "ix_example"],
+ output_names=["y", "k", "v", "y_emb", "logits", "samples"],
+ dynamic_axes={
+ "iy": [1],
+ "ik": [1],
+ "iv": [1],
+ "iy_emb": [1],
+ "ix_example": [1],
+ },
+ verbose=True,
+ opset_version=16
+ )
+
+
+class VitsModel(nn.Module):
+ def __init__(self, vits_path):
+ super().__init__()
+ dict_s2 = torch.load(vits_path,map_location="cpu")
+ self.hps = dict_s2["config"]
+ self.hps = DictToAttrRecursive(self.hps)
+ self.hps.model.semantic_frame_rate = "25hz"
+ self.vq_model = SynthesizerTrn(
+ self.hps.data.filter_length // 2 + 1,
+ self.hps.train.segment_size // self.hps.data.hop_length,
+ n_speakers=self.hps.data.n_speakers,
+ **self.hps.model
+ )
+ self.vq_model.eval()
+ self.vq_model.load_state_dict(dict_s2["weight"], strict=False)
+
+ def forward(self, text_seq, pred_semantic, ref_audio):
+ refer = spectrogram_torch(
+ ref_audio,
+ self.hps.data.filter_length,
+ self.hps.data.sampling_rate,
+ self.hps.data.hop_length,
+ self.hps.data.win_length,
+ center=False
+ )
+ return self.vq_model(pred_semantic, text_seq, refer)[0, 0]
+
+
+class GptSoVits(nn.Module):
+ def __init__(self, vits, t2s):
+ super().__init__()
+ self.vits = vits
+ self.t2s = t2s
+
+ def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content):
+ pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
+ return self.vits(text_seq, pred_semantic, ref_audio)
+
+ def export(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, project_name):
+ self.t2s.export(ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name)
+ pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
+ torch.onnx.export(
+ self.vits,
+ (text_seq, pred_semantic, ref_audio),
+ f"onnx/{project_name}/{project_name}_vits.onnx",
+ input_names=["text_seq", "pred_semantic", "ref_audio"],
+ output_names=["audio"],
+ dynamic_axes={
+ "text_seq": [1],
+ "pred_semantic": [2],
+ "ref_audio": [1],
+ },
+ opset_version=17
+ )
+
+
+class SSLModel(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.ssl = ssl_model
+
+ def forward(self, ref_audio_16k):
+ return self.ssl.model(ref_audio_16k)["last_hidden_state"].transpose(1, 2)
+
+
+def export(vits_path, gpt_path, project_name):
+ vits = VitsModel(vits_path)
+ gpt = T2SModel(gpt_path, vits)
+ gpt_sovits = GptSoVits(vits, gpt)
+ ssl = SSLModel()
+ ref_seq = torch.LongTensor([cleaned_text_to_sequence(["n", "i2", "h", "ao3", ",", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"])])
+ text_seq = torch.LongTensor([cleaned_text_to_sequence(["w", "o3", "sh", "i4", "b", "ai2", "y", "e4"])])
+ ref_bert = torch.randn((ref_seq.shape[1], 1024)).float()
+ text_bert = torch.randn((text_seq.shape[1], 1024)).float()
+ ref_audio = torch.randn((1, 48000 * 5)).float()
+ # ref_audio = torch.tensor([load_audio("rec.wav", 48000)]).float()
+ ref_audio_16k = torchaudio.functional.resample(ref_audio,48000,16000).float()
+ ref_audio_sr = torchaudio.functional.resample(ref_audio,48000,vits.hps.data.sampling_rate).float()
+
+ try:
+ os.mkdir(f"onnx/{project_name}")
+ except:
+ pass
+
+ ssl_content = ssl(ref_audio_16k).float()
+
+ a = gpt_sovits(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content).detach().cpu().numpy()
+
+ # soundfile.write("out.wav", a, vits.hps.data.sampling_rate)
+
+ gpt_sovits.export(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content, project_name)
+
+ MoeVSConf = {
+ "Folder" : f"{project_name}",
+ "Name" : f"{project_name}",
+ "Type" : "GPT-SoVits",
+ "Rate" : vits.hps.data.sampling_rate,
+ "NumLayers": gpt.t2s_model.num_layers,
+ "EmbeddingDim": gpt.t2s_model.embedding_dim,
+ "Dict": "BasicDict",
+ "BertPath": "chinese-roberta-wwm-ext-large",
+ "Symbol": symbols,
+ "AddBlank": False
+ }
+
+ MoeVSConfJson = json.dumps(MoeVSConf)
+ with open(f"onnx/{project_name}.json", 'w') as MoeVsConfFile:
+ json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
+
+
+if __name__ == "__main__":
+ try:
+ os.mkdir("onnx")
+ except:
+ pass
+
+ gpt_path = "pt_model/koharu-e20.ckpt"
+ vits_path = "pt_model/koharu_e20_s4960.pth"
+ exp_path = "koharu"
+ export(vits_path, gpt_path, exp_path)
+
+ # soundfile.write("out.wav", a, vits.hps.data.sampling_rate)
\ No newline at end of file
diff --git a/GPT_SoVITS/prepare_datasets/0-pipeline.py b/GPT_SoVITS/prepare_datasets/0-pipeline.py
deleted file mode 100644
index 4979ed26..00000000
--- a/GPT_SoVITS/prepare_datasets/0-pipeline.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import os, torch, sys
-from subprocess import Popen
-
-now_dir = os.getcwd()
-sys.path.append(now_dir)
-from config import (
- text_path,
- wav_dir,
- n_card,
- exp_name,
- n_parts,
- exp_dir,
-)
-
-os.makedirs("%s/logs_s1" % exp_dir, exist_ok=True)
-os.makedirs("%s/logs_s2" % exp_dir, exist_ok=True)
-##############step1
-ps = []
-for i_part in range(n_parts):
- cmd = "python prepare/1-get-text.py %s %s %s %s %s %s" % (
- text_path,
- wav_dir,
- exp_name,
- i_part,
- n_parts,
- i_part % n_card,
- )
- print(cmd)
- p = Popen(cmd, shell=True)
- ps.append(p)
-for p in ps:
- p.wait()
-
-opt = []
-for i_part in range(n_parts):
- txt_path = "%s/2-name2text-%s.txt" % (exp_dir, i_part)
- with open(txt_path, "r") as f:
- opt += f.read().strip("\n").split("\n")
- os.remove(txt_path)
-with open("%s/2-name2text.txt" % exp_dir, "w") as f:
- f.write("\n".join(opt) + "\n")
-
-############step2
-ps = []
-for i_part in range(n_parts):
- cmd = "python prepare/2-get-hubert-wav32k.py %s %s %s %s %s %s" % (
- text_path,
- wav_dir,
- exp_name,
- i_part,
- n_parts,
- i_part % n_card,
- )
- print(cmd)
- p = Popen(cmd, shell=True)
- ps.append(p)
-for p in ps:
- p.wait()
-#############step3
-ps = []
-for i_part in range(n_parts):
- cmd = "python prepare/3-get-semantic.py %s %s %s %s %s" % (
- text_path,
- exp_name,
- i_part,
- n_parts,
- i_part % n_card,
- )
- print(cmd)
- p = Popen(cmd, shell=True)
- ps.append(p)
-for p in ps:
- p.wait()
-opt = ["item_name semantic_audio"]
-for i_part in range(n_parts):
- semantic_path = "%s/6-name2semantic-%s.tsv" % (exp_dir, i_part)
- with open(semantic_path, "r") as f:
- opt += f.read().strip("\n").split("\n")
- os.remove(semantic_path)
-with open("%s/6-name2semantic.tsv" % exp_dir, "w") as f:
- f.write("\n".join(opt) + "\n")
diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py
index 85796931..88c9d858 100644
--- a/GPT_SoVITS/prepare_datasets/1-get-text.py
+++ b/GPT_SoVITS/prepare_datasets/1-get-text.py
@@ -41,12 +41,18 @@ def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
shutil.move(tmp_path, "%s/%s" % (dir, name))
+
txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
if os.path.exists(txt_path) == False:
bert_dir = "%s/3-bert" % (opt_dir)
os.makedirs(opt_dir, exist_ok=True)
os.makedirs(bert_dir, exist_ok=True)
- device = "cuda:0"
+ if torch.cuda.is_available():
+ device = "cuda:0"
+ elif torch.backends.mps.is_available():
+ device = "mps"
+ else:
+ device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
if is_half == True:
diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
index a5075ff4..26c71b74 100644
--- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
+++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
@@ -47,12 +47,20 @@ os.makedirs(wav32dir,exist_ok=True)
maxx=0.95
alpha=0.5
-device="cuda:0"
+if torch.cuda.is_available():
+ device = "cuda:0"
+elif torch.backends.mps.is_available():
+ device = "mps"
+else:
+ device = "cpu"
model=cnhubert.get_model()
+# is_half=False
if(is_half==True):
model=model.half().to(device)
else:
model = model.to(device)
+
+nan_fails=[]
def name2go(wav_name):
hubert_path="%s/%s.pt"%(hubert_dir,wav_name)
if(os.path.exists(hubert_path)):return
@@ -60,25 +68,28 @@ def name2go(wav_name):
tmp_audio = load_audio(wav_path, 32000)
tmp_max = np.abs(tmp_audio).max()
if tmp_max > 2.2:
- print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
+ print("%s-filtered" % (wav_name, tmp_max))
return
tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio
+ tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio
tmp_audio = librosa.resample(
- tmp_audio32, orig_sr=32000, target_sr=16000
- )
+ tmp_audio32b, orig_sr=32000, target_sr=16000
+ )#不是重采样问题
tensor_wav16 = torch.from_numpy(tmp_audio)
if (is_half == True):
tensor_wav16=tensor_wav16.half().to(device)
else:
tensor_wav16 = tensor_wav16.to(device)
ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215])
- if np.isnan(ssl.detach().numpy()).sum()!= 0:return
+ if np.isnan(ssl.detach().numpy()).sum()!= 0:
+ nan_fails.append(wav_name)
+ print("nan filtered:%s"%wav_name)
+ return
wavfile.write(
"%s/%s"%(wav32dir,wav_name),
32000,
tmp_audio32.astype("int16"),
)
- # torch.save(ssl,hubert_path )
my_save(ssl,hubert_path )
with open(inp_text,"r",encoding="utf8")as f:
@@ -92,3 +103,12 @@ for line in lines[int(i_part)::int(all_parts)]:
name2go(wav_name)
except:
print(line,traceback.format_exc())
+
+if(len(nan_fails)>0 and is_half==True):
+ is_half=False
+ model=model.float()
+ for wav_name in nan_fails:
+ try:
+ name2go(wav_name)
+ except:
+ print(wav_name,traceback.format_exc())
diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py
index 7cee6e4d..9ab56a48 100644
--- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py
+++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py
@@ -38,7 +38,12 @@ semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
if os.path.exists(semantic_path) == False:
os.makedirs(opt_dir, exist_ok=True)
- device = "cuda:0"
+ if torch.cuda.is_available():
+ device = "cuda"
+ elif torch.backends.mps.is_available():
+ device = "mps"
+ else:
+ device = "cpu"
hps = utils.get_hparams_from_file(s2config_path)
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py
index 4a770062..30c167e5 100644
--- a/GPT_SoVITS/s1_train.py
+++ b/GPT_SoVITS/s1_train.py
@@ -116,9 +116,9 @@ def main(args):
devices=-1,
benchmark=False,
fast_dev_run=False,
- strategy=DDPStrategy(
+ strategy = "auto" if torch.backends.mps.is_available() else DDPStrategy(
process_group_backend="nccl" if platform.system() != "Windows" else "gloo"
- ),
+ ), # mps 不支持多节点训练
precision=config["train"]["precision"],
logger=logger,
num_sanity_val_steps=0,
diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py
index d2ec262f..e6b64f6b 100644
--- a/GPT_SoVITS/s2_train.py
+++ b/GPT_SoVITS/s2_train.py
@@ -44,9 +44,12 @@ global_step = 0
def main():
"""Assume Single Node Multi GPUs Training Only"""
- assert torch.cuda.is_available(), "CPU training is not allowed."
+ assert torch.cuda.is_available() or torch.backends.mps.is_available(), "Only GPU training is allowed."
- n_gpus = torch.cuda.device_count()
+ if torch.backends.mps.is_available():
+ n_gpus = 1
+ else:
+ n_gpus = torch.cuda.device_count()
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
@@ -70,13 +73,14 @@ def run(rank, n_gpus, hps):
writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
dist.init_process_group(
- backend="gloo" if os.name == "nt" else "nccl",
+ backend = "gloo" if os.name == "nt" or torch.backends.mps.is_available() else "nccl",
init_method="env://",
world_size=n_gpus,
rank=rank,
)
torch.manual_seed(hps.train.seed)
- torch.cuda.set_device(rank)
+ if torch.cuda.is_available():
+ torch.cuda.set_device(rank)
train_dataset = TextAudioSpeakerLoader(hps.data) ########
train_sampler = DistributedBucketSampler(
@@ -128,9 +132,14 @@ def run(rank, n_gpus, hps):
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model,
- ).cuda(rank)
+ ).cuda(rank) if torch.cuda.is_available() else SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ n_speakers=hps.data.n_speakers,
+ **hps.model,
+ ).to("mps")
- net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
+ net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to("mps")
for name, param in net_g.named_parameters():
if not param.requires_grad:
print(name, "not requires_grad")
@@ -174,8 +183,12 @@ def run(rank, n_gpus, hps):
betas=hps.train.betas,
eps=hps.train.eps,
)
- net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
- net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
+ if torch.cuda.is_available():
+ net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
+ net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
+ else:
+ net_g = net_g.to("mps")
+ net_d = net_d.to("mps")
try: # 如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(
@@ -205,6 +218,9 @@ def run(rank, n_gpus, hps):
net_g.module.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
strict=False,
+ ) if torch.cuda.is_available() else net_g.load_state_dict(
+ torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
+ strict=False,
)
) ##测试不加载优化器
if hps.train.pretrained_s2D != "":
@@ -213,6 +229,8 @@ def run(rank, n_gpus, hps):
print(
net_d.module.load_state_dict(
torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"]
+ ) if torch.cuda.is_available() else net_d.load_state_dict(
+ torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"]
)
)
@@ -288,18 +306,26 @@ def train_and_evaluate(
text,
text_lengths,
) in tqdm(enumerate(train_loader)):
- spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(
- rank, non_blocking=True
- )
- y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(
- rank, non_blocking=True
- )
- ssl = ssl.cuda(rank, non_blocking=True)
- ssl.requires_grad = False
- # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
- text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda(
- rank, non_blocking=True
- )
+ if torch.cuda.is_available():
+ spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(
+ rank, non_blocking=True
+ )
+ y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(
+ rank, non_blocking=True
+ )
+ ssl = ssl.cuda(rank, non_blocking=True)
+ ssl.requires_grad = False
+ # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
+ text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda(
+ rank, non_blocking=True
+ )
+ else:
+ spec, spec_lengths = spec.to("mps"), spec_lengths.to("mps")
+ y, y_lengths = y.to("mps"), y_lengths.to("mps")
+ ssl = ssl.to("mps")
+ ssl.requires_grad = False
+ # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
+ text, text_lengths = text.to("mps"), text_lengths.to("mps")
with autocast(enabled=hps.train.fp16_run):
(
@@ -500,13 +526,21 @@ def evaluate(hps, generator, eval_loader, writer_eval):
text_lengths,
) in enumerate(eval_loader):
print(111)
- spec, spec_lengths = spec.cuda(), spec_lengths.cuda()
- y, y_lengths = y.cuda(), y_lengths.cuda()
- ssl = ssl.cuda()
- text, text_lengths = text.cuda(), text_lengths.cuda()
+ if torch.cuda.is_available():
+ spec, spec_lengths = spec.cuda(), spec_lengths.cuda()
+ y, y_lengths = y.cuda(), y_lengths.cuda()
+ ssl = ssl.cuda()
+ text, text_lengths = text.cuda(), text_lengths.cuda()
+ else:
+ spec, spec_lengths = spec.to("mps"), spec_lengths.to("mps")
+ y, y_lengths = y.to("mps"), y_lengths.to("mps")
+ ssl = ssl.to("mps")
+ text, text_lengths = text.to("mps"), text_lengths.to("mps")
for test in [0, 1]:
y_hat, mask, *_ = generator.module.infer(
ssl, spec, spec_lengths, text, text_lengths, test=test
+ ) if torch.cuda.is_available() else generator.infer(
+ ssl, spec, spec_lengths, text, text_lengths, test=test
)
y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length
diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py
index 64c8818f..de3ef011 100644
--- a/GPT_SoVITS/text/chinese.py
+++ b/GPT_SoVITS/text/chinese.py
@@ -18,7 +18,7 @@ pinyin_to_symbol_map = {
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
}
-import jieba.posseg as psg
+import jieba_fast.posseg as psg
rep_map = {
diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py
index f987a3f4..eafb179e 100644
--- a/GPT_SoVITS/text/tone_sandhi.py
+++ b/GPT_SoVITS/text/tone_sandhi.py
@@ -14,7 +14,7 @@
from typing import List
from typing import Tuple
-import jieba
+import jieba_fast as jieba
from pypinyin import lazy_pinyin
from pypinyin import Style
diff --git a/GPT_SoVITS/utils.py b/GPT_SoVITS/utils.py
index e1a66ea1..0ce03b33 100644
--- a/GPT_SoVITS/utils.py
+++ b/GPT_SoVITS/utils.py
@@ -18,7 +18,7 @@ logging.getLogger("matplotlib").setLevel(logging.ERROR)
MATPLOTLIB_FLAG = False
-logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
@@ -310,13 +310,13 @@ def check_git_hash(model_dir):
def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))
- logger.setLevel(logging.WARNING)
+ logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename))
- h.setLevel(logging.WARNING)
+ h.setLevel(logging.DEBUG)
h.setFormatter(formatter)
logger.addHandler(h)
return logger
diff --git a/README.md b/README.md
index 4b95297d..166602f3 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,8 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+For users in China region, you can use AutoDL Cloud Docker to experience the full functionality online: https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official
+
## Features:
1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion.
@@ -41,9 +43,24 @@ If you are a Windows user (tested with win>=10) you can install directly via the
- Python 3.9, PyTorch 2.0.1, CUDA 11
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
+- Python 3.9, PyTorch 2.3.0.dev20240122, macOS 14.3 (Apple Silicon, MPS)
_Note: numba==0.56.4 require py<3.11_
+### For Mac Users
+If you are a Mac user, please install by using the following commands:
+#### Create Environment
+```bash
+conda create -n GPTSoVits python=3.9
+conda activate GPTSoVits
+```
+#### Install Requirements
+```bash
+pip install -r requirements.txt
+pip uninstall torch torchaudio
+pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+```
+_Note: For preprocessing with UVR5, it is recommended to [download the original project GUI](https://github.com/Anjok07/ultimatevocalremovergui) and select GPU for operation. Additionally, there may be memory leak issues when using Mac for inference, restarting the inference webUI can release the memory._
### Quick Install with Conda
```bash
@@ -52,25 +69,13 @@ conda activate GPTSoVits
bash install.sh
```
### Install Manually
-#### Make sure you have the distutils for python3.9 installed
-
-```bash
-sudo apt-get install python3.9-distutils
-```
#### Pip Packages
```bash
-pip install torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm cn2an pypinyin pyopenjtalk g2p_en chardet
+pip install -r requirements.txt
```
-#### Additional Requirements
-
-If you need Chinese ASR (supported by FunASR), install:
-
-```bash
-pip install modelscope torchaudio sentencepiece funasr
-```
#### FFmpeg
@@ -107,6 +112,31 @@ For Chinese ASR (additionally), download models from [Damo ASR Model](https://mo
For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
+### Using Docker
+
+#### docker-compose.yaml configuration
+
+1. Environment Variables:
+ - is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation.
+
+2. Volumes Configuration,The application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content.
+3. shm_size: The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation.
+4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances.
+
+
+#### Running with docker compose
+```
+docker compose -f "docker-compose.yaml" up -d
+```
+
+#### Running with docker command
+
+As above, modify the corresponding parameters based on your actual situation, then run the following command:
+```
+docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9870:9870 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:dev-20240123.03
+```
+
+
## Dataset Format
The TTS annotation .list file format:
diff --git a/api.py b/api.py
index 376b0bcf..60ed9fff 100644
--- a/api.py
+++ b/api.py
@@ -1,3 +1,107 @@
+"""
+# api.py usage
+
+` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
+
+## 执行参数:
+
+`-s` - `SoVITS模型路径, 可在 config.py 中指定`
+`-g` - `GPT模型路径, 可在 config.py 中指定`
+
+调用请求缺少参考音频时使用
+`-dr` - `默认参考音频路径`
+`-dt` - `默认参考音频文本`
+`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
+
+`-d` - `推理设备, "cuda","cpu","mps"`
+`-a` - `绑定地址, 默认"127.0.0.1"`
+`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
+`-fp` - `覆盖 config.py 使用全精度`
+`-hp` - `覆盖 config.py 使用半精度`
+
+`-hb` - `cnhubert路径`
+`-b` - `bert路径`
+
+## 调用:
+
+### 推理
+
+endpoint: `/`
+
+使用执行参数指定的参考音频:
+GET:
+ `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
+POST:
+```json
+{
+ "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
+ "text_language": "zh"
+}
+```
+
+手动指定当次推理所使用的参考音频:
+GET:
+ `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
+POST:
+```json
+{
+ "refer_wav_path": "123.wav",
+ "prompt_text": "一二三。",
+ "prompt_language": "zh",
+ "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
+ "text_language": "zh"
+}
+```
+
+RESP:
+成功: 直接返回 wav 音频流, http code 200
+失败: 返回包含错误信息的 json, http code 400
+
+
+### 更换默认参考音频
+
+endpoint: `/change_refer`
+
+key与推理端一样
+
+GET:
+ `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
+POST:
+```json
+{
+ "refer_wav_path": "123.wav",
+ "prompt_text": "一二三。",
+ "prompt_language": "zh"
+}
+```
+
+RESP:
+成功: json, http code 200
+失败: json, 400
+
+
+### 命令控制
+
+endpoint: `/control`
+
+command:
+"restart": 重新运行
+"exit": 结束运行
+
+GET:
+ `http://127.0.0.1:9880/control?command=restart`
+POST:
+```json
+{
+ "command": "restart"
+}
+```
+
+RESP: 无
+
+"""
+
+
import argparse
import os
import signal
@@ -7,7 +111,7 @@ import torch
import librosa
import soundfile as sf
from fastapi import FastAPI, Request, HTTPException
-from fastapi.responses import StreamingResponse
+from fastapi.responses import StreamingResponse, JSONResponse
import uvicorn
from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np
@@ -30,14 +134,13 @@ parser = argparse.ArgumentParser(description="GPT-SoVITS api")
parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
-parser.add_argument("-dr", "--default_refer_path", type=str, default="",
- help="默认参考音频路径, 请求缺少参考音频时调用")
+parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
-parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
-parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
+parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu / mps")
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
+parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
# bool值的用法为 `python ./api.py -fp ...`
@@ -51,10 +154,18 @@ args = parser.parse_args()
sovits_path = args.sovits_path
gpt_path = args.gpt_path
-default_refer_path = args.default_refer_path
-default_refer_text = args.default_refer_text
-default_refer_language = args.default_refer_language
-has_preset = False
+
+class DefaultRefer:
+ def __init__(self, path, text, language):
+ self.path = args.default_refer_path
+ self.text = args.default_refer_text
+ self.language = args.default_refer_language
+
+ def is_ready(self) -> bool:
+ return is_full(self.path, self.text, self.language)
+
+
+default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
device = args.device
port = args.port
@@ -68,15 +179,13 @@ if gpt_path == "":
print(f"[WARN] 未指定GPT模型路径, fallback后当前值: {gpt_path}")
# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
-if default_refer_path == "" or default_refer_text == "" or default_refer_language == "":
- default_refer_path, default_refer_text, default_refer_language = "", "", ""
+if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
+ default_refer.path, default_refer.text, default_refer.language = "", "", ""
print("[INFO] 未指定默认参考音频")
- has_preset = False
else:
- print(f"[INFO] 默认参考音频路径: {default_refer_path}")
- print(f"[INFO] 默认参考音频文本: {default_refer_text}")
- print(f"[INFO] 默认参考音频语种: {default_refer_language}")
- has_preset = True
+ print(f"[INFO] 默认参考音频路径: {default_refer.path}")
+ print(f"[INFO] 默认参考音频文本: {default_refer.text}")
+ print(f"[INFO] 默认参考音频语种: {default_refer.language}")
is_half = g_config.is_half
if args.full_precision:
@@ -100,6 +209,20 @@ else:
bert_model = bert_model.to(device)
+def is_empty(*items): # 任意一项不为空返回False
+ for item in items:
+ if item is not None and item != "":
+ return False
+ return True
+
+
+def is_full(*items): # 任意一项为空返回False
+ for item in items:
+ if item is None or item == "":
+ return False
+ return True
+
+
def get_bert_feature(text, word2ph):
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
@@ -155,7 +278,7 @@ vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
hz = 50
max_sec = config['data']['max_sec']
-t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False)
+t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half:
t2s_model = t2s_model.half()
@@ -192,13 +315,18 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
t0 = ttime()
prompt_text = prompt_text.strip("\n")
prompt_language, text = prompt_language, text.strip("\n")
+ zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
with torch.no_grad():
- wav16k, sr = librosa.load(ref_wav_path, sr=16000) # 派蒙
+ wav16k, sr = librosa.load(ref_wav_path, sr=16000)
wav16k = torch.from_numpy(wav16k)
+ zero_wav_torch = torch.from_numpy(zero_wav)
if (is_half == True):
wav16k = wav16k.half().to(device)
+ zero_wav_torch = zero_wav_torch.half().to(device)
else:
wav16k = wav16k.to(device)
+ zero_wav_torch = zero_wav_torch.to(device)
+ wav16k = torch.cat([wav16k, zero_wav_torch])
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float()
codes = vq_model.extract_latent(ssl_content)
prompt_semantic = codes[0, 0]
@@ -209,7 +337,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
phones1 = cleaned_text_to_sequence(phones1)
texts = text.split("\n")
audio_opt = []
- zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
+
for text in texts:
phones2, word2ph2, norm_text2 = clean_text(text, text_language)
phones2 = cleaned_text_to_sequence(phones2)
@@ -259,25 +387,46 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
-def handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language):
- if command == "/restart":
+def handle_control(command):
+ if command == "restart":
os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
- elif command == "/exit":
+ elif command == "exit":
os.kill(os.getpid(), signal.SIGTERM)
exit(0)
+
+def handle_change(path, text, language):
+ if is_empty(path, text, language):
+ return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400)
+
+ if path != "" or path is not None:
+ default_refer.path = path
+ if text != "" or text is not None:
+ default_refer.text = text
+ if language != "" or language is not None:
+ default_refer.language = language
+
+ print(f"[INFO] 当前默认参考音频路径: {default_refer.path}")
+ print(f"[INFO] 当前默认参考音频文本: {default_refer.text}")
+ print(f"[INFO] 当前默认参考音频语种: {default_refer.language}")
+ print(f"[INFO] is_ready: {default_refer.is_ready()}")
+
+ return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
+
+
+def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
if (
refer_wav_path == "" or refer_wav_path is None
or prompt_text == "" or prompt_text is None
or prompt_language == "" or prompt_language is None
):
refer_wav_path, prompt_text, prompt_language = (
- default_refer_path,
- default_refer_text,
- default_refer_language,
+ default_refer.path,
+ default_refer.text,
+ default_refer.language,
)
- if not has_preset:
- raise HTTPException(status_code=400, detail="未指定参考音频且接口无预设")
+ if not default_refer.is_ready():
+ return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
with torch.no_grad():
gen = get_tts_wav(
@@ -290,17 +439,47 @@ def handle(command, refer_wav_path, prompt_text, prompt_language, text, text_lan
wav.seek(0)
torch.cuda.empty_cache()
+ torch.mps.empty_cache()
return StreamingResponse(wav, media_type="audio/wav")
app = FastAPI()
+@app.post("/control")
+async def control(request: Request):
+ json_post_raw = await request.json()
+ return handle_control(json_post_raw.get("command"))
+
+
+@app.get("/control")
+async def control(command: str = None):
+ return handle_control(command)
+
+
+@app.post("/change_refer")
+async def change_refer(request: Request):
+ json_post_raw = await request.json()
+ return handle_change(
+ json_post_raw.get("refer_wav_path"),
+ json_post_raw.get("prompt_text"),
+ json_post_raw.get("prompt_language")
+ )
+
+
+@app.get("/change_refer")
+async def change_refer(
+ refer_wav_path: str = None,
+ prompt_text: str = None,
+ prompt_language: str = None
+):
+ return handle_change(refer_wav_path, prompt_text, prompt_language)
+
+
@app.post("/")
async def tts_endpoint(request: Request):
json_post_raw = await request.json()
return handle(
- json_post_raw.get("command"),
json_post_raw.get("refer_wav_path"),
json_post_raw.get("prompt_text"),
json_post_raw.get("prompt_language"),
@@ -311,14 +490,13 @@ async def tts_endpoint(request: Request):
@app.get("/")
async def tts_endpoint(
- command: str = None,
refer_wav_path: str = None,
prompt_text: str = None,
prompt_language: str = None,
text: str = None,
text_language: str = None,
):
- return handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language)
+ return handle(refer_wav_path, prompt_text, prompt_language, text, text_language)
if __name__ == "__main__":
diff --git a/config.py b/config.py
index ec846b3c..897f53c1 100644
--- a/config.py
+++ b/config.py
@@ -1,10 +1,11 @@
-import sys
+import sys,os
+import torch
# 推理用的指定模型
sovits_path = ""
gpt_path = ""
-is_half = True
+is_half = eval(os.environ.get("is_half","True"))
is_share=False
cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
@@ -14,7 +15,12 @@ pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=
exp_root = "logs"
python_exec = sys.executable or "python"
-infer_device = "cuda"
+if torch.cuda.is_available():
+ infer_device = "cuda"
+elif torch.backends.mps.is_available():
+ infer_device = "mps"
+else:
+ infer_device = "cpu"
webui_port_main = 9874
webui_port_uvr5 = 9873
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 00000000..ed6f82a5
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,31 @@
+version: '3.8'
+
+services:
+ gpt-sovits:
+ image: breakstring/gpt-sovits:dev-20240123.03
+ container_name: gpt-sovits-container
+ environment:
+ - is_half=False
+ volumes:
+ - ./output:/workspace/output
+ - ./logs:/workspace/logs
+ - ./SoVITS_weights:/workspace/SoVITS_weights
+ - ./reference:/workspace/reference
+ working_dir: /workspace
+ ports:
+ - "9870:9870"
+ - "9871:9871"
+ - "9872:9872"
+ - "9873:9873"
+ - "9874:9874"
+ shm_size: 16G
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: "all"
+ capabilities: [gpu]
+ stdin_open: true
+ tty: true
+ restart: unless-stopped
diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md
new file mode 100644
index 00000000..93fc8be7
--- /dev/null
+++ b/docs/cn/Changelog_CN.md
@@ -0,0 +1,31 @@
+### 20240121更新
+
+1-config添加is_share,诸如colab等场景可以将此改为True,来使得webui映射到公网
+
+2-WebUI添加英文系统英文翻译适配
+
+3-cmd-asr自动判断是否已自带damo模型,如不在默认目录上将从modelscope自带下载
+
+4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等)
+
+5-清理TEMP文件夹缓存音频等文件
+
+6-大幅削弱合成音频包含参考音频结尾的问题
+
+### 20240122更新
+
+1-修复过短输出文件返回重复参考音频的问题。
+
+2-经测试,英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符)。
+
+3-音频路径检查。如果尝试读取输入错的路径报错路径不存在,而非ffmpeg错误。
+
+### 20240123更新
+
+1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题
+
+2-支持推理界面快速切换模型
+
+3-优化模型文件排序逻辑
+
+4-中文分词使用jieba_fast代替jieba
diff --git a/docs/cn/README.md b/docs/cn/README.md
index 5993b081..445bf92b 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -1,7 +1,7 @@