mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-10-06 22:50:00 +08:00
feat: update gpt_sovits
Signed-off-by: Guoxian Dai <guoxian@peta-mini1.lan>
This commit is contained in:
parent
a70e1ad30c
commit
329e7002f7
4
.gitignore
vendored
4
.gitignore
vendored
@ -13,6 +13,6 @@ SoVITS_weights
|
|||||||
GPT_weights_v2
|
GPT_weights_v2
|
||||||
SoVITS_weights_v2
|
SoVITS_weights_v2
|
||||||
TEMP
|
TEMP
|
||||||
weight.json
|
#weight.json
|
||||||
ffmpeg*
|
ffmpeg*
|
||||||
ffprobe*
|
ffprobe*
|
||||||
|
@ -8,9 +8,9 @@ from typing import Dict
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from pytorch_lightning import LightningModule
|
from pytorch_lightning import LightningModule
|
||||||
from AR.models.t2s_model import Text2SemanticDecoder
|
from GPT_SoVITS.AR.models.t2s_model import Text2SemanticDecoder
|
||||||
from AR.modules.lr_schedulers import WarmupCosineLRSchedule
|
from GPT_SoVITS.AR.modules.lr_schedulers import WarmupCosineLRSchedule
|
||||||
from AR.modules.optim import ScaledAdam
|
from GPT_SoVITS.AR.modules.optim import ScaledAdam
|
||||||
|
|
||||||
class Text2SemanticLightningModule(LightningModule):
|
class Text2SemanticLightningModule(LightningModule):
|
||||||
def __init__(self, config, output_dir, is_train=True):
|
def __init__(self, config, output_dir, is_train=True):
|
||||||
|
@ -5,8 +5,8 @@ from typing import List, Optional
|
|||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from AR.models.utils import make_pad_mask
|
from GPT_SoVITS.AR.models.utils import make_pad_mask
|
||||||
from AR.models.utils import (
|
from GPT_SoVITS.AR.models.utils import (
|
||||||
topk_sampling,
|
topk_sampling,
|
||||||
sample,
|
sample,
|
||||||
logits_to_probs,
|
logits_to_probs,
|
||||||
@ -15,11 +15,11 @@ from AR.models.utils import (
|
|||||||
make_reject_y,
|
make_reject_y,
|
||||||
get_batch_logps
|
get_batch_logps
|
||||||
)
|
)
|
||||||
from AR.modules.embedding import SinePositionalEmbedding
|
from GPT_SoVITS.AR.modules.embedding import SinePositionalEmbedding
|
||||||
from AR.modules.embedding import TokenEmbedding
|
from GPT_SoVITS.AR.modules.embedding import TokenEmbedding
|
||||||
from AR.modules.transformer import LayerNorm
|
from GPT_SoVITS.AR.modules.transformer import LayerNorm
|
||||||
from AR.modules.transformer import TransformerEncoder
|
from GPT_SoVITS.AR.modules.transformer import TransformerEncoder
|
||||||
from AR.modules.transformer import TransformerEncoderLayer
|
from GPT_SoVITS.AR.modules.transformer import TransformerEncoderLayer
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from torchmetrics.classification import MulticlassAccuracy
|
from torchmetrics.classification import MulticlassAccuracy
|
||||||
|
@ -12,7 +12,7 @@ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
|
|||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched
|
from GPT_SoVITS.AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched
|
||||||
|
|
||||||
F.multi_head_attention_forward = multi_head_attention_forward_patched
|
F.multi_head_attention_forward = multi_head_attention_forward_patched
|
||||||
|
|
||||||
|
@ -10,8 +10,8 @@ from typing import Tuple
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from AR.modules.activation import MultiheadAttention
|
from GPT_SoVITS.AR.modules.activation import MultiheadAttention
|
||||||
from AR.modules.scaling import BalancedDoubleSwish
|
from GPT_SoVITS.AR.modules.scaling import BalancedDoubleSwish
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
@ -21,8 +21,8 @@ from feature_extractor.cnhubert import CNHubert
|
|||||||
from module.models import SynthesizerTrn
|
from module.models import SynthesizerTrn
|
||||||
import librosa
|
import librosa
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
from tools.my_utils import load_audio
|
from GPT_SoVITS.tools.my_utils import load_audio
|
||||||
from module.mel_processing import spectrogram_torch
|
from module.mel_processing import spectrogram_torch
|
||||||
from TTS_infer_pack.text_segmentation_method import splits
|
from TTS_infer_pack.text_segmentation_method import splits
|
||||||
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
||||||
|
@ -15,7 +15,7 @@ from text import cleaned_text_to_sequence
|
|||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
|
from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
|
||||||
|
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
|
||||||
language=os.environ.get("language","Auto")
|
language=os.environ.get("language","Auto")
|
||||||
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||||
|
@ -16,7 +16,7 @@ from transformers import (
|
|||||||
HubertModel,
|
HubertModel,
|
||||||
)
|
)
|
||||||
|
|
||||||
import utils
|
import GPT_SoVITS.utils
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
cnhubert_base_path = None
|
cnhubert_base_path = None
|
||||||
@ -102,9 +102,10 @@ def get_content(hmodel, wav_16k_tensor):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
from GPT_SoVITS.utils import load_wav_to_torch_and_resample
|
||||||
model = get_model()
|
model = get_model()
|
||||||
src_path = "/Users/Shared/原音频2.wav"
|
src_path = "/Users/Shared/原音频2.wav"
|
||||||
wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
|
wav_16k_tensor = load_wav_to_torch_and_resample(src_path, 16000)
|
||||||
model = model
|
model = model
|
||||||
wav_16k_tensor = wav_16k_tensor
|
wav_16k_tensor = wav_16k_tensor
|
||||||
feats = get_content(model, wav_16k_tensor)
|
feats = get_content(model, wav_16k_tensor)
|
||||||
|
@ -2,9 +2,15 @@ import argparse
|
|||||||
import os
|
import os
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
from tools.i18n.i18n import I18nAuto
|
import GPT_SoVITS
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from GPT_SoVITS.tools.i18n.i18n import I18nAuto
|
||||||
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
|
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
|
|
||||||
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path):
|
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path):
|
||||||
@ -18,6 +24,7 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path,
|
|||||||
|
|
||||||
# Change model weights
|
# Change model weights
|
||||||
change_gpt_weights(gpt_path=GPT_model_path)
|
change_gpt_weights(gpt_path=GPT_model_path)
|
||||||
|
|
||||||
change_sovits_weights(sovits_path=SoVITS_model_path)
|
change_sovits_weights(sovits_path=SoVITS_model_path)
|
||||||
|
|
||||||
# Synthesize audio
|
# Synthesize audio
|
||||||
@ -29,6 +36,8 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path,
|
|||||||
|
|
||||||
result_list = list(synthesis_result)
|
result_list = list(synthesis_result)
|
||||||
|
|
||||||
|
if not os.path.isdir(output_path):
|
||||||
|
os.makedirs(output_path, exist_ok=True)
|
||||||
if result_list:
|
if result_list:
|
||||||
last_sampling_rate, last_audio_data = result_list[-1]
|
last_sampling_rate, last_audio_data = result_list[-1]
|
||||||
output_wav_path = os.path.join(output_path, "output.wav")
|
output_wav_path = os.path.join(output_path, "output.wav")
|
||||||
|
@ -5,7 +5,7 @@ from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushB
|
|||||||
from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox
|
from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
from tools.i18n.i18n import I18nAuto
|
from GPT_SoVITS.tools.i18n.i18n import I18nAuto
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
|
|
||||||
from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
|
from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
|
||||||
|
@ -27,8 +27,8 @@ try:
|
|||||||
except:...
|
except:...
|
||||||
|
|
||||||
version=os.environ.get("version","v2")
|
version=os.environ.get("version","v2")
|
||||||
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "GPT_SoVITS/pretrained_models/s2G488k.pth"]
|
pretrained_sovits_name=["pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "pretrained_models/s2G488k.pth"]
|
||||||
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"]
|
pretrained_gpt_name=["pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"]
|
||||||
|
|
||||||
_ =[[],[]]
|
_ =[[],[]]
|
||||||
for i in range(2):
|
for i in range(2):
|
||||||
@ -38,14 +38,19 @@ for i in range(2):
|
|||||||
_[-1].append(pretrained_sovits_name[i])
|
_[-1].append(pretrained_sovits_name[i])
|
||||||
pretrained_gpt_name,pretrained_sovits_name = _
|
pretrained_gpt_name,pretrained_sovits_name = _
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if os.path.exists(f"./weight.json"):
|
weight_json_file_path = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"weight.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("weight_json_file_path: ", weight_json_file_path)
|
||||||
|
if os.path.exists(weight_json_file_path):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
with open(f"./weight.json", 'w', encoding="utf-8") as file:json.dump({'GPT':{},'SoVITS':{}},file)
|
with open(weight_json_file_path, 'w', encoding="utf-8") as file:json.dump({'GPT':{},'SoVITS':{}},file)
|
||||||
|
|
||||||
with open(f"./weight.json", 'r', encoding="utf-8") as file:
|
with open(weight_json_file_path, 'r', encoding="utf-8") as file:
|
||||||
weight_data = file.read()
|
weight_data = file.read()
|
||||||
weight_data=json.loads(weight_data)
|
weight_data=json.loads(weight_data)
|
||||||
gpt_path = os.environ.get(
|
gpt_path = os.environ.get(
|
||||||
@ -62,11 +67,12 @@ with open(f"./weight.json", 'r', encoding="utf-8") as file:
|
|||||||
# )
|
# )
|
||||||
# sovits_path = os.environ.get("sovits_path", pretrained_sovits_name)
|
# sovits_path = os.environ.get("sovits_path", pretrained_sovits_name)
|
||||||
cnhubert_base_path = os.environ.get(
|
cnhubert_base_path = os.environ.get(
|
||||||
"cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base"
|
"cnhubert_base_path", "pretrained_models/chinese-hubert-base"
|
||||||
)
|
)
|
||||||
bert_path = os.environ.get(
|
bert_path = os.environ.get(
|
||||||
"bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
|
"bert_path", "pretrained_models/chinese-roberta-wwm-ext-large"
|
||||||
)
|
)
|
||||||
|
|
||||||
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
||||||
infer_ttswebui = int(infer_ttswebui)
|
infer_ttswebui = int(infer_ttswebui)
|
||||||
is_share = os.environ.get("is_share", "False")
|
is_share = os.environ.get("is_share", "False")
|
||||||
@ -79,18 +85,18 @@ import gradio as gr
|
|||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import librosa
|
import librosa
|
||||||
from feature_extractor import cnhubert
|
from GPT_SoVITS.feature_extractor import cnhubert
|
||||||
|
|
||||||
cnhubert.cnhubert_base_path = cnhubert_base_path
|
cnhubert.cnhubert_base_path = cnhubert_base_path
|
||||||
|
|
||||||
from module.models import SynthesizerTrn
|
from GPT_SoVITS.module.models import SynthesizerTrn
|
||||||
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
from text.cleaner import clean_text
|
from GPT_SoVITS.text.cleaner import clean_text
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
from module.mel_processing import spectrogram_torch
|
from GPT_SoVITS.module.mel_processing import spectrogram_torch
|
||||||
from tools.my_utils import load_audio
|
from GPT_SoVITS.tools.my_utils import load_audio
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
|
||||||
language=os.environ.get("language","Auto")
|
language=os.environ.get("language","Auto")
|
||||||
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||||
@ -126,6 +132,7 @@ dict_language_v2 = {
|
|||||||
}
|
}
|
||||||
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
|
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
|
||||||
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
||||||
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
||||||
if is_half == True:
|
if is_half == True:
|
||||||
@ -187,6 +194,9 @@ else:
|
|||||||
|
|
||||||
def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||||
global vq_model, hps, version, dict_language
|
global vq_model, hps, version, dict_language
|
||||||
|
print("sovits_path: ", sovits_path)
|
||||||
|
print("os.path.isfile(sovits_path) = ", os.path.isfile(sovits_path))
|
||||||
|
|
||||||
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
||||||
hps = dict_s2["config"]
|
hps = dict_s2["config"]
|
||||||
hps = DictToAttrRecursive(hps)
|
hps = DictToAttrRecursive(hps)
|
||||||
@ -212,11 +222,11 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
|||||||
vq_model.eval()
|
vq_model.eval()
|
||||||
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
||||||
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
|
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
|
||||||
with open("./weight.json")as f:
|
with open(weight_json_file_path)as f:
|
||||||
data=f.read()
|
data=f.read()
|
||||||
data=json.loads(data)
|
data=json.loads(data)
|
||||||
data["SoVITS"][version]=sovits_path
|
data["SoVITS"][version]=sovits_path
|
||||||
with open("./weight.json","w")as f:f.write(json.dumps(data))
|
with open(weight_json_file_path,"w")as f:f.write(json.dumps(data))
|
||||||
if prompt_language is not None and text_language is not None:
|
if prompt_language is not None and text_language is not None:
|
||||||
if prompt_language in list(dict_language.keys()):
|
if prompt_language in list(dict_language.keys()):
|
||||||
prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language}
|
prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language}
|
||||||
@ -249,11 +259,11 @@ def change_gpt_weights(gpt_path):
|
|||||||
t2s_model.eval()
|
t2s_model.eval()
|
||||||
total = sum([param.nelement() for param in t2s_model.parameters()])
|
total = sum([param.nelement() for param in t2s_model.parameters()])
|
||||||
print("Number of parameter: %.2fM" % (total / 1e6))
|
print("Number of parameter: %.2fM" % (total / 1e6))
|
||||||
with open("./weight.json")as f:
|
with open(weight_json_file_path)as f:
|
||||||
data=f.read()
|
data=f.read()
|
||||||
data=json.loads(data)
|
data=json.loads(data)
|
||||||
data["GPT"][version]=gpt_path
|
data["GPT"][version]=gpt_path
|
||||||
with open("./weight.json","w")as f:f.write(json.dumps(data))
|
with open(weight_json_file_path,"w")as f:f.write(json.dumps(data))
|
||||||
|
|
||||||
|
|
||||||
change_gpt_weights(gpt_path)
|
change_gpt_weights(gpt_path)
|
||||||
@ -303,7 +313,7 @@ def get_first(text):
|
|||||||
text = re.split(pattern, text)[0].strip()
|
text = re.split(pattern, text)[0].strip()
|
||||||
return text
|
return text
|
||||||
|
|
||||||
from text import chinese
|
from GPT_SoVITS.text import chinese
|
||||||
def get_phones_and_bert(text,language,version,final=False):
|
def get_phones_and_bert(text,language,version,final=False):
|
||||||
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
||||||
language = language.replace("all_","")
|
language = language.replace("all_","")
|
||||||
|
@ -46,7 +46,7 @@ version=os.environ.get("version","v2")
|
|||||||
import gradio as gr
|
import gradio as gr
|
||||||
from TTS_infer_pack.TTS import TTS, TTS_Config
|
from TTS_infer_pack.TTS import TTS, TTS_Config
|
||||||
from TTS_infer_pack.text_segmentation_method import get_method
|
from TTS_infer_pack.text_segmentation_method import get_method
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
|
||||||
language=os.environ.get("language","Auto")
|
language=os.environ.get("language","Auto")
|
||||||
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||||
|
@ -3,8 +3,8 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from module import commons
|
from GPT_SoVITS.module import commons
|
||||||
from module.modules import LayerNorm
|
from GPT_SoVITS.module.modules import LayerNorm
|
||||||
|
|
||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
|
@ -17,7 +17,7 @@ from functools import lru_cache
|
|||||||
import requests
|
import requests
|
||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from tools.my_utils import load_audio
|
from GPT_SoVITS.tools.my_utils import load_audio
|
||||||
version = os.environ.get('version',None)
|
version = os.environ.get('version',None)
|
||||||
# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
|
# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
|
||||||
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||||
|
@ -9,18 +9,18 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from module import commons
|
from GPT_SoVITS.module import commons
|
||||||
from module import modules
|
from GPT_SoVITS.module import modules
|
||||||
from module import attentions
|
from GPT_SoVITS.module import attentions
|
||||||
|
|
||||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||||
from module.commons import init_weights, get_padding
|
from GPT_SoVITS.module.commons import init_weights, get_padding
|
||||||
from module.mrte_model import MRTE
|
from GPT_SoVITS.module.mrte_model import MRTE
|
||||||
from module.quantize import ResidualVectorQuantizer
|
from GPT_SoVITS.module.quantize import ResidualVectorQuantizer
|
||||||
# from text import symbols
|
# from text import symbols
|
||||||
from text import symbols as symbols_v1
|
from GPT_SoVITS.text import symbols as symbols_v1
|
||||||
from text import symbols2 as symbols_v2
|
from GPT_SoVITS.text import symbols2 as symbols_v2
|
||||||
from torch.cuda.amp import autocast
|
from torch.cuda.amp import autocast
|
||||||
import contextlib
|
import contextlib
|
||||||
|
|
||||||
|
@ -7,9 +7,9 @@ from torch.nn import functional as F
|
|||||||
from torch.nn import Conv1d
|
from torch.nn import Conv1d
|
||||||
from torch.nn.utils import weight_norm, remove_weight_norm
|
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||||
|
|
||||||
from module import commons
|
from GPT_SoVITS.module import commons
|
||||||
from module.commons import init_weights, get_padding
|
from GPT_SoVITS.module.commons import init_weights, get_padding
|
||||||
from module.transforms import piecewise_rational_quadratic_transform
|
from GPT_SoVITS.module.transforms import piecewise_rational_quadratic_transform
|
||||||
import torch.distributions as D
|
import torch.distributions as D
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn.utils import remove_weight_norm, weight_norm
|
from torch.nn.utils import remove_weight_norm, weight_norm
|
||||||
from module.attentions import MultiHeadAttention
|
from GPT_SoVITS.module.attentions import MultiHeadAttention
|
||||||
|
|
||||||
|
|
||||||
class MRTE(nn.Module):
|
class MRTE(nn.Module):
|
||||||
|
@ -13,7 +13,7 @@ import typing as tp
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from module.core_vq import ResidualVectorQuantization
|
from GPT_SoVITS.module.core_vq import ResidualVectorQuantization
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -10,7 +10,7 @@ cnhubert.cnhubert_base_path = cnhubert_base_path
|
|||||||
ssl_model = cnhubert.get_model()
|
ssl_model = cnhubert.get_model()
|
||||||
from text import cleaned_text_to_sequence
|
from text import cleaned_text_to_sequence
|
||||||
import soundfile
|
import soundfile
|
||||||
from tools.my_utils import load_audio
|
from GPT_SoVITS.tools.my_utils import load_audio
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ from tqdm import tqdm
|
|||||||
from text.cleaner import clean_text
|
from text.cleaner import clean_text
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tools.my_utils import clean_path
|
from GPT_SoVITS.tools.my_utils import clean_path
|
||||||
|
|
||||||
# inp_text=sys.argv[1]
|
# inp_text=sys.argv[1]
|
||||||
# inp_wav_dir=sys.argv[2]
|
# inp_wav_dir=sys.argv[2]
|
||||||
|
@ -19,7 +19,7 @@ from scipy.io import wavfile
|
|||||||
import librosa
|
import librosa
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
from tools.my_utils import load_audio,clean_path
|
from GPT_SoVITS.tools.my_utils import load_audio,clean_path
|
||||||
|
|
||||||
# from config import cnhubert_base_path
|
# from config import cnhubert_base_path
|
||||||
# cnhubert.cnhubert_base_path=cnhubert_base_path
|
# cnhubert.cnhubert_base_path=cnhubert_base_path
|
||||||
|
@ -24,7 +24,7 @@ from glob import glob
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import logging, librosa, utils
|
import logging, librosa, utils
|
||||||
from module.models import SynthesizerTrn
|
from module.models import SynthesizerTrn
|
||||||
from tools.my_utils import clean_path
|
from GPT_SoVITS.tools.my_utils import clean_path
|
||||||
logging.getLogger("numba").setLevel(logging.WARNING)
|
logging.getLogger("numba").setLevel(logging.WARNING)
|
||||||
# from config import pretrained_s2G
|
# from config import pretrained_s2G
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ from collections import OrderedDict
|
|||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
import shutil,os
|
import shutil,os
|
||||||
import torch
|
import torch
|
||||||
from tools.i18n.i18n import I18nAuto
|
from GPT_SoVITS.tools.i18n.i18n import I18nAuto
|
||||||
|
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
|
|
||||||
|
@ -4,8 +4,8 @@ import os
|
|||||||
# else:
|
# else:
|
||||||
# from text.symbols2 import symbols
|
# from text.symbols2 import symbols
|
||||||
|
|
||||||
from text import symbols as symbols_v1
|
from GPT_SoVITS.text import symbols as symbols_v1
|
||||||
from text import symbols2 as symbols_v2
|
from GPT_SoVITS.text import symbols2 as symbols_v2
|
||||||
|
|
||||||
_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
|
_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
|
||||||
_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
|
_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
|
||||||
|
@ -5,9 +5,9 @@ import re
|
|||||||
import cn2an
|
import cn2an
|
||||||
from pypinyin import lazy_pinyin, Style
|
from pypinyin import lazy_pinyin, Style
|
||||||
|
|
||||||
from text.symbols import punctuation
|
from GPT_SoVITS.text.symbols import punctuation
|
||||||
from text.tone_sandhi import ToneSandhi
|
from GPT_SoVITS.text.tone_sandhi import ToneSandhi
|
||||||
from text.zh_normalization.text_normlization import TextNormalizer
|
from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer
|
||||||
|
|
||||||
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
||||||
|
|
||||||
|
@ -6,9 +6,9 @@ import cn2an
|
|||||||
from pypinyin import lazy_pinyin, Style
|
from pypinyin import lazy_pinyin, Style
|
||||||
from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals
|
from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals
|
||||||
|
|
||||||
from text.symbols import punctuation
|
from GPT_SoVITS.text.symbols import punctuation
|
||||||
from text.tone_sandhi import ToneSandhi
|
from GPT_SoVITS.text.tone_sandhi import ToneSandhi
|
||||||
from text.zh_normalization.text_normlization import TextNormalizer
|
from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer
|
||||||
|
|
||||||
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
||||||
|
|
||||||
@ -25,9 +25,9 @@ import jieba_fast.posseg as psg
|
|||||||
is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False
|
is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False
|
||||||
if is_g2pw:
|
if is_g2pw:
|
||||||
print("当前使用g2pw进行拼音推理")
|
print("当前使用g2pw进行拼音推理")
|
||||||
from text.g2pw import G2PWPinyin, correct_pronunciation
|
from GPT_SoVITS.text.g2pw import G2PWPinyin, correct_pronunciation
|
||||||
parent_directory = os.path.dirname(current_file_path)
|
parent_directory = os.path.dirname(current_file_path)
|
||||||
g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True)
|
g2pw = G2PWPinyin(model_dir=f"{current_file_path}/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True)
|
||||||
|
|
||||||
rep_map = {
|
rep_map = {
|
||||||
":": ",",
|
":": ",",
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
import os
|
import os
|
||||||
# if os.environ.get("version","v1")=="v1":
|
# if os.environ.get("version","v1")=="v1":
|
||||||
# from text import chinese
|
# from text import chinese
|
||||||
@ -7,8 +7,8 @@ import os
|
|||||||
# from text import chinese2 as chinese
|
# from text import chinese2 as chinese
|
||||||
# from text.symbols2 import symbols
|
# from text.symbols2 import symbols
|
||||||
|
|
||||||
from text import symbols as symbols_v1
|
from GPT_SoVITS.text import symbols as symbols_v1
|
||||||
from text import symbols2 as symbols_v2
|
from GPT_SoVITS.text import symbols2 as symbols_v2
|
||||||
|
|
||||||
special = [
|
special = [
|
||||||
# ("%", "zh", "SP"),
|
# ("%", "zh", "SP"),
|
||||||
@ -33,7 +33,7 @@ def clean_text(text, language, version=None):
|
|||||||
for special_s, special_l, target_symbol in special:
|
for special_s, special_l, target_symbol in special:
|
||||||
if special_s in text and language == special_l:
|
if special_s in text and language == special_l:
|
||||||
return clean_special(text, language, special_s, target_symbol, version)
|
return clean_special(text, language, special_s, target_symbol, version)
|
||||||
language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]])
|
language_module = __import__("GPT_SoVITS.text."+language_module_map[language],fromlist=[language_module_map[language]])
|
||||||
if hasattr(language_module,"text_normalize"):
|
if hasattr(language_module,"text_normalize"):
|
||||||
norm_text = language_module.text_normalize(text)
|
norm_text = language_module.text_normalize(text)
|
||||||
else:
|
else:
|
||||||
|
@ -1 +1 @@
|
|||||||
from text.g2pw.g2pw import *
|
from GPT_SoVITS.text.g2pw.g2pw import *
|
@ -11,4 +11,4 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from text.zh_normalization.text_normlization import *
|
from GPT_SoVITS.text.zh_normalization.text_normlization import *
|
||||||
|
1
GPT_SoVITS/weight.json
Normal file
1
GPT_SoVITS/weight.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"GPT": {"v2": "pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"}, "SoVITS": {"v2": "pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "v1": "pretrained_models/doria_female/doria_e8_s496.pth"}}
|
Loading…
x
Reference in New Issue
Block a user