mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-08-19 00:09:48 +08:00
more import changes
This commit is contained in:
parent
c5490bb2a2
commit
54bcce13d2
@ -17,7 +17,7 @@ from transformers import AutoTokenizer
|
|||||||
|
|
||||||
version = os.environ.get('version',None)
|
version = os.environ.get('version',None)
|
||||||
|
|
||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
|
|
||||||
# from config import exp_dir
|
# from config import exp_dir
|
||||||
|
|
||||||
|
@ -884,6 +884,157 @@ class Text2SemanticDecoder(nn.Module):
|
|||||||
return y[:, :-1], 0
|
return y[:, :-1], 0
|
||||||
return y[:, :-1], idx - 1
|
return y[:, :-1], idx - 1
|
||||||
|
|
||||||
|
def infer_panel_generator(
|
||||||
|
self,
|
||||||
|
x: torch.LongTensor,
|
||||||
|
x_lens: torch.LongTensor,
|
||||||
|
prompts: torch.LongTensor,
|
||||||
|
bert_feature: torch.LongTensor,
|
||||||
|
cumulation_amount: int,
|
||||||
|
top_k: int = -100,
|
||||||
|
top_p: int = 100,
|
||||||
|
early_stop_num: int = -1,
|
||||||
|
temperature: float = 1.0,
|
||||||
|
repetition_penalty: float = 1.35,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Generator method that yields generated tokens based on a specified cumulative amount.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (torch.LongTensor): Input phoneme IDs.
|
||||||
|
x_lens (torch.LongTensor): Lengths of the input sequences.
|
||||||
|
prompts (torch.LongTensor): Initial prompt tokens.
|
||||||
|
bert_feature (torch.LongTensor): BERT features corresponding to the input.
|
||||||
|
cumulation_amount (int): Number of tokens to generate before yielding.
|
||||||
|
top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
||||||
|
top_p (float): If set to < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
|
||||||
|
early_stop_num (int): Early stopping after generating a certain number of tokens.
|
||||||
|
temperature (float): The value used to module the next token probabilities.
|
||||||
|
repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty.
|
||||||
|
**kwargs: Additional keyword arguments.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
torch.LongTensor: Generated tokens since the last yield.
|
||||||
|
"""
|
||||||
|
x = self.ar_text_embedding(x)
|
||||||
|
x = x + self.bert_proj(bert_feature.transpose(1, 2))
|
||||||
|
x = self.ar_text_position(x)
|
||||||
|
|
||||||
|
# AR Decoder
|
||||||
|
y = prompts
|
||||||
|
|
||||||
|
x_len = x.shape[1]
|
||||||
|
x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device)
|
||||||
|
stop = False
|
||||||
|
|
||||||
|
# Initialize cumulative token counter
|
||||||
|
tokens_since_last_yield = 0
|
||||||
|
|
||||||
|
# Initialize last yield index
|
||||||
|
if y is not None:
|
||||||
|
prefix_len = y.shape[1]
|
||||||
|
else:
|
||||||
|
prefix_len = 0
|
||||||
|
|
||||||
|
last_yield_idx = prefix_len
|
||||||
|
|
||||||
|
k_cache = None
|
||||||
|
v_cache = None
|
||||||
|
|
||||||
|
################### first step ##########################
|
||||||
|
if y is not None:
|
||||||
|
y_emb = self.ar_audio_embedding(y)
|
||||||
|
y_len = y_emb.shape[1]
|
||||||
|
prefix_len = y.shape[1]
|
||||||
|
y_pos = self.ar_audio_position(y_emb)
|
||||||
|
xy_pos = torch.concat([x, y_pos], dim=1)
|
||||||
|
ref_free = False
|
||||||
|
else:
|
||||||
|
y_emb = None
|
||||||
|
y_len = 0
|
||||||
|
prefix_len = 0
|
||||||
|
y_pos = None
|
||||||
|
xy_pos = x
|
||||||
|
y = torch.zeros(x.shape[0], 0, dtype=torch.int64, device=x.device)
|
||||||
|
ref_free = True
|
||||||
|
|
||||||
|
bsz = x.shape[0]
|
||||||
|
src_len = x_len + y_len
|
||||||
|
x_attn_mask_pad = F.pad(
|
||||||
|
x_attn_mask,
|
||||||
|
(0, y_len), # Extend x_attn_mask to include y tokens
|
||||||
|
value=True,
|
||||||
|
)
|
||||||
|
y_attn_mask = F.pad(
|
||||||
|
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), diagonal=1),
|
||||||
|
(x_len, 0),
|
||||||
|
value=False,
|
||||||
|
)
|
||||||
|
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)
|
||||||
|
xy_attn_mask = xy_attn_mask.unsqueeze(0).expand(bsz * self.num_head, -1, -1)
|
||||||
|
xy_attn_mask = xy_attn_mask.view(bsz, self.num_head, src_len, src_len)
|
||||||
|
xy_attn_mask = xy_attn_mask.to(device=x.device, dtype=torch.bool)
|
||||||
|
|
||||||
|
for idx in tqdm(range(1500)):
|
||||||
|
if xy_attn_mask is not None:
|
||||||
|
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None)
|
||||||
|
else:
|
||||||
|
xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache)
|
||||||
|
|
||||||
|
logits = self.ar_predict_layer(xy_dec[:, -1])
|
||||||
|
|
||||||
|
if idx == 0:
|
||||||
|
xy_attn_mask = None
|
||||||
|
if idx < 11: # Ensure at least 10 tokens are generated before stopping
|
||||||
|
logits = logits[:, :-1]
|
||||||
|
|
||||||
|
samples = sample(
|
||||||
|
logits,
|
||||||
|
y,
|
||||||
|
top_k=top_k,
|
||||||
|
top_p=top_p,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
|
temperature=temperature,
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
y = torch.concat([y, samples], dim=1)
|
||||||
|
tokens_since_last_yield += 1
|
||||||
|
|
||||||
|
if tokens_since_last_yield >= cumulation_amount:
|
||||||
|
# Yield back the generated tokens since last yield
|
||||||
|
generated_tokens = y[:, last_yield_idx:]
|
||||||
|
# print(generated_tokens)
|
||||||
|
yield generated_tokens
|
||||||
|
last_yield_idx = y.shape[1]
|
||||||
|
tokens_since_last_yield = 0
|
||||||
|
|
||||||
|
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
|
||||||
|
print("Using early stop num:", early_stop_num)
|
||||||
|
stop = True
|
||||||
|
|
||||||
|
if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
|
||||||
|
stop = True
|
||||||
|
|
||||||
|
if stop:
|
||||||
|
if y.shape[1] == 0:
|
||||||
|
y = torch.concat([y, torch.zeros_like(samples)], dim=1)
|
||||||
|
print("Bad zero prediction")
|
||||||
|
print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Update for next step
|
||||||
|
y_emb = self.ar_audio_embedding(y[:, -1:])
|
||||||
|
y_len += 1
|
||||||
|
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[
|
||||||
|
:, y_len - 1
|
||||||
|
].to(dtype=y_emb.dtype, device=y_emb.device)
|
||||||
|
|
||||||
|
# After loop ends, yield any remaining tokens
|
||||||
|
if last_yield_idx < y.shape[1]:
|
||||||
|
generated_tokens = y[:, last_yield_idx:]
|
||||||
|
yield generated_tokens
|
||||||
|
|
||||||
|
|
||||||
def infer_panel(
|
def infer_panel(
|
||||||
self,
|
self,
|
||||||
|
@ -8,10 +8,10 @@ sys.path.append(now_dir)
|
|||||||
import re
|
import re
|
||||||
import torch
|
import torch
|
||||||
import LangSegment
|
import LangSegment
|
||||||
from text import chinese
|
from GPT_SoVITS.text import chinese
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
from text.cleaner import clean_text
|
from GPT_SoVITS.text.cleaner import clean_text
|
||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
|
from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import os, sys
|
import os, sys
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.insert(0, now_dir)
|
sys.path.insert(0, now_dir)
|
||||||
from text.g2pw import G2PWPinyin
|
from GPT_SoVITS.text.g2pw import G2PWPinyin
|
||||||
g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",v_to_u=False, neutral_tone_with_five=True)
|
g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",v_to_u=False, neutral_tone_with_five=True)
|
@ -3,7 +3,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from my_utils import load_audio
|
from my_utils import load_audio
|
||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
@ -14,7 +14,7 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|||||||
from feature_extractor import cnhubert
|
from feature_extractor import cnhubert
|
||||||
|
|
||||||
from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
||||||
from module.models_onnx import SynthesizerTrn
|
from GPT_SoVITS.module.models_onnx import SynthesizerTrn
|
||||||
|
|
||||||
from inference_webui import get_phones_and_bert
|
from inference_webui import get_phones_and_bert
|
||||||
|
|
||||||
|
@ -83,12 +83,12 @@ from feature_extractor import cnhubert
|
|||||||
|
|
||||||
cnhubert.cnhubert_base_path = cnhubert_base_path
|
cnhubert.cnhubert_base_path = cnhubert_base_path
|
||||||
|
|
||||||
from module.models import SynthesizerTrn
|
from GPT_SoVITS.module.models import SynthesizerTrn
|
||||||
from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
from text.cleaner import clean_text
|
from GPT_SoVITS.text.cleaner import clean_text
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
from module.mel_processing import spectrogram_torch
|
from GPT_SoVITS.module.mel_processing import spectrogram_torch
|
||||||
from tools.my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
|
||||||
@ -303,7 +303,7 @@ def get_first(text):
|
|||||||
text = re.split(pattern, text)[0].strip()
|
text = re.split(pattern, text)[0].strip()
|
||||||
return text
|
return text
|
||||||
|
|
||||||
from text import chinese
|
from GPT_SoVITS.text import chinese
|
||||||
def get_phones_and_bert(text,language,version,final=False):
|
def get_phones_and_bert(text,language,version,final=False):
|
||||||
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
||||||
language = language.replace("all_","")
|
language = language.replace("all_","")
|
||||||
|
@ -9,18 +9,18 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from module import commons
|
from GPT_SoVITS.module import commons
|
||||||
from module import modules
|
from GPT_SoVITS.module import modules
|
||||||
from module import attentions
|
from GPT_SoVITS.module import attentions
|
||||||
|
|
||||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||||
from module.commons import init_weights, get_padding
|
from GPT_SoVITS.module.commons import init_weights, get_padding
|
||||||
from module.mrte_model import MRTE
|
from GPT_SoVITS.module.mrte_model import MRTE
|
||||||
from module.quantize import ResidualVectorQuantizer
|
from GPT_SoVITS.module.quantize import ResidualVectorQuantizer
|
||||||
# from text import symbols
|
# from GPT_SoVITS.text import symbols
|
||||||
from text import symbols as symbols_v1
|
from GPT_SoVITS.text import symbols as symbols_v1
|
||||||
from text import symbols2 as symbols_v2
|
from GPT_SoVITS.text import symbols2 as symbols_v2
|
||||||
from torch.cuda.amp import autocast
|
from torch.cuda.amp import autocast
|
||||||
import contextlib
|
import contextlib
|
||||||
|
|
||||||
|
@ -5,17 +5,17 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from module import commons
|
from GPT_SoVITS.module import commons
|
||||||
from module import modules
|
from GPT_SoVITS.module import modules
|
||||||
from module import attentions_onnx as attentions
|
from GPT_SoVITS.module import attentions_onnx as attentions
|
||||||
|
|
||||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||||
from module.commons import init_weights, get_padding
|
from GPT_SoVITS.module.commons import init_weights, get_padding
|
||||||
from module.quantize import ResidualVectorQuantizer
|
from GPT_SoVITS.module.quantize import ResidualVectorQuantizer
|
||||||
# from text import symbols
|
# from GPT_SoVITS.text import symbols
|
||||||
from text import symbols as symbols_v1
|
from GPT_SoVITS.text import symbols as symbols_v1
|
||||||
from text import symbols2 as symbols_v2
|
from GPT_SoVITS.text import symbols2 as symbols_v2
|
||||||
from torch.cuda.amp import autocast
|
from torch.cuda.amp import autocast
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,9 +7,9 @@ from torch.nn import functional as F
|
|||||||
from torch.nn import Conv1d
|
from torch.nn import Conv1d
|
||||||
from torch.nn.utils import weight_norm, remove_weight_norm
|
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||||
|
|
||||||
from module import commons
|
from GPT_SoVITS.module import commons
|
||||||
from module.commons import init_weights, get_padding
|
from GPT_SoVITS.module.commons import init_weights, get_padding
|
||||||
from module.transforms import piecewise_rational_quadratic_transform
|
from GPT_SoVITS.module.transforms import piecewise_rational_quadratic_transform
|
||||||
import torch.distributions as D
|
import torch.distributions as D
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn.utils import remove_weight_norm, weight_norm
|
from torch.nn.utils import remove_weight_norm, weight_norm
|
||||||
from module.attentions import MultiHeadAttention
|
from GPT_SoVITS.module.attentions import MultiHeadAttention
|
||||||
|
|
||||||
|
|
||||||
class MRTE(nn.Module):
|
class MRTE(nn.Module):
|
||||||
|
@ -13,7 +13,7 @@ import typing as tp
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from module.core_vq import ResidualVectorQuantization
|
from GPT_SoVITS.module.core_vq import ResidualVectorQuantization
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2
|
from GPT_SoVITS.module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2
|
||||||
from GPT_SoVITS.AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule
|
from GPT_SoVITS.AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
@ -8,7 +8,7 @@ from feature_extractor import cnhubert
|
|||||||
cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
|
cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
|
||||||
cnhubert.cnhubert_base_path = cnhubert_base_path
|
cnhubert.cnhubert_base_path = cnhubert_base_path
|
||||||
ssl_model = cnhubert.get_model()
|
ssl_model = cnhubert.get_model()
|
||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
import soundfile
|
import soundfile
|
||||||
from tools.my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
import os
|
import os
|
||||||
|
@ -18,7 +18,7 @@ import sys, numpy as np, traceback, pdb
|
|||||||
import os.path
|
import os.path
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from text.cleaner import clean_text
|
from GPT_SoVITS.text.cleaner import clean_text
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tools.my_utils import clean_path
|
from tools.my_utils import clean_path
|
||||||
|
@ -23,7 +23,7 @@ import torch.multiprocessing as mp
|
|||||||
from glob import glob
|
from glob import glob
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import logging, librosa, utils
|
import logging, librosa, utils
|
||||||
from module.models import SynthesizerTrn
|
from GPT_SoVITS.module.models import SynthesizerTrn
|
||||||
from tools.my_utils import clean_path
|
from tools.my_utils import clean_path
|
||||||
logging.getLogger("numba").setLevel(logging.WARNING)
|
logging.getLogger("numba").setLevel(logging.WARNING)
|
||||||
# from config import pretrained_s2G
|
# from config import pretrained_s2G
|
||||||
|
@ -18,19 +18,19 @@ logging.getLogger("matplotlib").setLevel(logging.INFO)
|
|||||||
logging.getLogger("h5py").setLevel(logging.INFO)
|
logging.getLogger("h5py").setLevel(logging.INFO)
|
||||||
logging.getLogger("numba").setLevel(logging.INFO)
|
logging.getLogger("numba").setLevel(logging.INFO)
|
||||||
from random import randint
|
from random import randint
|
||||||
from module import commons
|
from GPT_SoVITS.module import commons
|
||||||
|
|
||||||
from module.data_utils import (
|
from GPT_SoVITS.module.data_utils import (
|
||||||
TextAudioSpeakerLoader,
|
TextAudioSpeakerLoader,
|
||||||
TextAudioSpeakerCollate,
|
TextAudioSpeakerCollate,
|
||||||
DistributedBucketSampler,
|
DistributedBucketSampler,
|
||||||
)
|
)
|
||||||
from module.models import (
|
from GPT_SoVITS.module.models import (
|
||||||
SynthesizerTrn,
|
SynthesizerTrn,
|
||||||
MultiPeriodDiscriminator,
|
MultiPeriodDiscriminator,
|
||||||
)
|
)
|
||||||
from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss
|
from GPT_SoVITS.module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss
|
||||||
from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
|
from GPT_SoVITS.module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
|
||||||
from process_ckpt import savee
|
from process_ckpt import savee
|
||||||
|
|
||||||
torch.backends.cudnn.benchmark = False
|
torch.backends.cudnn.benchmark = False
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import os
|
import os
|
||||||
# if os.environ.get("version","v1")=="v1":
|
# if os.environ.get("version","v1")=="v1":
|
||||||
# from text.symbols import symbols
|
# from GPT_SoVITS.text.symbols import symbols
|
||||||
# else:
|
# else:
|
||||||
# from text.symbols2 import symbols
|
# from GPT_SoVITS.text.symbols2 import symbols
|
||||||
|
|
||||||
from text import symbols as symbols_v1
|
from GPT_SoVITS.text import symbols as symbols_v1
|
||||||
from text import symbols2 as symbols_v2
|
from GPT_SoVITS.text import symbols2 as symbols_v2
|
||||||
|
|
||||||
_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
|
_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
|
||||||
_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
|
_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
|
||||||
|
@ -5,8 +5,8 @@ import re
|
|||||||
import cn2an
|
import cn2an
|
||||||
|
|
||||||
from pyjyutping import jyutping
|
from pyjyutping import jyutping
|
||||||
from text.symbols import punctuation
|
from GPT_SoVITS.text.symbols import punctuation
|
||||||
from text.zh_normalization.text_normlization import TextNormalizer
|
from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer
|
||||||
|
|
||||||
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
||||||
|
|
||||||
@ -182,7 +182,7 @@ def get_jyutping(text):
|
|||||||
|
|
||||||
|
|
||||||
def get_bert_feature(text, word2ph):
|
def get_bert_feature(text, word2ph):
|
||||||
from text import chinese_bert
|
from GPT_SoVITS.text import chinese_bert
|
||||||
|
|
||||||
return chinese_bert.get_bert_feature(text, word2ph)
|
return chinese_bert.get_bert_feature(text, word2ph)
|
||||||
|
|
||||||
|
@ -5,9 +5,9 @@ import re
|
|||||||
import cn2an
|
import cn2an
|
||||||
from pypinyin import lazy_pinyin, Style
|
from pypinyin import lazy_pinyin, Style
|
||||||
|
|
||||||
from text.symbols import punctuation
|
from GPT_SoVITS.text.symbols import punctuation
|
||||||
from text.tone_sandhi import ToneSandhi
|
from GPT_SoVITS.text.tone_sandhi import ToneSandhi
|
||||||
from text.zh_normalization.text_normlization import TextNormalizer
|
from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer
|
||||||
|
|
||||||
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
||||||
|
|
||||||
|
@ -6,9 +6,9 @@ import cn2an
|
|||||||
from pypinyin import lazy_pinyin, Style
|
from pypinyin import lazy_pinyin, Style
|
||||||
from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals
|
from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals
|
||||||
|
|
||||||
from text.symbols import punctuation
|
from GPT_SoVITS.text.symbols import punctuation
|
||||||
from text.tone_sandhi import ToneSandhi
|
from GPT_SoVITS.text.tone_sandhi import ToneSandhi
|
||||||
from text.zh_normalization.text_normlization import TextNormalizer
|
from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer
|
||||||
|
|
||||||
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
||||||
|
|
||||||
@ -25,7 +25,7 @@ import jieba_fast.posseg as psg
|
|||||||
is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False
|
is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False
|
||||||
if is_g2pw:
|
if is_g2pw:
|
||||||
print("当前使用g2pw进行拼音推理")
|
print("当前使用g2pw进行拼音推理")
|
||||||
from text.g2pw import G2PWPinyin, correct_pronunciation
|
from GPT_SoVITS.text.g2pw import G2PWPinyin, correct_pronunciation
|
||||||
parent_directory = os.path.dirname(current_file_path)
|
parent_directory = os.path.dirname(current_file_path)
|
||||||
g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True)
|
g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True)
|
||||||
|
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
import os
|
import os
|
||||||
# if os.environ.get("version","v1")=="v1":
|
# if os.environ.get("version","v1")=="v1":
|
||||||
# from text import chinese
|
# from GPT_SoVITS.text import chinese
|
||||||
# from text.symbols import symbols
|
# from GPT_SoVITS.text.symbols import symbols
|
||||||
# else:
|
# else:
|
||||||
# from text import chinese2 as chinese
|
# from GPT_SoVITS.text import chinese2 as chinese
|
||||||
# from text.symbols2 import symbols
|
# from GPT_SoVITS.text.symbols2 import symbols
|
||||||
|
|
||||||
from text import symbols as symbols_v1
|
from GPT_SoVITS.text import symbols as symbols_v1
|
||||||
from text import symbols2 as symbols_v2
|
from GPT_SoVITS.text import symbols2 as symbols_v2
|
||||||
|
|
||||||
special = [
|
special = [
|
||||||
# ("%", "zh", "SP"),
|
# ("%", "zh", "SP"),
|
||||||
|
@ -4,9 +4,9 @@ import re
|
|||||||
import wordsegment
|
import wordsegment
|
||||||
from g2p_en import G2p
|
from g2p_en import G2p
|
||||||
|
|
||||||
from text.symbols import punctuation
|
from GPT_SoVITS.text.symbols import punctuation
|
||||||
|
|
||||||
from text.symbols2 import symbols
|
from GPT_SoVITS.text.symbols2 import symbols
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from builtins import str as unicode
|
from builtins import str as unicode
|
||||||
|
@ -1 +1 @@
|
|||||||
from text.g2pw.g2pw import *
|
from GPT_SoVITS.text.g2pw.g2pw import *
|
@ -31,7 +31,7 @@ except Exception as e:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
from text.symbols import punctuation
|
from GPT_SoVITS.text.symbols import punctuation
|
||||||
# Regular expression matching Japanese without punctuation marks:
|
# Regular expression matching Japanese without punctuation marks:
|
||||||
_japanese_characters = re.compile(
|
_japanese_characters = re.compile(
|
||||||
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
|
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
|
||||||
|
@ -5,7 +5,7 @@ from jamo import h2j, j2hcj
|
|||||||
import ko_pron
|
import ko_pron
|
||||||
from g2pk2 import G2p
|
from g2pk2 import G2p
|
||||||
|
|
||||||
from text.symbols2 import symbols
|
from GPT_SoVITS.text.symbols2 import symbols
|
||||||
|
|
||||||
# This is a list of Korean classifiers preceded by pure Korean numerals.
|
# This is a list of Korean classifiers preceded by pure Korean numerals.
|
||||||
_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
|
_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
|
||||||
|
@ -11,4 +11,4 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from text.zh_normalization.text_normlization import *
|
from GPT_SoVITS.text.zh_normalization.text_normlization import *
|
||||||
|
10
api.py
10
api.py
@ -162,11 +162,11 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from feature_extractor import cnhubert
|
from feature_extractor import cnhubert
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from module.models import SynthesizerTrn
|
from GPT_SoVITS.module.models import SynthesizerTrn
|
||||||
from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
||||||
from text import cleaned_text_to_sequence
|
from GPT_SoVITS.text import cleaned_text_to_sequence
|
||||||
from text.cleaner import clean_text
|
from GPT_SoVITS.text.cleaner import clean_text
|
||||||
from module.mel_processing import spectrogram_torch
|
from GPT_SoVITS.module.mel_processing import spectrogram_torch
|
||||||
from tools.my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
import config as global_config
|
import config as global_config
|
||||||
import logging
|
import logging
|
||||||
@ -312,7 +312,7 @@ def get_bert_inf(phones, word2ph, norm_text, language):
|
|||||||
|
|
||||||
return bert
|
return bert
|
||||||
|
|
||||||
from text import chinese
|
from GPT_SoVITS.text import chinese
|
||||||
def get_phones_and_bert(text,language,version,final=False):
|
def get_phones_and_bert(text,language,version,final=False):
|
||||||
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
||||||
language = language.replace("all_","")
|
language = language.replace("all_","")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user