diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index 4eca27be..00000000 --- a/.dockerignore +++ /dev/null @@ -1,8 +0,0 @@ -docs -logs -output -reference -SoVITS_weights -GPT_weights -TEMP -.git diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 80cd9f3a..00000000 --- a/Dockerfile +++ /dev/null @@ -1,42 +0,0 @@ -# Base CUDA image -FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04 - -LABEL maintainer="breakstring@hotmail.com" -LABEL version="dev-20240209" -LABEL description="Docker image for GPT-SoVITS" - - -# Install 3rd party apps -ENV DEBIAN_FRONTEND=noninteractive -ENV TZ=Etc/UTC -RUN apt-get update && \ - apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \ - git lfs install && \ - rm -rf /var/lib/apt/lists/* - -# Copy only requirements.txt initially to leverage Docker cache -WORKDIR /workspace -COPY requirements.txt /workspace/ -RUN pip install --no-cache-dir -r requirements.txt - -# Define a build-time argument for image type -ARG IMAGE_TYPE=full - -# Conditional logic based on the IMAGE_TYPE argument -# Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite" -COPY ./Docker /workspace/Docker -# elite 类型的镜像里面不包含额外的模型 -RUN if [ "$IMAGE_TYPE" != "elite" ]; then \ - chmod +x /workspace/Docker/download.sh && \ - /workspace/Docker/download.sh && \ - python /workspace/Docker/download.py && \ - python -m nltk.downloader averaged_perceptron_tagger cmudict; \ - fi - - -# Copy the rest of the application -COPY . /workspace - -EXPOSE 9871 9872 9873 9874 9880 - -CMD ["python", "webui.py"] diff --git a/GPT_SoVITS/AR/data/data_module.py b/GPT_SoVITS/AR/data/data_module.py index f360503b..1cd913ec 100644 --- a/GPT_SoVITS/AR/data/data_module.py +++ b/GPT_SoVITS/AR/data/data_module.py @@ -1,11 +1,10 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py # reference: https://github.com/lifeiteng/vall-e from pytorch_lightning import LightningDataModule +from GPT_SoVITS.AR.data.bucket_sampler import DistributedBucketSampler +from GPT_SoVITS.AR.data.dataset import Text2SemanticDataset from torch.utils.data import DataLoader -from AR.data.bucket_sampler import DistributedBucketSampler -from AR.data.dataset import Text2SemanticDataset - class Text2SemanticDataModule(LightningDataModule): def __init__( diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py index 402483d9..6beb4082 100644 --- a/GPT_SoVITS/AR/data/dataset.py +++ b/GPT_SoVITS/AR/data/dataset.py @@ -13,7 +13,7 @@ from torch.utils.data import DataLoader, Dataset version = os.environ.get("version", None) -from text import cleaned_text_to_sequence +from GPT_SoVITS.text import cleaned_text_to_sequence # from config import exp_dir diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py index 0696c35f..e785a7f1 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py @@ -9,10 +9,9 @@ from typing import Dict import torch from pytorch_lightning import LightningModule - -from AR.models.t2s_model import Text2SemanticDecoder -from AR.modules.lr_schedulers import WarmupCosineLRSchedule -from AR.modules.optim import ScaledAdam +from GPT_SoVITS.AR.models.t2s_model import Text2SemanticDecoder +from GPT_SoVITS.AR.modules.lr_schedulers import WarmupCosineLRSchedule +from GPT_SoVITS.AR.modules.optim import ScaledAdam class Text2SemanticLightningModule(LightningModule): diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py index b0ab59c4..b0d8c4b9 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py @@ -9,10 +9,9 @@ from typing import Dict import torch from pytorch_lightning import LightningModule - -from AR.models.t2s_model_onnx import Text2SemanticDecoder -from AR.modules.lr_schedulers import WarmupCosineLRSchedule -from AR.modules.optim import ScaledAdam +from GPT_SoVITS.AR.models.t2s_model_onnx import Text2SemanticDecoder +from GPT_SoVITS.AR.modules.lr_schedulers import WarmupCosineLRSchedule +from GPT_SoVITS.AR.modules.optim import ScaledAdam class Text2SemanticLightningModule(LightningModule): diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index 4725b7a3..6c2bf9ec 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -9,7 +9,7 @@ from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy from tqdm import tqdm -from AR.models.utils import ( +from GPT_SoVITS.AR.models.utils import ( dpo_loss, get_batch_logps, make_pad_mask, @@ -18,8 +18,8 @@ from AR.models.utils import ( sample, topk_sampling, ) -from AR.modules.embedding import SinePositionalEmbedding, TokenEmbedding -from AR.modules.transformer import LayerNorm, TransformerEncoder, TransformerEncoderLayer +from GPT_SoVITS.AR.modules.embedding import SinePositionalEmbedding, TokenEmbedding +from GPT_SoVITS.AR.modules.transformer import LayerNorm, TransformerEncoder, TransformerEncoderLayer default_config = { "embedding_dim": 512, @@ -933,3 +933,140 @@ class Text2SemanticDecoder(nn.Module): return self.infer_panel_naive( x, x_lens, prompts, bert_feature, top_k, top_p, early_stop_num, temperature, repetition_penalty, **kwargs ) + + def infer_panel_generator( + self, + x: torch.LongTensor, + x_lens: torch.LongTensor, + prompts: torch.LongTensor, + bert_feature: torch.LongTensor, + cumulation_amount: int, + top_k: int = -100, + top_p: int = 100, + early_stop_num: int = -1, + temperature: float = 1.0, + repetition_penalty: float = 1.35, + **kwargs, + ): + """ + Generator method that yields generated tokens based on a specified cumulative amount. + + Args: + x (torch.LongTensor): Input phoneme IDs. + x_lens (torch.LongTensor): Lengths of the input sequences. + prompts (torch.LongTensor): Initial prompt tokens. + bert_feature (torch.LongTensor): BERT features corresponding to the input. + cumulation_amount (int): Number of tokens to generate before yielding. + top_k (int): Top-k sampling. + top_p (int): Top-p sampling. + early_stop_num (int): Early stopping number. + temperature (float): Sampling temperature. + repetition_penalty (float): Repetition penalty. + Yields: + torch.LongTensor: Generated tokens since the last yield. + """ + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + + # AR Decoder + y = prompts + + x_len = x.shape[1] + x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device) + stop = False + + # Initialize cumulative token counter + tokens_since_last_yield = 0 + # Initialize last yield index + prefix_len = y.shape[1] if y is not None else 0 + last_yield_idx = prefix_len + + k_cache = None + v_cache = None + + ################### first step ########################## + if y is not None and y.shape[1] > 0: + y_emb = self.ar_audio_embedding(y) + y_len = y_emb.shape[1] + y_pos = self.ar_audio_position(y_emb) + xy_pos = torch.concat([x, y_pos], dim=1) + ref_free = False + else: + y_emb = None + y_len = 0 + xy_pos = x + y = torch.zeros(x.shape[0], 0, dtype=torch.int64, device=x.device) + ref_free = True + + bsz = x.shape[0] + src_len = x_len + y_len + x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True) + y_attn_mask = F.pad( + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), diagonal=1), + (x_len, 0), + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + xy_attn_mask = xy_attn_mask.unsqueeze(0).expand(bsz * self.num_head, -1, -1) + xy_attn_mask = xy_attn_mask.view(bsz, self.num_head, src_len, src_len).to(device=x.device, dtype=torch.bool) + + for idx in tqdm(range(1500)): + if xy_attn_mask is not None: + xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None) + else: + xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) + + logits = self.ar_predict_layer(xy_dec[:, -1]) + + if idx == 0: + xy_attn_mask = None + if idx < 11: # Ensure at least 10 tokens are generated before stopping + logits = logits[:, :-1] + + samples = sample( + logits, + y, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + temperature=temperature, + )[0] + + y = torch.concat([y, samples], dim=1) + tokens_since_last_yield += 1 + + if tokens_since_last_yield >= cumulation_amount: + generated_tokens = y[:, last_yield_idx:] + yield generated_tokens + last_yield_idx = y.shape[1] + tokens_since_last_yield = 0 + + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + print("Using early stop num:", early_stop_num) + stop = True + + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + + if stop: + if y.shape[1] == 0: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + print("Bad zero prediction") + print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") + break + + # Update for next step + y_emb = self.ar_audio_embedding(y[:, -1:]) + y_len += 1 + xy_pos = ( + y_emb * self.ar_audio_position.x_scale + + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len - 1].to( + dtype=y_emb.dtype, device=y_emb.device + ) + ) + + # After loop ends, yield any remaining tokens + if last_yield_idx < y.shape[1]: + generated_tokens = y[:, last_yield_idx:] + yield generated_tokens diff --git a/GPT_SoVITS/AR/models/t2s_model_onnx.py b/GPT_SoVITS/AR/models/t2s_model_onnx.py index 4f7b50a3..5eff4ae0 100644 --- a/GPT_SoVITS/AR/models/t2s_model_onnx.py +++ b/GPT_SoVITS/AR/models/t2s_model_onnx.py @@ -1,12 +1,17 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py # reference: https://github.com/lifeiteng/vall-e import torch +from tqdm import tqdm + +from GPT_SoVITS.AR.modules.embedding_onnx import SinePositionalEmbedding +from GPT_SoVITS.AR.modules.embedding_onnx import TokenEmbedding +from GPT_SoVITS.AR.modules.transformer_onnx import LayerNorm +from GPT_SoVITS.AR.modules.transformer_onnx import TransformerEncoder +from GPT_SoVITS.AR.modules.transformer_onnx import TransformerEncoderLayer from torch import nn from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy -from AR.modules.embedding_onnx import SinePositionalEmbedding, TokenEmbedding -from AR.modules.transformer_onnx import LayerNorm, TransformerEncoder, TransformerEncoderLayer default_config = { "embedding_dim": 512, diff --git a/GPT_SoVITS/AR/modules/activation.py b/GPT_SoVITS/AR/modules/activation.py index 936f9c3f..f05d5e5f 100644 --- a/GPT_SoVITS/AR/modules/activation.py +++ b/GPT_SoVITS/AR/modules/activation.py @@ -9,7 +9,8 @@ from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter -from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched +from torch.nn import functional as F +from GPT_SoVITS.AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched F.multi_head_attention_forward = multi_head_attention_forward_patched @@ -152,14 +153,14 @@ class MultiheadAttention(Module): bias=bias, **factory_kwargs, ) - self.in_proj_weight = self.in_proj_linear.weight + self.in_proj_weight = self.in_proj_lineGPT_SoVITS.AR.weight self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: - self.in_proj_bias = self.in_proj_linear.bias + self.in_proj_bias = self.in_proj_lineGPT_SoVITS.AR.bias else: self.register_parameter("in_proj_bias", None) diff --git a/GPT_SoVITS/AR/modules/activation_onnx.py b/GPT_SoVITS/AR/modules/activation_onnx.py index c14ce40c..04bd41c1 100644 --- a/GPT_SoVITS/AR/modules/activation_onnx.py +++ b/GPT_SoVITS/AR/modules/activation_onnx.py @@ -8,7 +8,8 @@ from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter -from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched +from torch.nn import functional as F +from GPT_SoVITS.AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched class MultiheadAttention(Module): @@ -102,14 +103,14 @@ class MultiheadAttention(Module): bias=bias, **factory_kwargs, ) - self.in_proj_weight = self.in_proj_linear.weight + self.in_proj_weight = self.in_proj_lineGPT_SoVITS.AR.weight self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: - self.in_proj_bias = self.in_proj_linear.bias + self.in_proj_bias = self.in_proj_lineGPT_SoVITS.AR.bias else: self.register_parameter("in_proj_bias", None) diff --git a/GPT_SoVITS/AR/modules/transformer.py b/GPT_SoVITS/AR/modules/transformer.py index 1bf21cdb..5446fd3e 100644 --- a/GPT_SoVITS/AR/modules/transformer.py +++ b/GPT_SoVITS/AR/modules/transformer.py @@ -10,8 +10,8 @@ from typing import Tuple from typing import Union import torch -from AR.modules.activation import MultiheadAttention -from AR.modules.scaling import BalancedDoubleSwish +from GPT_SoVITS.AR.modules.activation import MultiheadAttention +from GPT_SoVITS.AR.modules.scaling import BalancedDoubleSwish from torch import nn from torch import Tensor from torch.nn import functional as F diff --git a/GPT_SoVITS/AR/modules/transformer_onnx.py b/GPT_SoVITS/AR/modules/transformer_onnx.py index fa170254..ed12595b 100644 --- a/GPT_SoVITS/AR/modules/transformer_onnx.py +++ b/GPT_SoVITS/AR/modules/transformer_onnx.py @@ -10,8 +10,8 @@ from typing import Tuple from typing import Union import torch -from AR.modules.activation_onnx import MultiheadAttention -from AR.modules.scaling import BalancedDoubleSwish +from GPT_SoVITS.AR.modules.activation_onnx import MultiheadAttention +from GPT_SoVITS.AR.modules.scaling import BalancedDoubleSwish from torch import nn from torch import Tensor from torch.nn import functional as F diff --git a/GPT_SoVITS/AR/text_processing/phonemizer.py b/GPT_SoVITS/AR/text_processing/phonemizer.py index 1003040e..808896a6 100644 --- a/GPT_SoVITS/AR/text_processing/phonemizer.py +++ b/GPT_SoVITS/AR/text_processing/phonemizer.py @@ -9,7 +9,7 @@ import regex from gruut import sentences from gruut.const import Sentence from gruut.const import Word -from AR.text_processing.symbols import SYMBOL_TO_ID +from GPT_SoVITS.AR.text_processing.symbols import SYMBOL_TO_ID class GruutPhonemizer: diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index d20daee3..201cb045 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -18,27 +18,31 @@ from typing import List, Tuple, Union import ffmpeg import librosa import numpy as np +import random import torch import torch.nn.functional as F +import traceback import yaml -from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from BigVGAN.bigvgan import BigVGAN -from feature_extractor.cnhubert import CNHubert -from module.mel_processing import mel_spectrogram_torch, spectrogram_torch -from module.models import SynthesizerTrn, SynthesizerTrnV3, Generator +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.BigVGAN.bigvgan import BigVGAN +from GPT_SoVITS.feature_extractor.cnhubert import CNHubert +from GPT_SoVITS.module.mel_processing import mel_spectrogram_torch, spectrogram_torch +from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3, Generator from peft import LoraConfig, get_peft_model -from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new +from GPT_SoVITS.process_ckpt import get_sovits_version_from_path_fast, load_sovits_new from transformers import AutoModelForMaskedLM, AutoTokenizer +from huggingface_hub import snapshot_download -from tools.audio_sr import AP_BWE -from tools.i18n.i18n import I18nAuto, scan_language_list -from tools.my_utils import load_audio -from TTS_infer_pack.text_segmentation_method import splits -from TTS_infer_pack.TextPreprocessor import TextPreprocessor +from GPT_SoVITS.tools.audio_sr import AP_BWE +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools.my_utils import load_audio +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import splits +from GPT_SoVITS.TTS_infer_pack.TextPreprocessor import TextPreprocessor language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) +LIBRARY_NAME = "GPT_SoVITS" spec_min = -12 @@ -149,28 +153,28 @@ class NO_PROMPT_ERROR(Exception): # configs/tts_infer.yaml """ custom: - bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large - cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt - vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth + t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt + vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth version: v2 v1: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt - vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth + t2s_weights_path: pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt + vits_weights_path: pretrained_models/s2G488k.pth version: v1 v2: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cpu is_half: false - t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt - vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth + t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt + vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth version: v2 v3: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large @@ -323,8 +327,10 @@ class TTS_Config: if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)): self.cnhuhbert_base_path = self.default_configs[version]["cnhuhbert_base_path"] print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}") + + repo_name="lj1995/GPT-SoVITS" + snapshot_download(repo_id=repo_name, local_dir=os.path.dirname(self.bert_base_path)) self.update_configs() - self.max_sec = None self.hz: int = 50 self.semantic_frame_rate: str = "25hz" @@ -1294,7 +1300,17 @@ class TTS: raise e finally: self.empty_cache() - + + def empty_cache(self): + try: + gc.collect() # 触发gc的垃圾回收。避免内存一直增长。 + if "cuda" in str(self.configs.device): + torch.cuda.empty_cache() + elif str(self.configs.device) == "mps": + torch.mps.empty_cache() + except: + pass + def empty_cache(self): try: gc.collect() # 触发gc的垃圾回收。避免内存一直增长。 @@ -1558,3 +1574,160 @@ class TTS: audio_fragments[i + 1] = f2_ return torch.cat(audio_fragments, 0) + + @torch.no_grad() + def run_generator(self, inputs: dict): + """ + Streaming inference using infer_panel_generator and zero-cross splitting for v1-v4. + Yields tuples of (sampling_rate, np.ndarray audio fragment). + """ + # Initialize parameters + self.stop_flag = False + text = inputs.get("text", "") + text_lang = inputs.get("text_lang", "") + ref_audio_path = inputs.get("ref_audio_path", "") + aux_ref_audio_paths = inputs.get("aux_ref_audio_paths", []) + prompt_text = inputs.get("prompt_text", "") + prompt_lang = inputs.get("prompt_lang", "") + top_k = inputs.get("top_k", 5) + top_p = inputs.get("top_p", 1) + temperature = inputs.get("temperature", 1) + text_split_method = inputs.get("text_split_method", "cut0") + batch_threshold = inputs.get("batch_threshold", 0.75) + speed_factor = inputs.get("speed_factor", 1.0) + seed = inputs.get("seed", -1) + seed = -1 if seed in [None, ""] else seed + set_seed(seed) + repetition_penalty = inputs.get("repetition_penalty", 1.35) + sample_steps = inputs.get("sample_steps", 8) + super_sampling = inputs.get("super_sampling", False) + search_length = inputs.get("search_length", 32000 * 5) + num_zeroes = inputs.get("num_zeroes", 5) + cumulation_amount = inputs.get("cumulation_amount", 50) + # Prepare reference audio + if ref_audio_path and ref_audio_path != self.prompt_cache["ref_audio_path"]: + if not os.path.exists(ref_audio_path): + raise ValueError(f"{ref_audio_path} not exists") + self.set_ref_audio(ref_audio_path) + # Auxiliary refs + self.prompt_cache["aux_ref_audio_paths"] = aux_ref_audio_paths or [] + self.prompt_cache["refer_spec"] = [self.prompt_cache["refer_spec"][0]] + for p in aux_ref_audio_paths or []: + if p and os.path.exists(p): + self.prompt_cache["refer_spec"].append(self._get_ref_spec(p)) + # Prompt text handling + no_prompt = prompt_text in [None, ""] + if not no_prompt: + prompt_text = prompt_text.strip("\n") + if prompt_text and prompt_text[-1] not in splits: + prompt_text += "。" if prompt_lang != "en" else "." + phones_p, bert_p, norm_p = self.text_preprocessor.segment_and_extract_feature_for_text( + prompt_text, prompt_lang, self.configs.version + ) + self.prompt_cache.update({ + "prompt_text": prompt_text, + "prompt_lang": prompt_lang, + "phones": phones_p, + "bert_features": bert_p, + "norm_text": norm_p, + }) + # Text to semantic preprocessing + data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version) + if not data: + sr = self.vocoder_configs["sr"] if self.configs.use_vocoder else self.configs.sampling_rate + yield sr, np.zeros(1, dtype=np.int16) + return + # Single-batch conversion + batches, _ = self.to_batch( + data, + prompt_data=None if no_prompt else self.prompt_cache, + batch_size=1, + threshold=batch_threshold, + split_bucket=False, + device=self.configs.device, + precision=self.precision, + ) + item = batches[0] + phones = item["phones"][0] + all_ids = item["all_phones"][0] + all_lens = item["all_phones_len"][0] + all_bert = item["all_bert_features"][0] + max_len = item["max_len"] + # Prepare semantic prompt + if not no_prompt: + prompt_sem = self.prompt_cache["prompt_semantic"].unsqueeze(0).to(self.configs.device) + else: + prompt_sem = None + # Reference spectrograms + refer_spec = [s.to(dtype=self.precision, device=self.configs.device) for s in self.prompt_cache["refer_spec"]] + # Streaming via generator + from GPT_SoVITS.TTS_infer_pack.zero_crossing import find_zero_zone, find_matching_index + zc_idx1 = zc_idx2 = crossing_dir = 0 + first = True + last = False + gen_list = [] + for gen_tokens in self.t2s_model.model.infer_panel_generator( + all_ids.unsqueeze(0).to(self.configs.device), + all_lens.unsqueeze(0).to(self.configs.device), + prompt_sem, + all_bert.unsqueeze(0).to(self.configs.device), + cumulation_amount=cumulation_amount, + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=self.configs.hz * self.configs.max_sec, + max_len=max_len, + repetition_penalty=repetition_penalty, + ): + gen_list.append(gen_tokens) + total = sum([t.size(1) for t in gen_list]) + toks = torch.cat(gen_list, dim=1)[:, :total] + eos = self.t2s_model.model.EOS + has_eos = (toks == eos).any() + if has_eos: + toks = toks.masked_fill(toks == eos, 0) + last = True + first = False + # Decode to waveform + pred = toks.unsqueeze(0) + phone_t = phones.unsqueeze(0).to(self.configs.device) + if not self.configs.use_vocoder: + w = self.vits_model.decode(pred, phone_t, refer_spec, speed=speed_factor).detach()[0,0,:] + else: + w = self.using_vocoder_synthesis(pred, phone_t, speed=speed_factor, sample_steps=sample_steps) + w = w.cpu().numpy().astype(np.float32) + mv = np.abs(w).max() + if mv > 1.0: + w /= mv + # Zero-cross splitting + start = len(w) - search_length + if start < 0: + search_length = len(w) + start = 0 + center = zc_idx2 + off = int(search_length // 2) + sr = self.vocoder_configs["sr"] if self.configs.use_vocoder else self.configs.sampling_rate + if first: + zc_idx1, crossing_dir = find_zero_zone(w, start, search_length, num_zeroes) + frag = w[:zc_idx1] + print(len(frag)) + frag_int16 = (frag * np.iinfo(np.int16).max).astype(np.int16) + yield sr, frag_int16 + first = False + zc_idx2 = zc_idx1 + elif last: + zc1 = find_matching_index(w, center, off, crossing_dir) + frag = w[zc1:] + print(len(frag)) + frag_int16 = (frag * np.iinfo(np.int16).max).astype(np.int16) + yield sr, frag_int16 + zc_idx2 = zc_idx1 + else: + zc1 = find_matching_index(w, center, off, crossing_dir) + zc_idx1, crossing_dir = find_zero_zone(w, start, search_length, num_zeroes) + frag = w[zc1:zc_idx1] + print(len(frag)) + frag_int16 = (frag * np.iinfo(np.int16).max).astype(np.int16) + yield sr, frag_int16 + zc_idx2 = zc_idx1 + self.empty_cache() diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 426929f8..9c587de7 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -9,15 +9,15 @@ sys.path.append(now_dir) import re import torch -from text.LangSegmenter import LangSegmenter -from text import chinese +from GPT_SoVITS.text.LangSegmenter import LangSegmenter +from GPT_SoVITS.text import chinese from typing import Dict, List, Tuple -from text.cleaner import clean_text -from text import cleaned_text_to_sequence +from GPT_SoVITS.text.cleaner import clean_text +from GPT_SoVITS.text import cleaned_text_to_sequence from transformers import AutoModelForMaskedLM, AutoTokenizer -from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method -from tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/TTS_infer_pack/zero_crossing.py b/GPT_SoVITS/TTS_infer_pack/zero_crossing.py new file mode 100644 index 00000000..542a6de9 --- /dev/null +++ b/GPT_SoVITS/TTS_infer_pack/zero_crossing.py @@ -0,0 +1,203 @@ +import numpy as np +import wave +import struct + +def read_wav_file(filename): + """ + Reads a WAV file and returns the sample rate and data as a numpy array. + """ + with wave.open(filename, 'rb') as wf: + sample_rate = wf.getframerate() + n_frames = wf.getnframes() + sample_width = wf.getsampwidth() + n_channels = wf.getnchannels() + + audio_data = wf.readframes(n_frames) + # Determine the format string for struct unpacking + fmt = "<" + {1:'b', 2:'h', 4:'i'}[sample_width] * n_frames * n_channels + audio_samples = struct.unpack(fmt, audio_data) + audio_array = np.array(audio_samples, dtype=int) + + # If stereo, reshape the array + if n_channels > 1: + audio_array = audio_array.reshape(-1, n_channels) + return sample_rate, audio_array, sample_width, n_channels + +def write_wav_file(filename, sample_rate, data, sample_width, n_channels): + """ + Writes numpy array data to a WAV file. + """ + with wave.open(filename, 'wb') as wf: + wf.setnchannels(n_channels) + wf.setsampwidth(sample_width) + wf.setframerate(sample_rate) + # Flatten the array if it's multi-dimensional + if data.ndim > 1: + data = data.flatten() + # Pack the data into bytes + fmt = "<" + {1:'b', 2:'h', 4:'i'}[sample_width] * len(data) + byte_data = struct.pack(fmt, *data) + wf.writeframes(byte_data) + +def find_zero_zone(chunk, start_index, search_length, num_zeroes=11): + zone = chunk[start_index:start_index + search_length] + print(f"Zero-crossing search zone: Start={start_index}, Length={len(zone)}") + + zero_threshold = 1.0e-4 + # Check for y consecutive zeros + for idx in range(len(zone), -1 + num_zeroes, -1): + index_to_start = idx-num_zeroes + abs_zone = np.abs(zone[index_to_start:idx]) + if np.all(abs_zone < zero_threshold): + index_midpoint = index_to_start + int(num_zeroes // 2) + return (start_index + index_midpoint), None + + print("Falling back to zero crossing due to no zero zone found. You may hear more prominent pops and clicks in the audio. Try increasing search length or cumulative tokens.") + return find_zero_crossing(chunk, start_index, search_length) + +def find_zero_crossing(chunk, start_index, search_length): + # If the model is falling back on the this function, it might be a bad indicator that the search length is too low + + zone = chunk[start_index:start_index + search_length] + sign_changes = np.where(np.diff(np.sign(zone)) != 0)[0] + + if len(sign_changes) == 0: + raise ("No zero-crossings found in this zone. This should not be happening, debugging time.") + else: + zc_index = start_index + sign_changes[0] + 1 + print(f"Zero-crossing found at index {zc_index}") + # Determine the crossing direction in chunk1 + prev_value = chunk[zc_index - 1] + curr_value = chunk[zc_index] + crossing_direction = np.sign(curr_value) - np.sign(prev_value) + print(f"Crossing direction in chunk1: {np.sign(prev_value)} to {np.sign(curr_value)}") + return zc_index, crossing_direction + +def find_matching_index(chunk, center_index, max_offset, crossing_direction): + """ + Finds a zero-crossing in data that matches the specified crossing direction, + starting from center_index and searching outward. + """ + if crossing_direction == None: + return center_index # if zero zone + + # fall back for zero_crossing + data_length = len(chunk) + print(f"Center index in chunk2: {center_index}") + for offset in range(max_offset + 1): + # Check index bounds + idx_forward = center_index + offset + idx_backward = center_index - offset + found = False + + # Check forward direction + if idx_forward < data_length - 1: + prev_sign = np.sign(chunk[idx_forward]) + curr_sign = np.sign(chunk[idx_forward + 1]) + direction = curr_sign - prev_sign + if direction == crossing_direction: + print(f"Matching zero-crossing found at index {idx_forward + 1} (forward)") + return idx_forward + 1 + + # Check backward direction + if idx_backward > 0: + prev_sign = np.sign(chunk[idx_backward - 1]) + curr_sign = np.sign(chunk[idx_backward]) + direction = curr_sign - prev_sign + if direction == crossing_direction: + print(f"Matching zero-crossing found at index {idx_backward} (backward)") + return idx_backward + + print("No matching zero-crossings found in this zone.") + return None + +# legacy, just for history. delete me sometime +def splice_chunks(chunk1, chunk2, search_length, y): + """ + Splices two audio chunks at zero-crossing points. + """ + # Define the zone to search in chunk1 + start_index1 = len(chunk1) - search_length + if start_index1 < 0: + start_index1 = 0 + search_length = len(chunk1) + print(f"Searching for zero-crossing in chunk1 from index {start_index1} to {len(chunk1)}") + # Find zero-crossing in chunk1 + zc_index1, crossing_direction = find_zero_crossing(chunk1, start_index1, search_length, y) + if zc_index1 is None: + print("No zero-crossing found in chunk1 within the specified zone.") + return None + + # Define the zone to search in chunk2 near the same index + # Since chunk2 overlaps with chunk1, we can assume that index positions correspond + # Adjusted search in chunk2 + # You can adjust this value if needed + center_index = zc_index1 # Assuming alignment between chunk1 and chunk2 + max_offset = search_length + + # Ensure center_index is within bounds + if center_index < 0: + center_index = 0 + elif center_index >= len(chunk2): + center_index = len(chunk2) - 1 + + print(f"Searching for matching zero-crossing in chunk2 around index {center_index} with max offset {max_offset}") + + zc_index2 = find_matching_zero_crossing(chunk2, center_index, max_offset, crossing_direction) + + if zc_index2 is None: + print("No matching zero-crossing found in chunk2.") + return None + + print(f"Zero-crossing in chunk1 at index {zc_index1}, chunk2 at index {zc_index2}") + # Splice the chunks + new_chunk = np.concatenate((chunk1[:zc_index1], chunk2[zc_index2:])) + print(f"Spliced chunk length: {len(new_chunk)}") + return new_chunk + +# legacy, just for history. delete me sometime +def process_audio_chunks(filenames, sample_rate, x, y, output_filename): + """ + Processes and splices a list of audio chunks. + """ + # Read the first chunk + sr, chunk_data, sample_width, n_channels = read_wav_file(filenames[0]) + if sr != sample_rate: + print(f"Sample rate mismatch in {filenames[0]}") + return + print(f"Processing {filenames[0]}") + # Initialize the combined audio with the first chunk + combined_audio = chunk_data + # Process remaining chunks + for filename in filenames[1:]: + sr, next_chunk_data, _, _ = read_wav_file(filename) + if sr != sample_rate: + print(f"Sample rate mismatch in {filename}") + return + print(f"Processing {filename}") + # Splice the current combined audio with the next chunk + new_combined = splice_chunks(combined_audio, next_chunk_data, x, y) + if new_combined is None: + print(f"Failed to splice chunks between {filename} and previous chunk.") + return + combined_audio = new_combined + # Write the final combined audio to output file + write_wav_file(output_filename, sample_rate, combined_audio, sample_width, n_channels) + print(f"Final audio saved to {output_filename}") + +# Main execution +if __name__ == "__main__": + # User-specified parameters + sample_rate = 32000 # Sample rate in Hz + x = 500 # Number of frames to search from the end of the chunk + y = 10 # Number of consecutive zeros to look for + output_filename = "combined_output.wav" + folder_with_chunks = "output_chunks" + import os + def absolute_file_paths(directory): + path = os.path.abspath(directory) + return [entry.path for entry in os.scandir(path) if entry.is_file()] + # List of input audio chunk filenames in sequential order + filenames = absolute_file_paths(folder_with_chunks) + # Process and splice the audio chunks + process_audio_chunks(filenames, sample_rate, x, y, output_filename) diff --git a/api_v2.py b/GPT_SoVITS/api_v2.py similarity index 98% rename from api_v2.py rename to GPT_SoVITS/api_v2.py index 87082074..50e99ec5 100644 --- a/api_v2.py +++ b/GPT_SoVITS/api_v2.py @@ -116,8 +116,10 @@ import soundfile as sf from fastapi import FastAPI, Response from fastapi.responses import StreamingResponse, JSONResponse import uvicorn + +from importlib.resources import files from io import BytesIO -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names from pydantic import BaseModel @@ -127,7 +129,7 @@ i18n = I18nAuto() cut_method_names = get_cut_method_names() parser = argparse.ArgumentParser(description="GPT-SoVITS api") -parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径") +parser.add_argument("-c", "--tts_config", type=str, default=None, help="tts_infer路径") parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880") args = parser.parse_args() @@ -138,7 +140,7 @@ host = args.bind_addr argv = sys.argv if config_path in [None, ""]: - config_path = "GPT-SoVITS/configs/tts_infer.yaml" + config_path = str(files("GPT_SoVITS").joinpath("configs/tts_infer.yaml")) tts_config = TTS_Config(config_path) print(tts_config) @@ -434,7 +436,7 @@ async def tts_get_endpoint( @APP.post("/tts") async def tts_post_endpoint(request: TTS_Request): - req = request.dict() + req = request.model_dump() return await tts_handle(req) @@ -498,3 +500,6 @@ if __name__ == "__main__": traceback.print_exc() os.kill(os.getpid(), signal.SIGTERM) exit(0) + +if __name__ == "__main__": + main() diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml index 20c41a20..c32c76a6 100644 --- a/GPT_SoVITS/configs/tts_infer.yaml +++ b/GPT_SoVITS/configs/tts_infer.yaml @@ -3,7 +3,7 @@ custom: cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cuda is_half: true - t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt + t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt version: v2 vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth v1: diff --git a/GPT_SoVITS/download.py b/GPT_SoVITS/download.py index fc4ead63..94061cea 100644 --- a/GPT_SoVITS/download.py +++ b/GPT_SoVITS/download.py @@ -3,11 +3,5 @@ import sys now_dir = os.getcwd() sys.path.insert(0, now_dir) -from text.g2pw import G2PWPinyin - -g2pw = G2PWPinyin( - model_dir="GPT_SoVITS/text/G2PWModel", - model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - v_to_u=False, - neutral_tone_with_five=True, -) +from GPT_SoVITS.text.g2pw import G2PWPinyin +g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",v_to_u=False, neutral_tone_with_five=True) diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py index 69817a37..a47bf9a7 100644 --- a/GPT_SoVITS/export_torch_script.py +++ b/GPT_SoVITS/export_torch_script.py @@ -12,8 +12,8 @@ from torch.nn import functional as F from transformers import AutoModelForMaskedLM, AutoTokenizer from feature_extractor import cnhubert -from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from module.models_onnx import SynthesizerTrn +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.module.models_onnx import SynthesizerTrn from inference_webui import get_phones_and_bert diff --git a/GPT_SoVITS/feature_extractor/cnhubert.py b/GPT_SoVITS/feature_extractor/cnhubert.py index f22b8d09..44b8f881 100644 --- a/GPT_SoVITS/feature_extractor/cnhubert.py +++ b/GPT_SoVITS/feature_extractor/cnhubert.py @@ -13,7 +13,7 @@ from transformers import ( HubertModel, ) -import utils +import GPT_SoVITS.utils import torch.nn as nn cnhubert_base_path = None diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py index 459a3d36..564a4417 100644 --- a/GPT_SoVITS/inference_cli.py +++ b/GPT_SoVITS/inference_cli.py @@ -2,7 +2,7 @@ import argparse import os import soundfile as sf -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav i18n = I18nAuto() diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py index 379f7fa8..f32cc1cd 100644 --- a/GPT_SoVITS/inference_gui.py +++ b/GPT_SoVITS/inference_gui.py @@ -5,8 +5,7 @@ from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushB from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox import soundfile as sf -from tools.i18n.i18n import I18nAuto - +from GPT_SoVITS.tools.i18n.i18n import I18nAuto i18n = I18nAuto() from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav @@ -305,7 +304,7 @@ class GPTSoVITSGUI(QMainWindow): result = "Audio saved to " + output_wav_path - self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) + self.status_bGPT_SoVITS.AR.showMessage("合成完成!输出路径:" + output_wav_path, 5000) self.output_text.append("处理结果:\n" + result) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 4bee27cd..0ca03f5d 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -124,12 +124,12 @@ def set_seed(seed): from time import time as ttime -from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule from peft import LoraConfig, get_peft_model -from text import cleaned_text_to_sequence -from text.cleaner import clean_text +from GPT_SoVITS.text import cleaned_text_to_sequence +from GPT_SoVITS.text.cleaner import clean_text -from tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language @@ -165,8 +165,8 @@ dict_language_v2 = { } dict_language = dict_language_v1 if version == "v1" else dict_language_v2 -tokenizer = AutoTokenizer.from_pretrained(bert_path) -bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) +tokenizer = AutoTokenizer.from_pretrained(bert_path, local_files_only=True) +bert_model = AutoModelForMaskedLM.from_pretrained(bert_path, local_files_only=True) if is_half == True: bert_model = bert_model.half().to(device) else: @@ -406,6 +406,7 @@ def init_bigvgan(): bigvgan_model = bigvgan.BigVGAN.from_pretrained( "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False, + local_files_only=True ) # if True, RuntimeError: Ninja is required to load C++ extensions # remove weight norm in the model and set to eval mode bigvgan_model.remove_weight_norm() @@ -518,11 +519,8 @@ def get_first(text): text = re.split(pattern, text)[0].strip() return text - -from text import chinese - - -def get_phones_and_bert(text, language, version, final=False): +from GPT_SoVITS.text import chinese +def get_phones_and_bert(text,language,version,final=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: formattext = text while " " in formattext: diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 311994b8..8ae259f0 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -50,10 +50,9 @@ bert_path = os.environ.get("bert_path", None) version = model_version = os.environ.get("version", "v2") import gradio as gr -from TTS_infer_pack.text_segmentation_method import get_method -from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config - -from tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/module/attentions.py b/GPT_SoVITS/module/attentions.py index 341de4ac..41fc0161 100644 --- a/GPT_SoVITS/module/attentions.py +++ b/GPT_SoVITS/module/attentions.py @@ -3,8 +3,8 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons -from module.modules import LayerNorm +from GPT_SoVITS.module import commons +from GPT_SoVITS.module.modules import LayerNorm class Encoder(nn.Module): @@ -325,7 +325,7 @@ class MultiHeadAttention(nn.Module): def _attention_bias_proximal(self, length): """Bias for self-attention to encourage attention to close positions. Args: - length: an integer scalar. + length: an integer scalGPT_SoVITS.AR. Returns: a Tensor with shape [1, 1, length, length] """ diff --git a/GPT_SoVITS/module/attentions_onnx.py b/GPT_SoVITS/module/attentions_onnx.py index 9961f987..0148039d 100644 --- a/GPT_SoVITS/module/attentions_onnx.py +++ b/GPT_SoVITS/module/attentions_onnx.py @@ -3,7 +3,7 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons +from GPT_SoVITS.module import commons from typing import Optional @@ -288,7 +288,7 @@ class MultiHeadAttention(nn.Module): def _attention_bias_proximal(self, length): """Bias for self-attention to encourage attention to close positions. Args: - length: an integer scalar. + length: an integer scalGPT_SoVITS.AR. Returns: a Tensor with shape [1, 1, length, length] """ diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index 1bda2b31..5c812829 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -5,10 +5,10 @@ import torch import torch.utils.data from tqdm import tqdm -from module.mel_processing import spectrogram_torch, spec_to_mel_torch -from text import cleaned_text_to_sequence +from GPT_SoVITS.module.mel_processing import spectrogram_torch, spec_to_mel_torch +from GPT_SoVITS.text import cleaned_text_to_sequence import torch.nn.functional as F -from tools.my_utils import load_audio +from GPT_SoVITS.tools.my_utils import load_audio version = os.environ.get("version", None) diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 21f60d99..c5432287 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -7,19 +7,19 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons -from module import modules -from module import attentions +from GPT_SoVITS.module import commons +from GPT_SoVITS.module import modules +from GPT_SoVITS.module import attentions from f5_tts.model import DiT from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from module.commons import init_weights, get_padding -from module.mrte_model import MRTE -from module.quantize import ResidualVectorQuantizer +from GPT_SoVITS.module.commons import init_weights, get_padding +from GPT_SoVITS.module.mrte_model import MRTE +from GPT_SoVITS.module.quantize import ResidualVectorQuantizer # from text import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 from torch.cuda.amp import autocast import contextlib import random diff --git a/GPT_SoVITS/module/models_onnx.py b/GPT_SoVITS/module/models_onnx.py index 8a3ad13f..504c5e6b 100644 --- a/GPT_SoVITS/module/models_onnx.py +++ b/GPT_SoVITS/module/models_onnx.py @@ -4,20 +4,20 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons -from module import modules -from module import attentions_onnx as attentions +from GPT_SoVITS.module import commons +from GPT_SoVITS.module import modules +from GPT_SoVITS.module import attentions_onnx as attentions from f5_tts.model import DiT from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from module.commons import init_weights, get_padding -from module.quantize import ResidualVectorQuantizer +from GPT_SoVITS.module.commons import init_weights, get_padding +from GPT_SoVITS.module.quantize import ResidualVectorQuantizer # from text import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 class StochasticDurationPredictor(nn.Module): diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py index 7493f0b4..1b39f691 100644 --- a/GPT_SoVITS/module/modules.py +++ b/GPT_SoVITS/module/modules.py @@ -7,9 +7,9 @@ from torch.nn import functional as F from torch.nn import Conv1d from torch.nn.utils import weight_norm, remove_weight_norm -from module import commons -from module.commons import init_weights, get_padding -from module.transforms import piecewise_rational_quadratic_transform +from GPT_SoVITS.module import commons +from GPT_SoVITS.module.commons import init_weights, get_padding +from GPT_SoVITS.module.transforms import piecewise_rational_quadratic_transform import torch.distributions as D diff --git a/GPT_SoVITS/module/mrte_model.py b/GPT_SoVITS/module/mrte_model.py index e889b7e9..33c35601 100644 --- a/GPT_SoVITS/module/mrte_model.py +++ b/GPT_SoVITS/module/mrte_model.py @@ -3,7 +3,7 @@ import torch from torch import nn from torch.nn.utils import remove_weight_norm, weight_norm -from module.attentions import MultiHeadAttention +from GPT_SoVITS.module.attentions import MultiHeadAttention class MRTE(nn.Module): diff --git a/GPT_SoVITS/module/quantize.py b/GPT_SoVITS/module/quantize.py index 0afed835..186902a3 100644 --- a/GPT_SoVITS/module/quantize.py +++ b/GPT_SoVITS/module/quantize.py @@ -12,7 +12,7 @@ import typing as tp import torch from torch import nn -from module.core_vq import ResidualVectorQuantization +from GPT_SoVITS.module.core_vq import ResidualVectorQuantization @dataclass diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index fd680135..d541159f 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -1,18 +1,22 @@ import torch import torchaudio -from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule -from feature_extractor import cnhubert -from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 +from GPT_SoVITS.AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule +from GPT_SoVITS.feature_extractor import cnhubert +from GPT_SoVITS.module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 from torch import nn cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" cnhubert.cnhubert_base_path = cnhubert_base_path ssl_model = cnhubert.get_model() +from GPT_SoVITS.text import cleaned_text_to_sequence +import soundfile +from GPT_SoVITS.tools.my_utils import load_audio +import os import json import os import soundfile -from text import cleaned_text_to_sequence +from GPT_SoVITS.text import cleaned_text_to_sequence def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index 8d83e79a..f6b3c789 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -17,9 +17,9 @@ is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() version = os.environ.get("version", None) import traceback import os.path -from text.cleaner import clean_text +from GPT_SoVITS.text.cleaner import clean_text from transformers import AutoModelForMaskedLM, AutoTokenizer -from tools.my_utils import clean_path +from GPT_SoVITS.tools.my_utils import clean_path # inp_text=sys.argv[1] # inp_wav_dir=sys.argv[2] diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 3a84c014..fbf3da25 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -25,7 +25,7 @@ import librosa now_dir = os.getcwd() sys.path.append(now_dir) -from tools.my_utils import load_audio, clean_path +from GPT_SoVITS.tools.my_utils import load_audio, clean_path # from config import cnhubert_base_path # cnhubert.cnhubert_base_path=cnhubert_base_path diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index ddb0607c..73b926b0 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -38,10 +38,10 @@ import logging import utils if version != "v3": - from module.models import SynthesizerTrn + from GPT_SoVITS.module.models import SynthesizerTrn else: - from module.models import SynthesizerTrnV3 as SynthesizerTrn -from tools.my_utils import clean_path + from GPT_SoVITS.module.models import SynthesizerTrnV3 as SynthesizerTrn +from GPT_SoVITS.tools.my_utils import clean_path logging.getLogger("numba").setLevel(logging.WARNING) # from config import pretrained_s2G diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index 4a2a1bac..338b2309 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -4,7 +4,7 @@ from time import time as ttime import shutil import os import torch -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto i18n = I18nAuto() diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py index 1176f0bc..9944351b 100644 --- a/GPT_SoVITS/s1_train.py +++ b/GPT_SoVITS/s1_train.py @@ -9,9 +9,9 @@ import platform from pathlib import Path import torch -from AR.data.data_module import Text2SemanticDataModule -from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from AR.utils.io import load_yaml_config +from GPT_SoVITS.AR.data.data_module import Text2SemanticDataModule +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.AR.utils.io import load_yaml_config from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger @@ -20,10 +20,12 @@ from pytorch_lightning.strategies import DDPStrategy logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) torch.set_float32_matmul_precision("high") +from GPT_SoVITS.AR.utils import get_newest_ckpt + from collections import OrderedDict -from AR.utils import get_newest_ckpt -from process_ckpt import my_save +from GPT_SoVITS.AR.utils import get_newest_ckpt +from GPT_SoVITS.process_ckpt import my_save class my_model_ckpt(ModelCheckpoint): diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py index ab461186..45b3bc05 100644 --- a/GPT_SoVITS/s2_train.py +++ b/GPT_SoVITS/s2_train.py @@ -24,19 +24,19 @@ logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) from random import randint -from module import commons -from module.data_utils import ( +from GPT_SoVITS.module import commons +from GPT_SoVITS.module.data_utils import ( DistributedBucketSampler, TextAudioSpeakerCollate, TextAudioSpeakerLoader, ) -from module.losses import discriminator_loss, feature_loss, generator_loss, kl_loss -from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from module.models import ( +from GPT_SoVITS.module.losses import discriminator_loss, feature_loss, generator_loss, kl_loss +from GPT_SoVITS.module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from GPT_SoVITS.module.models import ( MultiPeriodDiscriminator, SynthesizerTrn, ) -from process_ckpt import savee +from GPT_SoVITS.process_ckpt import savee torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = False @@ -71,7 +71,7 @@ def main(): def run(rank, n_gpus, hps): global global_step if rank == 0: - logger = utils.get_logger(hps.data.exp_dir) + logger = GPT_SoVITS.utils.get_logger(hps.data.exp_dir) logger.info(hps) # utils.check_git_hash(hps.s2_ckpt_dir) writer = SummaryWriter(log_dir=hps.s2_ckpt_dir) @@ -204,7 +204,7 @@ def run(rank, n_gpus, hps): net_d = net_d.to(device) try: # 如果能加载自动resume - _, _, _, epoch_str = utils.load_checkpoint( + _, _, _, epoch_str = GPT_SoVITS.utils.load_checkpoint( utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "D_*.pth"), net_d, optim_d, @@ -212,7 +212,7 @@ def run(rank, n_gpus, hps): if rank == 0: logger.info("loaded D") # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) - _, _, _, epoch_str = utils.load_checkpoint( + _, _, _, epoch_str = GPT_SoVITS.utils.load_checkpoint( utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "G_*.pth"), net_g, optim_g, @@ -479,30 +479,30 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade image_dict = None try: ###Some people installed the wrong version of matplotlib. image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy( + "slice/mel_org": GPT_SoVITS.utils.plot_spectrogram_to_numpy( y_mel[0].data.cpu().numpy(), ), - "slice/mel_gen": utils.plot_spectrogram_to_numpy( + "slice/mel_gen": GPT_SoVITS.utils.plot_spectrogram_to_numpy( y_hat_mel[0].data.cpu().numpy(), ), - "all/mel": utils.plot_spectrogram_to_numpy( + "all/mel": GPT_SoVITS.utils.plot_spectrogram_to_numpy( mel[0].data.cpu().numpy(), ), - "all/stats_ssl": utils.plot_spectrogram_to_numpy( + "all/stats_ssl": GPT_SoVITS.utils.plot_spectrogram_to_numpy( stats_ssl[0].data.cpu().numpy(), ), } except: pass if image_dict: - utils.summarize( + GPT_SoVITS.utils.summarize( writer=writer, global_step=global_step, images=image_dict, scalars=scalar_dict, ) else: - utils.summarize( + GPT_SoVITS.utils.summarize( writer=writer, global_step=global_step, scalars=scalar_dict, @@ -510,7 +510,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: if hps.train.if_save_latest == 0: - utils.save_checkpoint( + GPT_SoVITS.utils.save_checkpoint( net_g, optim_g, hps.train.learning_rate, @@ -520,7 +520,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade "G_{}.pth".format(global_step), ), ) - utils.save_checkpoint( + GPT_SoVITS.utils.save_checkpoint( net_d, optim_d, hps.train.learning_rate, @@ -531,7 +531,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade ), ) else: - utils.save_checkpoint( + GPT_SoVITS.utils.save_checkpoint( net_g, optim_g, hps.train.learning_rate, @@ -541,7 +541,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade "G_{}.pth".format(233333333333), ), ) - utils.save_checkpoint( + GPT_SoVITS.utils.save_checkpoint( net_d, optim_d, hps.train.learning_rate, @@ -644,7 +644,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): ) image_dict.update( { - f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy( + f"gen/mel_{batch_idx}_{test}": GPT_SoVITS.utils.plot_spectrogram_to_numpy( y_hat_mel[0].cpu().numpy(), ), } @@ -656,7 +656,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): ) image_dict.update( { - f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()), + f"gt/mel_{batch_idx}": GPT_SoVITS.utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()), }, ) audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, : y_lengths[0]]}) @@ -666,7 +666,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): # f"gen/audio_{batch_idx}_style_pred": y_hat[0, :, :] # }) - utils.summarize( + GPT_SoVITS.utils.summarize( writer=writer_eval, global_step=global_step, images=image_dict, diff --git a/GPT_SoVITS/text/__init__.py b/GPT_SoVITS/text/__init__.py index 82df1fbb..71bdf99f 100644 --- a/GPT_SoVITS/text/__init__.py +++ b/GPT_SoVITS/text/__init__.py @@ -1,11 +1,11 @@ import os # if os.environ.get("version","v1")=="v1": -# from text.symbols import symbols +# from GPT_SoVITS.text.symbols import symbols # else: -# from text.symbols2 import symbols +# from GPT_SoVITS.text.symbols2 import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)} _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)} diff --git a/GPT_SoVITS/text/cantonese.py b/GPT_SoVITS/text/cantonese.py index 1f07c414..9228cb1e 100644 --- a/GPT_SoVITS/text/cantonese.py +++ b/GPT_SoVITS/text/cantonese.py @@ -4,8 +4,8 @@ import re import cn2an import ToJyutping -from text.symbols import punctuation -from text.zh_normalization.text_normlization import TextNormalizer +from GPT_SoVITS.text.symbols import punctuation +from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer normalizer = lambda x: cn2an.transform(x, "an2cn") @@ -195,7 +195,7 @@ def get_jyutping(text): def get_bert_feature(text, word2ph): - from text import chinese_bert + from GPT_SoVITS.text import chinese_bert return chinese_bert.get_bert_feature(text, word2ph) diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py index ce44215f..767bc1a6 100644 --- a/GPT_SoVITS/text/chinese.py +++ b/GPT_SoVITS/text/chinese.py @@ -4,9 +4,9 @@ import re import cn2an from pypinyin import lazy_pinyin, Style -from text.symbols import punctuation -from text.tone_sandhi import ToneSandhi -from text.zh_normalization.text_normlization import TextNormalizer +from GPT_SoVITS.text.symbols import punctuation +from GPT_SoVITS.text.tone_sandhi import ToneSandhi +from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer normalizer = lambda x: cn2an.transform(x, "an2cn") diff --git a/GPT_SoVITS/text/chinese2.py b/GPT_SoVITS/text/chinese2.py index 612aa3a5..ed3a5800 100644 --- a/GPT_SoVITS/text/chinese2.py +++ b/GPT_SoVITS/text/chinese2.py @@ -5,9 +5,9 @@ import cn2an from pypinyin import lazy_pinyin, Style from pypinyin.contrib.tone_convert import to_finals_tone3, to_initials -from text.symbols import punctuation -from text.tone_sandhi import ToneSandhi -from text.zh_normalization.text_normlization import TextNormalizer +from GPT_SoVITS.text.symbols import punctuation +from GPT_SoVITS.text.tone_sandhi import ToneSandhi +from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer normalizer = lambda x: cn2an.transform(x, "an2cn") @@ -28,7 +28,7 @@ import jieba_fast.posseg as psg is_g2pw = True # True if is_g2pw_str.lower() == 'true' else False if is_g2pw: # print("当前使用g2pw进行拼音推理") - from text.g2pw import G2PWPinyin, correct_pronunciation + from GPT_SoVITS.text.g2pw import G2PWPinyin, correct_pronunciation parent_directory = os.path.dirname(current_file_path) g2pw = G2PWPinyin( diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 7ba8f376..493ea333 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -1,14 +1,14 @@ -from text import cleaned_text_to_sequence +from GPT_SoVITS.text import cleaned_text_to_sequence import os # if os.environ.get("version","v1")=="v1": -# from text import chinese -# from text.symbols import symbols +# from GPT_SoVITS.text import chinese +# from GPT_SoVITS.text.symbols import symbols # else: -# from text import chinese2 as chinese -# from text.symbols2 import symbols +# from GPT_SoVITS.text import chinese2 as chinese +# from GPT_SoVITS.text.symbols2 import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 special = [ # ("%", "zh", "SP"), @@ -34,7 +34,7 @@ def clean_text(text, language, version=None): for special_s, special_l, target_symbol in special: if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol, version) - language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) + language_module = __import__("GPT_SoVITS.text." + language_module_map[language], fromlist=[language_module_map[language]]) if hasattr(language_module, "text_normalize"): norm_text = language_module.text_normalize(text) else: @@ -69,7 +69,7 @@ def clean_special(text, language, special_s, target_symbol, version=None): 特殊静音段sp符号处理 """ text = text.replace(special_s, ",") - language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) + language_module = __import__("GPT_SoVITS.text."+language_module_map[language],fromlist=[language_module_map[language]]) norm_text = language_module.text_normalize(text) phones = language_module.g2p(norm_text) new_ph = [] diff --git a/GPT_SoVITS/text/english.py b/GPT_SoVITS/text/english.py index f6c69449..1ab530a1 100644 --- a/GPT_SoVITS/text/english.py +++ b/GPT_SoVITS/text/english.py @@ -4,9 +4,9 @@ import re import wordsegment from g2p_en import G2p -from text.symbols import punctuation +from GPT_SoVITS.text.symbols import punctuation -from text.symbols2 import symbols +from GPT_SoVITS.text.symbols2 import symbols from builtins import str as unicode from text.en_normalization.expend import normalize diff --git a/GPT_SoVITS/text/g2pw/__init__.py b/GPT_SoVITS/text/g2pw/__init__.py index 5ab811d9..36923bdc 100644 --- a/GPT_SoVITS/text/g2pw/__init__.py +++ b/GPT_SoVITS/text/g2pw/__init__.py @@ -1 +1 @@ -from text.g2pw.g2pw import * +from GPT_SoVITS.text.g2pw.g2pw import * diff --git a/GPT_SoVITS/text/japanese.py b/GPT_SoVITS/text/japanese.py index a54d0cf0..b7d22eb0 100644 --- a/GPT_SoVITS/text/japanese.py +++ b/GPT_SoVITS/text/japanese.py @@ -77,8 +77,7 @@ except Exception: pass -from text.symbols import punctuation - +from GPT_SoVITS.text.symbols import punctuation # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile( r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" diff --git a/GPT_SoVITS/text/korean.py b/GPT_SoVITS/text/korean.py index 254b05cf..1bf61b68 100644 --- a/GPT_SoVITS/text/korean.py +++ b/GPT_SoVITS/text/korean.py @@ -56,7 +56,7 @@ if os.name == "nt": G2p = win_G2p -from text.symbols2 import symbols +from GPT_SoVITS.text.symbols2 import symbols # This is a list of Korean classifiers preceded by pure Korean numerals. _korean_classifiers = ( diff --git a/GPT_SoVITS/text/zh_normalization/__init__.py b/GPT_SoVITS/text/zh_normalization/__init__.py index 46b367a6..ad201c81 100644 --- a/GPT_SoVITS/text/zh_normalization/__init__.py +++ b/GPT_SoVITS/text/zh_normalization/__init__.py @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from text.zh_normalization.text_normlization import * +from GPT_SoVITS.text.zh_normalization.text_normlization import * diff --git a/tools/AP_BWE_main/24kto48k/readme.txt b/GPT_SoVITS/tools/AP_BWE_main/24kto48k/readme.txt similarity index 100% rename from tools/AP_BWE_main/24kto48k/readme.txt rename to GPT_SoVITS/tools/AP_BWE_main/24kto48k/readme.txt diff --git a/tools/AP_BWE_main/LICENSE b/GPT_SoVITS/tools/AP_BWE_main/LICENSE similarity index 100% rename from tools/AP_BWE_main/LICENSE rename to GPT_SoVITS/tools/AP_BWE_main/LICENSE diff --git a/tools/AP_BWE_main/README.md b/GPT_SoVITS/tools/AP_BWE_main/README.md similarity index 100% rename from tools/AP_BWE_main/README.md rename to GPT_SoVITS/tools/AP_BWE_main/README.md diff --git a/tools/AP_BWE_main/datasets1/__init__.py b/GPT_SoVITS/tools/AP_BWE_main/datasets1/__init__.py similarity index 100% rename from tools/AP_BWE_main/datasets1/__init__.py rename to GPT_SoVITS/tools/AP_BWE_main/datasets1/__init__.py diff --git a/tools/AP_BWE_main/datasets1/dataset.py b/GPT_SoVITS/tools/AP_BWE_main/datasets1/dataset.py similarity index 100% rename from tools/AP_BWE_main/datasets1/dataset.py rename to GPT_SoVITS/tools/AP_BWE_main/datasets1/dataset.py diff --git a/tools/AP_BWE_main/models/__init__.py b/GPT_SoVITS/tools/AP_BWE_main/models/__init__.py similarity index 100% rename from tools/AP_BWE_main/models/__init__.py rename to GPT_SoVITS/tools/AP_BWE_main/models/__init__.py diff --git a/tools/AP_BWE_main/models/model.py b/GPT_SoVITS/tools/AP_BWE_main/models/model.py similarity index 100% rename from tools/AP_BWE_main/models/model.py rename to GPT_SoVITS/tools/AP_BWE_main/models/model.py diff --git a/tools/__init__.py b/GPT_SoVITS/tools/__init__.py similarity index 100% rename from tools/__init__.py rename to GPT_SoVITS/tools/__init__.py diff --git a/tools/asr/config.py b/GPT_SoVITS/tools/asr/config.py similarity index 100% rename from tools/asr/config.py rename to GPT_SoVITS/tools/asr/config.py diff --git a/tools/asr/fasterwhisper_asr.py b/GPT_SoVITS/tools/asr/fasterwhisper_asr.py similarity index 92% rename from tools/asr/fasterwhisper_asr.py rename to GPT_SoVITS/tools/asr/fasterwhisper_asr.py index e570f174..436097a4 100644 --- a/tools/asr/fasterwhisper_asr.py +++ b/GPT_SoVITS/tools/asr/fasterwhisper_asr.py @@ -9,7 +9,7 @@ import torch from faster_whisper import WhisperModel from tqdm import tqdm -from tools.asr.config import check_fw_local_models +from GPT_SoVITS.tools.asr.config import check_fw_local_models # fmt: off language_code_list = [ @@ -72,8 +72,13 @@ def execute_asr(input_folder, output_folder, model_size, language, precision): if info.language == "zh": print("检测为中文文本, 转 FunASR 处理") +<<<<<<< HEAD:tools/asr/fasterwhisper_asr.py if "only_asr" not in globals(): from tools.asr.funasr_asr import only_asr # 如果用英文就不需要导入下载模型 +======= + if("only_asr" not in globals()): + from GPT_SoVITS.tools.asr.funasr_asr import only_asr #如果用英文就不需要导入下载模型 +>>>>>>> main:GPT_SoVITS/tools/asr/fasterwhisper_asr.py text = only_asr(file_path, language=info.language.lower()) if text == "": diff --git a/tools/asr/funasr_asr.py b/GPT_SoVITS/tools/asr/funasr_asr.py similarity index 100% rename from tools/asr/funasr_asr.py rename to GPT_SoVITS/tools/asr/funasr_asr.py diff --git a/tools/asr/models/.gitignore b/GPT_SoVITS/tools/asr/models/.gitignore similarity index 100% rename from tools/asr/models/.gitignore rename to GPT_SoVITS/tools/asr/models/.gitignore diff --git a/tools/audio_sr.py b/GPT_SoVITS/tools/audio_sr.py similarity index 91% rename from tools/audio_sr.py rename to GPT_SoVITS/tools/audio_sr.py index 58df6d20..9485a6ae 100644 --- a/tools/audio_sr.py +++ b/GPT_SoVITS/tools/audio_sr.py @@ -9,8 +9,8 @@ import torch import torchaudio.functional as aF # from attrdict import AttrDict####will be bug in py3.10 -from datasets1.dataset import amp_pha_stft, amp_pha_istft -from models.model import APNet_BWE_Model +from GPT_SoVITS.tools.AP_BWE_main.datasets1.dataset import amp_pha_stft, amp_pha_istft +from GPT_SoVITS.tools.AP_BWE_main.models.model import APNet_BWE_Model class AP_BWE: diff --git a/tools/cmd-denoise.py b/GPT_SoVITS/tools/cmd-denoise.py similarity index 100% rename from tools/cmd-denoise.py rename to GPT_SoVITS/tools/cmd-denoise.py diff --git a/tools/denoise-model/.gitignore b/GPT_SoVITS/tools/denoise-model/.gitignore similarity index 100% rename from tools/denoise-model/.gitignore rename to GPT_SoVITS/tools/denoise-model/.gitignore diff --git a/tools/i18n/i18n.py b/GPT_SoVITS/tools/i18n/i18n.py similarity index 100% rename from tools/i18n/i18n.py rename to GPT_SoVITS/tools/i18n/i18n.py diff --git a/tools/i18n/locale/en_US.json b/GPT_SoVITS/tools/i18n/locale/en_US.json similarity index 100% rename from tools/i18n/locale/en_US.json rename to GPT_SoVITS/tools/i18n/locale/en_US.json diff --git a/tools/i18n/locale/es_ES.json b/GPT_SoVITS/tools/i18n/locale/es_ES.json similarity index 100% rename from tools/i18n/locale/es_ES.json rename to GPT_SoVITS/tools/i18n/locale/es_ES.json diff --git a/tools/i18n/locale/fr_FR.json b/GPT_SoVITS/tools/i18n/locale/fr_FR.json similarity index 100% rename from tools/i18n/locale/fr_FR.json rename to GPT_SoVITS/tools/i18n/locale/fr_FR.json diff --git a/tools/i18n/locale/it_IT.json b/GPT_SoVITS/tools/i18n/locale/it_IT.json similarity index 100% rename from tools/i18n/locale/it_IT.json rename to GPT_SoVITS/tools/i18n/locale/it_IT.json diff --git a/tools/i18n/locale/ja_JP.json b/GPT_SoVITS/tools/i18n/locale/ja_JP.json similarity index 100% rename from tools/i18n/locale/ja_JP.json rename to GPT_SoVITS/tools/i18n/locale/ja_JP.json diff --git a/tools/i18n/locale/ko_KR.json b/GPT_SoVITS/tools/i18n/locale/ko_KR.json similarity index 100% rename from tools/i18n/locale/ko_KR.json rename to GPT_SoVITS/tools/i18n/locale/ko_KR.json diff --git a/tools/i18n/locale/pt_BR.json b/GPT_SoVITS/tools/i18n/locale/pt_BR.json similarity index 95% rename from tools/i18n/locale/pt_BR.json rename to GPT_SoVITS/tools/i18n/locale/pt_BR.json index 06fb1d60..fe1eeb58 100644 --- a/tools/i18n/locale/pt_BR.json +++ b/GPT_SoVITS/tools/i18n/locale/pt_BR.json @@ -101,6 +101,7 @@ "实际输入的目标文本(每句):": "Texto alvo realmente inserido (por frase):", "实际输入的目标文本:": "Texto alvo realmente inserido:", "导出文件格式": "Formato de arquivo de exportação", +<<<<<<< HEAD:tools/i18n/locale/pt_BR.json "已关闭": " Fechado", "已完成": " Concluído", "已开启": " Ativado", @@ -110,6 +111,21 @@ "开启": "Ativar ", "开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativar.", "微调训练": "Treinamento de ajuste fino", +======= + "开启GPT训练": "Ativar treinamento GPT", + "开启SSL提取": "Ativar extração SSL", + "开启SoVITS训练": "Ativar treinamento SoVITS", + "开启TTS推理WebUI": "Abrir TTS Inference WebUI", + "开启UVR5-WebUI": "Abrir UVR5-WebUI", + "开启一键三连": "Ativar um clique", + "开启打标WebUI": "Abrir Labeling WebUI", + "开启文本获取": "Ativar obtenção de texto", + "开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativGPT_SoVITS.AR.", + "开启离线批量ASR": "Ativar ASR offline em lote", + "开启语义token提取": "Ativar extração de token semântico", + "开启语音切割": "Ativar corte de voz", + "开启语音降噪": "Ativar redução de ruído de voz", +>>>>>>> main:GPT_SoVITS/tools/i18n/locale/pt_BR.json "怎么切": "Como cortar", "总训练轮数total_epoch": "Total de epoch de treinamento", "总训练轮数total_epoch,不建议太高": "Total de epoch de treinamento, não é recomendável um valor muito alto", diff --git a/tools/i18n/locale/ru_RU.json b/GPT_SoVITS/tools/i18n/locale/ru_RU.json similarity index 100% rename from tools/i18n/locale/ru_RU.json rename to GPT_SoVITS/tools/i18n/locale/ru_RU.json diff --git a/tools/i18n/locale/tr_TR.json b/GPT_SoVITS/tools/i18n/locale/tr_TR.json similarity index 100% rename from tools/i18n/locale/tr_TR.json rename to GPT_SoVITS/tools/i18n/locale/tr_TR.json diff --git a/tools/i18n/locale/zh_CN.json b/GPT_SoVITS/tools/i18n/locale/zh_CN.json similarity index 100% rename from tools/i18n/locale/zh_CN.json rename to GPT_SoVITS/tools/i18n/locale/zh_CN.json diff --git a/tools/i18n/locale/zh_HK.json b/GPT_SoVITS/tools/i18n/locale/zh_HK.json similarity index 100% rename from tools/i18n/locale/zh_HK.json rename to GPT_SoVITS/tools/i18n/locale/zh_HK.json diff --git a/tools/i18n/locale/zh_SG.json b/GPT_SoVITS/tools/i18n/locale/zh_SG.json similarity index 100% rename from tools/i18n/locale/zh_SG.json rename to GPT_SoVITS/tools/i18n/locale/zh_SG.json diff --git a/tools/i18n/locale/zh_TW.json b/GPT_SoVITS/tools/i18n/locale/zh_TW.json similarity index 100% rename from tools/i18n/locale/zh_TW.json rename to GPT_SoVITS/tools/i18n/locale/zh_TW.json diff --git a/tools/i18n/scan_i18n.py b/GPT_SoVITS/tools/i18n/scan_i18n.py similarity index 100% rename from tools/i18n/scan_i18n.py rename to GPT_SoVITS/tools/i18n/scan_i18n.py diff --git a/tools/my_utils.py b/GPT_SoVITS/tools/my_utils.py similarity index 96% rename from tools/my_utils.py rename to GPT_SoVITS/tools/my_utils.py index 44d326e1..f502012a 100644 --- a/tools/my_utils.py +++ b/GPT_SoVITS/tools/my_utils.py @@ -3,7 +3,7 @@ import traceback import ffmpeg import numpy as np import gradio as gr -from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools.i18n.i18n import I18nAuto import pandas as pd i18n = I18nAuto(language=os.environ.get("language", "Auto")) diff --git a/tools/slice_audio.py b/GPT_SoVITS/tools/slice_audio.py similarity index 100% rename from tools/slice_audio.py rename to GPT_SoVITS/tools/slice_audio.py diff --git a/tools/slicer2.py b/GPT_SoVITS/tools/slicer2.py similarity index 97% rename from tools/slicer2.py rename to GPT_SoVITS/tools/slicer2.py index 8d80f1b4..d84bc84f 100644 --- a/tools/slicer2.py +++ b/GPT_SoVITS/tools/slicer2.py @@ -1,230 +1,230 @@ -import numpy as np - - -# This function is obtained from librosa. -def get_rms( - y, - frame_length=2048, - hop_length=512, - pad_mode="constant", -): - padding = (int(frame_length // 2), int(frame_length // 2)) - y = np.pad(y, padding, mode=pad_mode) - - axis = -1 - # put our new within-frame axis at the end for now - out_strides = y.strides + tuple([y.strides[axis]]) - # Reduce the shape on the framing axis - x_shape_trimmed = list(y.shape) - x_shape_trimmed[axis] -= frame_length - 1 - out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) - xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) - if axis < 0: - target_axis = axis - 1 - else: - target_axis = axis + 1 - xw = np.moveaxis(xw, -1, target_axis) - # Downsample along the target axis - slices = [slice(None)] * xw.ndim - slices[axis] = slice(0, None, hop_length) - x = xw[tuple(slices)] - - # Calculate power - power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) - - return np.sqrt(power) - - -class Slicer: - def __init__( - self, - sr: int, - threshold: float = -40.0, - min_length: int = 5000, - min_interval: int = 300, - hop_size: int = 20, - max_sil_kept: int = 5000, - ): - if not min_length >= min_interval >= hop_size: - raise ValueError("The following condition must be satisfied: min_length >= min_interval >= hop_size") - if not max_sil_kept >= hop_size: - raise ValueError("The following condition must be satisfied: max_sil_kept >= hop_size") - min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.0) - self.hop_size = round(sr * hop_size / 1000) - self.win_size = min(round(min_interval), 4 * self.hop_size) - self.min_length = round(sr * min_length / 1000 / self.hop_size) - self.min_interval = round(min_interval / self.hop_size) - self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) - - def _apply_slice(self, waveform, begin, end): - if len(waveform.shape) > 1: - return waveform[:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)] - else: - return waveform[begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)] - - # @timeit - def slice(self, waveform): - if len(waveform.shape) > 1: - samples = waveform.mean(axis=0) - else: - samples = waveform - if samples.shape[0] <= self.min_length: - return [waveform] - rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) - sil_tags = [] - silence_start = None - clip_start = 0 - for i, rms in enumerate(rms_list): - # Keep looping while frame is silent. - if rms < self.threshold: - # Record start of silent frames. - if silence_start is None: - silence_start = i - continue - # Keep looping while frame is not silent and silence start has not been recorded. - if silence_start is None: - continue - # Clear recorded silence start if interval is not enough or clip is too short - is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length - if not is_leading_silence and not need_slice_middle: - silence_start = None - continue - # Need slicing. Record the range of silent frames to be removed. - if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start - if silence_start == 0: - sil_tags.append((0, pos)) - else: - sil_tags.append((pos, pos)) - clip_start = pos - elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin() - pos += i - self.max_sil_kept - pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start - pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept - if silence_start == 0: - sil_tags.append((0, pos_r)) - clip_start = pos_r - else: - sil_tags.append((min(pos_l, pos), max(pos_r, pos))) - clip_start = max(pos_r, pos) - else: - pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start - pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept - if silence_start == 0: - sil_tags.append((0, pos_r)) - else: - sil_tags.append((pos_l, pos_r)) - clip_start = pos_r - silence_start = None - # Deal with trailing silence. - total_frames = rms_list.shape[0] - if silence_start is not None and total_frames - silence_start >= self.min_interval: - silence_end = min(total_frames, silence_start + self.max_sil_kept) - pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start - sil_tags.append((pos, total_frames + 1)) - # Apply and return slices. - ####音频+起始时间+终止时间 - if len(sil_tags) == 0: - return [[waveform, 0, int(total_frames * self.hop_size)]] - else: - chunks = [] - if sil_tags[0][0] > 0: - chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]), 0, int(sil_tags[0][0] * self.hop_size)]) - for i in range(len(sil_tags) - 1): - chunks.append( - [ - self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]), - int(sil_tags[i][1] * self.hop_size), - int(sil_tags[i + 1][0] * self.hop_size), - ] - ) - if sil_tags[-1][1] < total_frames: - chunks.append( - [ - self._apply_slice(waveform, sil_tags[-1][1], total_frames), - int(sil_tags[-1][1] * self.hop_size), - int(total_frames * self.hop_size), - ] - ) - return chunks - - -def main(): - import os.path - from argparse import ArgumentParser - - import librosa - import soundfile - - parser = ArgumentParser() - parser.add_argument("audio", type=str, help="The audio to be sliced") - parser.add_argument("--out", type=str, help="Output directory of the sliced audio clips") - parser.add_argument( - "--db_thresh", - type=float, - required=False, - default=-40, - help="The dB threshold for silence detection", - ) - parser.add_argument( - "--min_length", - type=int, - required=False, - default=5000, - help="The minimum milliseconds required for each sliced audio clip", - ) - parser.add_argument( - "--min_interval", - type=int, - required=False, - default=300, - help="The minimum milliseconds for a silence part to be sliced", - ) - parser.add_argument( - "--hop_size", - type=int, - required=False, - default=10, - help="Frame length in milliseconds", - ) - parser.add_argument( - "--max_sil_kept", - type=int, - required=False, - default=500, - help="The maximum silence length kept around the sliced clip, presented in milliseconds", - ) - args = parser.parse_args() - out = args.out - if out is None: - out = os.path.dirname(os.path.abspath(args.audio)) - audio, sr = librosa.load(args.audio, sr=None, mono=False) - slicer = Slicer( - sr=sr, - threshold=args.db_thresh, - min_length=args.min_length, - min_interval=args.min_interval, - hop_size=args.hop_size, - max_sil_kept=args.max_sil_kept, - ) - chunks = slicer.slice(audio) - if not os.path.exists(out): - os.makedirs(out) - for i, chunk in enumerate(chunks): - if len(chunk.shape) > 1: - chunk = chunk.T - soundfile.write( - os.path.join( - out, - "%s_%d.wav" % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), - ), - chunk, - sr, - ) - - -if __name__ == "__main__": - main() +import numpy as np + + +# This function is obtained from librosa. +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + # put our new within-frame axis at the end for now + out_strides = y.strides + tuple([y.strides[axis]]) + # Reduce the shape on the framing axis + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + xw = np.moveaxis(xw, -1, target_axis) + # Downsample along the target axis + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + # Calculate power + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + + return np.sqrt(power) + + +class Slicer: + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): + if not min_length >= min_interval >= hop_size: + raise ValueError("The following condition must be satisfied: min_length >= min_interval >= hop_size") + if not max_sil_kept >= hop_size: + raise ValueError("The following condition must be satisfied: max_sil_kept >= hop_size") + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.0) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + if len(waveform.shape) > 1: + return waveform[:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)] + else: + return waveform[begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)] + + # @timeit + def slice(self, waveform): + if len(waveform.shape) > 1: + samples = waveform.mean(axis=0) + else: + samples = waveform + if samples.shape[0] <= self.min_length: + return [waveform] + rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) + sil_tags = [] + silence_start = None + clip_start = 0 + for i, rms in enumerate(rms_list): + # Keep looping while frame is silent. + if rms < self.threshold: + # Record start of silent frames. + if silence_start is None: + silence_start = i + continue + # Keep looping while frame is not silent and silence start has not been recorded. + if silence_start is None: + continue + # Clear recorded silence start if interval is not enough or clip is too short + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + # Need slicing. Record the range of silent frames to be removed. + if i - silence_start <= self.max_sil_kept: + pos = rms_list[silence_start : i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin() + pos += i - self.max_sil_kept + pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + # Deal with trailing silence. + total_frames = rms_list.shape[0] + if silence_start is not None and total_frames - silence_start >= self.min_interval: + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + # Apply and return slices. + ####音频+起始时间+终止时间 + if len(sil_tags) == 0: + return [[waveform, 0, int(total_frames * self.hop_size)]] + else: + chunks = [] + if sil_tags[0][0] > 0: + chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]), 0, int(sil_tags[0][0] * self.hop_size)]) + for i in range(len(sil_tags) - 1): + chunks.append( + [ + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]), + int(sil_tags[i][1] * self.hop_size), + int(sil_tags[i + 1][0] * self.hop_size), + ] + ) + if sil_tags[-1][1] < total_frames: + chunks.append( + [ + self._apply_slice(waveform, sil_tags[-1][1], total_frames), + int(sil_tags[-1][1] * self.hop_size), + int(total_frames * self.hop_size), + ] + ) + return chunks + + +def main(): + import os.path + from argparse import ArgumentParser + + import librosa + import soundfile + + parser = ArgumentParser() + parser.add_argument("audio", type=str, help="The audio to be sliced") + parser.add_argument("--out", type=str, help="Output directory of the sliced audio clips") + parser.add_argument( + "--db_thresh", + type=float, + required=False, + default=-40, + help="The dB threshold for silence detection", + ) + parser.add_argument( + "--min_length", + type=int, + required=False, + default=5000, + help="The minimum milliseconds required for each sliced audio clip", + ) + parser.add_argument( + "--min_interval", + type=int, + required=False, + default=300, + help="The minimum milliseconds for a silence part to be sliced", + ) + parser.add_argument( + "--hop_size", + type=int, + required=False, + default=10, + help="Frame length in milliseconds", + ) + parser.add_argument( + "--max_sil_kept", + type=int, + required=False, + default=500, + help="The maximum silence length kept around the sliced clip, presented in milliseconds", + ) + args = parser.parse_args() + out = args.out + if out is None: + out = os.path.dirname(os.path.abspath(args.audio)) + audio, sr = librosa.load(args.audio, sr=None, mono=False) + slicer = Slicer( + sr=sr, + threshold=args.db_thresh, + min_length=args.min_length, + min_interval=args.min_interval, + hop_size=args.hop_size, + max_sil_kept=args.max_sil_kept, + ) + chunks = slicer.slice(audio) + if not os.path.exists(out): + os.makedirs(out) + for i, chunk in enumerate(chunks): + if len(chunk.shape) > 1: + chunk = chunk.T + soundfile.write( + os.path.join( + out, + "%s_%d.wav" % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), + chunk, + sr, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/subfix_webui.py b/GPT_SoVITS/tools/subfix_webui.py similarity index 100% rename from tools/subfix_webui.py rename to GPT_SoVITS/tools/subfix_webui.py diff --git a/tools/uvr5/bs_roformer/__init__.py b/GPT_SoVITS/tools/uvr5/bs_roformer/__init__.py similarity index 100% rename from tools/uvr5/bs_roformer/__init__.py rename to GPT_SoVITS/tools/uvr5/bs_roformer/__init__.py diff --git a/tools/uvr5/bs_roformer/attend.py b/GPT_SoVITS/tools/uvr5/bs_roformer/attend.py similarity index 100% rename from tools/uvr5/bs_roformer/attend.py rename to GPT_SoVITS/tools/uvr5/bs_roformer/attend.py diff --git a/tools/uvr5/bs_roformer/bs_roformer.py b/GPT_SoVITS/tools/uvr5/bs_roformer/bs_roformer.py similarity index 100% rename from tools/uvr5/bs_roformer/bs_roformer.py rename to GPT_SoVITS/tools/uvr5/bs_roformer/bs_roformer.py diff --git a/tools/uvr5/bs_roformer/mel_band_roformer.py b/GPT_SoVITS/tools/uvr5/bs_roformer/mel_band_roformer.py similarity index 100% rename from tools/uvr5/bs_roformer/mel_band_roformer.py rename to GPT_SoVITS/tools/uvr5/bs_roformer/mel_band_roformer.py diff --git a/tools/uvr5/bsroformer.py b/GPT_SoVITS/tools/uvr5/bsroformer.py similarity index 98% rename from tools/uvr5/bsroformer.py rename to GPT_SoVITS/tools/uvr5/bsroformer.py index ddcbfa74..0fbcdb4a 100644 --- a/tools/uvr5/bsroformer.py +++ b/GPT_SoVITS/tools/uvr5/bsroformer.py @@ -1,6 +1,7 @@ # This code is modified from https://github.com/ZFTurbo/ import os import warnings +import subprocess import librosa import numpy as np @@ -160,7 +161,7 @@ class Roformer_Loader: batch_data.append(part) batch_locations.append((i, length)) i += step - progress_bar.update(1) + progress_bGPT_SoVITS.AR.update(1) if len(batch_data) >= batch_size or (i >= mix.shape[1]): arr = torch.stack(batch_data, dim=0) @@ -189,7 +190,7 @@ class Roformer_Loader: # Remove pad estimated_sources = estimated_sources[..., border:-border] - progress_bar.close() + progress_bGPT_SoVITS.AR.close() if self.config["training"]["target_instrument"] is None: return {k: v for k, v in zip(self.config["training"]["instruments"], estimated_sources)} @@ -253,7 +254,10 @@ class Roformer_Loader: sf.write(path, data, sr) else: sf.write(path, data, sr) - os.system('ffmpeg -i "{}" -vn "{}" -q:a 2 -y'.format(path, path[:-3] + format)) + subprocess.run( + ["ffmpeg", "-i", path, "-vn", path[:-3] + format, "-q:a", "2", "-y"], + check=True, + ) try: os.remove(path) except: diff --git a/tools/uvr5/lib/lib_v5/dataset.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/dataset.py similarity index 100% rename from tools/uvr5/lib/lib_v5/dataset.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/dataset.py diff --git a/tools/uvr5/lib/lib_v5/layers.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers.py diff --git a/tools/uvr5/lib/lib_v5/layers_123812KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_123812KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_123812KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_123812KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_123821KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_123821KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_123821KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_123821KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_33966KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_33966KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_33966KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_33966KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_537227KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_537227KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_537227KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_537227KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_537238KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_537238KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_537238KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_537238KB.py diff --git a/tools/uvr5/lib/lib_v5/layers_new.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_new.py similarity index 100% rename from tools/uvr5/lib/lib_v5/layers_new.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/layers_new.py diff --git a/tools/uvr5/lib/lib_v5/model_param_init.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/model_param_init.py similarity index 100% rename from tools/uvr5/lib/lib_v5/model_param_init.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/model_param_init.py diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/2band_32000.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/2band_48000.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/3band_44100.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_v2.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json similarity index 94% rename from tools/uvr5/lib/lib_v5/modelparams/4band_v3.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json index 2a73bc97..edb908b8 100644 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json +++ b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json @@ -1,54 +1,54 @@ -{ - "bins": 672, - "unstable_bins": 8, - "reduction_bins": 530, - "band": { - "1": { - "sr": 7350, - "hl": 80, - "n_fft": 640, - "crop_start": 0, - "crop_stop": 85, - "lpf_start": 25, - "lpf_stop": 53, - "res_type": "polyphase" - }, - "2": { - "sr": 7350, - "hl": 80, - "n_fft": 320, - "crop_start": 4, - "crop_stop": 87, - "hpf_start": 25, - "hpf_stop": 12, - "lpf_start": 31, - "lpf_stop": 62, - "res_type": "polyphase" - }, - "3": { - "sr": 14700, - "hl": 160, - "n_fft": 512, - "crop_start": 17, - "crop_stop": 216, - "hpf_start": 48, - "hpf_stop": 24, - "lpf_start": 139, - "lpf_stop": 210, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 480, - "n_fft": 960, - "crop_start": 78, - "crop_stop": 383, - "hpf_start": 130, - "hpf_stop": 86, - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 668, - "pre_filter_stop": 672 +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 530, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 } \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/ensemble.json b/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/ensemble.json similarity index 100% rename from tools/uvr5/lib/lib_v5/modelparams/ensemble.json rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/ensemble.json diff --git a/tools/uvr5/lib/lib_v5/nets.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets.py diff --git a/tools/uvr5/lib/lib_v5/nets_123812KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_123812KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_123812KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_123812KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_123821KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_123821KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_123821KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_123821KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_33966KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_33966KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_33966KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_33966KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_537227KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_537227KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_537227KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_537227KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_537238KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_537238KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_537238KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_537238KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_61968KB.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_61968KB.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_61968KB.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_61968KB.py diff --git a/tools/uvr5/lib/lib_v5/nets_new.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_new.py similarity index 100% rename from tools/uvr5/lib/lib_v5/nets_new.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/nets_new.py diff --git a/tools/uvr5/lib/lib_v5/spec_utils.py b/GPT_SoVITS/tools/uvr5/lib/lib_v5/spec_utils.py similarity index 99% rename from tools/uvr5/lib/lib_v5/spec_utils.py rename to GPT_SoVITS/tools/uvr5/lib/lib_v5/spec_utils.py index da072e4b..d6cd1bf6 100644 --- a/tools/uvr5/lib/lib_v5/spec_utils.py +++ b/GPT_SoVITS/tools/uvr5/lib/lib_v5/spec_utils.py @@ -2,6 +2,8 @@ import hashlib import json import math import os +import subprocess +import sys import librosa import numpy as np @@ -671,6 +673,9 @@ if __name__ == "__main__": ] for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): - os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") + subprocess.run( + [sys.executable, "lib/align_tracks.py", e['file1'].strip('"'), e['file2'].strip('"')], + check=True, + ) # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) diff --git a/tools/uvr5/lib/name_params.json b/GPT_SoVITS/tools/uvr5/lib/name_params.json similarity index 100% rename from tools/uvr5/lib/name_params.json rename to GPT_SoVITS/tools/uvr5/lib/name_params.json diff --git a/tools/uvr5/lib/utils.py b/GPT_SoVITS/tools/uvr5/lib/utils.py similarity index 100% rename from tools/uvr5/lib/utils.py rename to GPT_SoVITS/tools/uvr5/lib/utils.py diff --git a/tools/uvr5/mdxnet.py b/GPT_SoVITS/tools/uvr5/mdxnet.py similarity index 91% rename from tools/uvr5/mdxnet.py rename to GPT_SoVITS/tools/uvr5/mdxnet.py index e1098273..dcbe6474 100644 --- a/tools/uvr5/mdxnet.py +++ b/GPT_SoVITS/tools/uvr5/mdxnet.py @@ -1,5 +1,6 @@ import os import logging +import subprocess logger = logging.getLogger(__name__) @@ -125,7 +126,7 @@ class Predictor: def demix_base(self, mixes, margin_size): chunked_sources = [] progress_bar = tqdm(total=len(mixes)) - progress_bar.set_description("Processing") + progress_bGPT_SoVITS.AR.set_description("Processing") for mix in mixes: cmix = mixes[mix] sources = [] @@ -161,12 +162,12 @@ class Predictor: end = None sources.append(tar_signal[:, start:end]) - progress_bar.update(1) + progress_bGPT_SoVITS.AR.update(1) chunked_sources.append(sources) _sources = np.concatenate(chunked_sources, axis=-1) # del self.model - progress_bar.close() + progress_bGPT_SoVITS.AR.close() return _sources def prediction(self, m, vocal_root, others_root, format): @@ -190,19 +191,15 @@ class Predictor: opt_path_vocal = path_vocal[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format if os.path.exists(path_vocal): - os.system("ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal)) - if os.path.exists(opt_path_vocal): - try: - os.remove(path_vocal) - except: - pass + subprocess.run( + ["ffmpeg", "-i", path_vocal, "-vn", opt_path_vocal, "-q:a", "2", "-y"], + check=True, + ) if os.path.exists(path_other): - os.system("ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other)) - if os.path.exists(opt_path_other): - try: - os.remove(path_other) - except: - pass + subprocess.run( + ["ffmpeg", "-i", path_other, "-vn", opt_path_other, "-q:a", "2", "-y"], + check=True, + ) class MDXNetDereverb: diff --git a/tools/uvr5/uvr5_weights/.gitignore b/GPT_SoVITS/tools/uvr5/uvr5_weights/.gitignore similarity index 100% rename from tools/uvr5/uvr5_weights/.gitignore rename to GPT_SoVITS/tools/uvr5/uvr5_weights/.gitignore diff --git a/tools/uvr5/vr.py b/GPT_SoVITS/tools/uvr5/vr.py similarity index 94% rename from tools/uvr5/vr.py rename to GPT_SoVITS/tools/uvr5/vr.py index 4ca8a3b7..71bed80b 100644 --- a/tools/uvr5/vr.py +++ b/GPT_SoVITS/tools/uvr5/vr.py @@ -1,4 +1,5 @@ import os +import subprocess parent_directory = os.path.dirname(os.path.abspath(__file__)) import logging @@ -140,7 +141,10 @@ class AudioPre: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + subprocess.run( + ["ffmpeg", "-i", path, "-vn", opt_format_path, "-q:a", "2", "-y"], + check=True, + ) if os.path.exists(opt_format_path): try: os.remove(path) @@ -175,7 +179,10 @@ class AudioPre: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + subprocess.run( + ["ffmpeg", "-i", path, "-vn", opt_format_path, "-q:a", "2", "-y"], + check=True, + ) if os.path.exists(opt_format_path): try: os.remove(path) @@ -303,7 +310,10 @@ class AudioPreDeEcho: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + subprocess.run( + ["ffmpeg", "-i", path, "-vn", opt_format_path, "-q:a", "2", "-y"], + check=True, + ) if os.path.exists(opt_format_path): try: os.remove(path) @@ -334,7 +344,10 @@ class AudioPreDeEcho: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) + subprocess.run( + ["ffmpeg", "-i", path, "-vn", opt_format_path, "-q:a", "2", "-y"], + check=True, + ) if os.path.exists(opt_format_path): try: os.remove(path) diff --git a/tools/uvr5/webui.py b/GPT_SoVITS/tools/uvr5/webui.py similarity index 96% rename from tools/uvr5/webui.py rename to GPT_SoVITS/tools/uvr5/webui.py index a3d7fe46..ec1895f7 100644 --- a/tools/uvr5/webui.py +++ b/GPT_SoVITS/tools/uvr5/webui.py @@ -2,9 +2,8 @@ import os import traceback import gradio as gr import logging -from tools.i18n.i18n import I18nAuto -from tools.my_utils import clean_path - +from GPT_SoVITS.tools.i18n.i18n import I18nAuto +from GPT_SoVITS.tools.my_utils import clean_path i18n = I18nAuto() logger = logging.getLogger(__name__) @@ -14,6 +13,7 @@ import sys from mdxnet import MDXNetDereverb from vr import AudioPre, AudioPreDeEcho from bsroformer import Roformer_Loader +import subprocess try: import gradio.analytics as analytics @@ -100,7 +100,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format os.path.join(os.environ["TEMP"]), os.path.basename(inp_path), ) - os.system(f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y') + subprocess.run( + ["ffmpeg", "-i", inp_path, "-vn", "-acodec", "pcm_s16le", "-ac", "2", "-ar", "44100", tmp_path, "-y"], + check=True, + ) inp_path = tmp_path try: if done == 0: diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..084bdb35 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include GPT_SoVITS/configs/* \ No newline at end of file diff --git a/README.md b/README.md index 463649ac..952efc9d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -
+# Jarod's NOTE +Working on turning this into a package. Right now, the API *does in fact* work to make requests to and this can be installed.

GPT-SoVITS-WebUI

A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

@@ -142,6 +143,7 @@ As above, modify the corresponding parameters based on your actual situation, th ``` docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx ``` +Which will bootup local server that you can make requests to. Checkout `test.py` and `test_streaming.py` to get an idea for how you might be able to use the API. ## Pretrained Models @@ -163,6 +165,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker 5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint. +<<<<<<< HEAD ## Dataset Format The TTS annotation .list file format: @@ -359,10 +362,13 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p A custom list save path is enabled +======= +>>>>>>> main ## Credits -Special thanks to the following projects and contributors: +Special thanks to the RVC-Boss for getting this wonderful tool up and going, as well as all of the other attributions used to build it: +<<<<<<< HEAD ### Theoretical Research - [ar-vits](https://github.com/innnky/ar-vits) @@ -407,3 +413,6 @@ Thankful to @Naozumi520 for providing the Cantonese training set and for the gui +======= +**Original Repo:** https://github.com/RVC-Boss/GPT-SoVITS +>>>>>>> main diff --git a/api.py b/api.py index c1c917a0..73edbdc3 100644 --- a/api.py +++ b/api.py @@ -163,12 +163,12 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer import numpy as np from feature_extractor import cnhubert from io import BytesIO -from module.models import SynthesizerTrn, SynthesizerTrnV3 +from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3 from peft import LoraConfig, get_peft_model -from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from text import cleaned_text_to_sequence -from text.cleaner import clean_text -from module.mel_processing import spectrogram_torch +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.text import cleaned_text_to_sequence +from GPT_SoVITS.text.cleaner import clean_text +from GPT_SoVITS.module.mel_processing import spectrogram_torch import config as global_config import logging import subprocess @@ -438,7 +438,7 @@ def get_bert_inf(phones, word2ph, norm_text, language): return bert -from text import chinese +from GPT_SoVITS.text import chinese def get_phones_and_bert(text, language, version, final=False): diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index aca8ab9e..00000000 --- a/docker-compose.yaml +++ /dev/null @@ -1,32 +0,0 @@ -version: '3.8' - -services: - gpt-sovits: - image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container. - container_name: gpt-sovits-container - environment: - - is_half=False - - is_share=False - volumes: - - ./output:/workspace/output - - ./logs:/workspace/logs - - ./SoVITS_weights:/workspace/SoVITS_weights - - ./reference:/workspace/reference - working_dir: /workspace - ports: - - "9880:9880" - - "9871:9871" - - "9872:9872" - - "9873:9873" - - "9874:9874" - shm_size: 16G - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: "all" - capabilities: [gpu] - stdin_open: true - tty: true - restart: unless-stopped diff --git a/dockerbuild.sh b/dockerbuild.sh deleted file mode 100755 index 3a4a1e18..00000000 --- a/dockerbuild.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# 获取当前日期,格式为 YYYYMMDD -DATE=$(date +%Y%m%d) -# 获取最新的 Git commit 哈希值的前 7 位 -COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7) - -# 构建 full 版本的镜像 -docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest . -# 为同一个镜像添加带日期的标签 -docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE -# 为同一个镜像添加带当前代码库Commit哈希值的标签 -docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH - - -# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器) -docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite . -# 为同一个镜像添加带日期的标签 -docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite -# 为同一个镜像添加带当前代码库Commit哈希值的标签 -docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite diff --git a/go-webui.bat b/go-webui.bat index a2dfff6c..2ca0a440 100644 --- a/go-webui.bat +++ b/go-webui.bat @@ -1,2 +1,2 @@ -runtime\python.exe -I webui.py zh_CN +runtime\python.exe -I webui.py en_US pause diff --git a/infer_script.py b/infer_script.py new file mode 100644 index 00000000..4bb38f8c --- /dev/null +++ b/infer_script.py @@ -0,0 +1,272 @@ +''' +This is just an example inference script to test batching with llama, mainly for my reference in the future. +''' + +import os +import sys +import numpy as np +import soundfile as sf +import threading +import queue +import sounddevice as sd +import time +import speech_recognition as sr + +# Ensure that GPT_SoVITS is in the Python path +now_dir = os.getcwd() +sys.path.append(now_dir) +sys.path.append(os.path.join(now_dir, 'GPT_SoVITS')) +os.environ['CUDA_LAUNCH_BLOCKING'] = '1' + + +from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config + +from llama_cpp import Llama +import sys + +# Initialize the Llama model +llm = Llama( + model_path="ggml-model-q8_0.gguf", + n_gpu_layers=-1, # Uncomment to use GPU acceleration + seed=1337, # Uncomment to set a specific seed + n_ctx=2048, # Uncomment to increase the context window + chat_format="llama-3", + verbose=False +) + +from time import time + +def generate_chat_completion_openai_v1_stream(messages): + start = time() + stream = llm.create_chat_completion_openai_v1( + messages=messages, + temperature=0.8, # Adjust temperature as needed + top_p=0.95, # Adjust top_p as needed + top_k=40, # Adjust top_k as needed + max_tokens=50, # Adjust the maximum number of tokens as needed + # stop=["\n"], # Adjust the stop sequence as needed + stream=True # Enable streaming + ) + end = time() + total = end - start + print(total) + for chunk in stream: + if chunk.choices[0].delta.content is not None: + yield chunk.choices[0].delta.content + +def audio_playback_thread(audio_queue, sample_rate): + """ + Audio playback thread that plays audio fragments from the queue. + """ + sd.default.samplerate = sample_rate + sd.default.channels = 1 + stream = sd.OutputStream(dtype='float32') + stream.start() + + try: + while True: + # Get the next audio fragment + audio_fragment = audio_queue.get() + try: + if audio_fragment is None: + # Sentinel value received, exit the loop + break + # Write the audio fragment to the stream + stream.write(audio_fragment) + finally: + # Mark the item as processed + audio_queue.task_done() + finally: + stream.stop() + stream.close() + +def main(): + + config_path = 'configs/tts_infer.yaml' + # GPT_model_path = 'pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt' + GPT_model_path = 'custom_trained.ckpt' + # SoVITS_model_path = 'pretrained_models/gsv-v2final-pretrained/s2G2333k.pth' + SoVITS_model_path = 'custom_trained.pth' + ref_audio_path = 'ref_audio.wav' + ref_text = 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか' + target_text = """hahahaha, well well, let me tell you about that! it was perhaps the most exquisite day of my life! Phew, I've never had one better! """ + output_path = 'output' + ref_language = 'ja' + target_language = 'ja' + + + # Ensure output directory exists + os.makedirs(output_path, exist_ok=True) + + # Initialize TTS configuration and pipeline + tts_config = TTS_Config(config_path) + tts_pipeline = TTS(tts_config) + + # Load model weights + tts_pipeline.init_t2s_weights(GPT_model_path) + tts_pipeline.init_vits_weights(SoVITS_model_path) + + # Prepare inputs for TTS + inputs = { + "text": target_text, + "text_lang": target_language.lower(), + "ref_audio_path": ref_audio_path, + "prompt_text": ref_text, + "prompt_lang": ref_language.lower(), + "top_k": 5, + "top_p": 1.0, + "temperature": 1.0, + "text_split_method": "cut0", + "batch_size": 1, + "batch_threshold": 0.75, + "split_bucket": True, + "speed_factor": 1.0, + "fragment_interval": 0.3, + "seed": 2855904637, + "return_fragment": True, + "parallel_infer": False, + "repetition_penalty": 1.35, + } + + # Run TTS inference + + system_message = '''You are a friendly AI named Vivy. + + HOW YOU SHOULD RESPOND: + - The responses should include only verbal responses, for example *laughs* should be replaced with haha + ''' + + # Initialize conversation history with system message + conversation_history = [ + {"role": "system", "content": f"{system_message}"} + ] + + # Create a queue for audio fragments + audio_queue = queue.Queue(maxsize=100) # Adjust maxsize based on your needs + + # Start the audio playback thread + playback_thread = threading.Thread( + target=audio_playback_thread, + args=(audio_queue, tts_pipeline.configs.sampling_rate) + ) + playback_thread.start() + + # Setup speech recognition + r = sr.Recognizer() + mic = sr.Microphone() + + try: + while True: + # Prompt for speech input instead of text input + while True: + print("\nPlease speak your message (say 'quit' to exit):") + with mic as source: + # Adjust for ambient noise to improve recognition accuracy + r.adjust_for_ambient_noise(source, duration=1.0) + print("Listening...") + audio_data = r.listen(source, timeout=None, phrase_time_limit=60) + try: + # Replace 'recognize_whisper' with your actual recognition method + # Ensure that the method is correctly implemented or available + user_input = r.recognize_whisper(audio_data=audio_data, model="base") + print("You said: " + user_input) + + # Check if the input is not empty or just whitespace + if user_input.strip() == "": + print("No speech detected. Please try again.") + continue # Continue listening + break # Valid input received, exit inner loop + except sr.UnknownValueError: + print("Sorry, I could not understand the audio. Please try again.") + continue # Continue listening + except sr.RequestError as e: + print(f"Could not request results from speech recognition service; {e}") + continue # Continue listening + + # Check if the user wants to quit + if user_input.lower() == "quit": + print("Exiting the application. Goodbye!") + sys.exit() + + # Append user message to conversation history + conversation_history.append({"role": "user", "content": user_input}) + + # Initialize variables to track character count and buffering + buffer = "" + char_count = 0 + waiting_for_punctuation = False + assistant_buffer = "" + + # Generate and print the chat completion with streaming + for token in generate_chat_completion_openai_v1_stream(conversation_history): + print(token, end="", flush=True) # Print each character as it's generated + buffer += token + assistant_buffer += token + char_count += len(token) + + if not waiting_for_punctuation: + if char_count >= 100: + waiting_for_punctuation = True # Start looking for punctuation + else: + if any(punct in token for punct in ['.', '!', '?']): + # Send the buffer to TTS + inputs["text"] = buffer + synthesis_result = tts_pipeline.run_generator(inputs) + # Consume the generator and put audio fragments into the queue + for sampling_rate, audio_fragment in synthesis_result: + audio_queue.put(audio_fragment) + #put sielnce into audio queue after tts sythesis generator has finished + silence_duration = 0.5 # in seconds + num_samples = int(sampling_rate * silence_duration) + silence = np.zeros(num_samples, dtype='float32') + audio_queue.put(silence) + + # Reset counters and buffer + char_count = 0 + buffer = "" + waiting_for_punctuation = False + + # Append assistant message to conversation history + conversation_history.append({"role": "assistant", "content": assistant_buffer}) + + # Handle any remaining text after the generator is done + if buffer.strip(): + inputs["text"] = buffer + synthesis_result = tts_pipeline.run_generator(inputs) + + # Consume the generator and put audio fragments into the queue + for sampling_rate, audio_fragment in synthesis_result: + audio_queue.put(audio_fragment) + #put sielnce into audio queue after tts sythesis generator has finished + silence_duration = 0.5 # in seconds + num_samples = int(sampling_rate * silence_duration) + silence = np.zeros(num_samples, dtype='float32') + audio_queue.put(silence) + + conversation_history.append({"role": "assistant", "content": buffer}) + buffer = "" + char_count = 0 + waiting_for_punctuation = False + finally: + # After all processing is done, send a sentinel to the audio queue and wait for threads to finish + audio_queue.put(None) + audio_queue.join() + playback_thread.join() + + + # text = input("GO:") + # inputs["text"] = text + # synthesis_result = tts_pipeline.run_generator(inputs) + # audio_data_list = list(synthesis_result) + # if audio_data_list: + # # Since return_fragment is False, we expect only one tuple in audio_data_list + # sampling_rate, audio_data = audio_data_list[0] + # output_wav_path = os.path.join(output_path, "output.wav") + # # Save the audio data to a WAV file + # sf.write(output_wav_path, audio_data, sampling_rate) + # print(f"Audio saved to {output_wav_path}") + # else: + # print("No audio data generated.") + +if __name__ == '__main__': + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..cd500100 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,64 @@ +[build-system] +requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "GPT_SoVITS" +dynamic = ["version"] +description = "A packaged version of GPT_SoVITS designed to be installable" +readme = "README.md" +license = {text = "MIT License"} +classifiers = [ + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", +] +dependencies = [ + "numpy==1.26.4", + "scipy", + "tensorboard", + "librosa==0.9.2", + "numba", + "pytorch-lightning", + "gradio>=4.0,<=4.24.0", + "ffmpeg-python", + "onnxruntime; sys_platform == 'darwin'", + "onnxruntime-gpu; sys_platform != 'darwin'", + "tqdm", + "funasr==1.0.27", + "cn2an", + "pypinyin", + "pyopenjtalk>=0.3.4", + "g2p_en", + "torchaudio", + "modelscope==1.10.0", + "sentencepiece", + "transformers", + "chardet", + "PyYAML", + "psutil", + "jieba_fast", + "jieba", + "LangSegment>=0.2.0", + "Faster_Whisper", + "wordsegment", + "rotary_embedding_torch", + "pyjyutping", + "g2pk2", + "ko_pron", + "opencc; sys_platform != 'linux'", + "opencc==1.1.1; sys_platform == 'linux'", + "python_mecab_ko; sys_platform != 'win32'", + "fastapi<0.112.2", + "sounddevice" +] + + +[project.urls] +Homepage = "https://github.com/JarodMica/GPT-SoVITS/tree/main" + +[project.scripts] +"gpt_sovits_api" = "GPT_SoVITS.api_v2:main" + +[tool.setuptools.packages.find] +include = ["GPT_SoVITS"] \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 00000000..ecc007e2 --- /dev/null +++ b/test.py @@ -0,0 +1,35 @@ +import requests + +# API configuration +host = '127.0.0.1' +port = 9880 +url = f'http://{host}:{port}/tts' + +# Parameters for the request +params = { + 'text': 'ははは、それはいいよね!でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか', + 'text_lang': 'ja', + 'ref_audio_path': 'test.wav', + 'prompt_lang': 'ja', + 'prompt_text': 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか', + 'text_split_method': 'cut0', + 'batch_size': 1, + 'media_type': 'wav', + 'streaming_mode': False, +} + +try: + # Send the GET request + response = requests.get(url, params=params) + + # Check if the request was successful + if response.status_code == 200: + # Save the audio content to a file + with open('output.wav', 'wb') as f: + f.write(response.content) + print('Audio saved to output.wav') + else: + print(f'Error: {response.status_code}') + print(response.json()) +except requests.exceptions.RequestException as e: + print(f'An error occurred: {e}') diff --git a/test_streaming.py b/test_streaming.py new file mode 100644 index 00000000..ca667096 --- /dev/null +++ b/test_streaming.py @@ -0,0 +1,89 @@ +import requests +import wave +import io +import sounddevice as sd + +# API configuration +host = '127.0.0.1' +port = 9880 +url = f'http://{host}:{port}/tts' + +# Parameters for the request +params = { + 'text': 'ふふふ、それはいいよね!でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか', + 'text_lang': 'ja', + 'ref_audio_path': 'test.wav', + 'prompt_lang': 'ja', + 'prompt_text': 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか', + 'text_split_method': 'cut0', + 'batch_size': 1, + 'media_type': 'wav', + 'streaming_mode': True, +} + +while True: + input("Waiting for enter") + try: + # Send the GET request with streaming enabled + response = requests.get(url, params=params, stream=True) + + # Check if the request was successful + if response.status_code == 200: + buffer = b'' # Buffer to hold data until header is processed + header_size = 44 # Standard WAV header size + header_parsed = False + stream = None + + for chunk in response.iter_content(chunk_size=4096): + if chunk: + if not header_parsed: + buffer += chunk + if len(buffer) >= header_size: + # Parse WAV header + wav_header = buffer[:header_size] + wav_file = wave.open(io.BytesIO(wav_header), 'rb') + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + sample_rate = wav_file.getframerate() + wav_file.close() + + # Map sample_width to dtype + dtype_map = {1: 'int8', 2: 'int16', 3: 'int24', 4: 'int32'} + dtype = dtype_map.get(sample_width, 'int16') + + # Open audio stream + stream = sd.RawOutputStream( + samplerate=sample_rate, + channels=channels, + dtype=dtype, + blocksize=0, # Use default block size + ) + + stream.start() + + # Write any remaining data after the header + data = buffer[header_size:] + if data: + stream.write(data) + header_parsed = True + buffer = b'' # Clear buffer + else: + # Play audio data + if stream: + stream.write(chunk) + # Clean up + if stream: + stream.stop() + stream.close() + print('Audio playback completed.') + else: + print(f'Error: {response.status_code}') + # Print the error message from the API + try: + print(response.json()) + except ValueError: + print(response.text) + except requests.exceptions.RequestException as e: + print(f'An error occurred: {e}') + except Exception as e: + print(f'An error occurred: {e}') diff --git a/webui.py b/webui.py index cddbb029..13112dc4 100644 --- a/webui.py +++ b/webui.py @@ -75,15 +75,15 @@ from config import ( webui_port_subfix, webui_port_uvr5, ) -from tools import my_utils -from tools.i18n.i18n import I18nAuto, scan_language_list +from GPT_SoVITS.tools import my_utils +from GPT_SoVITS.tools.i18n.i18n import I18nAuto, scan_language_list language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" os.environ["language"] = language i18n = I18nAuto(language=language) from multiprocessing import cpu_count -from tools.my_utils import check_details, check_for_existance +from GPT_SoVITS.tools.my_utils import check_details, check_for_existance # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu try: @@ -345,7 +345,12 @@ def kill_process(pid, process_name=""): if system == "Windows": cmd = "taskkill /t /f /pid %s" % pid # os.system(cmd) - subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + subprocess.run( + ["taskkill", "/t", "/f", "/pid", str(pid)], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) else: kill_proc_tree(pid) print(process_name + i18n("进程已终止"))