Merge branch 'RVC-Boss:main' into main

This commit is contained in:
Cosmo Clara 2025-02-12 19:27:47 +08:00 committed by GitHub
commit 055c84df60
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
91 changed files with 8533 additions and 353 deletions

2
.gitignore vendored
View File

@ -12,6 +12,8 @@ GPT_weights
SoVITS_weights
GPT_weights_v2
SoVITS_weights_v2
GPT_weights_v3
SoVITS_weights_v3
TEMP
weight.json
ffmpeg*

View File

@ -145,45 +145,21 @@ class T2SBlock:
else:
attn = scaled_dot_product_attention(q, k, v, attn_mask)
attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim)
attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0)
attn = attn.transpose(1, 2).reshape(batch_size, q_len, -1)
attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b)
if padding_mask is not None:
for i in range(batch_size):
# mask = padding_mask[i,:,0]
if self.false.device!= padding_mask.device:
self.false = self.false.to(padding_mask.device)
idx = torch.where(padding_mask[i,:,0]==self.false)[0]
x_item = x[i,idx,:].unsqueeze(0)
attn_item = attn[i,idx,:].unsqueeze(0)
x_item = x_item + attn_item
x_item = F.layer_norm(
x_item, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
)
x_item = x_item + self.mlp.forward(x_item)
x_item = F.layer_norm(
x_item,
[self.hidden_dim],
self.norm_w2,
self.norm_b2,
self.norm_eps2,
)
x[i,idx,:] = x_item.squeeze(0)
x = self.to_mask(x, padding_mask)
else:
x = x + attn
x = F.layer_norm(
x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
)
x = x + self.mlp.forward(x)
x = F.layer_norm(
x,
[self.hidden_dim],
self.norm_w2,
self.norm_b2,
self.norm_eps2,
)
x = x + attn
x = F.layer_norm(
x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
)
x = x + self.mlp.forward(x)
x = F.layer_norm(
x,
[self.hidden_dim],
self.norm_w2,
self.norm_b2,
self.norm_eps2,
)
return x, k_cache, v_cache
def decode_next_token(self, x:torch.Tensor, k_cache:torch.Tensor, v_cache:torch.Tensor, attn_mask:Optional[torch.Tensor]=None, torch_sdpa:bool=True):
@ -206,8 +182,7 @@ class T2SBlock:
else:
attn = scaled_dot_product_attention(q, k, v, attn_mask)
attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim)
attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0)
attn = attn.transpose(1, 2).reshape(batch_size, q_len, -1)
attn = F.linear(attn, self.out_w, self.out_b)
x = x + attn
@ -662,7 +637,7 @@ class Text2SemanticDecoder(nn.Module):
xy_attn_mask = xy_mask.logical_or(_xy_padding_mask)
xy_attn_mask = xy_attn_mask.unsqueeze(1).expand(-1, self.num_head, -1, -1)
xy_attn_mask = xy_attn_mask.bool()
xy_padding_mask = xy_padding_mask.view(bsz, src_len, 1).expand(-1, -1, self.model_dim)
xy_padding_mask = xy_padding_mask.view(bsz, src_len, 1)
###### decode #####
y_list = [None]*y.shape[0]

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 NVIDIA CORPORATION.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,266 @@
## BigVGAN: A Universal Neural Vocoder with Large-Scale Training
#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon
[[Paper]](https://arxiv.org/abs/2206.04658) - [[Code]](https://github.com/NVIDIA/BigVGAN) - [[Showcase]](https://bigvgan-demo.github.io/) - [[Project Page]](https://research.nvidia.com/labs/adlr/projects/bigvgan/) - [[Weights]](https://huggingface.co/collections/nvidia/bigvgan-66959df3d97fd7d98d97dc9a) - [[Demo]](https://huggingface.co/spaces/nvidia/BigVGAN)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/bigvgan-a-universal-neural-vocoder-with-large/speech-synthesis-on-libritts)](https://paperswithcode.com/sota/speech-synthesis-on-libritts?p=bigvgan-a-universal-neural-vocoder-with-large)
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
## News
- **Sep 2024 (v2.4):**
- We have updated the pretrained checkpoints trained for 5M steps. This is final release of the BigVGAN-v2 checkpoints.
- **Jul 2024 (v2.3):**
- General refactor and code improvements for improved readability.
- Fully fused CUDA kernel of anti-alised activation (upsampling + activation + downsampling) with inference speed benchmark.
- **Jul 2024 (v2.2):** The repository now includes an interactive local demo using gradio.
- **Jul 2024 (v2.1):** BigVGAN is now integrated with 🤗 Hugging Face Hub with easy access to inference using pretrained checkpoints. We also provide an interactive demo on Hugging Face Spaces.
- **Jul 2024 (v2):** We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:
- Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.
- Improved discriminator and loss: BigVGAN-v2 is trained using a [multi-scale sub-band CQT discriminator](https://arxiv.org/abs/2311.14957) and a [multi-scale mel spectrogram loss](https://arxiv.org/abs/2306.06546).
- Larger training data: BigVGAN-v2 is trained using datasets containing diverse audio types, including speech in multiple languages, environmental sounds, and instruments.
- We provide pretrained checkpoints of BigVGAN-v2 using diverse audio configurations, supporting up to 44 kHz sampling rate and 512x upsampling ratio.
## Installation
The codebase has been tested on Python `3.10` and PyTorch `2.3.1` conda packages with either `pytorch-cuda=12.1` or `pytorch-cuda=11.8`. Below is an example command to create the conda environment:
```shell
conda create -n bigvgan python=3.10 pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
conda activate bigvgan
```
Clone the repository and install dependencies:
```shell
git clone https://github.com/NVIDIA/BigVGAN
cd BigVGAN
pip install -r requirements.txt
```
## Inference Quickstart using 🤗 Hugging Face Hub
Below example describes how you can use BigVGAN: load the pretrained BigVGAN generator from Hugging Face Hub, compute mel spectrogram from input waveform, and generate synthesized waveform using the mel spectrogram as the model's input.
```python
device = 'cuda'
import torch
import bigvgan
import librosa
from meldataset import get_mel_spectrogram
# instantiate the model. You can optionally set use_cuda_kernel=True for faster inference.
model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False)
# remove weight norm in the model and set to eval mode
model.remove_weight_norm()
model = model.eval().to(device)
# load wav file and compute mel spectrogram
wav_path = '/path/to/your/audio.wav'
wav, sr = librosa.load(wav_path, sr=model.h.sampling_rate, mono=True) # wav is np.ndarray with shape [T_time] and values in [-1, 1]
wav = torch.FloatTensor(wav).unsqueeze(0) # wav is FloatTensor with shape [B(1), T_time]
# compute mel spectrogram from the ground truth audio
mel = get_mel_spectrogram(wav, model.h).to(device) # mel is FloatTensor with shape [B(1), C_mel, T_frame]
# generate waveform from mel
with torch.inference_mode():
wav_gen = model(mel) # wav_gen is FloatTensor with shape [B(1), 1, T_time] and values in [-1, 1]
wav_gen_float = wav_gen.squeeze(0).cpu() # wav_gen is FloatTensor with shape [1, T_time]
# you can convert the generated waveform to 16 bit linear PCM
wav_gen_int16 = (wav_gen_float * 32767.0).numpy().astype('int16') # wav_gen is now np.ndarray with shape [1, T_time] and int16 dtype
```
## Local gradio demo <a href='https://github.com/gradio-app/gradio'><img src='https://img.shields.io/github/stars/gradio-app/gradio'></a>
You can run a local gradio demo using below command:
```python
pip install -r demo/requirements.txt
python demo/app.py
```
## Training
Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset:
```shell
cd filelists/LibriTTS && \
ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \
ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \
ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \
ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \
ln -s /path/to/your/LibriTTS/dev-other dev-other && \
ln -s /path/to/your/LibriTTS/test-clean test-clean && \
ln -s /path/to/your/LibriTTS/test-other test-other && \
cd ../..
```
Train BigVGAN model. Below is an example command for training BigVGAN-v2 using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input:
```shell
python train.py \
--config configs/bigvgan_v2_24khz_100band_256x.json \
--input_wavs_dir filelists/LibriTTS \
--input_training_file filelists/LibriTTS/train-full.txt \
--input_validation_file filelists/LibriTTS/val-full.txt \
--list_input_unseen_wavs_dir filelists/LibriTTS filelists/LibriTTS \
--list_input_unseen_validation_file filelists/LibriTTS/dev-clean.txt filelists/LibriTTS/dev-other.txt \
--checkpoint_path exp/bigvgan_v2_24khz_100band_256x
```
## Synthesis
Synthesize from BigVGAN model. Below is an example command for generating audio from the model.
It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`.
```shell
python inference.py \
--checkpoint_file /path/to/your/bigvgan_v2_24khz_100band_256x/bigvgan_generator.pt \
--input_wavs_dir /path/to/your/input_wav \
--output_dir /path/to/your/output_wav
```
`inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`.
It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`.
Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model.
```shell
python inference_e2e.py \
--checkpoint_file /path/to/your/bigvgan_v2_24khz_100band_256x/bigvgan_generator.pt \
--input_mels_dir /path/to/your/input_mel \
--output_dir /path/to/your/output_wav
```
## Using Custom CUDA Kernel for Synthesis
You can apply the fast CUDA inference kernel by using a parameter `use_cuda_kernel` when instantiating BigVGAN:
```python
generator = BigVGAN(h, use_cuda_kernel=True)
```
You can also pass `--use_cuda_kernel` to `inference.py` and `inference_e2e.py` to enable this feature.
When applied for the first time, it builds the kernel using `nvcc` and `ninja`. If the build succeeds, the kernel is saved to `alias_free_activation/cuda/build` and the model automatically loads the kernel. The codebase has been tested using CUDA `12.1`.
Please make sure that both are installed in your system and `nvcc` installed in your system matches the version your PyTorch build is using.
We recommend running `test_cuda_vs_torch_model.py` first to build and check the correctness of the CUDA kernel. See below example command and its output, where it returns `[Success] test CUDA fused vs. plain torch BigVGAN inference`:
```python
python tests/test_cuda_vs_torch_model.py \
--checkpoint_file /path/to/your/bigvgan_generator.pt
```
```shell
loading plain Pytorch BigVGAN
...
loading CUDA kernel BigVGAN with auto-build
Detected CUDA files, patching ldflags
Emitting ninja build file /path/to/your/BigVGAN/alias_free_activation/cuda/build/build.ninja..
Building extension module anti_alias_activation_cuda...
...
Loading extension module anti_alias_activation_cuda...
...
Loading '/path/to/your/bigvgan_generator.pt'
...
[Success] test CUDA fused vs. plain torch BigVGAN inference
> mean_difference=0.0007238413265440613
...
```
If you see `[Fail] test CUDA fused vs. plain torch BigVGAN inference`, it means that the CUDA kernel inference is incorrect. Please check if `nvcc` installed in your system is compatible with your PyTorch version.
## Pretrained Models
We provide the [pretrained models on Hugging Face Collections](https://huggingface.co/collections/nvidia/bigvgan-66959df3d97fd7d98d97dc9a).
One can download the checkpoints of the generator weight (named `bigvgan_generator.pt`) and its discriminator/optimizer states (named `bigvgan_discriminator_optimizer.pt`) within the listed model repositories.
| Model Name | Sampling Rate | Mel band | fmax | Upsampling Ratio | Params | Dataset | Steps | Fine-Tuned |
|:--------------------------------------------------------------------------------------------------------:|:-------------:|:--------:|:-----:|:----------------:|:------:|:--------------------------:|:-----:|:----------:|
| [bigvgan_v2_44khz_128band_512x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_512x) | 44 kHz | 128 | 22050 | 512 | 122M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_44khz_128band_256x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_256x) | 44 kHz | 128 | 22050 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_24khz_100band_256x](https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x) | 24 kHz | 100 | 12000 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_22khz_80band_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_256x) | 22 kHz | 80 | 11025 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_22khz_80band_fmax8k_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_fmax8k_256x) | 22 kHz | 80 | 8000 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_24khz_100band](https://huggingface.co/nvidia/bigvgan_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 112M | LibriTTS | 5M | No |
| [bigvgan_base_24khz_100band](https://huggingface.co/nvidia/bigvgan_base_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 14M | LibriTTS | 5M | No |
| [bigvgan_22khz_80band](https://huggingface.co/nvidia/bigvgan_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 112M | LibriTTS + VCTK + LJSpeech | 5M | No |
| [bigvgan_base_22khz_80band](https://huggingface.co/nvidia/bigvgan_base_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 14M | LibriTTS + VCTK + LJSpeech | 5M | No |
The paper results are based on the original 24kHz BigVGAN models (`bigvgan_24khz_100band` and `bigvgan_base_24khz_100band`) trained on LibriTTS dataset.
We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
Note that the checkpoints use `snakebeta` activation with log scale parameterization, which have the best overall quality.
You can fine-tune the models by:
1. downloading the checkpoints (both the generator weight and its discriminator/optimizer states)
2. resuming training using your audio dataset by specifying `--checkpoint_path` that includes the checkpoints when launching `train.py`
## Training Details of BigVGAN-v2
Comapred to the original BigVGAN, the pretrained checkpoints of BigVGAN-v2 used `batch_size=32` with a longer `segment_size=65536` and are trained using 8 A100 GPUs.
Note that the BigVGAN-v2 `json` config files in `./configs` use `batch_size=4` as default to fit in a single A100 GPU for training. You can fine-tune the models adjusting `batch_size` depending on your GPUs.
When training BigVGAN-v2 from scratch with small batch size, it can potentially encounter the early divergence problem mentioned in the paper. In such case, we recommend lowering the `clip_grad_norm` value (e.g. `100`) for the early training iterations (e.g. 20000 steps) and increase the value to the default `500`.
## Evaluation Results of BigVGAN-v2
Below are the objective results of the 24kHz model (`bigvgan_v2_24khz_100band_256x`) obtained from the LibriTTS `dev` sets. BigVGAN-v2 shows noticeable improvements of the metrics. The model also exhibits reduced perceptual artifacts, especially for non-speech audio.
| Model | Dataset | Steps | PESQ(↑) | M-STFT(↓) | MCD(↓) | Periodicity(↓) | V/UV F1(↑) |
|:----------:|:-----------------------:|:-----:|:---------:|:----------:|:----------:|:--------------:|:----------:|
| BigVGAN | LibriTTS | 1M | 4.027 | 0.7997 | 0.3745 | 0.1018 | 0.9598 |
| BigVGAN | LibriTTS | 5M | 4.256 | 0.7409 | 0.2988 | 0.0809 | 0.9698 |
| BigVGAN-v2 | Large-scale Compilation | 3M | 4.359 | 0.7134 | 0.3060 | 0.0621 | 0.9777 |
| BigVGAN-v2 | Large-scale Compilation | 5M | **4.362** | **0.7026** | **0.2903** | **0.0593** | **0.9793** |
## Speed Benchmark
Below are the speed and VRAM usage benchmark results of BigVGAN from `tests/test_cuda_vs_torch_model.py`, using `bigvgan_v2_24khz_100band_256x` as a reference model.
| GPU | num_mel_frame | use_cuda_kernel | Speed (kHz) | Real-time Factor | VRAM (GB) |
|:--------------------------:|:-------------:|:---------------:|:-----------:|:----------------:|:---------:|
| NVIDIA A100 | 256 | False | 1672.1 | 69.7x | 1.3 |
| | | True | 3916.5 | 163.2x | 1.3 |
| | 2048 | False | 1899.6 | 79.2x | 1.7 |
| | | True | 5330.1 | 222.1x | 1.7 |
| | 16384 | False | 1973.8 | 82.2x | 5.0 |
| | | True | 5761.7 | 240.1x | 4.4 |
| NVIDIA GeForce RTX 3080 | 256 | False | 841.1 | 35.0x | 1.3 |
| | | True | 1598.1 | 66.6x | 1.3 |
| | 2048 | False | 929.9 | 38.7x | 1.7 |
| | | True | 1971.3 | 82.1x | 1.6 |
| | 16384 | False | 943.4 | 39.3x | 5.0 |
| | | True | 2026.5 | 84.4x | 3.9 |
| NVIDIA GeForce RTX 2080 Ti | 256 | False | 515.6 | 21.5x | 1.3 |
| | | True | 811.3 | 33.8x | 1.3 |
| | 2048 | False | 576.5 | 24.0x | 1.7 |
| | | True | 1023.0 | 42.6x | 1.5 |
| | 16384 | False | 589.4 | 24.6x | 5.0 |
| | | True | 1068.1 | 44.5x | 3.2 |
## Acknowledgements
We thank Vijay Anand Korthikanti and Kevin J. Shih for their generous support in implementing the CUDA kernel for inference.
## References
- [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator)
- [Snake](https://github.com/EdwardDixon/snake) (for periodic activation)
- [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing)
- [Julius](https://github.com/adefossez/julius) (for low-pass filter)
- [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator)
- [descript-audio-codec](https://github.com/descriptinc/descript-audio-codec) and [vocos](https://github.com/gemelo-ai/vocos) (for multi-band multi-scale STFT discriminator and multi-scale mel spectrogram loss)
- [Amphion](https://github.com/open-mmlab/Amphion) (for multi-scale sub-band CQT discriminator)

View File

@ -0,0 +1,126 @@
# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
# LICENSE is in incl_licenses directory.
import torch
from torch import nn, sin, pow
from torch.nn import Parameter
class Snake(nn.Module):
"""
Implementation of a sine-based periodic activation function
Shape:
- Input: (B, C, T)
- Output: (B, C, T), same shape as the input
Parameters:
- alpha - trainable parameter
References:
- This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
https://arxiv.org/abs/2006.08195
Examples:
>>> a1 = snake(256)
>>> x = torch.randn(256)
>>> x = a1(x)
"""
def __init__(
self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
):
"""
Initialization.
INPUT:
- in_features: shape of the input
- alpha: trainable parameter
alpha is initialized to 1 by default, higher values = higher-frequency.
alpha will be trained along with the rest of your model.
"""
super(Snake, self).__init__()
self.in_features = in_features
# Initialize alpha
self.alpha_logscale = alpha_logscale
if self.alpha_logscale: # Log scale alphas initialized to zeros
self.alpha = Parameter(torch.zeros(in_features) * alpha)
else: # Linear scale alphas initialized to ones
self.alpha = Parameter(torch.ones(in_features) * alpha)
self.alpha.requires_grad = alpha_trainable
self.no_div_by_zero = 0.000000001
def forward(self, x):
"""
Forward pass of the function.
Applies the function to the input elementwise.
Snake = x + 1/a * sin^2 (xa)
"""
alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # Line up with x to [B, C, T]
if self.alpha_logscale:
alpha = torch.exp(alpha)
x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
return x
class SnakeBeta(nn.Module):
"""
A modified Snake function which uses separate parameters for the magnitude of the periodic components
Shape:
- Input: (B, C, T)
- Output: (B, C, T), same shape as the input
Parameters:
- alpha - trainable parameter that controls frequency
- beta - trainable parameter that controls magnitude
References:
- This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
https://arxiv.org/abs/2006.08195
Examples:
>>> a1 = snakebeta(256)
>>> x = torch.randn(256)
>>> x = a1(x)
"""
def __init__(
self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
):
"""
Initialization.
INPUT:
- in_features: shape of the input
- alpha - trainable parameter that controls frequency
- beta - trainable parameter that controls magnitude
alpha is initialized to 1 by default, higher values = higher-frequency.
beta is initialized to 1 by default, higher values = higher-magnitude.
alpha will be trained along with the rest of your model.
"""
super(SnakeBeta, self).__init__()
self.in_features = in_features
# Initialize alpha
self.alpha_logscale = alpha_logscale
if self.alpha_logscale: # Log scale alphas initialized to zeros
self.alpha = Parameter(torch.zeros(in_features) * alpha)
self.beta = Parameter(torch.zeros(in_features) * alpha)
else: # Linear scale alphas initialized to ones
self.alpha = Parameter(torch.ones(in_features) * alpha)
self.beta = Parameter(torch.ones(in_features) * alpha)
self.alpha.requires_grad = alpha_trainable
self.beta.requires_grad = alpha_trainable
self.no_div_by_zero = 0.000000001
def forward(self, x):
"""
Forward pass of the function.
Applies the function to the input elementwise.
SnakeBeta = x + 1/b * sin^2 (xa)
"""
alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # Line up with x to [B, C, T]
beta = self.beta.unsqueeze(0).unsqueeze(-1)
if self.alpha_logscale:
alpha = torch.exp(alpha)
beta = torch.exp(beta)
x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
return x

View File

@ -0,0 +1,77 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
import torch
import torch.nn as nn
from alias_free_activation.torch.resample import UpSample1d, DownSample1d
# load fused CUDA kernel: this enables importing anti_alias_activation_cuda
from alias_free_activation.cuda import load
anti_alias_activation_cuda = load.load()
class FusedAntiAliasActivation(torch.autograd.Function):
"""
Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
The hyperparameters are hard-coded in the kernel to maximize speed.
NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
"""
@staticmethod
def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
activation_results = anti_alias_activation_cuda.forward(
inputs, up_ftr, down_ftr, alpha, beta
)
return activation_results
@staticmethod
def backward(ctx, output_grads):
raise NotImplementedError
return output_grads, None, None
class Activation1d(nn.Module):
def __init__(
self,
activation,
up_ratio: int = 2,
down_ratio: int = 2,
up_kernel_size: int = 12,
down_kernel_size: int = 12,
fused: bool = True,
):
super().__init__()
self.up_ratio = up_ratio
self.down_ratio = down_ratio
self.act = activation
self.upsample = UpSample1d(up_ratio, up_kernel_size)
self.downsample = DownSample1d(down_ratio, down_kernel_size)
self.fused = fused # Whether to use fused CUDA kernel or not
def forward(self, x):
if not self.fused:
x = self.upsample(x)
x = self.act(x)
x = self.downsample(x)
return x
else:
if self.act.__class__.__name__ == "Snake":
beta = self.act.alpha.data # Snake uses same params for alpha and beta
else:
beta = (
self.act.beta.data
) # Snakebeta uses different params for alpha and beta
alpha = self.act.alpha.data
if (
not self.act.alpha_logscale
): # Exp baked into cuda kernel, cancel it out with a log
alpha = torch.log(alpha)
beta = torch.log(beta)
x = FusedAntiAliasActivation.apply(
x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta
)
return x

View File

@ -0,0 +1,23 @@
/* coding=utf-8
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <torch/extension.h>
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
}

View File

@ -0,0 +1,246 @@
/* coding=utf-8
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#include "type_shim.h"
#include <assert.h>
#include <cfloat>
#include <limits>
#include <stdint.h>
#include <c10/macros/Macros.h>
namespace
{
// Hard-coded hyperparameters
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
constexpr int BUFFER_SIZE = 32;
constexpr int FILTER_SIZE = 12;
constexpr int HALF_FILTER_SIZE = 6;
constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
template <typename input_t, typename output_t, typename acc_t>
__global__ void anti_alias_activation_forward(
output_t *dst,
const input_t *src,
const input_t *up_ftr,
const input_t *down_ftr,
const input_t *alpha,
const input_t *beta,
int batch_size,
int channels,
int seq_len)
{
// Up and downsample filters
input_t up_filter[FILTER_SIZE];
input_t down_filter[FILTER_SIZE];
// Load data from global memory including extra indices reserved for replication paddings
input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
// Output stores downsampled output before writing to dst
output_t output[BUFFER_SIZE];
// blockDim/threadIdx = (128, 1, 1)
// gridDim/blockIdx = (seq_blocks, channels, batches)
int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
int local_offset = threadIdx.x * BUFFER_SIZE;
int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
// intermediate have double the seq_len
int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
// Get values needed for replication padding before moving pointer
const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
input_t seq_left_most_value = right_most_pntr[0];
input_t seq_right_most_value = right_most_pntr[seq_len - 1];
// Move src and dst pointers
src += block_offset + local_offset;
dst += block_offset + local_offset;
// Alpha and beta values for snake activatons. Applies exp by default
alpha = alpha + blockIdx.y;
input_t alpha_val = expf(alpha[0]);
beta = beta + blockIdx.y;
input_t beta_val = expf(beta[0]);
#pragma unroll
for (int it = 0; it < FILTER_SIZE; it += 1)
{
up_filter[it] = up_ftr[it];
down_filter[it] = down_ftr[it];
}
// Apply replication padding for upsampling, matching torch impl
#pragma unroll
for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
{
int element_index = seq_offset + it; // index for element
if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
{
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
}
if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
{
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
}
if ((element_index >= 0) && (element_index < seq_len))
{
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
}
}
// Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
#pragma unroll
for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
{
input_t acc = 0.0;
int element_index = intermediate_seq_offset + it; // index for intermediate
#pragma unroll
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
{
if ((element_index + f_idx) >= 0)
{
acc += up_filter[f_idx] * elements[it + f_idx];
}
}
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
}
// Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
double no_div_by_zero = 0.000000001;
#pragma unroll
for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
{
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
}
// Apply replication padding before downsampling conv from intermediates
#pragma unroll
for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
{
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
}
#pragma unroll
for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
{
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
}
// Apply downsample strided convolution (assuming stride=2) from intermediates
#pragma unroll
for (int it = 0; it < BUFFER_SIZE; it += 1)
{
input_t acc = 0.0;
#pragma unroll
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
{
// Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
}
output[it] = acc;
}
// Write output to dst
#pragma unroll
for (int it = 0; it < BUFFER_SIZE; it += ELEMENTS_PER_LDG_STG)
{
int element_index = seq_offset + it;
if (element_index < seq_len)
{
dst[it] = output[it];
}
}
}
template <typename input_t, typename output_t, typename acc_t>
void dispatch_anti_alias_activation_forward(
output_t *dst,
const input_t *src,
const input_t *up_ftr,
const input_t *down_ftr,
const input_t *alpha,
const input_t *beta,
int batch_size,
int channels,
int seq_len)
{
if (seq_len == 0)
{
return;
}
else
{
// Use 128 threads per block to maximimize gpu utilization
constexpr int threads_per_block = 128;
constexpr int seq_len_per_block = 4096;
int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
dim3 blocks(blocks_per_seq_len, channels, batch_size);
dim3 threads(threads_per_block, 1, 1);
anti_alias_activation_forward<input_t, output_t, acc_t>
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
}
}
}
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
{
// Input is a 3d tensor with dimensions [batches, channels, seq_len]
const int batches = input.size(0);
const int channels = input.size(1);
const int seq_len = input.size(2);
// Output
auto act_options = input.options().requires_grad(false);
torch::Tensor anti_alias_activation_results =
torch::empty({batches, channels, seq_len}, act_options);
void *input_ptr = static_cast<void *>(input.data_ptr());
void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
void *beta_ptr = static_cast<void *>(beta.data_ptr());
void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
DISPATCH_FLOAT_HALF_AND_BFLOAT(
input.scalar_type(),
"dispatch anti alias activation_forward",
dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
reinterpret_cast<const scalar_t *>(input_ptr),
reinterpret_cast<const scalar_t *>(up_filter_ptr),
reinterpret_cast<const scalar_t *>(down_filter_ptr),
reinterpret_cast<const scalar_t *>(alpha_ptr),
reinterpret_cast<const scalar_t *>(beta_ptr),
batches,
channels,
seq_len););
return anti_alias_activation_results;
}

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,29 @@
/* coding=utf-8
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*This code is copied fron NVIDIA apex:
* https://github.com/NVIDIA/apex
* with minor changes. */
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif

View File

@ -0,0 +1,86 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
import os
import pathlib
import subprocess
from torch.utils import cpp_extension
"""
Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels.
Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
"""
os.environ["TORCH_CUDA_ARCH_LIST"] = ""
def load():
# Check if cuda 11 is installed for compute capability 8.0
cc_flag = []
_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
if int(bare_metal_major) >= 11:
cc_flag.append("-gencode")
cc_flag.append("arch=compute_80,code=sm_80")
# Build path
srcpath = pathlib.Path(__file__).parent.absolute()
buildpath = srcpath / "build"
_create_build_dir(buildpath)
# Helper function to build the kernels.
def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
return cpp_extension.load(
name=name,
sources=sources,
build_directory=buildpath,
extra_cflags=[
"-O3",
],
extra_cuda_cflags=[
"-O3",
"-gencode",
"arch=compute_70,code=sm_70",
"--use_fast_math",
]
+ extra_cuda_flags
+ cc_flag,
verbose=True,
)
extra_cuda_flags = [
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
]
sources = [
srcpath / "anti_alias_activation.cpp",
srcpath / "anti_alias_activation_cuda.cu",
]
anti_alias_activation_cuda = _cpp_extention_load_helper(
"anti_alias_activation_cuda", sources, extra_cuda_flags
)
return anti_alias_activation_cuda
def _get_cuda_bare_metal_version(cuda_dir):
raw_output = subprocess.check_output(
[cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
return raw_output, bare_metal_major, bare_metal_minor
def _create_build_dir(buildpath):
try:
os.mkdir(buildpath)
except OSError:
if not os.path.isdir(buildpath):
print(f"Creation of the build directory {buildpath} failed")

View File

@ -0,0 +1,92 @@
/* coding=utf-8
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <ATen/ATen.h>
#include "compat.h"
#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...) \
switch (TYPE) \
{ \
case at::ScalarType::Float: \
{ \
using scalar_t = float; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::Half: \
{ \
using scalar_t = at::Half; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::BFloat16: \
{ \
using scalar_t = at::BFloat16; \
__VA_ARGS__; \
break; \
} \
default: \
AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
}
#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
switch (TYPEIN) \
{ \
case at::ScalarType::Float: \
{ \
using scalar_t_in = float; \
switch (TYPEOUT) \
{ \
case at::ScalarType::Float: \
{ \
using scalar_t_out = float; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::Half: \
{ \
using scalar_t_out = at::Half; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::BFloat16: \
{ \
using scalar_t_out = at::BFloat16; \
__VA_ARGS__; \
break; \
} \
default: \
AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
} \
break; \
} \
case at::ScalarType::Half: \
{ \
using scalar_t_in = at::Half; \
using scalar_t_out = at::Half; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::BFloat16: \
{ \
using scalar_t_in = at::BFloat16; \
using scalar_t_out = at::BFloat16; \
__VA_ARGS__; \
break; \
} \
default: \
AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
}

View File

@ -0,0 +1,6 @@
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
from .filter import *
from .resample import *
from .act import *

View File

@ -0,0 +1,30 @@
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
import torch.nn as nn
from alias_free_activation.torch.resample import UpSample1d, DownSample1d
class Activation1d(nn.Module):
def __init__(
self,
activation,
up_ratio: int = 2,
down_ratio: int = 2,
up_kernel_size: int = 12,
down_kernel_size: int = 12,
):
super().__init__()
self.up_ratio = up_ratio
self.down_ratio = down_ratio
self.act = activation
self.upsample = UpSample1d(up_ratio, up_kernel_size)
self.downsample = DownSample1d(down_ratio, down_kernel_size)
# x: [B,C,T]
def forward(self, x):
x = self.upsample(x)
x = self.act(x)
x = self.downsample(x)
return x

View File

@ -0,0 +1,101 @@
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
if "sinc" in dir(torch):
sinc = torch.sinc
else:
# This code is adopted from adefossez's julius.core.sinc under the MIT License
# https://adefossez.github.io/julius/julius/core.html
# LICENSE is in incl_licenses directory.
def sinc(x: torch.Tensor):
"""
Implementation of sinc, i.e. sin(pi * x) / (pi * x)
__Warning__: Different to julius.sinc, the input is multiplied by `pi`!
"""
return torch.where(
x == 0,
torch.tensor(1.0, device=x.device, dtype=x.dtype),
torch.sin(math.pi * x) / math.pi / x,
)
# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
# https://adefossez.github.io/julius/julius/lowpass.html
# LICENSE is in incl_licenses directory.
def kaiser_sinc_filter1d(
cutoff, half_width, kernel_size
): # return filter [1,1,kernel_size]
even = kernel_size % 2 == 0
half_size = kernel_size // 2
# For kaiser window
delta_f = 4 * half_width
A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
if A > 50.0:
beta = 0.1102 * (A - 8.7)
elif A >= 21.0:
beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
else:
beta = 0.0
window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
# ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
if even:
time = torch.arange(-half_size, half_size) + 0.5
else:
time = torch.arange(kernel_size) - half_size
if cutoff == 0:
filter_ = torch.zeros_like(time)
else:
filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
"""
Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
"""
filter_ /= filter_.sum()
filter = filter_.view(1, 1, kernel_size)
return filter
class LowPassFilter1d(nn.Module):
def __init__(
self,
cutoff=0.5,
half_width=0.6,
stride: int = 1,
padding: bool = True,
padding_mode: str = "replicate",
kernel_size: int = 12,
):
"""
kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
"""
super().__init__()
if cutoff < -0.0:
raise ValueError("Minimum cutoff must be larger than zero.")
if cutoff > 0.5:
raise ValueError("A cutoff above 0.5 does not make sense.")
self.kernel_size = kernel_size
self.even = kernel_size % 2 == 0
self.pad_left = kernel_size // 2 - int(self.even)
self.pad_right = kernel_size // 2
self.stride = stride
self.padding = padding
self.padding_mode = padding_mode
filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
self.register_buffer("filter", filter)
# Input [B, C, T]
def forward(self, x):
_, C, _ = x.shape
if self.padding:
x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
return out

View File

@ -0,0 +1,58 @@
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
import torch.nn as nn
from torch.nn import functional as F
from alias_free_activation.torch.filter import LowPassFilter1d
from alias_free_activation.torch.filter import kaiser_sinc_filter1d
class UpSample1d(nn.Module):
def __init__(self, ratio=2, kernel_size=None):
super().__init__()
self.ratio = ratio
self.kernel_size = (
int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
)
self.stride = ratio
self.pad = self.kernel_size // ratio - 1
self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
self.pad_right = (
self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
)
filter = kaiser_sinc_filter1d(
cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
)
self.register_buffer("filter", filter)
# x: [B, C, T]
def forward(self, x):
_, C, _ = x.shape
x = F.pad(x, (self.pad, self.pad), mode="replicate")
x = self.ratio * F.conv_transpose1d(
x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
)
x = x[..., self.pad_left : -self.pad_right]
return x
class DownSample1d(nn.Module):
def __init__(self, ratio=2, kernel_size=None):
super().__init__()
self.ratio = ratio
self.kernel_size = (
int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
)
self.lowpass = LowPassFilter1d(
cutoff=0.5 / ratio,
half_width=0.6 / ratio,
stride=ratio,
kernel_size=self.kernel_size,
)
def forward(self, x):
xx = self.lowpass(x)
return xx

View File

@ -0,0 +1,493 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import os
import json
from pathlib import Path
from typing import Optional, Union, Dict
import torch
import torch.nn as nn
from torch.nn import Conv1d, ConvTranspose1d
from torch.nn.utils import weight_norm, remove_weight_norm
import activations
from utils0 import init_weights, get_padding
from alias_free_activation.torch.act import Activation1d as TorchActivation1d
from env import AttrDict
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
def load_hparams_from_json(path) -> AttrDict:
with open(path) as f:
data = f.read()
return AttrDict(json.loads(data))
class AMPBlock1(torch.nn.Module):
"""
AMPBlock applies Snake / SnakeBeta activation functions with trainable parameters that control periodicity, defined for each layer.
AMPBlock1 has additional self.convs2 that contains additional Conv1d layers with a fixed dilation=1 followed by each layer in self.convs1
Args:
h (AttrDict): Hyperparameters.
channels (int): Number of convolution channels.
kernel_size (int): Size of the convolution kernel. Default is 3.
dilation (tuple): Dilation rates for the convolutions. Each dilation layer has two convolutions. Default is (1, 3, 5).
activation (str): Activation function type. Should be either 'snake' or 'snakebeta'. Default is None.
"""
def __init__(
self,
h: AttrDict,
channels: int,
kernel_size: int = 3,
dilation: tuple = (1, 3, 5),
activation: str = None,
):
super().__init__()
self.h = h
self.convs1 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=d,
padding=get_padding(kernel_size, d),
)
)
for d in dilation
]
)
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
)
for _ in range(len(dilation))
]
)
self.convs2.apply(init_weights)
self.num_layers = len(self.convs1) + len(
self.convs2
) # Total number of conv layers
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
if self.h.get("use_cuda_kernel", False):
from alias_free_activation.cuda.activation1d import (
Activation1d as CudaActivation1d,
)
Activation1d = CudaActivation1d
else:
Activation1d = TorchActivation1d
# Activation functions
if activation == "snake":
self.activations = nn.ModuleList(
[
Activation1d(
activation=activations.Snake(
channels, alpha_logscale=h.snake_logscale
)
)
for _ in range(self.num_layers)
]
)
elif activation == "snakebeta":
self.activations = nn.ModuleList(
[
Activation1d(
activation=activations.SnakeBeta(
channels, alpha_logscale=h.snake_logscale
)
)
for _ in range(self.num_layers)
]
)
else:
raise NotImplementedError(
"activation incorrectly specified. check the config file and look for 'activation'."
)
def forward(self, x):
acts1, acts2 = self.activations[::2], self.activations[1::2]
for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
xt = a1(x)
xt = c1(xt)
xt = a2(xt)
xt = c2(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class AMPBlock2(torch.nn.Module):
"""
AMPBlock applies Snake / SnakeBeta activation functions with trainable parameters that control periodicity, defined for each layer.
Unlike AMPBlock1, AMPBlock2 does not contain extra Conv1d layers with fixed dilation=1
Args:
h (AttrDict): Hyperparameters.
channels (int): Number of convolution channels.
kernel_size (int): Size of the convolution kernel. Default is 3.
dilation (tuple): Dilation rates for the convolutions. Each dilation layer has two convolutions. Default is (1, 3, 5).
activation (str): Activation function type. Should be either 'snake' or 'snakebeta'. Default is None.
"""
def __init__(
self,
h: AttrDict,
channels: int,
kernel_size: int = 3,
dilation: tuple = (1, 3, 5),
activation: str = None,
):
super().__init__()
self.h = h
self.convs = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=d,
padding=get_padding(kernel_size, d),
)
)
for d in dilation
]
)
self.convs.apply(init_weights)
self.num_layers = len(self.convs) # Total number of conv layers
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
if self.h.get("use_cuda_kernel", False):
from alias_free_activation.cuda.activation1d import (
Activation1d as CudaActivation1d,
)
Activation1d = CudaActivation1d
else:
Activation1d = TorchActivation1d
# Activation functions
if activation == "snake":
self.activations = nn.ModuleList(
[
Activation1d(
activation=activations.Snake(
channels, alpha_logscale=h.snake_logscale
)
)
for _ in range(self.num_layers)
]
)
elif activation == "snakebeta":
self.activations = nn.ModuleList(
[
Activation1d(
activation=activations.SnakeBeta(
channels, alpha_logscale=h.snake_logscale
)
)
for _ in range(self.num_layers)
]
)
else:
raise NotImplementedError(
"activation incorrectly specified. check the config file and look for 'activation'."
)
def forward(self, x):
for c, a in zip(self.convs, self.activations):
xt = a(x)
xt = c(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class BigVGAN(
torch.nn.Module,
PyTorchModelHubMixin,
# library_name="bigvgan",
# repo_url="https://github.com/NVIDIA/BigVGAN",
# docs_url="https://github.com/NVIDIA/BigVGAN/blob/main/README.md",
# pipeline_tag="audio-to-audio",
# license="mit",
# tags=["neural-vocoder", "audio-generation", "arxiv:2206.04658"],
):
"""
BigVGAN is a neural vocoder model that applies anti-aliased periodic activation for residual blocks (resblocks).
New in BigVGAN-v2: it can optionally use optimized CUDA kernels for AMP (anti-aliased multi-periodicity) blocks.
Args:
h (AttrDict): Hyperparameters.
use_cuda_kernel (bool): If set to True, loads optimized CUDA kernels for AMP. This should be used for inference only, as training is not supported with CUDA kernels.
Note:
- The `use_cuda_kernel` parameter should be used for inference only, as training with CUDA kernels is not supported.
- Ensure that the activation function is correctly specified in the hyperparameters (h.activation).
"""
def __init__(self, h: AttrDict, use_cuda_kernel: bool = False):
super().__init__()
self.h = h
self.h["use_cuda_kernel"] = use_cuda_kernel
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
if self.h.get("use_cuda_kernel", False):
from alias_free_activation.cuda.activation1d import (
Activation1d as CudaActivation1d,
)
Activation1d = CudaActivation1d
else:
Activation1d = TorchActivation1d
self.num_kernels = len(h.resblock_kernel_sizes)
self.num_upsamples = len(h.upsample_rates)
# Pre-conv
self.conv_pre = weight_norm(
Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
)
# Define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
if h.resblock == "1":
resblock_class = AMPBlock1
elif h.resblock == "2":
resblock_class = AMPBlock2
else:
raise ValueError(
f"Incorrect resblock class specified in hyperparameters. Got {h.resblock}"
)
# Transposed conv-based upsamplers. does not apply anti-aliasing
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
self.ups.append(
nn.ModuleList(
[
weight_norm(
ConvTranspose1d(
h.upsample_initial_channel // (2**i),
h.upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
]
)
)
# Residual blocks using anti-aliased multi-periodicity composition modules (AMP)
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = h.upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
):
self.resblocks.append(
resblock_class(h, ch, k, d, activation=h.activation)
)
# Post-conv
activation_post = (
activations.Snake(ch, alpha_logscale=h.snake_logscale)
if h.activation == "snake"
else (
activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
if h.activation == "snakebeta"
else None
)
)
if activation_post is None:
raise NotImplementedError(
"activation incorrectly specified. check the config file and look for 'activation'."
)
self.activation_post = Activation1d(activation=activation_post)
# Whether to use bias for the final conv_post. Default to True for backward compatibility
self.use_bias_at_final = h.get("use_bias_at_final", True)
self.conv_post = weight_norm(
Conv1d(ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final)
)
# Weight initialization
for i in range(len(self.ups)):
self.ups[i].apply(init_weights)
self.conv_post.apply(init_weights)
# Final tanh activation. Defaults to True for backward compatibility
self.use_tanh_at_final = h.get("use_tanh_at_final", True)
def forward(self, x):
# Pre-conv
x = self.conv_pre(x)
for i in range(self.num_upsamples):
# Upsampling
for i_up in range(len(self.ups[i])):
x = self.ups[i][i_up](x)
# AMP blocks
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
# Post-conv
x = self.activation_post(x)
x = self.conv_post(x)
# Final tanh activation
if self.use_tanh_at_final:
x = torch.tanh(x)
else:
x = torch.clamp(x, min=-1.0, max=1.0) # Bound the output to [-1, 1]
return x
def remove_weight_norm(self):
try:
# print("Removing weight norm...")
for l in self.ups:
for l_i in l:
remove_weight_norm(l_i)
for l in self.resblocks:
l.remove_weight_norm()
remove_weight_norm(self.conv_pre)
remove_weight_norm(self.conv_post)
except ValueError:
print("[INFO] Model already removed weight norm. Skipping!")
pass
# Additional methods for huggingface_hub support
def _save_pretrained(self, save_directory: Path) -> None:
"""Save weights and config.json from a Pytorch model to a local directory."""
model_path = save_directory / "bigvgan_generator.pt"
torch.save({"generator": self.state_dict()}, model_path)
config_path = save_directory / "config.json"
with open(config_path, "w") as config_file:
json.dump(self.h, config_file, indent=4)
@classmethod
def _from_pretrained(
cls,
*,
model_id: str,
revision: str,
cache_dir: str,
force_download: bool,
proxies: Optional[Dict],
resume_download: bool,
local_files_only: bool,
token: Union[str, bool, None],
map_location: str = "cpu", # Additional argument
strict: bool = False, # Additional argument
use_cuda_kernel: bool = False,
**model_kwargs,
):
"""Load Pytorch pretrained weights and return the loaded model."""
# Download and load hyperparameters (h) used by BigVGAN
if os.path.isdir(model_id):
# print("Loading config.json from local directory")
config_file = os.path.join(model_id, "config.json")
else:
config_file = hf_hub_download(
repo_id=model_id,
filename="config.json",
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
h = load_hparams_from_json(config_file)
# instantiate BigVGAN using h
if use_cuda_kernel:
print(
f"[WARNING] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!"
)
print(
f"[WARNING] You need nvcc and ninja installed in your system that matches your PyTorch build is using to build the kernel. If not, the model will fail to initialize or generate incorrect waveform!"
)
print(
f"[WARNING] For detail, see the official GitHub repository: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis"
)
model = cls(h, use_cuda_kernel=use_cuda_kernel)
# Download and load pretrained generator weight
if os.path.isdir(model_id):
# print("Loading weights from local directory")
model_file = os.path.join(model_id, "bigvgan_generator.pt")
else:
# print(f"Loading weights from {model_id}")
model_file = hf_hub_download(
repo_id=model_id,
filename="bigvgan_generator.pt",
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
checkpoint_dict = torch.load(model_file, map_location=map_location)
try:
model.load_state_dict(checkpoint_dict["generator"])
except RuntimeError:
print(
f"[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!"
)
model.remove_weight_norm()
model.load_state_dict(checkpoint_dict["generator"])
return model

View File

@ -0,0 +1,45 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 32,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [4,4,2,2,2,2],
"upsample_kernel_sizes": [8,8,4,4,4,4],
"upsample_initial_channel": 1536,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"activation": "snakebeta",
"snake_logscale": true,
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"segment_size": 8192,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 22050,
"fmin": 0,
"fmax": 8000,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,45 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 32,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [4,4,2,2,2,2],
"upsample_kernel_sizes": [8,8,4,4,4,4],
"upsample_initial_channel": 1536,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"activation": "snakebeta",
"snake_logscale": true,
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"segment_size": 8192,
"num_mels": 100,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 24000,
"fmin": 0,
"fmax": 12000,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,45 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 32,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [8,8,2,2],
"upsample_kernel_sizes": [16,16,4,4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"activation": "snakebeta",
"snake_logscale": true,
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"segment_size": 8192,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 22050,
"fmin": 0,
"fmax": 8000,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,45 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 32,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [8,8,2,2],
"upsample_kernel_sizes": [16,16,4,4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"activation": "snakebeta",
"snake_logscale": true,
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"segment_size": 8192,
"num_mels": 100,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 24000,
"fmin": 0,
"fmax": 12000,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,61 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 4,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [4,4,2,2,2,2],
"upsample_kernel_sizes": [8,8,4,4,4,4],
"upsample_initial_channel": 1536,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"use_tanh_at_final": false,
"use_bias_at_final": false,
"activation": "snakebeta",
"snake_logscale": true,
"use_cqtd_instead_of_mrd": true,
"cqtd_filters": 128,
"cqtd_max_filters": 1024,
"cqtd_filters_scale": 1,
"cqtd_dilations": [1, 2, 4],
"cqtd_hop_lengths": [512, 256, 256],
"cqtd_n_octaves": [9, 9, 9],
"cqtd_bins_per_octaves": [24, 36, 48],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"use_multiscale_melloss": true,
"lambda_melloss": 15,
"clip_grad_norm": 500,
"segment_size": 65536,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 22050,
"fmin": 0,
"fmax": null,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,61 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 4,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [4,4,2,2,2,2],
"upsample_kernel_sizes": [8,8,4,4,4,4],
"upsample_initial_channel": 1536,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"use_tanh_at_final": false,
"use_bias_at_final": false,
"activation": "snakebeta",
"snake_logscale": true,
"use_cqtd_instead_of_mrd": true,
"cqtd_filters": 128,
"cqtd_max_filters": 1024,
"cqtd_filters_scale": 1,
"cqtd_dilations": [1, 2, 4],
"cqtd_hop_lengths": [512, 256, 256],
"cqtd_n_octaves": [9, 9, 9],
"cqtd_bins_per_octaves": [24, 36, 48],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"use_multiscale_melloss": true,
"lambda_melloss": 15,
"clip_grad_norm": 500,
"segment_size": 65536,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 22050,
"fmin": 0,
"fmax": 8000,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,61 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 4,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [4,4,2,2,2,2],
"upsample_kernel_sizes": [8,8,4,4,4,4],
"upsample_initial_channel": 1536,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"use_tanh_at_final": false,
"use_bias_at_final": false,
"activation": "snakebeta",
"snake_logscale": true,
"use_cqtd_instead_of_mrd": true,
"cqtd_filters": 128,
"cqtd_max_filters": 1024,
"cqtd_filters_scale": 1,
"cqtd_dilations": [1, 2, 4],
"cqtd_hop_lengths": [512, 256, 256],
"cqtd_n_octaves": [9, 9, 9],
"cqtd_bins_per_octaves": [24, 36, 48],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"use_multiscale_melloss": true,
"lambda_melloss": 15,
"clip_grad_norm": 500,
"segment_size": 65536,
"num_mels": 100,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 24000,
"fmin": 0,
"fmax": null,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,61 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 4,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [4,4,2,2,2,2],
"upsample_kernel_sizes": [8,8,4,4,4,4],
"upsample_initial_channel": 1536,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"use_tanh_at_final": false,
"use_bias_at_final": false,
"activation": "snakebeta",
"snake_logscale": true,
"use_cqtd_instead_of_mrd": true,
"cqtd_filters": 128,
"cqtd_max_filters": 1024,
"cqtd_filters_scale": 1,
"cqtd_dilations": [1, 2, 4],
"cqtd_hop_lengths": [512, 256, 256],
"cqtd_n_octaves": [9, 9, 9],
"cqtd_bins_per_octaves": [24, 36, 48],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"use_multiscale_melloss": true,
"lambda_melloss": 15,
"clip_grad_norm": 500,
"segment_size": 65536,
"num_mels": 128,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"sampling_rate": 44100,
"fmin": 0,
"fmax": null,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,61 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 4,
"learning_rate": 0.0001,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.9999996,
"seed": 1234,
"upsample_rates": [8,4,2,2,2,2],
"upsample_kernel_sizes": [16,8,4,4,4,4],
"upsample_initial_channel": 1536,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"use_tanh_at_final": false,
"use_bias_at_final": false,
"activation": "snakebeta",
"snake_logscale": true,
"use_cqtd_instead_of_mrd": true,
"cqtd_filters": 128,
"cqtd_max_filters": 1024,
"cqtd_filters_scale": 1,
"cqtd_dilations": [1, 2, 4],
"cqtd_hop_lengths": [512, 256, 256],
"cqtd_n_octaves": [9, 9, 9],
"cqtd_bins_per_octaves": [24, 36, 48],
"mpd_reshapes": [2, 3, 5, 7, 11],
"use_spectral_norm": false,
"discriminator_channel_mult": 1,
"use_multiscale_melloss": true,
"lambda_melloss": 15,
"clip_grad_norm": 500,
"segment_size": 65536,
"num_mels": 128,
"num_freq": 2049,
"n_fft": 2048,
"hop_size": 512,
"win_size": 2048,
"sampling_rate": 44100,
"fmin": 0,
"fmax": null,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,651 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Conv2d
from torch.nn.utils import weight_norm, spectral_norm
from torchaudio.transforms import Spectrogram, Resample
from env import AttrDict
from utils import get_padding
import typing
from typing import Optional, List, Union, Dict, Tuple
class DiscriminatorP(torch.nn.Module):
def __init__(
self,
h: AttrDict,
period: List[int],
kernel_size: int = 5,
stride: int = 3,
use_spectral_norm: bool = False,
):
super().__init__()
self.period = period
self.d_mult = h.discriminator_channel_mult
norm_f = weight_norm if not use_spectral_norm else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(
Conv2d(
1,
int(32 * self.d_mult),
(kernel_size, 1),
(stride, 1),
padding=(get_padding(5, 1), 0),
)
),
norm_f(
Conv2d(
int(32 * self.d_mult),
int(128 * self.d_mult),
(kernel_size, 1),
(stride, 1),
padding=(get_padding(5, 1), 0),
)
),
norm_f(
Conv2d(
int(128 * self.d_mult),
int(512 * self.d_mult),
(kernel_size, 1),
(stride, 1),
padding=(get_padding(5, 1), 0),
)
),
norm_f(
Conv2d(
int(512 * self.d_mult),
int(1024 * self.d_mult),
(kernel_size, 1),
(stride, 1),
padding=(get_padding(5, 1), 0),
)
),
norm_f(
Conv2d(
int(1024 * self.d_mult),
int(1024 * self.d_mult),
(kernel_size, 1),
1,
padding=(2, 0),
)
),
]
)
self.conv_post = norm_f(
Conv2d(int(1024 * self.d_mult), 1, (3, 1), 1, padding=(1, 0))
)
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, 0.1)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, h: AttrDict):
super().__init__()
self.mpd_reshapes = h.mpd_reshapes
print(f"mpd_reshapes: {self.mpd_reshapes}")
self.discriminators = nn.ModuleList(
[
DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm)
for rs in self.mpd_reshapes
]
)
def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[
List[torch.Tensor],
List[torch.Tensor],
List[List[torch.Tensor]],
List[List[torch.Tensor]],
]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorR(nn.Module):
def __init__(self, cfg: AttrDict, resolution: List[List[int]]):
super().__init__()
self.resolution = resolution
assert (
len(self.resolution) == 3
), f"MRD layer requires list with len=3, got {self.resolution}"
self.lrelu_slope = 0.1
norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm
if hasattr(cfg, "mrd_use_spectral_norm"):
print(
f"[INFO] overriding MRD use_spectral_norm as {cfg.mrd_use_spectral_norm}"
)
norm_f = (
weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm
)
self.d_mult = cfg.discriminator_channel_mult
if hasattr(cfg, "mrd_channel_mult"):
print(f"[INFO] overriding mrd channel multiplier as {cfg.mrd_channel_mult}")
self.d_mult = cfg.mrd_channel_mult
self.convs = nn.ModuleList(
[
norm_f(nn.Conv2d(1, int(32 * self.d_mult), (3, 9), padding=(1, 4))),
norm_f(
nn.Conv2d(
int(32 * self.d_mult),
int(32 * self.d_mult),
(3, 9),
stride=(1, 2),
padding=(1, 4),
)
),
norm_f(
nn.Conv2d(
int(32 * self.d_mult),
int(32 * self.d_mult),
(3, 9),
stride=(1, 2),
padding=(1, 4),
)
),
norm_f(
nn.Conv2d(
int(32 * self.d_mult),
int(32 * self.d_mult),
(3, 9),
stride=(1, 2),
padding=(1, 4),
)
),
norm_f(
nn.Conv2d(
int(32 * self.d_mult),
int(32 * self.d_mult),
(3, 3),
padding=(1, 1),
)
),
]
)
self.conv_post = norm_f(
nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1))
)
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
fmap = []
x = self.spectrogram(x)
x = x.unsqueeze(1)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, self.lrelu_slope)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
n_fft, hop_length, win_length = self.resolution
x = F.pad(
x,
(int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
mode="reflect",
)
x = x.squeeze(1)
x = torch.stft(
x,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
center=False,
return_complex=True,
)
x = torch.view_as_real(x) # [B, F, TT, 2]
mag = torch.norm(x, p=2, dim=-1) # [B, F, TT]
return mag
class MultiResolutionDiscriminator(nn.Module):
def __init__(self, cfg, debug=False):
super().__init__()
self.resolutions = cfg.resolutions
assert (
len(self.resolutions) == 3
), f"MRD requires list of list with len=3, each element having a list with len=3. Got {self.resolutions}"
self.discriminators = nn.ModuleList(
[DiscriminatorR(cfg, resolution) for resolution in self.resolutions]
)
def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[
List[torch.Tensor],
List[torch.Tensor],
List[List[torch.Tensor]],
List[List[torch.Tensor]],
]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(x=y)
y_d_g, fmap_g = d(x=y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
# Method based on descript-audio-codec: https://github.com/descriptinc/descript-audio-codec
# Modified code adapted from https://github.com/gemelo-ai/vocos under the MIT license.
# LICENSE is in incl_licenses directory.
class DiscriminatorB(nn.Module):
def __init__(
self,
window_length: int,
channels: int = 32,
hop_factor: float = 0.25,
bands: Tuple[Tuple[float, float], ...] = (
(0.0, 0.1),
(0.1, 0.25),
(0.25, 0.5),
(0.5, 0.75),
(0.75, 1.0),
),
):
super().__init__()
self.window_length = window_length
self.hop_factor = hop_factor
self.spec_fn = Spectrogram(
n_fft=window_length,
hop_length=int(window_length * hop_factor),
win_length=window_length,
power=None,
)
n_fft = window_length // 2 + 1
bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
self.bands = bands
convs = lambda: nn.ModuleList(
[
weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
weight_norm(
nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
),
weight_norm(
nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
),
weight_norm(
nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
),
weight_norm(
nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))
),
]
)
self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
self.conv_post = weight_norm(
nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1))
)
def spectrogram(self, x: torch.Tensor) -> List[torch.Tensor]:
# Remove DC offset
x = x - x.mean(dim=-1, keepdims=True)
# Peak normalize the volume of input audio
x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
x = self.spec_fn(x)
x = torch.view_as_real(x)
x = x.permute(0, 3, 2, 1) # [B, F, T, C] -> [B, C, T, F]
# Split into bands
x_bands = [x[..., b[0] : b[1]] for b in self.bands]
return x_bands
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
x_bands = self.spectrogram(x.squeeze(1))
fmap = []
x = []
for band, stack in zip(x_bands, self.band_convs):
for i, layer in enumerate(stack):
band = layer(band)
band = torch.nn.functional.leaky_relu(band, 0.1)
if i > 0:
fmap.append(band)
x.append(band)
x = torch.cat(x, dim=-1)
x = self.conv_post(x)
fmap.append(x)
return x, fmap
# Method based on descript-audio-codec: https://github.com/descriptinc/descript-audio-codec
# Modified code adapted from https://github.com/gemelo-ai/vocos under the MIT license.
# LICENSE is in incl_licenses directory.
class MultiBandDiscriminator(nn.Module):
def __init__(
self,
h,
):
"""
Multi-band multi-scale STFT discriminator, with the architecture based on https://github.com/descriptinc/descript-audio-codec.
and the modified code adapted from https://github.com/gemelo-ai/vocos.
"""
super().__init__()
# fft_sizes (list[int]): Tuple of window lengths for FFT. Defaults to [2048, 1024, 512] if not set in h.
self.fft_sizes = h.get("mbd_fft_sizes", [2048, 1024, 512])
self.discriminators = nn.ModuleList(
[DiscriminatorB(window_length=w) for w in self.fft_sizes]
)
def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[
List[torch.Tensor],
List[torch.Tensor],
List[List[torch.Tensor]],
List[List[torch.Tensor]],
]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for d in self.discriminators:
y_d_r, fmap_r = d(x=y)
y_d_g, fmap_g = d(x=y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
# Adapted from https://github.com/open-mmlab/Amphion/blob/main/models/vocoders/gan/discriminator/mssbcqtd.py under the MIT license.
# LICENSE is in incl_licenses directory.
class DiscriminatorCQT(nn.Module):
def __init__(self, cfg: AttrDict, hop_length: int, n_octaves:int, bins_per_octave: int):
super().__init__()
self.cfg = cfg
self.filters = cfg["cqtd_filters"]
self.max_filters = cfg["cqtd_max_filters"]
self.filters_scale = cfg["cqtd_filters_scale"]
self.kernel_size = (3, 9)
self.dilations = cfg["cqtd_dilations"]
self.stride = (1, 2)
self.in_channels = cfg["cqtd_in_channels"]
self.out_channels = cfg["cqtd_out_channels"]
self.fs = cfg["sampling_rate"]
self.hop_length = hop_length
self.n_octaves = n_octaves
self.bins_per_octave = bins_per_octave
# Lazy-load
from nnAudio import features
self.cqt_transform = features.cqt.CQT2010v2(
sr=self.fs * 2,
hop_length=self.hop_length,
n_bins=self.bins_per_octave * self.n_octaves,
bins_per_octave=self.bins_per_octave,
output_format="Complex",
pad_mode="constant",
)
self.conv_pres = nn.ModuleList()
for _ in range(self.n_octaves):
self.conv_pres.append(
nn.Conv2d(
self.in_channels * 2,
self.in_channels * 2,
kernel_size=self.kernel_size,
padding=self.get_2d_padding(self.kernel_size),
)
)
self.convs = nn.ModuleList()
self.convs.append(
nn.Conv2d(
self.in_channels * 2,
self.filters,
kernel_size=self.kernel_size,
padding=self.get_2d_padding(self.kernel_size),
)
)
in_chs = min(self.filters_scale * self.filters, self.max_filters)
for i, dilation in enumerate(self.dilations):
out_chs = min(
(self.filters_scale ** (i + 1)) * self.filters, self.max_filters
)
self.convs.append(
weight_norm(
nn.Conv2d(
in_chs,
out_chs,
kernel_size=self.kernel_size,
stride=self.stride,
dilation=(dilation, 1),
padding=self.get_2d_padding(self.kernel_size, (dilation, 1)),
)
)
)
in_chs = out_chs
out_chs = min(
(self.filters_scale ** (len(self.dilations) + 1)) * self.filters,
self.max_filters,
)
self.convs.append(
weight_norm(
nn.Conv2d(
in_chs,
out_chs,
kernel_size=(self.kernel_size[0], self.kernel_size[0]),
padding=self.get_2d_padding(
(self.kernel_size[0], self.kernel_size[0])
),
)
)
)
self.conv_post = weight_norm(
nn.Conv2d(
out_chs,
self.out_channels,
kernel_size=(self.kernel_size[0], self.kernel_size[0]),
padding=self.get_2d_padding((self.kernel_size[0], self.kernel_size[0])),
)
)
self.activation = torch.nn.LeakyReLU(negative_slope=0.1)
self.resample = Resample(orig_freq=self.fs, new_freq=self.fs * 2)
self.cqtd_normalize_volume = self.cfg.get("cqtd_normalize_volume", False)
if self.cqtd_normalize_volume:
print(
f"[INFO] cqtd_normalize_volume set to True. Will apply DC offset removal & peak volume normalization in CQTD!"
)
def get_2d_padding(
self,
kernel_size: typing.Tuple[int, int],
dilation: typing.Tuple[int, int] = (1, 1),
):
return (
((kernel_size[0] - 1) * dilation[0]) // 2,
((kernel_size[1] - 1) * dilation[1]) // 2,
)
def forward(self, x: torch.tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
fmap = []
if self.cqtd_normalize_volume:
# Remove DC offset
x = x - x.mean(dim=-1, keepdims=True)
# Peak normalize the volume of input audio
x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
x = self.resample(x)
z = self.cqt_transform(x)
z_amplitude = z[:, :, :, 0].unsqueeze(1)
z_phase = z[:, :, :, 1].unsqueeze(1)
z = torch.cat([z_amplitude, z_phase], dim=1)
z = torch.permute(z, (0, 1, 3, 2)) # [B, C, W, T] -> [B, C, T, W]
latent_z = []
for i in range(self.n_octaves):
latent_z.append(
self.conv_pres[i](
z[
:,
:,
:,
i * self.bins_per_octave : (i + 1) * self.bins_per_octave,
]
)
)
latent_z = torch.cat(latent_z, dim=-1)
for i, l in enumerate(self.convs):
latent_z = l(latent_z)
latent_z = self.activation(latent_z)
fmap.append(latent_z)
latent_z = self.conv_post(latent_z)
return latent_z, fmap
class MultiScaleSubbandCQTDiscriminator(nn.Module):
def __init__(self, cfg: AttrDict):
super().__init__()
self.cfg = cfg
# Using get with defaults
self.cfg["cqtd_filters"] = self.cfg.get("cqtd_filters", 32)
self.cfg["cqtd_max_filters"] = self.cfg.get("cqtd_max_filters", 1024)
self.cfg["cqtd_filters_scale"] = self.cfg.get("cqtd_filters_scale", 1)
self.cfg["cqtd_dilations"] = self.cfg.get("cqtd_dilations", [1, 2, 4])
self.cfg["cqtd_in_channels"] = self.cfg.get("cqtd_in_channels", 1)
self.cfg["cqtd_out_channels"] = self.cfg.get("cqtd_out_channels", 1)
# Multi-scale params to loop over
self.cfg["cqtd_hop_lengths"] = self.cfg.get("cqtd_hop_lengths", [512, 256, 256])
self.cfg["cqtd_n_octaves"] = self.cfg.get("cqtd_n_octaves", [9, 9, 9])
self.cfg["cqtd_bins_per_octaves"] = self.cfg.get(
"cqtd_bins_per_octaves", [24, 36, 48]
)
self.discriminators = nn.ModuleList(
[
DiscriminatorCQT(
self.cfg,
hop_length=self.cfg["cqtd_hop_lengths"][i],
n_octaves=self.cfg["cqtd_n_octaves"][i],
bins_per_octave=self.cfg["cqtd_bins_per_octaves"][i],
)
for i in range(len(self.cfg["cqtd_hop_lengths"]))
]
)
def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[
List[torch.Tensor],
List[torch.Tensor],
List[List[torch.Tensor]],
List[List[torch.Tensor]],
]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for disc in self.discriminators:
y_d_r, fmap_r = disc(y)
y_d_g, fmap_g = disc(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class CombinedDiscriminator(nn.Module):
"""
Wrapper of chaining multiple discrimiantor architectures.
Example: combine mbd and cqtd as a single class
"""
def __init__(self, list_discriminator: List[nn.Module]):
super().__init__()
self.discrimiantor = nn.ModuleList(list_discriminator)
def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[
List[torch.Tensor],
List[torch.Tensor],
List[List[torch.Tensor]],
List[List[torch.Tensor]],
]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for disc in self.discrimiantor:
y_d_r, y_d_g, fmap_r, fmap_g = disc(y, y_hat)
y_d_rs.extend(y_d_r)
fmap_rs.extend(fmap_r)
y_d_gs.extend(y_d_g)
fmap_gs.extend(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs

18
GPT_SoVITS/BigVGAN/env.py Normal file
View File

@ -0,0 +1,18 @@
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import os
import shutil
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def build_env(config, config_name, path):
t_path = os.path.join(path, config_name)
if config != t_path:
os.makedirs(path, exist_ok=True)
shutil.copyfile(config, os.path.join(path, config_name))

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 Jungil Kong
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 Edward Dixon
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,29 @@
BSD 3-Clause License
Copyright (c) 2019, Seungwon Park 박승원
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,16 @@
Copyright 2020 Alexandre Défossez
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute,
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or
substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023-present, Descript
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Charactr Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Amphion
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,89 @@
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import argparse
import json
import torch
import librosa
from utils import load_checkpoint
from meldataset import get_mel_spectrogram
from scipy.io.wavfile import write
from env import AttrDict
from meldataset import MAX_WAV_VALUE
from bigvgan import BigVGAN as Generator
h = None
device = None
torch.backends.cudnn.benchmark = False
def inference(a, h):
generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
state_dict_g = load_checkpoint(a.checkpoint_file, device)
generator.load_state_dict(state_dict_g["generator"])
filelist = os.listdir(a.input_wavs_dir)
os.makedirs(a.output_dir, exist_ok=True)
generator.eval()
generator.remove_weight_norm()
with torch.no_grad():
for i, filname in enumerate(filelist):
# Load the ground truth audio and resample if necessary
wav, sr = librosa.load(
os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True
)
wav = torch.FloatTensor(wav).to(device)
# Compute mel spectrogram from the ground truth audio
x = get_mel_spectrogram(wav.unsqueeze(0), generator.h)
y_g_hat = generator(x)
audio = y_g_hat.squeeze()
audio = audio * MAX_WAV_VALUE
audio = audio.cpu().numpy().astype("int16")
output_file = os.path.join(
a.output_dir, os.path.splitext(filname)[0] + "_generated.wav"
)
write(output_file, h.sampling_rate, audio)
print(output_file)
def main():
print("Initializing Inference Process..")
parser = argparse.ArgumentParser()
parser.add_argument("--input_wavs_dir", default="test_files")
parser.add_argument("--output_dir", default="generated_files")
parser.add_argument("--checkpoint_file", required=True)
parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
a = parser.parse_args()
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
with open(config_file) as f:
data = f.read()
global h
json_config = json.loads(data)
h = AttrDict(json_config)
torch.manual_seed(h.seed)
global device
if torch.cuda.is_available():
torch.cuda.manual_seed(h.seed)
device = torch.device("cuda")
else:
device = torch.device("cpu")
inference(a, h)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,102 @@
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
from __future__ import absolute_import, division, print_function, unicode_literals
import glob
import os
import numpy as np
import argparse
import json
import torch
from scipy.io.wavfile import write
from env import AttrDict
from meldataset import MAX_WAV_VALUE
from bigvgan import BigVGAN as Generator
h = None
device = None
torch.backends.cudnn.benchmark = False
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print(f"Loading '{filepath}'")
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def scan_checkpoint(cp_dir, prefix):
pattern = os.path.join(cp_dir, prefix + "*")
cp_list = glob.glob(pattern)
if len(cp_list) == 0:
return ""
return sorted(cp_list)[-1]
def inference(a, h):
generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
state_dict_g = load_checkpoint(a.checkpoint_file, device)
generator.load_state_dict(state_dict_g["generator"])
filelist = os.listdir(a.input_mels_dir)
os.makedirs(a.output_dir, exist_ok=True)
generator.eval()
generator.remove_weight_norm()
with torch.no_grad():
for i, filname in enumerate(filelist):
# Load the mel spectrogram in .npy format
x = np.load(os.path.join(a.input_mels_dir, filname))
x = torch.FloatTensor(x).to(device)
if len(x.shape) == 2:
x = x.unsqueeze(0)
y_g_hat = generator(x)
audio = y_g_hat.squeeze()
audio = audio * MAX_WAV_VALUE
audio = audio.cpu().numpy().astype("int16")
output_file = os.path.join(
a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav"
)
write(output_file, h.sampling_rate, audio)
print(output_file)
def main():
print("Initializing Inference Process..")
parser = argparse.ArgumentParser()
parser.add_argument("--input_mels_dir", default="test_mel_files")
parser.add_argument("--output_dir", default="generated_files_from_mel")
parser.add_argument("--checkpoint_file", required=True)
parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
a = parser.parse_args()
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
with open(config_file) as f:
data = f.read()
global h
json_config = json.loads(data)
h = AttrDict(json_config)
torch.manual_seed(h.seed)
global device
if torch.cuda.is_available():
torch.cuda.manual_seed(h.seed)
device = torch.device("cuda")
else:
device = torch.device("cpu")
inference(a, h)
if __name__ == "__main__":
main()

254
GPT_SoVITS/BigVGAN/loss.py Normal file
View File

@ -0,0 +1,254 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import torch
import torch.nn.functional as F
import torch.nn as nn
from librosa.filters import mel as librosa_mel_fn
from scipy import signal
import typing
from typing import Optional, List, Union, Dict, Tuple
from collections import namedtuple
import math
import functools
# Adapted from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/loss.py under the MIT license.
# LICENSE is in incl_licenses directory.
class MultiScaleMelSpectrogramLoss(nn.Module):
"""Compute distance between mel spectrograms. Can be used
in a multi-scale way.
Parameters
----------
n_mels : List[int]
Number of mels per STFT, by default [5, 10, 20, 40, 80, 160, 320],
window_lengths : List[int], optional
Length of each window of each STFT, by default [32, 64, 128, 256, 512, 1024, 2048]
loss_fn : typing.Callable, optional
How to compare each loss, by default nn.L1Loss()
clamp_eps : float, optional
Clamp on the log magnitude, below, by default 1e-5
mag_weight : float, optional
Weight of raw magnitude portion of loss, by default 0.0 (no ampliciation on mag part)
log_weight : float, optional
Weight of log magnitude portion of loss, by default 1.0
pow : float, optional
Power to raise magnitude to before taking log, by default 1.0
weight : float, optional
Weight of this loss, by default 1.0
match_stride : bool, optional
Whether to match the stride of convolutional layers, by default False
Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
Additional code copied and modified from https://github.com/descriptinc/audiotools/blob/master/audiotools/core/audio_signal.py
"""
def __init__(
self,
sampling_rate: int,
n_mels: List[int] = [5, 10, 20, 40, 80, 160, 320],
window_lengths: List[int] = [32, 64, 128, 256, 512, 1024, 2048],
loss_fn: typing.Callable = nn.L1Loss(),
clamp_eps: float = 1e-5,
mag_weight: float = 0.0,
log_weight: float = 1.0,
pow: float = 1.0,
weight: float = 1.0,
match_stride: bool = False,
mel_fmin: List[float] = [0, 0, 0, 0, 0, 0, 0],
mel_fmax: List[float] = [None, None, None, None, None, None, None],
window_type: str = "hann",
):
super().__init__()
self.sampling_rate = sampling_rate
STFTParams = namedtuple(
"STFTParams",
["window_length", "hop_length", "window_type", "match_stride"],
)
self.stft_params = [
STFTParams(
window_length=w,
hop_length=w // 4,
match_stride=match_stride,
window_type=window_type,
)
for w in window_lengths
]
self.n_mels = n_mels
self.loss_fn = loss_fn
self.clamp_eps = clamp_eps
self.log_weight = log_weight
self.mag_weight = mag_weight
self.weight = weight
self.mel_fmin = mel_fmin
self.mel_fmax = mel_fmax
self.pow = pow
@staticmethod
@functools.lru_cache(None)
def get_window(
window_type,
window_length,
):
return signal.get_window(window_type, window_length)
@staticmethod
@functools.lru_cache(None)
def get_mel_filters(sr, n_fft, n_mels, fmin, fmax):
return librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
def mel_spectrogram(
self,
wav,
n_mels,
fmin,
fmax,
window_length,
hop_length,
match_stride,
window_type,
):
"""
Mirrors AudioSignal.mel_spectrogram used by BigVGAN-v2 training from:
https://github.com/descriptinc/audiotools/blob/master/audiotools/core/audio_signal.py
"""
B, C, T = wav.shape
if match_stride:
assert (
hop_length == window_length // 4
), "For match_stride, hop must equal n_fft // 4"
right_pad = math.ceil(T / hop_length) * hop_length - T
pad = (window_length - hop_length) // 2
else:
right_pad = 0
pad = 0
wav = torch.nn.functional.pad(wav, (pad, pad + right_pad), mode="reflect")
window = self.get_window(window_type, window_length)
window = torch.from_numpy(window).to(wav.device).float()
stft = torch.stft(
wav.reshape(-1, T),
n_fft=window_length,
hop_length=hop_length,
window=window,
return_complex=True,
center=True,
)
_, nf, nt = stft.shape
stft = stft.reshape(B, C, nf, nt)
if match_stride:
"""
Drop first two and last two frames, which are added, because of padding. Now num_frames * hop_length = num_samples.
"""
stft = stft[..., 2:-2]
magnitude = torch.abs(stft)
nf = magnitude.shape[2]
mel_basis = self.get_mel_filters(
self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax
)
mel_basis = torch.from_numpy(mel_basis).to(wav.device)
mel_spectrogram = magnitude.transpose(2, -1) @ mel_basis.T
mel_spectrogram = mel_spectrogram.transpose(-1, 2)
return mel_spectrogram
def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
"""Computes mel loss between an estimate and a reference
signal.
Parameters
----------
x : torch.Tensor
Estimate signal
y : torch.Tensor
Reference signal
Returns
-------
torch.Tensor
Mel loss.
"""
loss = 0.0
for n_mels, fmin, fmax, s in zip(
self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
):
kwargs = {
"n_mels": n_mels,
"fmin": fmin,
"fmax": fmax,
"window_length": s.window_length,
"hop_length": s.hop_length,
"match_stride": s.match_stride,
"window_type": s.window_type,
}
x_mels = self.mel_spectrogram(x, **kwargs)
y_mels = self.mel_spectrogram(y, **kwargs)
x_logmels = torch.log(
x_mels.clamp(min=self.clamp_eps).pow(self.pow)
) / torch.log(torch.tensor(10.0))
y_logmels = torch.log(
y_mels.clamp(min=self.clamp_eps).pow(self.pow)
) / torch.log(torch.tensor(10.0))
loss += self.log_weight * self.loss_fn(x_logmels, y_logmels)
loss += self.mag_weight * self.loss_fn(x_logmels, y_logmels)
return loss
# Loss functions
def feature_loss(
fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]]
) -> torch.Tensor:
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
loss += torch.mean(torch.abs(rl - gl))
return loss * 2 # This equates to lambda=2.0 for the feature matching loss
def discriminator_loss(
disc_real_outputs: List[torch.Tensor], disc_generated_outputs: List[torch.Tensor]
) -> Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]:
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
r_loss = torch.mean((1 - dr) ** 2)
g_loss = torch.mean(dg**2)
loss += r_loss + g_loss
r_losses.append(r_loss.item())
g_losses.append(g_loss.item())
return loss, r_losses, g_losses
def generator_loss(
disc_outputs: List[torch.Tensor],
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
loss = 0
gen_losses = []
for dg in disc_outputs:
l = torch.mean((1 - dg) ** 2)
gen_losses.append(l)
loss += l
return loss, gen_losses

View File

@ -0,0 +1,396 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import math
import os
import random
import torch
import torch.utils.data
import numpy as np
import librosa
from librosa.filters import mel as librosa_mel_fn
import pathlib
from tqdm import tqdm
from typing import List, Tuple, Optional
from env import AttrDict
MAX_WAV_VALUE = 32767.0 # NOTE: 32768.0 -1 to prevent int16 overflow (results in popping sound in corner cases)
def dynamic_range_compression(x, C=1, clip_val=1e-5):
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
def dynamic_range_decompression(x, C=1):
return np.exp(x) / C
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
return torch.exp(x) / C
def spectral_normalize_torch(magnitudes):
return dynamic_range_compression_torch(magnitudes)
def spectral_de_normalize_torch(magnitudes):
return dynamic_range_decompression_torch(magnitudes)
mel_basis_cache = {}
hann_window_cache = {}
def mel_spectrogram(
y: torch.Tensor,
n_fft: int,
num_mels: int,
sampling_rate: int,
hop_size: int,
win_size: int,
fmin: int,
fmax: int = None,
center: bool = False,
) -> torch.Tensor:
"""
Calculate the mel spectrogram of an input signal.
This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft).
Args:
y (torch.Tensor): Input signal.
n_fft (int): FFT size.
num_mels (int): Number of mel bins.
sampling_rate (int): Sampling rate of the input signal.
hop_size (int): Hop size for STFT.
win_size (int): Window size for STFT.
fmin (int): Minimum frequency for mel filterbank.
fmax (int): Maximum frequency for mel filterbank. If None, defaults to half the sampling rate (fmax = sr / 2.0) inside librosa_mel_fn
center (bool): Whether to pad the input to center the frames. Default is False.
Returns:
torch.Tensor: Mel spectrogram.
"""
if torch.min(y) < -1.0:
print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}")
if torch.max(y) > 1.0:
print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}")
device = y.device
key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}"
if key not in mel_basis_cache:
mel = librosa_mel_fn(
sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
)
mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)
hann_window_cache[key] = torch.hann_window(win_size).to(device)
mel_basis = mel_basis_cache[key]
hann_window = hann_window_cache[key]
padding = (n_fft - hop_size) // 2
y = torch.nn.functional.pad(
y.unsqueeze(1), (padding, padding), mode="reflect"
).squeeze(1)
spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window,
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
)
spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
mel_spec = torch.matmul(mel_basis, spec)
mel_spec = spectral_normalize_torch(mel_spec)
return mel_spec
def get_mel_spectrogram(wav, h):
"""
Generate mel spectrogram from a waveform using given hyperparameters.
Args:
wav (torch.Tensor): Input waveform.
h: Hyperparameters object with attributes n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax.
Returns:
torch.Tensor: Mel spectrogram.
"""
return mel_spectrogram(
wav,
h.n_fft,
h.num_mels,
h.sampling_rate,
h.hop_size,
h.win_size,
h.fmin,
h.fmax,
)
def get_dataset_filelist(a):
training_files = []
validation_files = []
list_unseen_validation_files = []
with open(a.input_training_file, "r", encoding="utf-8") as fi:
training_files = [
os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
for x in fi.read().split("\n")
if len(x) > 0
]
print(f"first training file: {training_files[0]}")
with open(a.input_validation_file, "r", encoding="utf-8") as fi:
validation_files = [
os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
for x in fi.read().split("\n")
if len(x) > 0
]
print(f"first validation file: {validation_files[0]}")
for i in range(len(a.list_input_unseen_validation_file)):
with open(a.list_input_unseen_validation_file[i], "r", encoding="utf-8") as fi:
unseen_validation_files = [
os.path.join(a.list_input_unseen_wavs_dir[i], x.split("|")[0] + ".wav")
for x in fi.read().split("\n")
if len(x) > 0
]
print(
f"first unseen {i}th validation fileset: {unseen_validation_files[0]}"
)
list_unseen_validation_files.append(unseen_validation_files)
return training_files, validation_files, list_unseen_validation_files
class MelDataset(torch.utils.data.Dataset):
def __init__(
self,
training_files: List[str],
hparams: AttrDict,
segment_size: int,
n_fft: int,
num_mels: int,
hop_size: int,
win_size: int,
sampling_rate: int,
fmin: int,
fmax: Optional[int],
split: bool = True,
shuffle: bool = True,
device: str = None,
fmax_loss: Optional[int] = None,
fine_tuning: bool = False,
base_mels_path: str = None,
is_seen: bool = True,
):
self.audio_files = training_files
random.seed(1234)
if shuffle:
random.shuffle(self.audio_files)
self.hparams = hparams
self.is_seen = is_seen
if self.is_seen:
self.name = pathlib.Path(self.audio_files[0]).parts[0]
else:
self.name = "-".join(pathlib.Path(self.audio_files[0]).parts[:2]).strip("/")
self.segment_size = segment_size
self.sampling_rate = sampling_rate
self.split = split
self.n_fft = n_fft
self.num_mels = num_mels
self.hop_size = hop_size
self.win_size = win_size
self.fmin = fmin
self.fmax = fmax
self.fmax_loss = fmax_loss
self.device = device
self.fine_tuning = fine_tuning
self.base_mels_path = base_mels_path
print("[INFO] checking dataset integrity...")
for i in tqdm(range(len(self.audio_files))):
assert os.path.exists(
self.audio_files[i]
), f"{self.audio_files[i]} not found"
def __getitem__(
self, index: int
) -> Tuple[torch.Tensor, torch.Tensor, str, torch.Tensor]:
try:
filename = self.audio_files[index]
# Use librosa.load that ensures loading waveform into mono with [-1, 1] float values
# Audio is ndarray with shape [T_time]. Disable auto-resampling here to minimize overhead
# The on-the-fly resampling during training will be done only for the obtained random chunk
audio, source_sampling_rate = librosa.load(filename, sr=None, mono=True)
# Main logic that uses <mel, audio> pair for training BigVGAN
if not self.fine_tuning:
if self.split: # Training step
# Obtain randomized audio chunk
if source_sampling_rate != self.sampling_rate:
# Adjust segment size to crop if the source sr is different
target_segment_size = math.ceil(
self.segment_size
* (source_sampling_rate / self.sampling_rate)
)
else:
target_segment_size = self.segment_size
# Compute upper bound index for the random chunk
random_chunk_upper_bound = max(
0, audio.shape[0] - target_segment_size
)
# Crop or pad audio to obtain random chunk with target_segment_size
if audio.shape[0] >= target_segment_size:
audio_start = random.randint(0, random_chunk_upper_bound)
audio = audio[audio_start : audio_start + target_segment_size]
else:
audio = np.pad(
audio,
(0, target_segment_size - audio.shape[0]),
mode="constant",
)
# Resample audio chunk to self.sampling rate
if source_sampling_rate != self.sampling_rate:
audio = librosa.resample(
audio,
orig_sr=source_sampling_rate,
target_sr=self.sampling_rate,
)
if audio.shape[0] > self.segment_size:
# trim last elements to match self.segment_size (e.g., 16385 for 44khz downsampled to 24khz -> 16384)
audio = audio[: self.segment_size]
else: # Validation step
# Resample full audio clip to target sampling rate
if source_sampling_rate != self.sampling_rate:
audio = librosa.resample(
audio,
orig_sr=source_sampling_rate,
target_sr=self.sampling_rate,
)
# Trim last elements to match audio length to self.hop_size * n for evaluation
if (audio.shape[0] % self.hop_size) != 0:
audio = audio[: -(audio.shape[0] % self.hop_size)]
# BigVGAN is trained using volume-normalized waveform
audio = librosa.util.normalize(audio) * 0.95
# Cast ndarray to torch tensor
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0) # [B(1), self.segment_size]
# Compute mel spectrogram corresponding to audio
mel = mel_spectrogram(
audio,
self.n_fft,
self.num_mels,
self.sampling_rate,
self.hop_size,
self.win_size,
self.fmin,
self.fmax,
center=False,
) # [B(1), self.num_mels, self.segment_size // self.hop_size]
# Fine-tuning logic that uses pre-computed mel. Example: Using TTS model-generated mel as input
else:
# For fine-tuning, assert that the waveform is in the defined sampling_rate
# Fine-tuning won't support on-the-fly resampling to be fool-proof (the dataset should have been prepared properly)
assert (
source_sampling_rate == self.sampling_rate
), f"For fine_tuning, waveform must be in the spcified sampling rate {self.sampling_rate}, got {source_sampling_rate}"
# Cast ndarray to torch tensor
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0) # [B(1), T_time]
# Load pre-computed mel from disk
mel = np.load(
os.path.join(
self.base_mels_path,
os.path.splitext(os.path.split(filename)[-1])[0] + ".npy",
)
)
mel = torch.from_numpy(mel)
if len(mel.shape) < 3:
mel = mel.unsqueeze(0) # ensure [B, C, T]
if self.split:
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
if audio.size(1) >= self.segment_size:
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
mel = mel[:, :, mel_start : mel_start + frames_per_seg]
audio = audio[
:,
mel_start
* self.hop_size : (mel_start + frames_per_seg)
* self.hop_size,
]
# Pad pre-computed mel and audio to match length to ensuring fine-tuning without error.
# NOTE: this may introduce a single-frame misalignment of the <pre-computed mel, audio>
# To remove possible misalignment, it is recommended to prepare the <pre-computed mel, audio> pair where the audio length is the integer multiple of self.hop_size
mel = torch.nn.functional.pad(
mel, (0, frames_per_seg - mel.size(2)), "constant"
)
audio = torch.nn.functional.pad(
audio, (0, self.segment_size - audio.size(1)), "constant"
)
# Compute mel_loss used by spectral regression objective. Uses self.fmax_loss instead (usually None)
mel_loss = mel_spectrogram(
audio,
self.n_fft,
self.num_mels,
self.sampling_rate,
self.hop_size,
self.win_size,
self.fmin,
self.fmax_loss,
center=False,
) # [B(1), self.num_mels, self.segment_size // self.hop_size]
# Shape sanity checks
assert (
audio.shape[1] == mel.shape[2] * self.hop_size
and audio.shape[1] == mel_loss.shape[2] * self.hop_size
), f"Audio length must be mel frame length * hop_size. Got audio shape {audio.shape} mel shape {mel.shape} mel_loss shape {mel_loss.shape}"
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
# If it encounters error during loading the data, skip this sample and load random other sample to the batch
except Exception as e:
if self.fine_tuning:
raise e # Terminate training if it is fine-tuning. The dataset should have been prepared properly.
else:
print(
f"[WARNING] Failed to load waveform, skipping! filename: {filename} Error: {e}"
)
return self[random.randrange(len(self))]
def __len__(self):
return len(self.audio_files)

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,4 @@
| Field | Response |
| :--------------------------------------------------------------------------------------------------------- | :--------------------------------------------------- |
| Participation considerations from adversely impacted groups protected classes in model design and testing: | None |
| Measures taken to mitigate against unwanted bias: | No measures taken to mitigate against unwanted bias. |

View File

@ -0,0 +1,13 @@
| Field | Response |
| :---------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Intended Application & Domain: | Generating waveform from mel spectrogram. |
| Model Type: | Convolutional Neural Network (CNN) |
| Intended Users: | This model is intended for developers to synthesize and generate waveforms from the AI-generated mel spectrograms. |
| Output: | Audio Waveform |
| Describe how the model works: | Model generates audio waveform corresponding to the input mel spectrogram. |
| Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable |
| Technical Limitations: | This may not perform well on synthetically-generated mel spectrograms that deviate significantly from the profile of mel spectrograms on which this was trained. |
| Verified to have met prescribed NVIDIA quality standards: | Yes |
| Performance Metrics: | Perceptual Evaluation of Speech Quality (PESQ), Virtual Speech Quality Objective Listener (VISQOL), Multi-resolution STFT (MRSTFT), Mel cepstral distortion (MCD), Periodicity RMSE, Voice/Unvoiced F1 Score (V/UV F1) |
| Potential Known Risks: | This model may generate low-quality or distorted soundwaves. |
| Licensing: | https://github.com/NVIDIA/BigVGAN/blob/main/LICENSE |

View File

@ -0,0 +1,126 @@
# Model Overview
## Description:
BigVGAN is a generative AI model specialized in synthesizing audio waveforms using Mel spectrogram as inputs.
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
BigVGAN is a fully convolutional architecture with several upsampling blocks using transposed convolution followed by multiple residual dilated convolution layers.
BigVGAN consists of a novel module, called anti-aliased multi-periodicity composition (AMP), which is specifically designed for generating waveforms. AMP is specialized in synthesizing high-frequency and periodic soundwaves drawing inspiration from audio signal processing principles.
It applies a periodic activation function, called Snake, which provides an inductive bias to the architecture in generating periodic soundwaves. It also applies anti-aliasing filters to reduce undesired artifacts in the generated waveforms. <br>
This model is ready for commercial use.<br>
## References(s):
- [BigVGAN: A Universal Neural Vocoder with Large-Scale Training](https://arxiv.org/abs/2206.04658) <br>
- [Project Page](https://research.nvidia.com/labs/adlr/projects/bigvgan/) <br>
- [Audio Demo](https://bigvgan-demo.github.io/) <br>
## Model Architecture:
**Architecture Type:** Convolution Neural Network (CNN) <br>
**Network Architecture:** You can see the details of this model on this link: https://github.com/NVIDIA/BigVGAN and the related paper can be found here: https://arxiv.org/abs/2206.04658<br>
**Model Version:** 2.0 <br>
## Input:
**Input Type:** Audio <br>
**Input Format:** Mel Spectrogram <br>
**Input Parameters:** None <br>
**Other Properties Related to Input:** The input mel spectrogram has shape `[batch, channels, frames]`, where `channels` refers to the number of mel bands defined by the model and `frames` refers to the temporal length. The model supports arbitrary long `frames` that fits into the GPU memory.
## Output:
**Input Type:** Audio <br>
**Output Format:** Audio Waveform <br>
**Output Parameters:** None <br>
**Other Properties Related to Output:** The output audio waveform has shape `[batch, 1, time]`, where `1` refers to the mono audio channels and `time` refers to the temporal length. `time` is defined as a fixed integer multiple of input `frames`, which is an upsampling ratio of the model (`time = upsampling ratio * frames`). The output audio waveform consitutes float values with a range of `[-1, 1]`.
## Software Integration:
**Runtime Engine(s):** PyTorch
**Supported Hardware Microarchitecture Compatibility:** NVIDIA Ampere, NVIDIA Hopper, NVIDIA Lovelace, NVIDIA Turing, NVIDIA Volta <br>
## Preferred/Supported Operating System(s):
Linux
## Model Version(s):
v2.0
## Training, Testing, and Evaluation Datasets:
### Training Dataset:
The dataset contains diverse audio types, including speech in multiple languages, environmental sounds, and instruments.
**Links:**
- [AAM: Artificial Audio Multitracks Dataset](https://zenodo.org/records/5794629)
- [AudioCaps](https://audiocaps.github.io/)
- [AudioSet](https://research.google.com/audioset/index.html)
- [common-accent](https://huggingface.co/datasets/DTU54DL/common-accent)
- [Crowd Sourced Emotional Multimodal Actors Dataset (CREMA-D)](https://ieeexplore.ieee.org/document/6849440)
- [DCASE2017 Challenge, Task 4: Large-scale weakly supervised sound event detection for smart cars](https://dcase.community/challenge2017/task-large-scale-sound-event-detection)
- [FSDnoisy18k](https://zenodo.org/records/2529934)
- [Free Universal Sound Separation Dataset](https://zenodo.org/records/3694384)
- [Greatest Hits dataset](https://andrewowens.com/vis/)
- [GTZAN](https://ieeexplore.ieee.org/document/1021072)
- [JL corpus](https://www.kaggle.com/datasets/tli725/jl-corpus)
- [Medley-solos-DB: a cross-collection dataset for musical instrument recognition](https://zenodo.org/records/3464194)
- [MUSAN: A Music, Speech, and Noise Corpus](https://www.openslr.org/17/)
- [MusicBench](https://huggingface.co/datasets/amaai-lab/MusicBench)
- [MusicCaps](https://www.kaggle.com/datasets/googleai/musiccaps)
- [MusicNet](https://www.kaggle.com/datasets/imsparsh/musicnet-dataset)
- [NSynth](https://magenta.tensorflow.org/datasets/nsynth)
- [OnAir-Music-Dataset](https://github.com/sevagh/OnAir-Music-Dataset)
- [Audio Piano Triads Dataset](https://zenodo.org/records/4740877)
- [Pitch Audio Dataset (Surge synthesizer)](https://zenodo.org/records/4677097)
- [SONYC Urban Sound Tagging (SONYC-UST): a multilabel dataset from an urban acoustic sensor network](https://zenodo.org/records/3966543)
- [VocalSound: A Dataset for Improving Human Vocal Sounds Recognition](https://arxiv.org/abs/2205.03433)
- [WavText5K](https://github.com/microsoft/WavText5K)
- [CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages](https://github.com/Kyubyong/css10)
- [Hi-Fi Multi-Speaker English TTS Dataset (Hi-Fi TTS)](https://www.openslr.org/109/)
- [IIIT-H Indic Speech Databases](http://festvox.org/databases/iiit_voices/)
- [Libri-Light: A Benchmark for ASR with Limited or No Supervision](https://arxiv.org/abs/1912.07875)
- [LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech](https://www.openslr.org/60)
- [LibriTTS-R: A Restored Multi-Speaker Text-to-Speech Corpus](https://www.openslr.org/141/)
- [The SIWIS French Speech Synthesis Database](https://datashare.ed.ac.uk/handle/10283/2353)
- [Crowdsourced high-quality Colombian Spanish speech data set](https://openslr.org/72/)
- [TTS-Portuguese Corpus](https://github.com/Edresson/TTS-Portuguese-Corpus)
- [CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit](https://datashare.ed.ac.uk/handle/10283/3443)
\*\* Data Collection Method by dataset <br>
- Human <br>
\*\* Labeling Method by dataset (for those with labels) <br>
- Hybrid: Automated, Human, Unknown <br>
### Evaluating Dataset:
Properties: The audio generation quality of BigVGAN is evaluated using `dev` splits of the [LibriTTS dataset](https://www.openslr.org/60/) and [Hi-Fi TTS dataset](https://www.openslr.org/109/). The datasets include speech in English language with equal balance of genders.
\*\* Data Collection Method by dataset <br>
- Human <br>
\*\* Labeling Method by dataset <br>
- Automated <br>
## Inference:
**Engine:** PyTorch <br>
**Test Hardware:** NVIDIA A100 GPU <br>
## Ethical Considerations:
NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. For more detailed information on ethical considerations for this model, please see the Model Card++ Explainability, Bias, Safety & Security, and Privacy Subcards. Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).

View File

@ -0,0 +1,14 @@
| Field | Response |
| :------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------- |
| Generatable or reverse engineerable personal information? | None |
| Protected class data used to create this model? | None |
| Was consent obtained for any personal data used? | Not Applicable (No Personal Data) |
| How often is dataset reviewed? | Before Release |
| Is a mechanism in place to honor data subject right of access or deletion of personal data? | Not Applicable |
| If personal collected for the development of the model, was it collected directly by NVIDIA? | Not Applicable |
| If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable |
| If personal collected for the development of this AI model, was it minimized to only what was required? | Not Applicable |
| Is data in dataset traceable? | Yes |
| Is there provenance for all datasets used in training? | Yes |
| Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
| Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |

View File

@ -0,0 +1,6 @@
| Field | Response |
| :---------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Model Application(s): | Synethic Audio Generation |
| Describe the life critical impact (if present). | Not Applicable |
| Use Case Restrictions: | None |
| Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |

View File

@ -0,0 +1,13 @@
torch
numpy
librosa>=0.8.1
scipy
tensorboard
soundfile
matplotlib
pesq
auraloss
tqdm
nnAudio
ninja
huggingface_hub>=0.23.4

View File

@ -0,0 +1,65 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
import os
import sys
# to import modules from parent_dir
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(parent_dir)
import torch
from alias_free_activation.cuda import activation1d
from activations import Snake
def test_load_fused_kernels():
try:
print("[Success] load_fused_kernels")
except ImportError as e:
print("[Fail] load_fused_kernels")
raise e
def test_anti_alias_activation():
data = torch.rand((10, 10, 200), device="cuda")
# Check activations.Snake cuda vs. torch
fused_anti_alias_activation = activation1d.Activation1d(
activation=Snake(10), fused=True
).cuda()
fused_activation_output = fused_anti_alias_activation(data)
torch_anti_alias_activation = activation1d.Activation1d(
activation=Snake(10), fused=False
).cuda()
torch_activation_output = torch_anti_alias_activation(data)
test_result = (fused_activation_output - torch_activation_output).abs()
while test_result.dim() != 1:
test_result = test_result.mean(dim=-1)
diff = test_result.mean(dim=-1)
if diff <= 1e-3:
print(
f"\n[Success] test_fused_anti_alias_activation"
f"\n > mean_difference={diff}"
f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
)
else:
print(
f"\n[Fail] test_fused_anti_alias_activation"
f"\n > mean_difference={diff}, "
f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
)
if __name__ == "__main__":
from alias_free_activation.cuda import load
load.load()
test_load_fused_kernels()
test_anti_alias_activation()

View File

@ -0,0 +1,66 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
import os
import sys
# to import modules from parent_dir
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(parent_dir)
import torch
from alias_free_activation.cuda import activation1d
from activations import SnakeBeta
def test_load_fused_kernels():
try:
print("[Success] load_fused_kernels")
except ImportError as e:
print("[Fail] load_fused_kernels")
raise e
def test_anti_alias_activation():
data = torch.rand((10, 10, 200), device="cuda")
# Check activations, Snake CUDA vs. Torch
fused_anti_alias_activation = activation1d.Activation1d(
activation=SnakeBeta(10), fused=True
).cuda()
fused_activation_output = fused_anti_alias_activation(data)
torch_anti_alias_activation = activation1d.Activation1d(
activation=SnakeBeta(10), fused=False
).cuda()
torch_activation_output = torch_anti_alias_activation(data)
test_result = (fused_activation_output - torch_activation_output).abs()
while test_result.dim() != 1:
test_result = test_result.mean(dim=-1)
diff = test_result.mean(dim=-1)
if diff <= 1e-3:
print(
f"\n[Success] test_fused_anti_alias_activation"
f"\n > mean_difference={diff}"
f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
)
else:
print(
f"\n[Fail] test_fused_anti_alias_activation"
f"\n > mean_difference={diff}, "
f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
)
if __name__ == "__main__":
from alias_free_activation.cuda import load
load.load()
test_load_fused_kernels()
test_anti_alias_activation()

View File

@ -0,0 +1,221 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
import os
import sys
# to import modules from parent_dir
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(parent_dir)
import torch
import json
from env import AttrDict
from bigvgan import BigVGAN
from time import time
from tqdm import tqdm
from meldataset import mel_spectrogram, MAX_WAV_VALUE
from scipy.io.wavfile import write
import numpy as np
import argparse
torch.backends.cudnn.benchmark = True
# For easier debugging
torch.set_printoptions(linewidth=200, threshold=10_000)
def generate_soundwave(duration=5.0, sr=24000):
t = np.linspace(0, duration, int(sr * duration), False, dtype=np.float32)
modulation = np.sin(2 * np.pi * t / duration)
min_freq = 220
max_freq = 1760
frequencies = min_freq + (max_freq - min_freq) * (modulation + 1) / 2
soundwave = np.sin(2 * np.pi * frequencies * t)
soundwave = soundwave / np.max(np.abs(soundwave)) * 0.95
return soundwave, sr
def get_mel(x, h):
return mel_spectrogram(
x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax
)
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print(f"Loading '{filepath}'")
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Test script to check CUDA kernel correctness."
)
parser.add_argument(
"--checkpoint_file",
type=str,
required=True,
help="Path to the checkpoint file. Assumes config.json exists in the directory.",
)
args = parser.parse_args()
config_file = os.path.join(os.path.split(args.checkpoint_file)[0], "config.json")
with open(config_file) as f:
config = f.read()
json_config = json.loads(config)
h = AttrDict({**json_config})
print("loading plain Pytorch BigVGAN")
generator_original = BigVGAN(h).to("cuda")
print("loading CUDA kernel BigVGAN with auto-build")
generator_cuda_kernel = BigVGAN(h, use_cuda_kernel=True).to("cuda")
state_dict_g = load_checkpoint(args.checkpoint_file, "cuda")
generator_original.load_state_dict(state_dict_g["generator"])
generator_cuda_kernel.load_state_dict(state_dict_g["generator"])
generator_original.remove_weight_norm()
generator_original.eval()
generator_cuda_kernel.remove_weight_norm()
generator_cuda_kernel.eval()
# define number of samples and length of mel frame to benchmark
num_sample = 10
num_mel_frame = 16384
# CUDA kernel correctness check
diff = 0.0
for i in tqdm(range(num_sample)):
# Random mel
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
with torch.inference_mode():
audio_original = generator_original(data)
with torch.inference_mode():
audio_cuda_kernel = generator_cuda_kernel(data)
# Both outputs should be (almost) the same
test_result = (audio_original - audio_cuda_kernel).abs()
diff += test_result.mean(dim=-1).item()
diff /= num_sample
if (
diff <= 2e-3
): # We can expect a small difference (~1e-3) which does not affect perceptual quality
print(
f"\n[Success] test CUDA fused vs. plain torch BigVGAN inference"
f"\n > mean_difference={diff}"
f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}"
f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}"
)
else:
print(
f"\n[Fail] test CUDA fused vs. plain torch BigVGAN inference"
f"\n > mean_difference={diff}"
f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}, "
f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}"
)
del data, audio_original, audio_cuda_kernel
# Variables for tracking total time and VRAM usage
toc_total_original = 0
toc_total_cuda_kernel = 0
vram_used_original_total = 0
vram_used_cuda_kernel_total = 0
audio_length_total = 0
# Measure Original inference in isolation
for i in tqdm(range(num_sample)):
torch.cuda.reset_peak_memory_stats(device="cuda")
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
torch.cuda.synchronize()
tic = time()
with torch.inference_mode():
audio_original = generator_original(data)
torch.cuda.synchronize()
toc = time() - tic
toc_total_original += toc
vram_used_original_total += torch.cuda.max_memory_allocated(device="cuda")
del data, audio_original
torch.cuda.empty_cache()
# Measure CUDA kernel inference in isolation
for i in tqdm(range(num_sample)):
torch.cuda.reset_peak_memory_stats(device="cuda")
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
torch.cuda.synchronize()
tic = time()
with torch.inference_mode():
audio_cuda_kernel = generator_cuda_kernel(data)
torch.cuda.synchronize()
toc = time() - tic
toc_total_cuda_kernel += toc
audio_length_total += audio_cuda_kernel.shape[-1]
vram_used_cuda_kernel_total += torch.cuda.max_memory_allocated(device="cuda")
del data, audio_cuda_kernel
torch.cuda.empty_cache()
# Calculate metrics
audio_second = audio_length_total / h.sampling_rate
khz_original = audio_length_total / toc_total_original / 1000
khz_cuda_kernel = audio_length_total / toc_total_cuda_kernel / 1000
vram_used_original_gb = vram_used_original_total / num_sample / (1024 ** 3)
vram_used_cuda_kernel_gb = vram_used_cuda_kernel_total / num_sample / (1024 ** 3)
# Print results
print(
f"Original BigVGAN: took {toc_total_original:.2f} seconds to generate {audio_second:.2f} seconds of audio, {khz_original:.1f}kHz, {audio_second / toc_total_original:.1f} faster than realtime, VRAM used {vram_used_original_gb:.1f} GB"
)
print(
f"CUDA kernel BigVGAN: took {toc_total_cuda_kernel:.2f} seconds to generate {audio_second:.2f} seconds of audio, {khz_cuda_kernel:.1f}kHz, {audio_second / toc_total_cuda_kernel:.1f} faster than realtime, VRAM used {vram_used_cuda_kernel_gb:.1f} GB"
)
print(f"speedup of CUDA kernel: {khz_cuda_kernel / khz_original}")
print(f"VRAM saving of CUDA kernel: {vram_used_original_gb / vram_used_cuda_kernel_gb}")
# Use artificial sine waves for inference test
audio_real, sr = generate_soundwave(duration=5.0, sr=h.sampling_rate)
audio_real = torch.tensor(audio_real).to("cuda")
# Compute mel spectrogram from the ground truth audio
x = get_mel(audio_real.unsqueeze(0), h)
with torch.inference_mode():
y_g_hat_original = generator_original(x)
y_g_hat_cuda_kernel = generator_cuda_kernel(x)
audio_real = audio_real.squeeze()
audio_real = audio_real * MAX_WAV_VALUE
audio_real = audio_real.cpu().numpy().astype("int16")
audio_original = y_g_hat_original.squeeze()
audio_original = audio_original * MAX_WAV_VALUE
audio_original = audio_original.cpu().numpy().astype("int16")
audio_cuda_kernel = y_g_hat_cuda_kernel.squeeze()
audio_cuda_kernel = audio_cuda_kernel * MAX_WAV_VALUE
audio_cuda_kernel = audio_cuda_kernel.cpu().numpy().astype("int16")
os.makedirs("tmp", exist_ok=True)
output_file_real = os.path.join("tmp", "audio_real.wav")
output_file_original = os.path.join("tmp", "audio_generated_original.wav")
output_file_cuda_kernel = os.path.join("tmp", "audio_generated_cuda_kernel.wav")
write(output_file_real, h.sampling_rate, audio_real)
write(output_file_original, h.sampling_rate, audio_original)
write(output_file_cuda_kernel, h.sampling_rate, audio_cuda_kernel)
print("Example generated audios of original vs. fused CUDA kernel written to tmp!")
print("Done")

777
GPT_SoVITS/BigVGAN/train.py Normal file
View File

@ -0,0 +1,777 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
import itertools
import os
import time
import argparse
import json
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DistributedSampler, DataLoader
import torch.multiprocessing as mp
from torch.distributed import init_process_group
from torch.nn.parallel import DistributedDataParallel
from env import AttrDict, build_env
from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist, MAX_WAV_VALUE
from bigvgan import BigVGAN
from discriminators import (
MultiPeriodDiscriminator,
MultiResolutionDiscriminator,
MultiBandDiscriminator,
MultiScaleSubbandCQTDiscriminator,
)
from loss import (
feature_loss,
generator_loss,
discriminator_loss,
MultiScaleMelSpectrogramLoss,
)
from utils import (
plot_spectrogram,
plot_spectrogram_clipped,
scan_checkpoint,
load_checkpoint,
save_checkpoint,
save_audio,
)
import torchaudio as ta
from pesq import pesq
from tqdm import tqdm
import auraloss
torch.backends.cudnn.benchmark = False
def train(rank, a, h):
if h.num_gpus > 1:
# initialize distributed
init_process_group(
backend=h.dist_config["dist_backend"],
init_method=h.dist_config["dist_url"],
world_size=h.dist_config["world_size"] * h.num_gpus,
rank=rank,
)
# Set seed and device
torch.cuda.manual_seed(h.seed)
torch.cuda.set_device(rank)
device = torch.device(f"cuda:{rank:d}")
# Define BigVGAN generator
generator = BigVGAN(h).to(device)
# Define discriminators. MPD is used by default
mpd = MultiPeriodDiscriminator(h).to(device)
# Define additional discriminators. BigVGAN-v1 uses UnivNet's MRD as default
# New in BigVGAN-v2: option to switch to new discriminators: MultiBandDiscriminator / MultiScaleSubbandCQTDiscriminator
if h.get("use_mbd_instead_of_mrd", False): # Switch to MBD
print(
"[INFO] using MultiBandDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator"
)
# Variable name is kept as "mrd" for backward compatibility & minimal code change
mrd = MultiBandDiscriminator(h).to(device)
elif h.get("use_cqtd_instead_of_mrd", False): # Switch to CQTD
print(
"[INFO] using MultiScaleSubbandCQTDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator"
)
mrd = MultiScaleSubbandCQTDiscriminator(h).to(device)
else: # Fallback to original MRD in BigVGAN-v1
mrd = MultiResolutionDiscriminator(h).to(device)
# New in BigVGAN-v2: option to switch to multi-scale L1 mel loss
if h.get("use_multiscale_melloss", False):
print(
"[INFO] using multi-scale Mel l1 loss of BigVGAN-v2 instead of the original single-scale loss"
)
fn_mel_loss_multiscale = MultiScaleMelSpectrogramLoss(
sampling_rate=h.sampling_rate
) # NOTE: accepts waveform as input
else:
fn_mel_loss_singlescale = F.l1_loss
# Print the model & number of parameters, and create or scan the latest checkpoint from checkpoints directory
if rank == 0:
print(generator)
print(mpd)
print(mrd)
print(f"Generator params: {sum(p.numel() for p in generator.parameters())}")
print(f"Discriminator mpd params: {sum(p.numel() for p in mpd.parameters())}")
print(f"Discriminator mrd params: {sum(p.numel() for p in mrd.parameters())}")
os.makedirs(a.checkpoint_path, exist_ok=True)
print(f"Checkpoints directory: {a.checkpoint_path}")
if os.path.isdir(a.checkpoint_path):
# New in v2.1: If the step prefix pattern-based checkpoints are not found, also check for renamed files in Hugging Face Hub to resume training
cp_g = scan_checkpoint(
a.checkpoint_path, prefix="g_", renamed_file="bigvgan_generator.pt"
)
cp_do = scan_checkpoint(
a.checkpoint_path,
prefix="do_",
renamed_file="bigvgan_discriminator_optimizer.pt",
)
# Load the latest checkpoint if exists
steps = 0
if cp_g is None or cp_do is None:
state_dict_do = None
last_epoch = -1
else:
state_dict_g = load_checkpoint(cp_g, device)
state_dict_do = load_checkpoint(cp_do, device)
generator.load_state_dict(state_dict_g["generator"])
mpd.load_state_dict(state_dict_do["mpd"])
mrd.load_state_dict(state_dict_do["mrd"])
steps = state_dict_do["steps"] + 1
last_epoch = state_dict_do["epoch"]
# Initialize DDP, optimizers, and schedulers
if h.num_gpus > 1:
generator = DistributedDataParallel(generator, device_ids=[rank]).to(device)
mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device)
mrd = DistributedDataParallel(mrd, device_ids=[rank]).to(device)
optim_g = torch.optim.AdamW(
generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]
)
optim_d = torch.optim.AdamW(
itertools.chain(mrd.parameters(), mpd.parameters()),
h.learning_rate,
betas=[h.adam_b1, h.adam_b2],
)
if state_dict_do is not None:
optim_g.load_state_dict(state_dict_do["optim_g"])
optim_d.load_state_dict(state_dict_do["optim_d"])
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
optim_g, gamma=h.lr_decay, last_epoch=last_epoch
)
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
optim_d, gamma=h.lr_decay, last_epoch=last_epoch
)
# Define training and validation datasets
"""
unseen_validation_filelist will contain sample filepaths outside the seen training & validation dataset
Example: trained on LibriTTS, validate on VCTK
"""
training_filelist, validation_filelist, list_unseen_validation_filelist = (
get_dataset_filelist(a)
)
trainset = MelDataset(
training_filelist,
h,
h.segment_size,
h.n_fft,
h.num_mels,
h.hop_size,
h.win_size,
h.sampling_rate,
h.fmin,
h.fmax,
shuffle=False if h.num_gpus > 1 else True,
fmax_loss=h.fmax_for_loss,
device=device,
fine_tuning=a.fine_tuning,
base_mels_path=a.input_mels_dir,
is_seen=True,
)
train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None
train_loader = DataLoader(
trainset,
num_workers=h.num_workers,
shuffle=False,
sampler=train_sampler,
batch_size=h.batch_size,
pin_memory=True,
drop_last=True,
)
if rank == 0:
validset = MelDataset(
validation_filelist,
h,
h.segment_size,
h.n_fft,
h.num_mels,
h.hop_size,
h.win_size,
h.sampling_rate,
h.fmin,
h.fmax,
False,
False,
fmax_loss=h.fmax_for_loss,
device=device,
fine_tuning=a.fine_tuning,
base_mels_path=a.input_mels_dir,
is_seen=True,
)
validation_loader = DataLoader(
validset,
num_workers=1,
shuffle=False,
sampler=None,
batch_size=1,
pin_memory=True,
drop_last=True,
)
list_unseen_validset = []
list_unseen_validation_loader = []
for i in range(len(list_unseen_validation_filelist)):
unseen_validset = MelDataset(
list_unseen_validation_filelist[i],
h,
h.segment_size,
h.n_fft,
h.num_mels,
h.hop_size,
h.win_size,
h.sampling_rate,
h.fmin,
h.fmax,
False,
False,
fmax_loss=h.fmax_for_loss,
device=device,
fine_tuning=a.fine_tuning,
base_mels_path=a.input_mels_dir,
is_seen=False,
)
unseen_validation_loader = DataLoader(
unseen_validset,
num_workers=1,
shuffle=False,
sampler=None,
batch_size=1,
pin_memory=True,
drop_last=True,
)
list_unseen_validset.append(unseen_validset)
list_unseen_validation_loader.append(unseen_validation_loader)
# Tensorboard logger
sw = SummaryWriter(os.path.join(a.checkpoint_path, "logs"))
if a.save_audio: # Also save audio to disk if --save_audio is set to True
os.makedirs(os.path.join(a.checkpoint_path, "samples"), exist_ok=True)
"""
Validation loop, "mode" parameter is automatically defined as (seen or unseen)_(name of the dataset).
If the name of the dataset contains "nonspeech", it skips PESQ calculation to prevent errors
"""
def validate(rank, a, h, loader, mode="seen"):
assert rank == 0, "validate should only run on rank=0"
generator.eval()
torch.cuda.empty_cache()
val_err_tot = 0
val_pesq_tot = 0
val_mrstft_tot = 0
# Modules for evaluation metrics
pesq_resampler = ta.transforms.Resample(h.sampling_rate, 16000).cuda()
loss_mrstft = auraloss.freq.MultiResolutionSTFTLoss(device="cuda")
if a.save_audio: # Also save audio to disk if --save_audio is set to True
os.makedirs(
os.path.join(a.checkpoint_path, "samples", f"gt_{mode}"),
exist_ok=True,
)
os.makedirs(
os.path.join(a.checkpoint_path, "samples", f"{mode}_{steps:08d}"),
exist_ok=True,
)
with torch.no_grad():
print(f"step {steps} {mode} speaker validation...")
# Loop over validation set and compute metrics
for j, batch in enumerate(tqdm(loader)):
x, y, _, y_mel = batch
y = y.to(device)
if hasattr(generator, "module"):
y_g_hat = generator.module(x.to(device))
else:
y_g_hat = generator(x.to(device))
y_mel = y_mel.to(device, non_blocking=True)
y_g_hat_mel = mel_spectrogram(
y_g_hat.squeeze(1),
h.n_fft,
h.num_mels,
h.sampling_rate,
h.hop_size,
h.win_size,
h.fmin,
h.fmax_for_loss,
)
min_t = min(y_mel.size(-1), y_g_hat_mel.size(-1))
val_err_tot += F.l1_loss(y_mel[...,:min_t], y_g_hat_mel[...,:min_t]).item()
# PESQ calculation. only evaluate PESQ if it's speech signal (nonspeech PESQ will error out)
if (
not "nonspeech" in mode
): # Skips if the name of dataset (in mode string) contains "nonspeech"
# Resample to 16000 for pesq
y_16k = pesq_resampler(y)
y_g_hat_16k = pesq_resampler(y_g_hat.squeeze(1))
y_int_16k = (y_16k[0] * MAX_WAV_VALUE).short().cpu().numpy()
y_g_hat_int_16k = (
(y_g_hat_16k[0] * MAX_WAV_VALUE).short().cpu().numpy()
)
val_pesq_tot += pesq(16000, y_int_16k, y_g_hat_int_16k, "wb")
# MRSTFT calculation
min_t = min(y.size(-1), y_g_hat.size(-1))
val_mrstft_tot += loss_mrstft(y_g_hat[...,:min_t], y[...,:min_t]).item()
# Log audio and figures to Tensorboard
if j % a.eval_subsample == 0: # Subsample every nth from validation set
if steps >= 0:
sw.add_audio(f"gt_{mode}/y_{j}", y[0], steps, h.sampling_rate)
if (
a.save_audio
): # Also save audio to disk if --save_audio is set to True
save_audio(
y[0],
os.path.join(
a.checkpoint_path,
"samples",
f"gt_{mode}",
f"{j:04d}.wav",
),
h.sampling_rate,
)
sw.add_figure(
f"gt_{mode}/y_spec_{j}",
plot_spectrogram(x[0]),
steps,
)
sw.add_audio(
f"generated_{mode}/y_hat_{j}",
y_g_hat[0],
steps,
h.sampling_rate,
)
if (
a.save_audio
): # Also save audio to disk if --save_audio is set to True
save_audio(
y_g_hat[0, 0],
os.path.join(
a.checkpoint_path,
"samples",
f"{mode}_{steps:08d}",
f"{j:04d}.wav",
),
h.sampling_rate,
)
# Spectrogram of synthesized audio
y_hat_spec = mel_spectrogram(
y_g_hat.squeeze(1),
h.n_fft,
h.num_mels,
h.sampling_rate,
h.hop_size,
h.win_size,
h.fmin,
h.fmax,
)
sw.add_figure(
f"generated_{mode}/y_hat_spec_{j}",
plot_spectrogram(y_hat_spec.squeeze(0).cpu().numpy()),
steps,
)
"""
Visualization of spectrogram difference between GT and synthesized audio, difference higher than 1 is clipped for better visualization.
"""
spec_delta = torch.clamp(
torch.abs(x[0] - y_hat_spec.squeeze(0).cpu()),
min=1e-6,
max=1.0,
)
sw.add_figure(
f"delta_dclip1_{mode}/spec_{j}",
plot_spectrogram_clipped(spec_delta.numpy(), clip_max=1.0),
steps,
)
val_err = val_err_tot / (j + 1)
val_pesq = val_pesq_tot / (j + 1)
val_mrstft = val_mrstft_tot / (j + 1)
# Log evaluation metrics to Tensorboard
sw.add_scalar(f"validation_{mode}/mel_spec_error", val_err, steps)
sw.add_scalar(f"validation_{mode}/pesq", val_pesq, steps)
sw.add_scalar(f"validation_{mode}/mrstft", val_mrstft, steps)
generator.train()
# If the checkpoint is loaded, start with validation loop
if steps != 0 and rank == 0 and not a.debug:
if not a.skip_seen:
validate(
rank,
a,
h,
validation_loader,
mode=f"seen_{train_loader.dataset.name}",
)
for i in range(len(list_unseen_validation_loader)):
validate(
rank,
a,
h,
list_unseen_validation_loader[i],
mode=f"unseen_{list_unseen_validation_loader[i].dataset.name}",
)
# Exit the script if --evaluate is set to True
if a.evaluate:
exit()
# Main training loop
generator.train()
mpd.train()
mrd.train()
for epoch in range(max(0, last_epoch), a.training_epochs):
if rank == 0:
start = time.time()
print(f"Epoch: {epoch + 1}")
if h.num_gpus > 1:
train_sampler.set_epoch(epoch)
for i, batch in enumerate(train_loader):
if rank == 0:
start_b = time.time()
x, y, _, y_mel = batch
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)
y_mel = y_mel.to(device, non_blocking=True)
y = y.unsqueeze(1)
y_g_hat = generator(x)
y_g_hat_mel = mel_spectrogram(
y_g_hat.squeeze(1),
h.n_fft,
h.num_mels,
h.sampling_rate,
h.hop_size,
h.win_size,
h.fmin,
h.fmax_for_loss,
)
optim_d.zero_grad()
# MPD
y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach())
loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(
y_df_hat_r, y_df_hat_g
)
# MRD
y_ds_hat_r, y_ds_hat_g, _, _ = mrd(y, y_g_hat.detach())
loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(
y_ds_hat_r, y_ds_hat_g
)
loss_disc_all = loss_disc_s + loss_disc_f
# Set clip_grad_norm value
clip_grad_norm = h.get("clip_grad_norm", 1000.0) # Default to 1000
# Whether to freeze D for initial training steps
if steps >= a.freeze_step:
loss_disc_all.backward()
grad_norm_mpd = torch.nn.utils.clip_grad_norm_(
mpd.parameters(), clip_grad_norm
)
grad_norm_mrd = torch.nn.utils.clip_grad_norm_(
mrd.parameters(), clip_grad_norm
)
optim_d.step()
else:
print(
f"[WARNING] skipping D training for the first {a.freeze_step} steps"
)
grad_norm_mpd = 0.0
grad_norm_mrd = 0.0
# Generator
optim_g.zero_grad()
# L1 Mel-Spectrogram Loss
lambda_melloss = h.get(
"lambda_melloss", 45.0
) # Defaults to 45 in BigVGAN-v1 if not set
if h.get("use_multiscale_melloss", False): # uses wav <y, y_g_hat> for loss
loss_mel = fn_mel_loss_multiscale(y, y_g_hat) * lambda_melloss
else: # Uses mel <y_mel, y_g_hat_mel> for loss
loss_mel = fn_mel_loss_singlescale(y_mel, y_g_hat_mel) * lambda_melloss
# MPD loss
y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat)
loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g)
# MRD loss
y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = mrd(y, y_g_hat)
loss_fm_s = feature_loss(fmap_s_r, fmap_s_g)
loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g)
if steps >= a.freeze_step:
loss_gen_all = (
loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel
)
else:
print(
f"[WARNING] using regression loss only for G for the first {a.freeze_step} steps"
)
loss_gen_all = loss_mel
loss_gen_all.backward()
grad_norm_g = torch.nn.utils.clip_grad_norm_(
generator.parameters(), clip_grad_norm
)
optim_g.step()
if rank == 0:
# STDOUT logging
if steps % a.stdout_interval == 0:
mel_error = (
loss_mel.item() / lambda_melloss
) # Log training mel regression loss to stdout
print(
f"Steps: {steps:d}, "
f"Gen Loss Total: {loss_gen_all:4.3f}, "
f"Mel Error: {mel_error:4.3f}, "
f"s/b: {time.time() - start_b:4.3f} "
f"lr: {optim_g.param_groups[0]['lr']:4.7f} "
f"grad_norm_g: {grad_norm_g:4.3f}"
)
# Checkpointing
if steps % a.checkpoint_interval == 0 and steps != 0:
checkpoint_path = f"{a.checkpoint_path}/g_{steps:08d}"
save_checkpoint(
checkpoint_path,
{
"generator": (
generator.module if h.num_gpus > 1 else generator
).state_dict()
},
)
checkpoint_path = f"{a.checkpoint_path}/do_{steps:08d}"
save_checkpoint(
checkpoint_path,
{
"mpd": (mpd.module if h.num_gpus > 1 else mpd).state_dict(),
"mrd": (mrd.module if h.num_gpus > 1 else mrd).state_dict(),
"optim_g": optim_g.state_dict(),
"optim_d": optim_d.state_dict(),
"steps": steps,
"epoch": epoch,
},
)
# Tensorboard summary logging
if steps % a.summary_interval == 0:
mel_error = (
loss_mel.item() / lambda_melloss
) # Log training mel regression loss to tensorboard
sw.add_scalar("training/gen_loss_total", loss_gen_all.item(), steps)
sw.add_scalar("training/mel_spec_error", mel_error, steps)
sw.add_scalar("training/fm_loss_mpd", loss_fm_f.item(), steps)
sw.add_scalar("training/gen_loss_mpd", loss_gen_f.item(), steps)
sw.add_scalar("training/disc_loss_mpd", loss_disc_f.item(), steps)
sw.add_scalar("training/grad_norm_mpd", grad_norm_mpd, steps)
sw.add_scalar("training/fm_loss_mrd", loss_fm_s.item(), steps)
sw.add_scalar("training/gen_loss_mrd", loss_gen_s.item(), steps)
sw.add_scalar("training/disc_loss_mrd", loss_disc_s.item(), steps)
sw.add_scalar("training/grad_norm_mrd", grad_norm_mrd, steps)
sw.add_scalar("training/grad_norm_g", grad_norm_g, steps)
sw.add_scalar(
"training/learning_rate_d", scheduler_d.get_last_lr()[0], steps
)
sw.add_scalar(
"training/learning_rate_g", scheduler_g.get_last_lr()[0], steps
)
sw.add_scalar("training/epoch", epoch + 1, steps)
# Validation
if steps % a.validation_interval == 0:
# Plot training input x so far used
for i_x in range(x.shape[0]):
sw.add_figure(
f"training_input/x_{i_x}",
plot_spectrogram(x[i_x].cpu()),
steps,
)
sw.add_audio(
f"training_input/y_{i_x}",
y[i_x][0],
steps,
h.sampling_rate,
)
# Seen and unseen speakers validation loops
if not a.debug and steps != 0:
validate(
rank,
a,
h,
validation_loader,
mode=f"seen_{train_loader.dataset.name}",
)
for i in range(len(list_unseen_validation_loader)):
validate(
rank,
a,
h,
list_unseen_validation_loader[i],
mode=f"unseen_{list_unseen_validation_loader[i].dataset.name}",
)
steps += 1
# BigVGAN-v2 learning rate scheduler is changed from epoch-level to step-level
scheduler_g.step()
scheduler_d.step()
if rank == 0:
print(
f"Time taken for epoch {epoch + 1} is {int(time.time() - start)} sec\n"
)
def main():
print("Initializing Training Process..")
parser = argparse.ArgumentParser()
parser.add_argument("--group_name", default=None)
parser.add_argument("--input_wavs_dir", default="LibriTTS")
parser.add_argument("--input_mels_dir", default="ft_dataset")
parser.add_argument(
"--input_training_file", default="tests/LibriTTS/train-full.txt"
)
parser.add_argument(
"--input_validation_file", default="tests/LibriTTS/val-full.txt"
)
parser.add_argument(
"--list_input_unseen_wavs_dir",
nargs="+",
default=["tests/LibriTTS", "tests/LibriTTS"],
)
parser.add_argument(
"--list_input_unseen_validation_file",
nargs="+",
default=["tests/LibriTTS/dev-clean.txt", "tests/LibriTTS/dev-other.txt"],
)
parser.add_argument("--checkpoint_path", default="exp/bigvgan")
parser.add_argument("--config", default="")
parser.add_argument("--training_epochs", default=100000, type=int)
parser.add_argument("--stdout_interval", default=5, type=int)
parser.add_argument("--checkpoint_interval", default=50000, type=int)
parser.add_argument("--summary_interval", default=100, type=int)
parser.add_argument("--validation_interval", default=50000, type=int)
parser.add_argument(
"--freeze_step",
default=0,
type=int,
help="freeze D for the first specified steps. G only uses regression loss for these steps.",
)
parser.add_argument("--fine_tuning", default=False, type=bool)
parser.add_argument(
"--debug",
default=False,
type=bool,
help="debug mode. skips validation loop throughout training",
)
parser.add_argument(
"--evaluate",
default=False,
type=bool,
help="only run evaluation from checkpoint and exit",
)
parser.add_argument(
"--eval_subsample",
default=5,
type=int,
help="subsampling during evaluation loop",
)
parser.add_argument(
"--skip_seen",
default=False,
type=bool,
help="skip seen dataset. useful for test set inference",
)
parser.add_argument(
"--save_audio",
default=False,
type=bool,
help="save audio of test set inference to disk",
)
a = parser.parse_args()
with open(a.config) as f:
data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)
build_env(a.config, "config.json", a.checkpoint_path)
torch.manual_seed(h.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(h.seed)
h.num_gpus = torch.cuda.device_count()
h.batch_size = int(h.batch_size / h.num_gpus)
print(f"Batch size per GPU: {h.batch_size}")
else:
pass
if h.num_gpus > 1:
mp.spawn(
train,
nprocs=h.num_gpus,
args=(
a,
h,
),
)
else:
train(0, a, h)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,99 @@
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import glob
import os
import matplotlib
import torch
from torch.nn.utils import weight_norm
matplotlib.use("Agg")
import matplotlib.pylab as plt
from meldataset import MAX_WAV_VALUE
from scipy.io.wavfile import write
def plot_spectrogram(spectrogram):
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
plt.colorbar(im, ax=ax)
fig.canvas.draw()
plt.close()
return fig
def plot_spectrogram_clipped(spectrogram, clip_max=2.0):
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(
spectrogram,
aspect="auto",
origin="lower",
interpolation="none",
vmin=1e-6,
vmax=clip_max,
)
plt.colorbar(im, ax=ax)
fig.canvas.draw()
plt.close()
return fig
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def apply_weight_norm(m):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
weight_norm(m)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print(f"Loading '{filepath}'")
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def save_checkpoint(filepath, obj):
print(f"Saving checkpoint to {filepath}")
torch.save(obj, filepath)
print("Complete.")
def scan_checkpoint(cp_dir, prefix, renamed_file=None):
# Fallback to original scanning logic first
pattern = os.path.join(cp_dir, prefix + "????????")
cp_list = glob.glob(pattern)
if len(cp_list) > 0:
last_checkpoint_path = sorted(cp_list)[-1]
print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'")
return last_checkpoint_path
# If no pattern-based checkpoints are found, check for renamed file
if renamed_file:
renamed_path = os.path.join(cp_dir, renamed_file)
if os.path.isfile(renamed_path):
print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'")
return renamed_path
return None
def save_audio(audio, path, sr):
# wav: torch with 1d shape
audio = audio * MAX_WAV_VALUE
audio = audio.cpu().numpy().astype("int16")
write(path, sr, audio)

View File

@ -79,7 +79,7 @@ def set_seed(seed:int):
except:
pass
return seed
class TTS_Config:
default_configs={
"default":{
@ -118,18 +118,18 @@ class TTS_Config:
# "auto_yue",#多语种启动切分识别语种
def __init__(self, configs: Union[dict, str]=None):
# 设置默认配置文件路径
configs_base_path:str = "GPT_SoVITS/configs/"
os.makedirs(configs_base_path, exist_ok=True)
self.configs_path:str = os.path.join(configs_base_path, "tts_infer.yaml")
if configs in ["", None]:
if not os.path.exists(self.configs_path):
self.save_configs()
print(f"Create default config file at {self.configs_path}")
configs:dict = deepcopy(self.default_configs)
if isinstance(configs, str):
self.configs_path = configs
configs:dict = self._load_configs(self.configs_path)
@ -142,8 +142,8 @@ class TTS_Config:
default_config_key = "default"if version=="v1" else "default_v2"
self.configs:dict = configs.get("custom", deepcopy(self.default_configs[default_config_key]))
self.device = self.configs.get("device", torch.device("cpu"))
self.is_half = self.configs.get("is_half", False)
self.version = version
@ -153,7 +153,7 @@ class TTS_Config:
self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None)
self.languages = self.v2_languages if self.version=="v2" else self.v1_languages
if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)):
self.t2s_weights_path = self.default_configs[default_config_key]['t2s_weights_path']
print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}")
@ -167,8 +167,8 @@ class TTS_Config:
self.cnhuhbert_base_path = self.default_configs[default_config_key]['cnhuhbert_base_path']
print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}")
self.update_configs()
self.max_sec = None
self.hz:int = 50
self.semantic_frame_rate:str = "25hz"
@ -180,7 +180,7 @@ class TTS_Config:
self.n_speakers:int = 300
def _load_configs(self, configs_path: str)->dict:
if os.path.exists(configs_path):
...
@ -189,14 +189,14 @@ class TTS_Config:
self.save_configs(configs_path)
with open(configs_path, 'r') as f:
configs = yaml.load(f, Loader=yaml.FullLoader)
return configs
def save_configs(self, configs_path:str=None)->None:
configs=deepcopy(self.default_configs)
if self.configs is not None:
configs["custom"] = self.update_configs()
if configs_path is None:
configs_path = self.configs_path
with open(configs_path, 'w') as f:
@ -217,7 +217,7 @@ class TTS_Config:
def update_version(self, version:str)->None:
self.version = version
self.languages = self.v2_languages if self.version=="v2" else self.v1_languages
def __str__(self):
self.configs = self.update_configs()
string = "TTS Config".center(100, '-') + '\n'
@ -225,7 +225,7 @@ class TTS_Config:
string += f"{str(k).ljust(20)}: {str(v)}\n"
string += "-" * 100 + '\n'
return string
def __repr__(self):
return self.__str__()
@ -242,21 +242,21 @@ class TTS:
self.configs = configs
else:
self.configs:TTS_Config = TTS_Config(configs)
self.t2s_model:Text2SemanticLightningModule = None
self.vits_model:SynthesizerTrn = None
self.bert_tokenizer:AutoTokenizer = None
self.bert_model:AutoModelForMaskedLM = None
self.cnhuhbert_model:CNHubert = None
self._init_models()
self.text_preprocessor:TextPreprocessor = \
TextPreprocessor(self.bert_model,
self.bert_tokenizer,
TextPreprocessor(self.bert_model,
self.bert_tokenizer,
self.configs.device)
self.prompt_cache:dict = {
"ref_audio_path" : None,
"prompt_semantic": None,
@ -268,8 +268,8 @@ class TTS:
"norm_text" : None,
"aux_ref_audio_paths": [],
}
self.stop_flag:bool = False
self.precision:torch.dtype = torch.float16 if self.configs.is_half else torch.float32
@ -279,9 +279,9 @@ class TTS:
self.init_bert_weights(self.configs.bert_base_path)
self.init_cnhuhbert_weights(self.configs.cnhuhbert_base_path)
# self.enable_half_precision(self.configs.is_half)
def init_cnhuhbert_weights(self, base_path: str):
print(f"Loading CNHuBERT weights from {base_path}")
self.cnhuhbert_model = CNHubert(base_path)
@ -289,9 +289,9 @@ class TTS:
self.cnhuhbert_model = self.cnhuhbert_model.to(self.configs.device)
if self.configs.is_half and str(self.configs.device)!="cpu":
self.cnhuhbert_model = self.cnhuhbert_model.half()
def init_bert_weights(self, base_path: str):
print(f"Loading BERT weights from {base_path}")
self.bert_tokenizer = AutoTokenizer.from_pretrained(base_path)
@ -300,7 +300,7 @@ class TTS:
self.bert_model = self.bert_model.to(self.configs.device)
if self.configs.is_half and str(self.configs.device)!="cpu":
self.bert_model = self.bert_model.half()
def init_vits_weights(self, weights_path: str):
print(f"Loading VITS weights from {weights_path}")
self.configs.vits_weights_path = weights_path
@ -311,11 +311,11 @@ class TTS:
else:
self.configs.update_version("v2")
self.configs.save_configs()
hps["model"]["version"] = self.configs.version
self.configs.filter_length = hps["data"]["filter_length"]
self.configs.segment_size = hps["train"]["segment_size"]
self.configs.sampling_rate = hps["data"]["sampling_rate"]
self.configs.sampling_rate = hps["data"]["sampling_rate"]
self.configs.hop_length = hps["data"]["hop_length"]
self.configs.win_length = hps["data"]["win_length"]
self.configs.n_speakers = hps["data"]["n_speakers"]
@ -330,7 +330,7 @@ class TTS:
if hasattr(vits_model, "enc_q"):
del vits_model.enc_q
vits_model = vits_model.to(self.configs.device)
vits_model = vits_model.eval()
vits_model.load_state_dict(dict_s2["weight"], strict=False)
@ -338,7 +338,7 @@ class TTS:
if self.configs.is_half and str(self.configs.device)!="cpu":
self.vits_model = self.vits_model.half()
def init_t2s_weights(self, weights_path: str):
print(f"Loading Text2Semantic weights from {weights_path}")
self.configs.t2s_weights_path = weights_path
@ -354,18 +354,18 @@ class TTS:
self.t2s_model = t2s_model
if self.configs.is_half and str(self.configs.device)!="cpu":
self.t2s_model = self.t2s_model.half()
def enable_half_precision(self, enable: bool = True, save: bool = True):
'''
To enable half precision for the TTS model.
Args:
enable: bool, whether to enable half precision.
'''
if str(self.configs.device) == "cpu" and enable:
print("Half precision is not supported on CPU.")
return
self.configs.is_half = enable
self.precision = torch.float16 if enable else torch.float32
if save:
@ -388,7 +388,7 @@ class TTS:
self.bert_model = self.bert_model.float()
if self.cnhuhbert_model is not None:
self.cnhuhbert_model = self.cnhuhbert_model.float()
def set_device(self, device: torch.device, save: bool = True):
'''
To set the device for all models.
@ -406,10 +406,10 @@ class TTS:
self.bert_model = self.bert_model.to(device)
if self.cnhuhbert_model is not None:
self.cnhuhbert_model = self.cnhuhbert_model.to(device)
def set_ref_audio(self, ref_audio_path:str):
'''
To set the reference audio for the TTS model,
To set the reference audio for the TTS model,
including the prompt_semantic and refer_spepc.
Args:
ref_audio_path: str, the path of the reference audio.
@ -417,9 +417,9 @@ class TTS:
self._set_prompt_semantic(ref_audio_path)
self._set_ref_spec(ref_audio_path)
self._set_ref_audio_path(ref_audio_path)
def _set_ref_audio_path(self, ref_audio_path):
self.prompt_cache["ref_audio_path"] = ref_audio_path
self.prompt_cache["ref_audio_path"] = ref_audio_path
def _set_ref_spec(self, ref_audio_path):
spec = self._get_ref_spec(ref_audio_path)
@ -472,10 +472,10 @@ class TTS:
1, 2
) # .float()
codes = self.vits_model.extract_latent(hubert_feature)
prompt_semantic = codes[0, 0].to(self.configs.device)
self.prompt_cache["prompt_semantic"] = prompt_semantic
def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length:int=None):
seq = sequences[0]
ndim = seq.dim()
@ -496,12 +496,12 @@ class TTS:
padded_sequences.append(padded_seq)
batch = torch.stack(padded_sequences)
return batch
def to_batch(self, data:list,
prompt_data:dict=None,
batch_size:int=5,
threshold:float=0.75,
split_bucket:bool=True,
def to_batch(self, data:list,
prompt_data:dict=None,
batch_size:int=5,
threshold:float=0.75,
split_bucket:bool=True,
device:torch.device=torch.device("cpu"),
precision:torch.dtype=torch.float32,
):
@ -514,8 +514,8 @@ class TTS:
batch_index_list = []
if split_bucket:
index_and_len_list.sort(key=lambda x: x[1])
index_and_len_list = np.array(index_and_len_list, dtype=np.int64)
index_and_len_list = np.array(index_and_len_list, dtype=np.int64)
batch_index_list_len = 0
pos = 0
while pos <index_and_len_list.shape[0]:
@ -531,16 +531,16 @@ class TTS:
pos = pos_end
break
pos_end=pos_end-1
assert batch_index_list_len == len(data)
else:
for i in range(len(data)):
if i%batch_size == 0:
batch_index_list.append([])
batch_index_list[-1].append(i)
for batch_idx, index_list in enumerate(batch_index_list):
item_list = [data[idx] for idx in index_list]
phones_list = []
@ -568,19 +568,19 @@ class TTS:
all_bert_max_len = max(all_bert_max_len, all_bert_features.shape[-1])
all_phones_max_len = max(all_phones_max_len, all_phones.shape[-1])
phones_list.append(phones)
phones_len_list.append(phones.shape[-1])
all_phones_list.append(all_phones)
all_phones_len_list.append(all_phones.shape[-1])
all_bert_features_list.append(all_bert_features)
norm_text_batch.append(item["norm_text"])
phones_batch = phones_list
all_phones_batch = all_phones_list
all_bert_features_batch = all_bert_features_list
max_len = max(all_bert_max_len, all_phones_max_len)
# phones_batch = self.batch_sequences(phones_list, axis=0, pad_value=0, max_length=max_len)
#### 直接对phones和bert_features进行pad。padding策略会影响T2S模型生成的结果但不直接影响复读概率。影响复读概率的主要因素是mask的策略
@ -589,16 +589,16 @@ class TTS:
# all_bert_features_batch = torch.zeros((len(all_bert_features_list), 1024, max_len), dtype=precision, device=device)
# for idx, item in enumerate(all_bert_features_list):
# all_bert_features_batch[idx, :, : item.shape[-1]] = item
# #### 先对phones进行embedding、对bert_features进行project再pad到相同长度padding策略会影响T2S模型生成的结果但不直接影响复读概率。影响复读概率的主要因素是mask的策略
# all_phones_list = [self.t2s_model.model.ar_text_embedding(item.to(self.t2s_model.device)) for item in all_phones_list]
# all_phones_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) for item in all_phones_list]
# all_phones_batch = torch.stack(all_phones_list, dim=0)
# all_bert_features_list = [self.t2s_model.model.bert_proj(item.to(self.t2s_model.device).transpose(0, 1)) for item in all_bert_features_list]
# all_bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) for item in all_bert_features_list]
# all_bert_features_batch = torch.stack(all_bert_features_list, dim=0)
batch = {
"phones": phones_batch,
"phones_len": torch.LongTensor(phones_len_list).to(device),
@ -609,17 +609,17 @@ class TTS:
"max_len": max_len,
}
_data.append(batch)
return _data, batch_index_list
def recovery_order(self, data:list, batch_index_list:list)->list:
'''
Recovery the order of the audio according to the batch_index_list.
Args:
data (List[list(np.ndarray)]): the out of order audio .
batch_index_list (List[list[int]]): the batch index list.
Returns:
list (List[np.ndarray]): the data in the original order.
'''
@ -635,14 +635,14 @@ class TTS:
Stop the inference process.
'''
self.stop_flag = True
@torch.no_grad()
def run(self, inputs:dict):
"""
Text to speech inference.
Args:
inputs (dict):
inputs (dict):
{
"text": "", # str.(required) text to be synthesized
"text_lang: "", # str.(required) language of the text to be synthesized
@ -734,7 +734,7 @@ class TTS:
if not os.path.exists(ref_audio_path):
raise ValueError(f"{ref_audio_path} not exists")
self.set_ref_audio(ref_audio_path)
aux_ref_audio_paths = aux_ref_audio_paths if aux_ref_audio_paths is not None else []
paths = set(aux_ref_audio_paths)&set(self.prompt_cache["aux_ref_audio_paths"])
if not (len(list(paths)) == len(aux_ref_audio_paths) == len(self.prompt_cache["aux_ref_audio_paths"])):
@ -744,10 +744,10 @@ class TTS:
if path in [None, ""]:
continue
if not os.path.exists(path):
print(i18n("音频文件不存在,跳过:{}").format(path))
print(i18n("音频文件不存在,跳过:"), path)
continue
self.prompt_cache["refer_spec"].append(self._get_ref_spec(path))
if not no_prompt_text:
prompt_text = prompt_text.strip("\n")
if (prompt_text[-1] not in splits): prompt_text += "" if prompt_lang != "en" else "."
@ -757,7 +757,7 @@ class TTS:
self.prompt_cache["prompt_lang"] = prompt_lang
phones, bert_features, norm_text = \
self.text_preprocessor.segment_and_extract_feature_for_text(
prompt_text,
prompt_text,
prompt_lang,
self.configs.version)
self.prompt_cache["phones"] = phones
@ -778,26 +778,26 @@ class TTS:
return
batch_index_list:list = None
data, batch_index_list = self.to_batch(data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
data, batch_index_list = self.to_batch(data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
threshold=batch_threshold,
split_bucket=split_bucket,
device=self.configs.device,
precision=self.precision
)
else:
print(i18n("############ 切分文本 ############"))
print(f'############ {i18n("切分文本")} ############')
texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method)
data = []
for i in range(len(texts)):
if i%batch_size == 0:
data.append([])
data[-1].append(texts[i])
def make_batch(batch_texts):
batch_data = []
print(i18n("############ 提取文本Bert特征 ############"))
print(f'############ {i18n("提取文本Bert特征")} ############')
for text in tqdm(batch_texts):
phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(text, text_lang, self.configs.version)
if phones is None:
@ -810,9 +810,9 @@ class TTS:
batch_data.append(res)
if len(batch_data) == 0:
return None
batch, _ = self.to_batch(batch_data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
batch, _ = self.to_batch(batch_data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
threshold=batch_threshold,
split_bucket=False,
device=self.configs.device,
@ -868,10 +868,10 @@ class TTS:
t_34 += t4 - t3
refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
batch_audio_fragment = []
# ## vits并行推理 method 1
# pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
# pred_semantic_len = torch.LongTensor([item.shape[0] for item in pred_semantic_list]).to(self.configs.device)
@ -914,10 +914,10 @@ class TTS:
t_45 += t5 - t4
if return_fragment:
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
yield self.audio_postprocess([batch_audio_fragment],
self.configs.sampling_rate,
None,
speed_factor,
yield self.audio_postprocess([batch_audio_fragment],
self.configs.sampling_rate,
None,
speed_factor,
False,
fragment_interval
)
@ -935,10 +935,10 @@ class TTS:
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
dtype=np.int16)
return
yield self.audio_postprocess(audio,
self.configs.sampling_rate,
batch_index_list,
speed_factor,
yield self.audio_postprocess(audio,
self.configs.sampling_rate,
batch_index_list,
speed_factor,
split_bucket,
fragment_interval
)
@ -958,7 +958,7 @@ class TTS:
raise e
finally:
self.empty_cache()
def empty_cache(self):
try:
gc.collect() # 触发gc的垃圾回收。避免内存一直增长。
@ -967,13 +967,13 @@ class TTS:
elif str(self.configs.device) == "mps":
torch.mps.empty_cache()
except:
pass
def audio_postprocess(self,
audio:List[torch.Tensor],
sr:int,
batch_index_list:list=None,
speed_factor:float=1.0,
pass
def audio_postprocess(self,
audio:List[torch.Tensor],
sr:int,
batch_index_list:list=None,
speed_factor:float=1.0,
split_bucket:bool=True,
fragment_interval:float=0.3
)->Tuple[int, np.ndarray]:
@ -982,36 +982,36 @@ class TTS:
dtype=self.precision,
device=self.configs.device
)
for i, batch in enumerate(audio):
for j, audio_fragment in enumerate(batch):
max_audio=torch.abs(audio_fragment).max()#简单防止16bit爆音
if max_audio>1: audio_fragment/=max_audio
audio_fragment:torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0)
audio[i][j] = audio_fragment.cpu().numpy()
if split_bucket:
audio = self.recovery_order(audio, batch_index_list)
else:
# audio = [item for batch in audio for item in batch]
audio = sum(audio, [])
audio = np.concatenate(audio, 0)
audio = (audio * 32768).astype(np.int16)
audio = (audio * 32768).astype(np.int16)
# try:
# if speed_factor != 1.0:
# audio = speed_change(audio, speed=speed_factor, sr=int(sr))
# except Exception as e:
# print(f"Failed to change speed of audio: \n{e}")
return sr, audio
def speed_change(input_audio:np.ndarray, speed:float, sr:int):
# 将 NumPy 数组转换为原始 PCM 流
raw_audio = input_audio.astype(np.int16).tobytes()

View File

@ -49,18 +49,18 @@ def merge_short_text_in_array(texts:str, threshold:int) -> list:
class TextPreprocessor:
def __init__(self, bert_model:AutoModelForMaskedLM,
def __init__(self, bert_model:AutoModelForMaskedLM,
tokenizer:AutoTokenizer, device:torch.device):
self.bert_model = bert_model
self.tokenizer = tokenizer
self.device = device
def preprocess(self, text:str, lang:str, text_split_method:str, version:str="v2")->List[Dict]:
print(i18n("############ 切分文本 ############"))
print(f'############ {i18n("切分文本")} ############')
text = self.replace_consecutive_punctuation(text)
texts = self.pre_seg_text(text, lang, text_split_method)
result = []
print(i18n("############ 提取文本Bert特征 ############"))
print(f'############ {i18n("提取文本Bert特征")} ############')
for text in tqdm(texts):
phones, bert_features, norm_text = self.segment_and_extract_feature_for_text(text, lang, version)
if phones is None or norm_text=="":
@ -77,14 +77,14 @@ class TextPreprocessor:
text = text.strip("\n")
if len(text) == 0:
return []
if (text[0] not in splits and len(get_first(text)) < 4):
if (text[0] not in splits and len(get_first(text)) < 4):
text = "" + text if lang != "en" else "." + text
print(i18n("实际输入的目标文本:"))
print(text)
seg_method = get_seg_method(text_split_method)
text = seg_method(text)
while "\n\n" in text:
text = text.replace("\n\n", "\n")
@ -93,29 +93,29 @@ class TextPreprocessor:
_texts = merge_short_text_in_array(_texts, 5)
texts = []
for text in _texts:
# 解决输入目标文本的空行导致报错的问题
if (len(text.strip()) == 0):
continue
if not re.sub("\W+", "", text):
if not re.sub("\W+", "", text):
# 检测一下,如果是纯符号,就跳过。
continue
if (text[-1] not in splits): text += "" if lang != "en" else "."
# 解决句子过长导致Bert报错的问题
if (len(text) > 510):
texts.extend(split_big_text(text))
else:
texts.append(text)
print(i18n("实际输入的目标文本(切句后):"))
print(texts)
return texts
def segment_and_extract_feature_for_text(self, text:str, language:str, version:str="v1")->Tuple[list, torch.Tensor, str]:
return self.get_phones_and_bert(text, language, version)
def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False):
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
language = language.replace("all_","")
@ -199,7 +199,7 @@ class TextPreprocessor:
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
return phone_level_feature.T
def clean_text_inf(self, text:str, language:str, version:str="v2"):
phones, word2ph, norm_text = clean_text(text, language, version)
phones = cleaned_text_to_sequence(phones, version)
@ -228,13 +228,10 @@ class TextPreprocessor:
else:
_text.append(text)
return _text
def replace_consecutive_punctuation(self,text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
return result

View File

@ -0,0 +1,13 @@
# from f5_tts.model.cfm import CFM
#
# from f5_tts.model.backbones.unett import UNetT
from GPT_SoVITS.f5_tts.model.backbones.dit import DiT
# from f5_tts.model.backbones.dit import DiTNoCond
# from f5_tts.model.backbones.dit import DiTNoCondNoT
# from f5_tts.model.backbones.mmdit import MMDiT
# from f5_tts.model.trainer import Trainer
# __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
# __all__ = ["CFM", "UNetT", "DiTNoCond","DiT", "MMDiT"]

View File

@ -0,0 +1,20 @@
## Backbones quick introduction
### unett.py
- flat unet transformer
- structure same as in e2-tts & voicebox paper except using rotary pos emb
- update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
### dit.py
- adaln-zero dit
- embedded timestep as condition
- concatted noised_input + masked_cond + embedded_text, linear proj in
- possible abs pos emb & convnextv2 blocks for embedded text before concat
- possible long skip connection (first layer to last layer)
### mmdit.py
- sd3 structure
- timestep as condition
- left stream: text embedded and applied a abs pos emb
- right stream: masked_cond & noised_input concatted and with same conv pos emb as unett

View File

@ -0,0 +1,169 @@
"""
ein notation:
b - batch
n - sequence
nt - text sequence
nw - raw wave length
d - dimension
"""
from __future__ import annotations
import torch
from torch import nn
import torch.nn.functional as F
from x_transformers.x_transformers import RotaryEmbedding
from GPT_SoVITS.f5_tts.model.modules import (
TimestepEmbedding,
ConvNeXtV2Block,
ConvPositionEmbedding,
DiTBlock,
AdaLayerNormZero_Final,
precompute_freqs_cis,
get_pos_embed_indices,
)
from module.commons import sequence_mask
class TextEmbedding(nn.Module):
def __init__(self, text_dim, conv_layers=0, conv_mult=2):
super().__init__()
if conv_layers > 0:
self.extra_modeling = True
self.precompute_max_pos = 4096 # ~44s of 24khz audio
self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
self.text_blocks = nn.Sequential(
*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
)
else:
self.extra_modeling = False
def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
batch, text_len = text.shape[0], text.shape[1]
if drop_text: # cfg for text
text = torch.zeros_like(text)
# possible extra modeling
if self.extra_modeling:
# sinus pos emb
batch_start = torch.zeros((batch,), dtype=torch.long)
pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
text_pos_embed = self.freqs_cis[pos_idx]
# print(23333333,text.shape,text_pos_embed.shape)#torch.Size([7, 465, 256]) torch.Size([7, 465, 256])
text = text + text_pos_embed
# convnextv2 blocks
text = self.text_blocks(text)
return text
# noised input audio and context mixing embedding
class InputEmbedding(nn.Module):
def __init__(self, mel_dim, text_dim, out_dim):
super().__init__()
self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
if drop_audio_cond: # cfg for cond audio
cond = torch.zeros_like(cond)
x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
x = self.conv_pos_embed(x) + x
return x
# Transformer backbone using DiT blocks
class DiT(nn.Module):
def __init__(
self,
*,
dim,
depth=8,
heads=8,
dim_head=64,
dropout=0.1,
ff_mult=4,
mel_dim=100,
text_dim=None,
conv_layers=0,
long_skip_connection=False,
):
super().__init__()
self.time_embed = TimestepEmbedding(dim)
self.d_embed = TimestepEmbedding(dim)
if text_dim is None:
text_dim = mel_dim
self.text_embed = TextEmbedding(text_dim, conv_layers=conv_layers)
self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
self.rotary_embed = RotaryEmbedding(dim_head)
self.dim = dim
self.depth = depth
self.transformer_blocks = nn.ModuleList(
[DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
)
self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
self.proj_out = nn.Linear(dim, mel_dim)
def forward(#x, prompt_x, x_lens, t, style,cond
self,#d is channel,n is T
x0: float["b n d"], # nosied input audio # noqa: F722
cond0: float["b n d"], # masked cond audio # noqa: F722
x_lens,
time: float["b"] | float[""], # time step # noqa: F821 F722
dt_base_bootstrap,
text0, # : int["b nt"] # noqa: F722#####condition feature
###no-use
drop_audio_cond=False, # cfg for cond audio
drop_text=False, # cfg for text
# mask: bool["b n"] | None = None, # noqa: F722
):
x=x0.transpose(2,1)
cond=cond0.transpose(2,1)
text=text0.transpose(2,1)
mask = sequence_mask(x_lens,max_length=x.size(1)).to(x.device)
batch, seq_len = x.shape[0], x.shape[1]
if time.ndim == 0:
time = time.repeat(batch)
# t: conditioning time, c: context (text + masked cond audio), x: noised input audio
t = self.time_embed(time)
dt = self.d_embed(dt_base_bootstrap)
t+=dt
text_embed = self.text_embed(text, seq_len, drop_text=drop_text)###need to change
x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
rope = self.rotary_embed.forward_from_seq_len(seq_len)
if self.long_skip_connection is not None:
residual = x
for block in self.transformer_blocks:
x = block(x, t, mask=mask, rope=rope)
if self.long_skip_connection is not None:
x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
x = self.norm_out(x, t)
output = self.proj_out(x)
return output

View File

@ -0,0 +1,146 @@
"""
ein notation:
b - batch
n - sequence
nt - text sequence
nw - raw wave length
d - dimension
"""
from __future__ import annotations
import torch
from torch import nn
from x_transformers.x_transformers import RotaryEmbedding
from f5_tts.model.modules import (
TimestepEmbedding,
ConvPositionEmbedding,
MMDiTBlock,
AdaLayerNormZero_Final,
precompute_freqs_cis,
get_pos_embed_indices,
)
# text embedding
class TextEmbedding(nn.Module):
def __init__(self, out_dim, text_num_embeds):
super().__init__()
self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim) # will use 0 as filler token
self.precompute_max_pos = 1024
self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]: # noqa: F722
text = text + 1
if drop_text:
text = torch.zeros_like(text)
text = self.text_embed(text)
# sinus pos emb
batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
batch_text_len = text.shape[1]
pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
text_pos_embed = self.freqs_cis[pos_idx]
text = text + text_pos_embed
return text
# noised input & masked cond audio embedding
class AudioEmbedding(nn.Module):
def __init__(self, in_dim, out_dim):
super().__init__()
self.linear = nn.Linear(2 * in_dim, out_dim)
self.conv_pos_embed = ConvPositionEmbedding(out_dim)
def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False): # noqa: F722
if drop_audio_cond:
cond = torch.zeros_like(cond)
x = torch.cat((x, cond), dim=-1)
x = self.linear(x)
x = self.conv_pos_embed(x) + x
return x
# Transformer backbone using MM-DiT blocks
class MMDiT(nn.Module):
def __init__(
self,
*,
dim,
depth=8,
heads=8,
dim_head=64,
dropout=0.1,
ff_mult=4,
text_num_embeds=256,
mel_dim=100,
):
super().__init__()
self.time_embed = TimestepEmbedding(dim)
self.text_embed = TextEmbedding(dim, text_num_embeds)
self.audio_embed = AudioEmbedding(mel_dim, dim)
self.rotary_embed = RotaryEmbedding(dim_head)
self.dim = dim
self.depth = depth
self.transformer_blocks = nn.ModuleList(
[
MMDiTBlock(
dim=dim,
heads=heads,
dim_head=dim_head,
dropout=dropout,
ff_mult=ff_mult,
context_pre_only=i == depth - 1,
)
for i in range(depth)
]
)
self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
self.proj_out = nn.Linear(dim, mel_dim)
def forward(
self,
x: float["b n d"], # nosied input audio # noqa: F722
cond: float["b n d"], # masked cond audio # noqa: F722
text: int["b nt"], # text # noqa: F722
time: float["b"] | float[""], # time step # noqa: F821 F722
drop_audio_cond, # cfg for cond audio
drop_text, # cfg for text
mask: bool["b n"] | None = None, # noqa: F722
):
batch = x.shape[0]
if time.ndim == 0:
time = time.repeat(batch)
# t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
t = self.time_embed(time)
c = self.text_embed(text, drop_text=drop_text)
x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
seq_len = x.shape[1]
text_len = text.shape[1]
rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
rope_text = self.rotary_embed.forward_from_seq_len(text_len)
for block in self.transformer_blocks:
c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
x = self.norm_out(x, t)
output = self.proj_out(x)
return output

View File

@ -0,0 +1,219 @@
"""
ein notation:
b - batch
n - sequence
nt - text sequence
nw - raw wave length
d - dimension
"""
from __future__ import annotations
from typing import Literal
import torch
from torch import nn
import torch.nn.functional as F
from x_transformers import RMSNorm
from x_transformers.x_transformers import RotaryEmbedding
from f5_tts.model.modules import (
TimestepEmbedding,
ConvNeXtV2Block,
ConvPositionEmbedding,
Attention,
AttnProcessor,
FeedForward,
precompute_freqs_cis,
get_pos_embed_indices,
)
# Text embedding
class TextEmbedding(nn.Module):
def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
super().__init__()
self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
if conv_layers > 0:
self.extra_modeling = True
self.precompute_max_pos = 4096 # ~44s of 24khz audio
self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
self.text_blocks = nn.Sequential(
*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
)
else:
self.extra_modeling = False
def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
batch, text_len = text.shape[0], text.shape[1]
text = F.pad(text, (0, seq_len - text_len), value=0)
if drop_text: # cfg for text
text = torch.zeros_like(text)
text = self.text_embed(text) # b n -> b n d
# possible extra modeling
if self.extra_modeling:
# sinus pos emb
batch_start = torch.zeros((batch,), dtype=torch.long)
pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
text_pos_embed = self.freqs_cis[pos_idx]
text = text + text_pos_embed
# convnextv2 blocks
text = self.text_blocks(text)
return text
# noised input audio and context mixing embedding
class InputEmbedding(nn.Module):
def __init__(self, mel_dim, text_dim, out_dim):
super().__init__()
self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
if drop_audio_cond: # cfg for cond audio
cond = torch.zeros_like(cond)
x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
x = self.conv_pos_embed(x) + x
return x
# Flat UNet Transformer backbone
class UNetT(nn.Module):
def __init__(
self,
*,
dim,
depth=8,
heads=8,
dim_head=64,
dropout=0.1,
ff_mult=4,
mel_dim=100,
text_num_embeds=256,
text_dim=None,
conv_layers=0,
skip_connect_type: Literal["add", "concat", "none"] = "concat",
):
super().__init__()
assert depth % 2 == 0, "UNet-Transformer's depth should be even."
self.time_embed = TimestepEmbedding(dim)
if text_dim is None:
text_dim = mel_dim
self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
self.rotary_embed = RotaryEmbedding(dim_head)
# transformer layers & skip connections
self.dim = dim
self.skip_connect_type = skip_connect_type
needs_skip_proj = skip_connect_type == "concat"
self.depth = depth
self.layers = nn.ModuleList([])
for idx in range(depth):
is_later_half = idx >= (depth // 2)
attn_norm = RMSNorm(dim)
attn = Attention(
processor=AttnProcessor(),
dim=dim,
heads=heads,
dim_head=dim_head,
dropout=dropout,
)
ff_norm = RMSNorm(dim)
ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
self.layers.append(
nn.ModuleList(
[
skip_proj,
attn_norm,
attn,
ff_norm,
ff,
]
)
)
self.norm_out = RMSNorm(dim)
self.proj_out = nn.Linear(dim, mel_dim)
def forward(
self,
x: float["b n d"], # nosied input audio # noqa: F722
cond: float["b n d"], # masked cond audio # noqa: F722
text: int["b nt"], # text # noqa: F722
time: float["b"] | float[""], # time step # noqa: F821 F722
drop_audio_cond, # cfg for cond audio
drop_text, # cfg for text
mask: bool["b n"] | None = None, # noqa: F722
):
batch, seq_len = x.shape[0], x.shape[1]
if time.ndim == 0:
time = time.repeat(batch)
# t: conditioning time, c: context (text + masked cond audio), x: noised input audio
t = self.time_embed(time)
text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
# postfix time t to input x, [b n d] -> [b n+1 d]
x = torch.cat([t.unsqueeze(1), x], dim=1) # pack t to x
if mask is not None:
mask = F.pad(mask, (1, 0), value=1)
rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
# flat unet transformer
skip_connect_type = self.skip_connect_type
skips = []
for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
layer = idx + 1
# skip connection logic
is_first_half = layer <= (self.depth // 2)
is_later_half = not is_first_half
if is_first_half:
skips.append(x)
if is_later_half:
skip = skips.pop()
if skip_connect_type == "concat":
x = torch.cat((x, skip), dim=-1)
x = maybe_skip_proj(x)
elif skip_connect_type == "add":
x = x + skip
# attention and feedforward blocks
x = attn(attn_norm(x), rope=rope, mask=mask) + x
x = ff(ff_norm(x)) + x
assert len(skips) == 0
x = self.norm_out(x)[:, 1:, :] # unpack t from x
return self.proj_out(x)

View File

@ -0,0 +1,664 @@
"""
ein notation:
b - batch
n - sequence
nt - text sequence
nw - raw wave length
d - dimension
"""
from __future__ import annotations
import math
from typing import Optional
import torch
import torch.nn.functional as F
import torchaudio
from librosa.filters import mel as librosa_mel_fn
from torch import nn
from x_transformers.x_transformers import apply_rotary_pos_emb
# raw wav to mel spec
mel_basis_cache = {}
hann_window_cache = {}
def get_bigvgan_mel_spectrogram(
waveform,
n_fft=1024,
n_mel_channels=100,
target_sample_rate=24000,
hop_length=256,
win_length=1024,
fmin=0,
fmax=None,
center=False,
): # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
device = waveform.device
key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
if key not in mel_basis_cache:
mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) # TODO: why they need .float()?
hann_window_cache[key] = torch.hann_window(win_length).to(device)
mel_basis = mel_basis_cache[key]
hann_window = hann_window_cache[key]
padding = (n_fft - hop_length) // 2
waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
spec = torch.stft(
waveform,
n_fft,
hop_length=hop_length,
win_length=win_length,
window=hann_window,
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
)
spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
mel_spec = torch.matmul(mel_basis, spec)
mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
return mel_spec
def get_vocos_mel_spectrogram(
waveform,
n_fft=1024,
n_mel_channels=100,
target_sample_rate=24000,
hop_length=256,
win_length=1024,
):
mel_stft = torchaudio.transforms.MelSpectrogram(
sample_rate=target_sample_rate,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
n_mels=n_mel_channels,
power=1,
center=True,
normalized=False,
norm=None,
).to(waveform.device)
if len(waveform.shape) == 3:
waveform = waveform.squeeze(1) # 'b 1 nw -> b nw'
assert len(waveform.shape) == 2
mel = mel_stft(waveform)
mel = mel.clamp(min=1e-5).log()
return mel
class MelSpec(nn.Module):
def __init__(
self,
n_fft=1024,
hop_length=256,
win_length=1024,
n_mel_channels=100,
target_sample_rate=24_000,
mel_spec_type="vocos",
):
super().__init__()
assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
self.n_fft = n_fft
self.hop_length = hop_length
self.win_length = win_length
self.n_mel_channels = n_mel_channels
self.target_sample_rate = target_sample_rate
if mel_spec_type == "vocos":
self.extractor = get_vocos_mel_spectrogram
elif mel_spec_type == "bigvgan":
self.extractor = get_bigvgan_mel_spectrogram
self.register_buffer("dummy", torch.tensor(0), persistent=False)
def forward(self, wav):
if self.dummy.device != wav.device:
self.to(wav.device)
mel = self.extractor(
waveform=wav,
n_fft=self.n_fft,
n_mel_channels=self.n_mel_channels,
target_sample_rate=self.target_sample_rate,
hop_length=self.hop_length,
win_length=self.win_length,
)
return mel
# sinusoidal position embedding
class SinusPositionEmbedding(nn.Module):
def __init__(self, dim):
super().__init__()
self.dim = dim
def forward(self, x, scale=1000):
device = x.device
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
return emb
# convolutional position embedding
class ConvPositionEmbedding(nn.Module):
def __init__(self, dim, kernel_size=31, groups=16):
super().__init__()
assert kernel_size % 2 != 0
self.conv1d = nn.Sequential(
nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
nn.Mish(),
nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
nn.Mish(),
)
def forward(self, x: float["b n d"], mask: bool["b n"] | None = None): # noqa: F722
if mask is not None:
mask = mask[..., None]
x = x.masked_fill(~mask, 0.0)
x = x.permute(0, 2, 1)
x = self.conv1d(x)
out = x.permute(0, 2, 1)
if mask is not None:
out = out.masked_fill(~mask, 0.0)
return out
# rotary positional embedding related
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
# proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
# has some connection to NTK literature
# https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
# https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
theta *= theta_rescale_factor ** (dim / (dim - 2))
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
t = torch.arange(end, device=freqs.device) # type: ignore
freqs = torch.outer(t, freqs).float() # type: ignore
freqs_cos = torch.cos(freqs) # real part
freqs_sin = torch.sin(freqs) # imaginary part
return torch.cat([freqs_cos, freqs_sin], dim=-1)
def get_pos_embed_indices(start, length, max_pos, scale=1.0):
# length = length if isinstance(length, int) else length.max()
scale = scale * torch.ones_like(start, dtype=torch.float32) # in case scale is a scalar
pos = (
start.unsqueeze(1)
+ (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
)
# avoid extra long error.
pos = torch.where(pos < max_pos, pos, max_pos - 1)
return pos
# Global Response Normalization layer (Instance Normalization ?)
class GRN(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
self.beta = nn.Parameter(torch.zeros(1, 1, dim))
def forward(self, x):
Gx = torch.norm(x, p=2, dim=1, keepdim=True)
Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
return self.gamma * (x * Nx) + self.beta + x
# ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
# ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
class ConvNeXtV2Block(nn.Module):
def __init__(
self,
dim: int,
intermediate_dim: int,
dilation: int = 1,
):
super().__init__()
padding = (dilation * (7 - 1)) // 2
self.dwconv = nn.Conv1d(
dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
) # depthwise conv
self.norm = nn.LayerNorm(dim, eps=1e-6)
self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.grn = GRN(intermediate_dim)
self.pwconv2 = nn.Linear(intermediate_dim, dim)
def forward(self, x: torch.Tensor) -> torch.Tensor:
residual = x
x = x.transpose(1, 2) # b n d -> b d n
x = self.dwconv(x)
x = x.transpose(1, 2) # b d n -> b n d
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.grn(x)
x = self.pwconv2(x)
return residual + x
# AdaLayerNormZero
# return with modulated x for attn input, and params for later mlp modulation
class AdaLayerNormZero(nn.Module):
def __init__(self, dim):
super().__init__()
self.silu = nn.SiLU()
self.linear = nn.Linear(dim, dim * 6)
self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
def forward(self, x, emb=None):
emb = self.linear(self.silu(emb))
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
# AdaLayerNormZero for final layer
# return only with modulated x for attn input, cuz no more mlp modulation
class AdaLayerNormZero_Final(nn.Module):
def __init__(self, dim):
super().__init__()
self.silu = nn.SiLU()
self.linear = nn.Linear(dim, dim * 2)
self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
def forward(self, x, emb):
emb = self.linear(self.silu(emb))
scale, shift = torch.chunk(emb, 2, dim=1)
x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
return x
# FeedForward
class FeedForward(nn.Module):
def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
super().__init__()
inner_dim = int(dim * mult)
dim_out = dim_out if dim_out is not None else dim
activation = nn.GELU(approximate=approximate)
project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
def forward(self, x):
return self.ff(x)
# Attention with possible joint part
# modified from diffusers/src/diffusers/models/attention_processor.py
class Attention(nn.Module):
def __init__(
self,
processor: JointAttnProcessor | AttnProcessor,
dim: int,
heads: int = 8,
dim_head: int = 64,
dropout: float = 0.0,
context_dim: Optional[int] = None, # if not None -> joint attention
context_pre_only=None,
):
super().__init__()
if not hasattr(F, "scaled_dot_product_attention"):
raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
self.processor = processor
self.dim = dim
self.heads = heads
self.inner_dim = dim_head * heads
self.dropout = dropout
self.context_dim = context_dim
self.context_pre_only = context_pre_only
self.to_q = nn.Linear(dim, self.inner_dim)
self.to_k = nn.Linear(dim, self.inner_dim)
self.to_v = nn.Linear(dim, self.inner_dim)
if self.context_dim is not None:
self.to_k_c = nn.Linear(context_dim, self.inner_dim)
self.to_v_c = nn.Linear(context_dim, self.inner_dim)
if self.context_pre_only is not None:
self.to_q_c = nn.Linear(context_dim, self.inner_dim)
self.to_out = nn.ModuleList([])
self.to_out.append(nn.Linear(self.inner_dim, dim))
self.to_out.append(nn.Dropout(dropout))
if self.context_pre_only is not None and not self.context_pre_only:
self.to_out_c = nn.Linear(self.inner_dim, dim)
def forward(
self,
x: float["b n d"], # noised input x # noqa: F722
c: float["b n d"] = None, # context c # noqa: F722
mask: bool["b n"] | None = None, # noqa: F722
rope=None, # rotary position embedding for x
c_rope=None, # rotary position embedding for c
) -> torch.Tensor:
if c is not None:
return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
else:
return self.processor(self, x, mask=mask, rope=rope)
# Attention processor
# from torch.nn.attention import SDPBackend
# torch.backends.cuda.enable_flash_sdp(True)
class AttnProcessor:
def __init__(self):
pass
def __call__(
self,
attn: Attention,
x: float["b n d"], # noised input x # noqa: F722
mask: bool["b n"] | None = None, # noqa: F722
rope=None, # rotary position embedding
) -> torch.FloatTensor:
batch_size = x.shape[0]
# `sample` projections.
query = attn.to_q(x)
key = attn.to_k(x)
value = attn.to_v(x)
# apply rotary position embedding
if rope is not None:
freqs, xpos_scale = rope
q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
# attention
inner_dim = key.shape[-1]
head_dim = inner_dim // attn.heads
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
# mask. e.g. inference got a batch with different target durations, mask out the padding
if mask is not None:
attn_mask = mask
attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
# print(3433333333,attn_mask.shape)
attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
else:
attn_mask = None
# with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
# with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=True):
# with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=False):
# print(torch.backends.cuda.flash_sdp_enabled())
# print(torch.backends.cuda.mem_efficient_sdp_enabled())
# print(torch.backends.cuda.math_sdp_enabled())
x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
x = x.to(query.dtype)
# linear proj
x = attn.to_out[0](x)
# dropout
x = attn.to_out[1](x)
if mask is not None:
mask = mask.unsqueeze(-1)
x = x.masked_fill(~mask, 0.0)
return x
# Joint Attention processor for MM-DiT
# modified from diffusers/src/diffusers/models/attention_processor.py
class JointAttnProcessor:
def __init__(self):
pass
def __call__(
self,
attn: Attention,
x: float["b n d"], # noised input x # noqa: F722
c: float["b nt d"] = None, # context c, here text # noqa: F722
mask: bool["b n"] | None = None, # noqa: F722
rope=None, # rotary position embedding for x
c_rope=None, # rotary position embedding for c
) -> torch.FloatTensor:
residual = x
batch_size = c.shape[0]
# `sample` projections.
query = attn.to_q(x)
key = attn.to_k(x)
value = attn.to_v(x)
# `context` projections.
c_query = attn.to_q_c(c)
c_key = attn.to_k_c(c)
c_value = attn.to_v_c(c)
# apply rope for context and noised input independently
if rope is not None:
freqs, xpos_scale = rope
q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
if c_rope is not None:
freqs, xpos_scale = c_rope
q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
# attention
query = torch.cat([query, c_query], dim=1)
key = torch.cat([key, c_key], dim=1)
value = torch.cat([value, c_value], dim=1)
inner_dim = key.shape[-1]
head_dim = inner_dim // attn.heads
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
# mask. e.g. inference got a batch with different target durations, mask out the padding
if mask is not None:
attn_mask = F.pad(mask, (0, c.shape[1]), value=True) # no mask for c (text)
attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
else:
attn_mask = None
x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
x = x.to(query.dtype)
# Split the attention outputs.
x, c = (
x[:, : residual.shape[1]],
x[:, residual.shape[1] :],
)
# linear proj
x = attn.to_out[0](x)
# dropout
x = attn.to_out[1](x)
if not attn.context_pre_only:
c = attn.to_out_c(c)
if mask is not None:
mask = mask.unsqueeze(-1)
x = x.masked_fill(~mask, 0.0)
# c = c.masked_fill(~mask, 0.) # no mask for c (text)
return x, c
# DiT Block
class DiTBlock(nn.Module):
def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
super().__init__()
self.attn_norm = AdaLayerNormZero(dim)
self.attn = Attention(
processor=AttnProcessor(),
dim=dim,
heads=heads,
dim_head=dim_head,
dropout=dropout,
)
self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
def forward(self, x, t, mask=None, rope=None): # x: noised input, t: time embedding
# pre-norm & modulation for attention input
norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
# attention
attn_output = self.attn(x=norm, mask=mask, rope=rope)
# process attention output for input x
x = x + gate_msa.unsqueeze(1) * attn_output
norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
ff_output = self.ff(norm)
x = x + gate_mlp.unsqueeze(1) * ff_output
return x
# MMDiT Block https://arxiv.org/abs/2403.03206
class MMDiTBlock(nn.Module):
r"""
modified from diffusers/src/diffusers/models/attention.py
notes.
_c: context related. text, cond, etc. (left part in sd3 fig2.b)
_x: noised input related. (right part)
context_pre_only: last layer only do prenorm + modulation cuz no more ffn
"""
def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
super().__init__()
self.context_pre_only = context_pre_only
self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
self.attn_norm_x = AdaLayerNormZero(dim)
self.attn = Attention(
processor=JointAttnProcessor(),
dim=dim,
heads=heads,
dim_head=dim_head,
dropout=dropout,
context_dim=dim,
context_pre_only=context_pre_only,
)
if not context_pre_only:
self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
else:
self.ff_norm_c = None
self.ff_c = None
self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: noised input, c: context, t: time embedding
# pre-norm & modulation for attention input
if self.context_pre_only:
norm_c = self.attn_norm_c(c, t)
else:
norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
# attention
x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
# process attention output for context c
if self.context_pre_only:
c = None
else: # if not last layer
c = c + c_gate_msa.unsqueeze(1) * c_attn_output
norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
c_ff_output = self.ff_c(norm_c)
c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
# process attention output for input x
x = x + x_gate_msa.unsqueeze(1) * x_attn_output
norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
x_ff_output = self.ff_x(norm_x)
x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
return c, x
# time step conditioning embedding
class TimestepEmbedding(nn.Module):
def __init__(self, dim, freq_embed_dim=256):
super().__init__()
self.time_embed = SinusPositionEmbedding(freq_embed_dim)
self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
def forward(self, timestep: float["b"]): # noqa: F821
time_hidden = self.time_embed(timestep)
time_hidden = time_hidden.to(timestep.dtype)
time = self.time_mlp(time_hidden) # b d
return time

View File

@ -7,8 +7,7 @@
全部按日文识别
'''
import logging
import traceback
import traceback,torchaudio,warnings
logging.getLogger("markdown_it").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("httpcore").setLevel(logging.ERROR)
@ -17,6 +16,8 @@ logging.getLogger("asyncio").setLevel(logging.ERROR)
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
warnings.simplefilter(action='ignore', category=FutureWarning)
import LangSegment, os, re, sys, json
import pdb
import torch
@ -25,20 +26,17 @@ try:
import gradio.analytics as analytics
analytics.version_check = lambda:None
except:...
version=model_version=os.environ.get("version","v2")
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth","GPT_SoVITS/pretrained_models/s2Gv3.pth"]
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"]
version=os.environ.get("version","v2")
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "GPT_SoVITS/pretrained_models/s2G488k.pth"]
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"]
_ =[[],[]]
for i in range(2):
if os.path.exists(pretrained_gpt_name[i]):
_[0].append(pretrained_gpt_name[i])
if os.path.exists(pretrained_sovits_name[i]):
_[-1].append(pretrained_sovits_name[i])
for i in range(3):
if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
pretrained_gpt_name,pretrained_sovits_name = _
if os.path.exists(f"./weight.json"):
pass
@ -83,7 +81,7 @@ from feature_extractor import cnhubert
cnhubert.cnhubert_base_path = cnhubert_base_path
from module.models import SynthesizerTrn
from GPT_SoVITS.module.models import SynthesizerTrn,SynthesizerTrnV3
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from text import cleaned_text_to_sequence
from text.cleaner import clean_text
@ -184,39 +182,47 @@ if is_half == True:
else:
ssl_model = ssl_model.to(device)
resample_transform_dict={}
def resample(audio_tensor, sr0):
global resample_transform_dict
if sr0 not in resample_transform_dict:
resample_transform_dict[sr0] = torchaudio.transforms.Resample(
sr0, 24000
).to(device)
return resample_transform_dict[sr0](audio_tensor)
def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
global vq_model, hps, version, dict_language
dict_s2 = torch.load(sovits_path, map_location="cpu")
hps = dict_s2["config"]
hps = DictToAttrRecursive(hps)
hps.model.semantic_frame_rate = "25hz"
if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322:
hps.model.version = "v1"
global vq_model, hps, version, model_version, dict_language
'''
v1:about 82942KB
half thr:82978KB
v2:about 83014KB
half thr:100MB
v1base:103490KB
half thr:103520KB
v2base:103551KB
v3:about 750MB
~82978K~100M~103420~700M
v1-v2-v1base-v2base-v3
version:
symbols version and timebre_embedding version
model_version:
sovits is v1/2 (VITS) or v3 (shortcut CFM DiT)
'''
size=os.path.getsize(sovits_path)
if size<82978*1024:
model_version=version="v1"
elif size<100*1024*1024:
model_version=version="v2"
elif size<103520*1024:
model_version=version="v1"
elif size<700*1024*1024:
model_version = version = "v2"
else:
hps.model.version = "v2"
version = hps.model.version
# print("sovits版本:",hps.model.version)
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
if ("pretrained" not in sovits_path):
del vq_model.enc_q
if is_half == True:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
with open("./weight.json")as f:
data=f.read()
data=json.loads(data)
data["SoVITS"][version]=sovits_path
with open("./weight.json","w")as f:f.write(json.dumps(data))
version = "v2"
model_version="v3"
if prompt_language is not None and text_language is not None:
if prompt_language in list(dict_language.keys()):
prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language}
@ -228,13 +234,59 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
else:
text_update = {'__type__':'update', 'value':''}
text_language_update = {'__type__':'update', 'value':i18n("中文")}
return {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update
if model_version=="v3":
visible_sample_steps=True
visible_inp_refs=False
else:
visible_sample_steps=False
visible_inp_refs=True
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False}
dict_s2 = torch.load(sovits_path, map_location="cpu")
hps = dict_s2["config"]
hps = DictToAttrRecursive(hps)
hps.model.semantic_frame_rate = "25hz"
if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322:
hps.model.version = "v1"
else:
hps.model.version = "v2"
version=hps.model.version
# print("sovits版本:",hps.model.version)
if model_version!="v3":
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
model_version=version
else:
vq_model = SynthesizerTrnV3(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
if ("pretrained" not in sovits_path):
try:
del vq_model.enc_q
except:pass
if is_half == True:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
print("loading sovits_%s"%model_version,vq_model.load_state_dict(dict_s2["weight"], strict=False))
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
with open("./weight.json")as f:
data=f.read()
data=json.loads(data)
data["SoVITS"][version]=sovits_path
with open("./weight.json","w")as f:f.write(json.dumps(data))
change_sovits_weights(sovits_path)
def change_gpt_weights(gpt_path):
global hz, max_sec, t2s_model, config
hz = 50
@ -247,8 +299,8 @@ def change_gpt_weights(gpt_path):
t2s_model = t2s_model.half()
t2s_model = t2s_model.to(device)
t2s_model.eval()
total = sum([param.nelement() for param in t2s_model.parameters()])
print("Number of parameter: %.2fM" % (total / 1e6))
# total = sum([param.nelement() for param in t2s_model.parameters()])
# print("Number of parameter: %.2fM" % (total / 1e6))
with open("./weight.json")as f:
data=f.read()
data=json.loads(data)
@ -257,6 +309,25 @@ def change_gpt_weights(gpt_path):
change_gpt_weights(gpt_path)
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import torch,soundfile
now_dir = os.getcwd()
import soundfile
def init_bigvgan():
global model
from BigVGAN import bigvgan
model = bigvgan.BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions
# remove weight norm in the model and set to eval mode
model.remove_weight_norm()
model = model.eval()
if is_half == True:
model = model.half().to(device)
else:
model = model.to(device)
if model_version!="v3":model=None
else:init_bigvgan()
def get_spepc(hps, filename):
@ -371,6 +442,30 @@ def get_phones_and_bert(text,language,version,final=False):
return phones,bert.to(dtype),norm_text
from module.mel_processing import spectrogram_torch,spec_to_mel_torch
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
spec=spectrogram_torch(y,n_fft,sampling_rate,hop_size,win_size,center)
mel=spec_to_mel_torch(spec,n_fft,num_mels,sampling_rate,fmin,fmax)
return mel
mel_fn_args = {
"n_fft": 1024,
"win_size": 1024,
"hop_size": 256,
"num_mels": 100,
"sampling_rate": 24000,
"fmin": 0,
"fmax": None,
"center": False
}
spec_min = -12
spec_max = 2
def norm_spec(x):
return (x - spec_min) / (spec_max - spec_min) * 2 - 1
def denorm_spec(x):
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
mel_fn=lambda x: mel_spectrogram(x, **mel_fn_args)
def merge_short_text_in_array(texts, threshold):
if (len(texts)) < 2:
@ -392,8 +487,7 @@ def merge_short_text_in_array(texts, threshold):
##ref_wav_path+prompt_text+prompt_language+text(单个)+text_language+top_k+top_p+temperature
# cache_tokens={}#暂未实现清理机制
cache= {}
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free
=False,speed=1,if_freeze=False,inp_refs=None):
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1,if_freeze=False,inp_refs=None,sample_steps=8):
global cache
if ref_wav_path:pass
else:gr.Warning(i18n('请上传参考音频'))
@ -402,6 +496,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
t = []
if prompt_text is None or len(prompt_text) == 0:
ref_free = True
if model_version=="v3":ref_free=False#s2v3暂不支持ref_free
t0 = ttime()
prompt_language = dict_language[prompt_language]
text_language = dict_language[text_language]
@ -463,6 +558,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
texts = process_text(texts)
texts = merge_short_text_in_array(texts, 5)
audio_opt = []
###s2v3暂不支持ref_free
if not ref_free:
phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language, version)
@ -504,18 +600,60 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
cache[i_text]=pred_semantic
t3 = ttime()
refers=[]
if(inp_refs):
for path in inp_refs:
try:
refer = get_spepc(hps, path.name).to(dtype).to(device)
refers.append(refer)
except:
traceback.print_exc()
if(len(refers)==0):refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
audio = (vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers,speed=speed).detach().cpu().numpy()[0, 0])
max_audio=np.abs(audio).max()#简单防止16bit爆音
if max_audio>1:audio/=max_audio
###v3不存在以下逻辑和inp_refs
if model_version!="v3":
refers=[]
if(inp_refs):
for path in inp_refs:
try:
refer = get_spepc(hps, path.name).to(dtype).to(device)
refers.append(refer)
except:
traceback.print_exc()
if(len(refers)==0):refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
audio = (vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers,speed=speed).detach().cpu().numpy()[0, 0])
else:
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)#######这里要重采样切到32k,因为src是24k的没有单独的32k的src所以不能改成2个路径
phoneme_ids0=torch.LongTensor(phones1).to(device).unsqueeze(0)
phoneme_ids1=torch.LongTensor(phones2).to(device).unsqueeze(0)
fea_ref,ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer)
ref_audio, sr = torchaudio.load(ref_wav_path)
ref_audio=ref_audio.to(device).float()
if (ref_audio.shape[0] == 2):
ref_audio = ref_audio.mean(0).unsqueeze(0)
if sr!=24000:
ref_audio=resample(ref_audio,sr)
mel2 = mel_fn(ref_audio.to(dtype))
mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2])
mel2 = mel2[:, :, :T_min]
fea_ref = fea_ref[:, :, :T_min]
if (T_min > 468):
mel2 = mel2[:, :, -468:]
fea_ref = fea_ref[:, :, -468:]
T_min = 468
chunk_len = 934 - T_min
fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge)
cfm_resss = []
idx = 0
while (1):
fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len]
if (fea_todo_chunk.shape[-1] == 0): break
idx += chunk_len
fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
cfm_res = vq_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
cfm_res = cfm_res[:, :, mel2.shape[2]:]
mel2 = cfm_res[:, :, -468:]
fea_ref = fea_todo_chunk[:, :, -468:]
cfm_resss.append(cfm_res)
cmf_res = torch.cat(cfm_resss, 2)
cmf_res = denorm_spec(cmf_res)
if model==None:init_bigvgan()
with torch.inference_mode():
wav_gen = model(cmf_res)
audio=wav_gen[0][0].cpu().detach().numpy()
max_audio=np.abs(audio).max()#简单防止16bit爆音
if max_audio>1:audio/=max_audio
audio_opt.append(audio)
audio_opt.append(zero_wav)
t4 = ttime()
@ -524,9 +662,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
print("%.3f\t%.3f\t%.3f\t%.3f" %
(t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3]))
)
yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(
np.int16
)
sr=hps.data.sampling_rate if model_version!="v3"else 24000
yield sr, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
def split(todo_text):
@ -650,8 +787,8 @@ def change_choices():
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"}
SoVITS_weight_root=["SoVITS_weights_v2","SoVITS_weights"]
GPT_weight_root=["GPT_weights_v2","GPT_weights"]
SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"]
GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"]
for path in SoVITS_weight_root+GPT_weight_root:
os.makedirs(path,exist_ok=True)
@ -696,14 +833,15 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Row():
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频超过会报错"), type="filepath", scale=13)
with gr.Column(scale=13):
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True,scale=1)
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。"), value=False, interactive=True, show_label=True,scale=1)
gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。")))
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1)
with gr.Column(scale=14):
prompt_language = gr.Dropdown(
label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文"),
)
inp_refs = gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple")
inp_refs = gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple")if model_version!="v3"else gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple",visible=False)
sample_steps = gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),value=32,choices=[4,8,16,32],visible=True)if model_version=="v3"else gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),value=8,choices=[4,8,16,32],visible=False)
gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"),'h3'))
with gr.Row():
with gr.Column(scale=13):
@ -735,10 +873,10 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
inference_button.click(
get_tts_wav,
[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs],
[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps],
[output],
)
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language])
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free])
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
# gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))

View File

@ -9,7 +9,7 @@ import torch.utils.data
from tqdm import tqdm
from module import commons
from module.mel_processing import spectrogram_torch
from module.mel_processing import spectrogram_torch,spec_to_mel_torch
from text import cleaned_text_to_sequence
from utils import load_wav_to_torch, load_filepaths_and_text
import torch.nn.functional as F
@ -170,8 +170,6 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, (
ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir)
return reference_mel, ssl, wav2, mel
class TextAudioSpeakerCollate():
""" Zero-pads model inputs and targets
"""
@ -232,7 +230,232 @@ class TextAudioSpeakerCollate():
text_lengths[i] = text.size(0)
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, hparams, val=False):
exp_dir = hparams.exp_dir
self.path2 = "%s/2-name2text.txt" % exp_dir
self.path4 = "%s/4-cnhubert" % exp_dir
self.path5 = "%s/5-wav32k" % exp_dir
assert os.path.exists(self.path2)
assert os.path.exists(self.path4)
assert os.path.exists(self.path5)
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
names5 = set(os.listdir(self.path5))
self.phoneme_data = {}
with open(self.path2, "r", encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines:
tmp = line.split("\t")
if (len(tmp) != 4):
continue
self.phoneme_data[tmp[0]] = [tmp[1]]
self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5)
tmp = self.audiopaths_sid_text
leng = len(tmp)
min_num = 100
if (leng < min_num):
self.audiopaths_sid_text = []
for _ in range(max(2, int(min_num / leng))):
self.audiopaths_sid_text += tmp
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.val = val
random.seed(1234)
random.shuffle(self.audiopaths_sid_text)
print("phoneme_data_len:", len(self.phoneme_data.keys()))
print("wav_data_len:", len(self.audiopaths_sid_text))
audiopaths_sid_text_new = []
lengths = []
skipped_phone = 0
skipped_dur = 0
for audiopath in tqdm(self.audiopaths_sid_text):
try:
phoneme = self.phoneme_data[audiopath][0]
phoneme = phoneme.split(' ')
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
except Exception:
print(f"{audiopath} not in self.phoneme_data !")
skipped_phone += 1
continue
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
duration = size / self.sampling_rate / 2
if duration == 0:
print(f"Zero duration for {audiopath}, skipping...")
skipped_dur += 1
continue
if 54 > duration > 0.6 or self.val:
audiopaths_sid_text_new.append([audiopath, phoneme_ids])
lengths.append(size // (2 * self.hop_length))
else:
skipped_dur += 1
continue
print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur)
print("total left: ", len(audiopaths_sid_text_new))
assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size这里todo
self.audiopaths_sid_text = audiopaths_sid_text_new
self.lengths = lengths
self.spec_min=-12
self.spec_max=2
self.filter_length_mel=self.win_length_mel=1024
self.hop_length_mel=256
self.n_mel_channels=100
self.sampling_rate_mel=24000
self.mel_fmin=0
self.mel_fmax=None
def norm_spec(self, x):
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
def get_audio_text_speaker_pair(self, audiopath_sid_text):
audiopath, phoneme_ids = audiopath_sid_text
text = torch.FloatTensor(phoneme_ids)
try:
spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
with torch.no_grad():
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
if (ssl.shape[-1] != spec.shape[-1]):
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
ssl.requires_grad = False
except:
traceback.print_exc()
mel = torch.zeros(100, 180)
# wav = torch.zeros(1, 96 * self.hop_length)
spec = torch.zeros(1025, 96)
ssl = torch.zeros(1, 768, 96)
text = text[-1:]
print("load audio or ssl error!!!!!!", audiopath)
return (ssl, spec, mel, text)
def get_audio(self, filename):
audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的不用再/32768
audio=torch.FloatTensor(audio_array)#/32768
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的不用再/32768######这里可以用GPU重采样加速
audio24=torch.FloatTensor(audio_array24)#/32768
audio_norm24 = audio24
audio_norm24 = audio_norm24.unsqueeze(0)
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False)
mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax)
mel = torch.squeeze(mel, 0)
mel=self.norm_spec(mel)
# print(1111111,spec.shape,mel.shape)
return spec, mel
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
return sid
def __getitem__(self, index):
# with torch.no_grad():
return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
def __len__(self):
return len(self.audiopaths_sid_text)
class TextAudioSpeakerCollateV3():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
def __call__(self, batch):
"""Collate's training batch from normalized text, audio and speaker identities
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized, sid]
"""
#ssl, spec, wav,mel, text
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
#(ssl, spec,mel, text)
max_ssl_len = max([x[0].size(2) for x in batch])
max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
# max_ssl_len = int(8 * ((max_ssl_len // 8) + 1))
# max_ssl_len1=max_ssl_len
max_spec_len = max([x[1].size(1) for x in batch])
max_spec_len = int(2 * ((max_spec_len // 2) + 1))
# max_wav_len = max([x[2].size(1) for x in batch])
max_text_len = max([x[3].size(0) for x in batch])
max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320
ssl_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
text_lengths = torch.LongTensor(len(batch))
# wav_lengths = torch.LongTensor(len(batch))
mel_lengths = torch.LongTensor(len(batch))
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_mel_len)
ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
text_padded = torch.LongTensor(len(batch), max_text_len)
# wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
spec_padded.zero_()
mel_padded.zero_()
ssl_padded.zero_()
text_padded.zero_()
# wav_padded.zero_()
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
# ssl, spec, wav,mel, text
ssl = row[0]
ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
ssl_lengths[i] = ssl.size(2)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
# wav = row[2]
# wav_padded[i, :, :wav.size(1)] = wav
# wav_lengths[i] = wav.size(1)
mel = row[2]
mel_padded[i, :, :mel.size(1)] = mel
mel_lengths[i] = mel.size(1)
text = row[3]
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)
# return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
"""

View File

@ -12,7 +12,7 @@ from torch.nn import functional as F
from module import commons
from module import modules
from module import attentions
from f5_tts.model import DiT
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from module.commons import init_weights, get_padding
@ -22,7 +22,7 @@ from module.quantize import ResidualVectorQuantizer
from text import symbols as symbols_v1
from text import symbols2 as symbols_v2
from torch.cuda.amp import autocast
import contextlib
import contextlib,random
class StochasticDurationPredictor(nn.Module):
@ -243,7 +243,7 @@ class TextEncoder(nn.Module):
)
y = self.ssl_proj(y * y_mask) * y_mask
y = self.encoder_ssl(y * y_mask, y_mask)
text_mask = torch.unsqueeze(
@ -371,6 +371,37 @@ class PosteriorEncoder(nn.Module):
return z, m, logs, x_mask
class Encoder(nn.Module):
def __init__(self,
in_channels,
out_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
def forward(self, x, x_lengths, g=None):
if(g!=None):
g = g.detach()
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
return stats, x_mask
class WNEncoder(nn.Module):
def __init__(
self,
@ -1028,3 +1059,218 @@ class SynthesizerTrn(nn.Module):
ssl = self.ssl_proj(x)
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
return codes.transpose(0, 1)
class CFM(torch.nn.Module):
def __init__(
self,
in_channels,dit
):
super().__init__()
self.sigma_min = 1e-6
self.estimator = dit
self.in_channels = in_channels
self.criterion = torch.nn.MSELoss()
@torch.inference_mode()
def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, inference_cfg_rate=0):
"""Forward diffusion"""
B, T = mu.size(0), mu.size(1)
x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype) * temperature
prompt_len = prompt.size(-1)
prompt_x = torch.zeros_like(x,dtype=mu.dtype)
prompt_x[..., :prompt_len] = prompt[..., :prompt_len]
x[..., :prompt_len] = 0
mu=mu.transpose(2,1)
t = 0
d = 1 / n_timesteps
for j in range(n_timesteps):
t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t
d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d
# v_pred = model(x, t_tensor, d_tensor, **extra_args)
v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu,drop_audio_cond=False,drop_text=False).transpose(2, 1)
if inference_cfg_rate>1e-5:
neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, drop_audio_cond=True, drop_text=True).transpose(2, 1)
v_pred=v_pred+(v_pred-neg)*inference_cfg_rate
x = x + d * v_pred
t = t + d
x[:, :, :prompt_len] = 0
return x
def forward(self, x1, x_lens, prompt_lens, mu):
b, _, t = x1.shape
# random timestep
t = torch.rand([b], device=mu.device, dtype=x1.dtype)
x0 = torch.randn_like(x1,device=mu.device)
vt = x1 - x0
xt = x0 + t[:, None, None] * vt
dt = torch.zeros_like(t,device=mu.device)
prompt = torch.zeros_like(x1)
for bib in range(b):
prompt[bib, :, :prompt_lens[bib]] = x1[bib, :, :prompt_lens[bib]]
xt[bib, :, :prompt_lens[bib]] = 0
gailv=0.3# if ttime()>1736250488 else 0.1
if random.random() < gailv:
base = torch.randint(2, 8, (t.shape[0],), device=mu.device)
d = 1/torch.pow(2, base)
d_input = d.clone()
d_input[d_input < 1e-2] = 0
# with torch.no_grad():
v_pred_1 = self.estimator(xt, prompt, x_lens, t, d_input, mu).transpose(2, 1).detach()
# v_pred_1 = self.diffusion(xt, t, d_input, cond=conditioning).detach()
x_mid = xt + d[:, None, None] * v_pred_1
# v_pred_2 = self.diffusion(x_mid, t+d, d_input, cond=conditioning).detach()
v_pred_2 = self.estimator(x_mid, prompt, x_lens, t+d, d_input, mu).transpose(2, 1).detach()
vt = (v_pred_1 + v_pred_2) / 2
vt = vt.detach()
dt = 2*d
vt_pred = self.estimator(xt, prompt, x_lens, t,dt, mu).transpose(2,1)
loss = 0
# print(45555555,estimator_out.shape,u.shape,x_lens,prompt_lens)#45555555 torch.Size([7, 465, 100]) torch.Size([7, 100, 465]) tensor([461, 461, 451, 451, 442, 442, 442], device='cuda:0') tensor([ 96, 93, 185, 59, 244, 262, 294], device='cuda:0')
for bib in range(b):
loss += self.criterion(vt_pred[bib, :, prompt_lens[bib]:x_lens[bib]], vt[bib, :, prompt_lens[bib]:x_lens[bib]])
loss /= b
return loss#, estimator_out + (1 - self.sigma_min) * z
class SynthesizerTrnV3(nn.Module):
"""
Synthesizer for Training
"""
def __init__(self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
n_speakers=0,
gin_channels=0,
use_sdp=True,
semantic_frame_rate=None,
freeze_quantizer=None,
**kwargs):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.n_speakers = n_speakers
self.gin_channels = gin_channels
self.model_dim=512
self.use_sdp = use_sdp
self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout)
# self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback
self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback
# self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
# upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
# self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
# gin_channels=gin_channels)
# self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
ssl_dim = 768
assert semantic_frame_rate in ['25hz', "50hz"]
self.semantic_frame_rate = semantic_frame_rate
if semantic_frame_rate == '25hz':
self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
else:
self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)
self.quantizer = ResidualVectorQuantizer(
dimension=ssl_dim,
n_q=1,
bins=1024
)
self.freeze_quantizer=freeze_quantizer
inter_channels2=512
self.bridge=nn.Sequential(
nn.Conv1d(inter_channels, inter_channels2, 1, stride=1),
nn.LeakyReLU()
)
self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels)
self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1)
self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim
def forward(self, ssl, y, mel,ssl_lengths,y_lengths, text, text_lengths,mel_lengths):#ssl_lengths no need now
with autocast(enabled=False):
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
ge = self.ref_enc(y[:,:704] * y_mask, y_mask)
maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
with maybe_no_grad:
if self.freeze_quantizer:
self.ssl_proj.eval()#
self.quantizer.eval()
ssl = self.ssl_proj(ssl)
quantized, codes, commit_loss, quantized_list = self.quantizer(
ssl, layers=[0]
)
with maybe_no_grad:
quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
fea=self.bridge(x)
fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT
fea, y_mask_ = self.wns1(fea, mel_lengths, ge)##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate.
B=ssl.shape[0]
prompt_len_max = mel_lengths*2/3
prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long)
minn=min(mel.shape[-1],fea.shape[-1])
mel=mel[:,:,:minn]
fea=fea[:,:,:minn]
cfm_loss= self.cfm(mel, mel_lengths, prompt_len, fea)
return cfm_loss
@torch.no_grad()
def decode_encp(self, codes,text, refer,ge=None):
# print(2333333,refer.shape)
# ge=None
if(ge==None):
refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask)
y_lengths = torch.LongTensor([int(codes.size(2)*2)]).to(codes.device)
y_lengths1 = torch.LongTensor([int(codes.size(2)*2.5*1.5)]).to(codes.device)
text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
quantized = self.quantizer.decode(codes)
if self.semantic_frame_rate == '25hz':
quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
fea=self.bridge(x)
fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT
####more wn paramter to learn mel
fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
return fea,ge
def extract_latent(self, x):
ssl = self.ssl_proj(x)
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
return codes.transpose(0,1)

View File

@ -118,6 +118,7 @@ def main(args):
)
logger = TensorBoardLogger(name=output_dir.stem, save_dir=output_dir)
os.environ["MASTER_ADDR"]="localhost"
os.environ["USE_LIBUV"] = "0"
trainer: Trainer = Trainer(
max_epochs=config["train"]["epochs"],
accelerator="gpu" if torch.cuda.is_available() else "cpu",

View File

@ -75,7 +75,7 @@ def run(rank, n_gpus, hps):
dist.init_process_group(
backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
init_method="env://",
init_method="env://?use_libuv=False",
world_size=n_gpus,
rank=rank,
)
@ -193,7 +193,7 @@ def run(rank, n_gpus, hps):
try: # 如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "D_*.pth"),
utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_*.pth"),
net_d,
optim_d,
) # D多半加载没事
@ -201,7 +201,7 @@ def run(rank, n_gpus, hps):
logger.info("loaded D")
# _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
_, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "G_*.pth"),
utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_*.pth"),
net_g,
optim_g,
)
@ -455,7 +455,7 @@ def train_and_evaluate(
hps.train.learning_rate,
epoch,
os.path.join(
"%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(global_step)
"%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(global_step)
),
)
utils.save_checkpoint(
@ -464,7 +464,7 @@ def train_and_evaluate(
hps.train.learning_rate,
epoch,
os.path.join(
"%s/logs_s2" % hps.data.exp_dir, "D_{}.pth".format(global_step)
"%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(global_step)
),
)
else:
@ -474,7 +474,7 @@ def train_and_evaluate(
hps.train.learning_rate,
epoch,
os.path.join(
"%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(233333333333)
"%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(233333333333)
),
)
utils.save_checkpoint(
@ -483,7 +483,7 @@ def train_and_evaluate(
hps.train.learning_rate,
epoch,
os.path.join(
"%s/logs_s2" % hps.data.exp_dir, "D_{}.pth".format(233333333333)
"%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(233333333333)
),
)
if rank == 0 and hps.train.if_save_every_weights == True:

413
GPT_SoVITS/s2_train_v3.py Normal file
View File

@ -0,0 +1,413 @@
import warnings
warnings.filterwarnings("ignore")
import utils, os
hps = utils.get_hparams(stage=2)
os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",")
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.multiprocessing as mp
import torch.distributed as dist, traceback
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import logging, traceback
logging.getLogger("matplotlib").setLevel(logging.INFO)
logging.getLogger("h5py").setLevel(logging.INFO)
logging.getLogger("numba").setLevel(logging.INFO)
from random import randint
from module import commons
from module.data_utils import (
TextAudioSpeakerLoaderV3 as TextAudioSpeakerLoader,
TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate,
DistributedBucketSampler,
)
from module.models import (
SynthesizerTrnV3 as SynthesizerTrn,
MultiPeriodDiscriminator,
)
from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss
from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
from process_ckpt import savee
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = False
###反正A100fp32更快那试试tf32吧
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就快一丁点),对于结果造成不了影响
# from config import pretrained_s2G,pretrained_s2D
global_step = 0
device = "cpu" # cuda以外的设备等mps优化后加入
def main():
if torch.cuda.is_available():
n_gpus = torch.cuda.device_count()
else:
n_gpus = 1
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
mp.spawn(
run,
nprocs=n_gpus,
args=(
n_gpus,
hps,
),
)
def run(rank, n_gpus, hps):
global global_step
if rank == 0:
logger = utils.get_logger(hps.data.exp_dir)
logger.info(hps)
# utils.check_git_hash(hps.s2_ckpt_dir)
writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
dist.init_process_group(
backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
init_method="env://?use_libuv=False",
world_size=n_gpus,
rank=rank,
)
torch.manual_seed(hps.train.seed)
if torch.cuda.is_available():
torch.cuda.set_device(rank)
train_dataset = TextAudioSpeakerLoader(hps.data) ########
train_sampler = DistributedBucketSampler(
train_dataset,
hps.train.batch_size,
[
32,
300,
400,
500,
600,
700,
800,
900,
1000,
# 1100,
# 1200,
# 1300,
# 1400,
# 1500,
# 1600,
# 1700,
# 1800,
# 1900,
],
num_replicas=n_gpus,
rank=rank,
shuffle=True,
)
collate_fn = TextAudioSpeakerCollate()
train_loader = DataLoader(
train_dataset,
num_workers=6,
shuffle=False,
pin_memory=True,
collate_fn=collate_fn,
batch_sampler=train_sampler,
persistent_workers=True,
prefetch_factor=4,
)
# if rank == 0:
# eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, val=True)
# eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False,
# batch_size=1, pin_memory=True,
# drop_last=False, collate_fn=collate_fn)
net_g = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model,
).cuda(rank) if torch.cuda.is_available() else SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model,
).to(device)
# net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device)
# for name, param in net_g.named_parameters():
# if not param.requires_grad:
# print(name, "not requires_grad")
optim_g = torch.optim.AdamW(
filter(lambda p: p.requires_grad, net_g.parameters()),###默认所有层lr一致
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps,
)
# optim_d = torch.optim.AdamW(
# net_d.parameters(),
# hps.train.learning_rate,
# betas=hps.train.betas,
# eps=hps.train.eps,
# )
if torch.cuda.is_available():
net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
else:
net_g = net_g.to(device)
# net_d = net_d.to(device)
try: # 如果能加载自动resume
# _, _, _, epoch_str = utils.load_checkpoint(
# utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_*.pth"),
# net_d,
# optim_d,
# ) # D多半加载没事
# if rank == 0:
# logger.info("loaded D")
# _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
_, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_*.pth"),
net_g,
optim_g,
)
global_step = (epoch_str - 1) * len(train_loader)
# epoch_str = 1
# global_step = 0
except: # 如果首次不能加载加载pretrain
# traceback.print_exc()
epoch_str = 1
global_step = 0
if hps.train.pretrained_s2G != ""and hps.train.pretrained_s2G != None and os.path.exists(hps.train.pretrained_s2G):
if rank == 0:
logger.info("loaded pretrained %s" % hps.train.pretrained_s2G)
print(
net_g.module.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
strict=False,
) if torch.cuda.is_available() else net_g.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
strict=False,
)
) ##测试不加载优化器
# if hps.train.pretrained_s2D != ""and hps.train.pretrained_s2D != None and os.path.exists(hps.train.pretrained_s2D):
# if rank == 0:
# logger.info("loaded pretrained %s" % hps.train.pretrained_s2D)
# print(
# net_d.module.load_state_dict(
# torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"]
# ) if torch.cuda.is_available() else net_d.load_state_dict(
# torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"]
# )
# )
# scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
# scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
optim_g, gamma=hps.train.lr_decay, last_epoch=-1
)
# scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
# optim_d, gamma=hps.train.lr_decay, last_epoch=-1
# )
for _ in range(epoch_str):
scheduler_g.step()
# scheduler_d.step()
scaler = GradScaler(enabled=hps.train.fp16_run)
net_d=optim_d=scheduler_d=None
for epoch in range(epoch_str, hps.train.epochs + 1):
if rank == 0:
train_and_evaluate(
rank,
epoch,
hps,
[net_g, net_d],
[optim_g, optim_d],
[scheduler_g, scheduler_d],
scaler,
# [train_loader, eval_loader], logger, [writer, writer_eval])
[train_loader, None],
logger,
[writer, writer_eval],
)
else:
train_and_evaluate(
rank,
epoch,
hps,
[net_g, net_d],
[optim_g, optim_d],
[scheduler_g, scheduler_d],
scaler,
[train_loader, None],
None,
None,
)
scheduler_g.step()
# scheduler_d.step()
def train_and_evaluate(
rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers
):
net_g, net_d = nets
optim_g, optim_d = optims
# scheduler_g, scheduler_d = schedulers
train_loader, eval_loader = loaders
if writers is not None:
writer, writer_eval = writers
train_loader.batch_sampler.set_epoch(epoch)
global global_step
net_g.train()
# net_d.train()
# for batch_idx, (
# ssl,
# ssl_lengths,
# spec,
# spec_lengths,
# y,
# y_lengths,
# text,
# text_lengths,
# ) in enumerate(tqdm(train_loader)):
for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate(tqdm(train_loader)):
if torch.cuda.is_available():
spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(
rank, non_blocking=True
)
mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(
rank, non_blocking=True
)
ssl = ssl.cuda(rank, non_blocking=True)
ssl.requires_grad = False
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda(
rank, non_blocking=True
)
else:
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
mel, mel_lengths = mel.to(device), mel_lengths.to(device)
ssl = ssl.to(device)
ssl.requires_grad = False
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
text, text_lengths = text.to(device), text_lengths.to(device)
with autocast(enabled=hps.train.fp16_run):
cfm_loss = net_g(ssl, spec, mel,ssl_lengths,spec_lengths, text, text_lengths,mel_lengths)
loss_gen_all=cfm_loss
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]['lr']
# losses = [commit_loss,cfm_loss,mel_loss,loss_disc, loss_gen, loss_fm, loss_mel, loss_kl]
losses = [cfm_loss]
logger.info('Train Epoch: {} [{:.0f}%]'.format(
epoch,
100. * batch_idx / len(train_loader)))
logger.info([x.item() for x in losses] + [global_step, lr])
scalar_dict = {"loss/g/total": loss_gen_all, "learning_rate": lr, "grad_norm_g": grad_norm_g}
# image_dict = {
# "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
# "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
# "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
# "all/stats_ssl": utils.plot_spectrogram_to_numpy(stats_ssl[0].data.cpu().numpy()),
# }
utils.summarize(
writer=writer,
global_step=global_step,
# images=image_dict,
scalars=scalar_dict)
# if global_step % hps.train.eval_interval == 0:
# # evaluate(hps, net_g, eval_loader, writer_eval)
# utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,os.path.join(hps.s2_ckpt_dir, "G_{}.pth".format(global_step)),scaler)
# # utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,os.path.join(hps.s2_ckpt_dir, "D_{}.pth".format(global_step)),scaler)
# # keep_ckpts = getattr(hps.train, 'keep_ckpts', 3)
# # if keep_ckpts > 0:
# # utils.clean_checkpoints(path_to_models=hps.s2_ckpt_dir, n_ckpts_to_keep=keep_ckpts, sort_by_time=True)
global_step += 1
if epoch % hps.train.save_every_epoch == 0 and rank == 0:
if hps.train.if_save_latest == 0:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(
"%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(global_step)
),
)
# utils.save_checkpoint(
# net_d,
# optim_d,
# hps.train.learning_rate,
# epoch,
# os.path.join(
# "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(global_step)
# ),
# )
else:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(
"%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(233333333333)
),
)
# utils.save_checkpoint(
# net_d,
# optim_d,
# hps.train.learning_rate,
# epoch,
# os.path.join(
# "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(233333333333)
# ),
# )
if rank == 0 and hps.train.if_save_every_weights == True:
if hasattr(net_g, "module"):
ckpt = net_g.module.state_dict()
else:
ckpt = net_g.state_dict()
logger.info(
"saving ckpt %s_e%s:%s"
% (
hps.name,
epoch,
savee(
ckpt,
hps.name + "_e%s_s%s" % (epoch, global_step),
epoch,
global_step,
hps,
),
)
)
if rank == 0:
logger.info("====> Epoch: {}".format(epoch))
if __name__ == "__main__":
main()

View File

@ -3,8 +3,8 @@
import sys
import re
import cn2an
import ToJyutping
from pyjyutping import jyutping
from text.symbols import punctuation
from text.zh_normalization.text_normlization import TextNormalizer
@ -173,12 +173,24 @@ def jyuping_to_initials_finals_tones(jyuping_syllables):
def get_jyutping(text):
jp = jyutping.convert(text)
# print(1111111,jp)
for symbol in punctuation:
jp = jp.replace(symbol, " " + symbol + " ")
jp_array = jp.split()
return jp_array
jyutping_array = []
punct_pattern = re.compile(r"^[{}]+$".format(re.escape("".join(punctuation))))
syllables = ToJyutping.get_jyutping_list(text)
for word, syllable in syllables:
if punct_pattern.match(word):
puncts = re.split(r"([{}])".format(re.escape("".join(punctuation))), word)
for punct in puncts:
if len(punct) > 0:
jyutping_array.append(punct)
else:
# match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", syllable):
raise ValueError(f"Failed to convert {word} to jyutping: {syllable}")
jyutping_array.append(syllable)
return jyutping_array
def get_bert_feature(text, word2ph):

View File

@ -269,12 +269,12 @@ Use v2 from v1 environment:
- [ ] **Features:**
- [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min).
- [x] TTS speaking speed control.
- [ ] ~~Enhanced TTS emotion control.~~
- [ ] ~~Enhanced TTS emotion control.~~ Maybe use pretrained finetuned preset GPT models for better emotion.
- [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent).
- [x] Improve English and Japanese text frontend.
- [ ] Develop tiny and larger-sized TTS models.
- [x] Colab scripts.
- [ ] Try expand training dataset (2k hours -> 10k hours).
- [x] Try expand training dataset (2k hours -> 10k hours).
- [x] better sovits base model (enhanced audio quality)
- [ ] model mix
@ -321,9 +321,12 @@ Special thanks to the following projects and contributors:
- [contentvec](https://github.com/auspicious3000/contentvec/)
- [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### Pretrained Models
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### Text Frontend for Inference
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [LangSegment](https://github.com/juntaosun/LangSegment)

View File

@ -160,9 +160,9 @@
1-修复无参考文本模式问题
2-优化中英文文本前端
3-api格式优化
4-cmd格式问题修复
5-训练数据处理阶段不支持的语言提示报错
@ -235,13 +235,15 @@
### 20240821
1-fast_inference分支合并进mainhttps://github.com/RVC-Boss/GPT-SoVITS/pull/1490
2-支持通过ssml标签优化数字、电话、时间日期等https://github.com/RVC-Boss/GPT-SoVITS/issues/1508
3-api修复优化https://github.com/RVC-Boss/GPT-SoVITS/pull/1503
4-修复了参考音频混合只能上传一条的bug:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
5-增加了各种数据集检查,若缺失会弹出warning:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
### 20250211
1-增加gpt-sovits-v3模型

View File

@ -130,7 +130,7 @@
#### Major Fixes:
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) fixed the issue of WebUI's GPT fine-tuning not reading BERT feature of Chinese input texts, causing inconsistency with inference and potential quality degradation.
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) fixed the issue of WebUI's GPT fine-tuning not reading BERT feature of Chinese input texts, causing inconsistency with inference and potential quality degradation.
**Caution: If you have previously fine-tuned with a large amount of data, it is recommended to retune the model to improve quality.**
### 20240706 Update
@ -145,8 +145,8 @@
#### Major Fixes:
6. The accelerated inference code from [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) has been validated and merged into the main branch, ensuring consistent inference effects with the base.
It also supports accelerated inference in no-reference text mode.
6. The accelerated inference code from [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) has been validated and merged into the main branch, ensuring consistent inference effects with the base.
It also supports accelerated inference in no-reference text mode.
**Future updates will continue to verify the consistency of changes in the `fast_inference` branch**.
@ -160,7 +160,7 @@
#### Major Fixes:
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) supported speech rate adjustment for synthesis.
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) supported speech rate adjustment for synthesis.
Enabled freezing randomness while only adjusting the speech rate.
### 20240806 Update
@ -176,3 +176,14 @@
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) added support for Cantonese ASR.
6. Added support for GPT-SoVITS v2.
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) optimized timing logic.
### 20240821 Update
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) Merge the `fast_inference` branch into the main branch.
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) Support for optimizing numbers, phone numbers, dates, and times using SSML tags.
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) Fixed and optimized API.
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Fixed the bug where only one reference audio could be uploaded for mixing, Added various dataset checks with warnings popping up if missing files.
### 20250211 Update
1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) Added GPT-SoVITS v3 Model.

View File

@ -175,3 +175,14 @@
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 広東語ASRのサポートを追加しました。
6. GPT-SoVITS v2 のサポートを追加しました。
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) タイミングロジックを最適化しました。
### 20240821 更新
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` ブランチをメインブランチにマージしました。
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSMLタグを使用して数字、電話番号、日付、時間などの最適化をサポートしました。
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) APIの修正と最適化を行いました。
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 参照音声のミキシングで1つしかアップロードできないバグを修正し、データセットの各種チェックを追加してファイルが欠落している場合に警告を表示するようにしました。
### 20250211 更新
1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 モデルを追加しました。

View File

@ -1,7 +1,7 @@
<div align="center">
<h1>GPT-SoVITS-WebUI</h1>
パワフルな数発音声変換・音声合成 WebUI。<br><br>
パワフルなFew-Shot音声変換・音声合成 WebUI。<br><br>
[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS)
@ -20,17 +20,17 @@
## 機能:
1. **ゼロショット TTS:** 5 秒間のボーカルサンプルを入力すると、即座にテキストから音声に変換されます。
1. **Zero-Shot TTS:** たった5秒間の音声サンプルで、即座にテキストからその音声に変換できます。
2. **数ショット TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声の類似性とリアリズムを向上。
2. **Few-Shot TTS:** わずか1分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
3. **多言語サポート:** 現在、英語、日本語、韓語、粵語、中国語をサポートしています。
3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。
4. **WebUI ツール:** 統合されたツールは、音声伴奏の分離、トレーニングセットの自動セグメンテーション、中国語 ASR、テキストラベリングが含まれ、初心者がトレーニングデータセットと GPT/SoVITS モデルを作成するのを支援します。
4. **WebUI ツール:** 統合されたツールは、音声伴奏BGM等の分離、トレーニングセットの自動セグメンテーション、ASR中国語のみ、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成やGPT/SoVITSモデルのトレーニング等を非常に簡単に行えます。
**[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!**
未見の話者数ショット微調整デモ:
声の事前学習無しかつFew-Shotでトレーニングされたモデルのデモ:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
@ -49,7 +49,7 @@ _注記: numba==0.56.4 は py<3.11 が必要です_
### Windows
Windows ユーザーの方へWindows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します。
Windows ユーザー:Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します。
### Linux
@ -61,7 +61,7 @@ bash install.sh
### macOS
**注MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練します。**
**注MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練することを強く推奨します。**
1. `xcode-select --install` を実行して、Xcodeコマンドラインツールをインストールします。
2. `brew install ffmpeg` を実行してFFmpegをインストールします。
@ -94,7 +94,7 @@ conda install -c conda-forge 'ffmpeg<7'
##### Windows ユーザー
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートディレクトリに置きます。
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
##### MacOS ユーザー
```bash
@ -111,13 +111,13 @@ pip install -r requirementx.txt
#### docker-compose.yaml の設定
0. イメージのタグについて:コードベースの更新が速、イメージのパッケージングとテストが遅いため、[Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits) で現在パッケージされている最新のイメージをご覧になり、ご自身の状況に応じて選択するか、またはご自身のニーズに応じて Dockerfile を使用してローカルで構築してください。
0. イメージのタグについて:コードベースの更新が速い割に、イメージのパッケージングとテストが遅いため、[Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits) で現在パッケージされている最新のイメージをご覧になり、ご自身の状況に応じて選択するか、またはご自身のニーズに応じて Dockerfile を使用してローカルでビルドしてください。
1. 環境変数:
- `is_half`:半精度/倍精度の制御。"SSL 抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です。実際の状況に応じて True または False に調整してください。
2. ボリューム設定:コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます。デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています。
3. `shm_size`Windows の Docker Desktop のデフォルトの利用可能メモリが小さすぎるため、異常な動作を引き起こす可能性があります。状況に応じて適宜設定してください。
3. `shm_size`Windows の Docker Desktop のデフォルトの利用可能メモリは小さすぎるため、うまく動作しない可能性があります。状況に応じて適宜設定してください。
4. `deploy`セクションの GPU に関連する内容は、システムと実際の状況に応じて慎重に設定してください。
#### docker compose で実行する
@ -140,11 +140,11 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
2. [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。中国語TTSのみ
3. UVR5ボーカル/伴奏分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
3. UVR5ボーカル/伴奏BGM等分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
4. 中国語ASR追加機能の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
5. 英語または日本語のASR追加機能場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より少ないディスク容量で同様の効果を持つ可能性があります。
5. 英語または日本語のASR追加機能を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
## データセット形式
@ -185,11 +185,11 @@ V1に切り替えたい場合は
```bash
python webui.py v1 <言語(オプション)>
```
またはWebUIで手動でバージョンを切り替えます
またはWebUIで手動でバージョンを切り替えてください
### 微調整
#### パス自動補完がサポートされました
#### パス自動補完のサポート
1.音声パスを入力する

View File

@ -175,4 +175,15 @@
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 인식 로직을 최적화했습니다.
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 광동어 ASR 지원을 추가했습니다.
6. GPT-SoVITS v2 지원을 추가했습니다.
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) 타이밍 로직을 최적화했습니다.
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) 타이밍 로직을 최적화했습니다.
### 20240821 업데이트
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` 브랜치를 메인 브랜치에 병합.
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML 태그를 사용하여 숫자, 전화번호, 날짜 및 시간 최적화 지원.
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API 수정 및 최적화.
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 믹싱을 위한 참조 오디오를 하나만 업로드할 수 있는 버그 수정, 다양한 데이터셋 검사 추가 및 파일이 누락된 경우 경고 팝업.
### 20250211 업데이트
1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 모델 추가.

View File

@ -130,7 +130,7 @@
#### Büyük Düzeltmeler:
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI'nin GPT ince ayarının, Çince giriş metinlerinin BERT özelliğini okumaması sorununu düzeltti, bu da çıkarım ile tutarsızlığa ve potansiyel kalite düşüşüne neden oluyordu.
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI'nin GPT ince ayarının, Çince giriş metinlerinin BERT özelliğini okumaması sorununu düzeltti, bu da çıkarım ile tutarsızlığa ve potansiyel kalite düşüşüne neden oluyordu.
**Dikkat: Daha önce büyük miktarda veri ile ince ayar yaptıysanız, modelin kalitesini artırmak için yeniden ayar yapmanız önerilir.**
### 20240706 Güncellemesi
@ -160,7 +160,7 @@
#### Büyük Düzeltmeler:
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) sentez için konuşma hızı ayarlamasını destekledi.
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) sentez için konuşma hızı ayarlamasını destekledi.
Konuşma hızını ayarlarken rastgeleliği dondurmayı etkinleştirdi.
### 20240806 Güncellemesi
@ -175,4 +175,15 @@
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU tanıma mantığını optimize etti.
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) Kantonca ASR desteği ekledi.
6. GPT-SoVITS v2 desteği eklendi.
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) zamanlama mantığını optimize etti.
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) zamanlama mantığını optimize etti.
### 20240821 Güncelleme
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` dalını ana dala birleştir.
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML etiketlerini kullanarak sayıları, telefon numaralarını, tarihleri ve saatleri optimize etme desteği.
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API düzeltildi ve optimize edildi.
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Karıştırmak için yalnızca bir referans sesi yüklenebiliyordu hatası düzeltildi, çeşitli veri seti kontrolleri eklendi ve eksik dosyalar için uyarılar çıkar.
### 20250211 Güncelleme
1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 Modeli eklendi.

View File

@ -27,10 +27,11 @@ LangSegment>=0.2.0
Faster_Whisper
wordsegment
rotary_embedding_torch
pyjyutping
ToJyutping
g2pk2
ko_pron
opencc; sys_platform != 'linux'
opencc==1.1.1; sys_platform == 'linux'
python_mecab_ko; sys_platform != 'win32'
fastapi<0.112.2
x_transformers

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5 opened ",
"UVR5进程输出信息": "UVR5 process output log",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset",
"batch_size": "Batch Size",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy",
"max:归一化后最大值多少": "Loudness multiplier after normalized",
"max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept",
@ -66,6 +67,9 @@
"中文": "Chinese",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Chinese Tutorialhttps://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "Chinese-English Mixed",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Primary Reference Audio (Please upload reference audio within 3-10 seconds, exceeding this limit will cause an error!)",
"主参考音频的文本": "Text of Primary Reference Audio",
"主参考音频的语种": "Language of Primary Reference Audio",
"也可批量输入音频文件, 二选一, 优先读文件夹": "Multiple audio files can also be imported. If a folder path exists, this input is ignored.",
"人声伴奏分离批量处理, 使用UVR5模型。": "Batch processing for vocal and instrumental separation, using the UVR5 model.",
"人声提取激进程度": "Vocal extraction aggressiveness",
@ -73,13 +77,24 @@
"以下模型不存在:": "No Such Model:",
"伴奏人声分离&去混响&去回声": "Vocals/Accompaniment Separation & Reverberation Removal",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "When using the no-reference text mode, it is recommended to use a fine-tuned GPT. If the reference audio is unclear and you don't know what to write, you can enable this feature, which will ignore the reference text you've entered.",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Recommended to use a Finetune-GPT when using Prompt-Free Mode. If reference audio is not clear or unsure what to write, enable this option to ignore the reference text.",
"保存频率save_every_epoch": "Save frequency (save_every_epoch):",
"保持随机": "Keep Random",
"关闭TTS推理WebUI": "Close TTS Inference WebUI",
"关闭UVR5-WebUI": "Close UVR5-WebUI",
"关闭打标WebUI": "Close Labeling WebUI",
"凑50字一切": "Slice per 50 characters",
"凑四句一切": "Slice once every 4 sentences",
"分桶处理模式已关闭": "Bucket Processing Mode Disabled",
"分桶处理模式已开启": "Bucket Processing Mode Enabled",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "Segmented Return Mode does not support Bucket Processing, Bucket Processing Disabled automatically",
"分段返回模式已开启": "Segmented Return Mode Enabled",
"分段间隔(秒)": "Segment Interval (Seconds)",
"分段间隔过小已自动设置为0.01": "Segment Interval too short, automatically set to 0.01",
"切分": "Segmentation",
"切分后文本": "Inference Text After Segmentation",
"切分后的子音频的输出根目录": "Audio slicer output folder",
"切分文本": "Segment Text",
"切割使用的进程数": "CPU threads used for audio slicing",
"刷新模型路径": "refreshing model paths",
"前端处理后的文本(每句):": "Processed text from the frontend (per sentence):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optional: Upload multiple reference audio files by dragging and dropping them (recommended to be of the same gender), and average their tone. If this option is left blank, the tone will be controlled by the single reference audio on the left. If fine-tuning the model, it is recommended that all reference audio files have tones within the fine-tuning training set; the pretrained model can be ignored.",
"合成语音": "Start inference",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "An example of a valid folder path format: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simply copy the address from the file manager's address bar).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Support for Phoneme Conversion, Manual Phoneme Editing, and Step-by-Step Speech Synthesis will be added in the future.",
"启用并行推理版本": "Enable Parallel Inference Version",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "Please fill in the segmented audio files' directory! The full path of the audio file = the directory concatenated with the filename corresponding to the waveform in the list file (not the full path). If left blank, the absolute full path in the .list file will be used.",
"多语种混合": "Multilingual Mixed",
"多语种混合(粤语)": "Multilingual Mixed(Yue)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "Actual Input Target Text (per sentence):",
"实际输入的目标文本:": "Actual Input Target Text:",
"导出文件格式": "Export file format",
"并行推理": "Parallel Inference",
"并行推理模式已关闭": "Parallel Inference Mode Disabled",
"并行推理模式已开启": "Parallel Inference Mode Enabled",
"开启GPT训练": "Start GPT training",
"开启SSL提取": "Start SSL extracting",
"开启SoVITS训练": "Start SoVITS training",
@ -107,6 +127,7 @@
"开启打标WebUI": "Open Labeling WebUI",
"开启文本获取": "Start speech-to-text",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Enable no reference mode. If you don't fill 'Text for reference audio', no reference mode will be enabled.",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "Enable Reference Text-Free Mode. Leaving the reference text field blank is equivalent to enabling this mode. v3 does not support this mode currently, using it will cause an error.",
"开启离线批量ASR": "Start batch ASR",
"开启语义token提取": "Start semantics token extraction",
"开启语音切割": "Start audio slicer",
@ -122,7 +143,11 @@
"按中文句号。切": "Slice by Chinese punct",
"按标点符号切": "Slice by every punct",
"按英文句号.切": "Slice by English punct",
"推理设置": "Inference Settings",
"提取文本Bert特征": "Extract Text BERT Features",
"数据分桶(并行推理时会降低一点计算量)": "Data Bucketing (Reduces Computation Cost in Parallel Inference)",
"数据类型精度": "Computing precision",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Text Segmentation Tool. Very long text may not yield good synthesis results, so Segmentation is Recommended. Synthesis will be performed based on line breaks and then concatenated.",
"文本模块学习率权重": "Text model learning rate weighting",
"文本进程输出信息": "Text processing output",
"施工中,请静候佳音": "In construction, please wait",
@ -146,6 +171,7 @@
"终止SSL提取进程": "Stop SSL extraction",
"终止SoVITS训练": "Stop SoVITS training",
"终止一键三连": "Stop one-click formatting",
"终止合成": "Terminate Synthesis",
"终止文本获取进程": "Stop speech-to-text",
"终止语义token提取进程": "Stop semantics token extraction",
"终止语音切割": "Stop audio cutting",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "Sematics token extraction output log",
"语速": "Speech rate",
"语速调整,高为更快": "Adjust speech rate, higher for faster",
"语速调节不支持分桶处理,已自动关闭分桶处理": "Speech Rate Adjustment does not support Bucket Processing, Bucket Processing Disabled automatically",
"语音切割进程输出信息": "Audio slicer output log",
"语音降噪进程输出信息": "Voice Denoiser Process Output Information",
"请上传3~10秒内参考音频超过会报错": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.",
@ -166,9 +193,11 @@
"请填入正确的List路径": "Please Fill in the Correct List Path",
"请填入正确的音频文件夹路径": "Please Fill in the Correct Audio Folder Path",
"请输入有效文本": "Please enter valid text.",
"路径不存在,使用默认配置": "Paths Not Found, Using Default Config",
"路径不能为空": "Expected No Empty Path",
"路径错误": "Path Error",
"转换": "Convert",
"辅参考音频(可选多个,或不选)": "Secondary Reference Audio (Multiple Optional, or None)",
"输入待处理音频文件夹路径": "Enter the path of the audio folder to be processed:",
"输入文件夹路径": "Input folder path",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "output folder (logs/{experiment name}) should have files and folders starts with 23456.",
@ -176,13 +205,19 @@
"输出文件夹路径": "Output folder path",
"输出的语音": "Inference Result",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Choose the models from SoVITS_weights and GPT_weights. The default one is a pretrain, so you can experience zero shot TTS.",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Sampling Steps: If feel noisy, try increasing, if feel slow, try decreasing",
"重复惩罚": "Repetition Penalty",
"降噪结果输出文件夹": "Denoised Results Output Folder",
"降噪音频文件输入文件夹": "Denoising Audio File Input Folder",
"随机种子": "Random Seed",
"需要合成的切分前文本": "Inference Text Before Segmentation",
"需要合成的文本": "Inference text",
"需要合成的文本的语种": "Language of the Inference Text",
"需要合成的语种": "Inference text language",
"韩文": "Korean",
"韩英混合": "Korean-English Mixed",
"音频加载失败": "Failed to Load Audio",
"音频文件不存在,跳过:": "Audio File Not Found, Skipping: ",
"音频自动切分输入路径,可文件可文件夹": "Audio slicer input (file or folder)",
"预训练的GPT模型路径": "Pretrained GPT model path",
"预训练的SSL模型路径": "Pretrained SSL model path",

View File

@ -9,7 +9,7 @@
"*请上传并填写参考信息": "*Por favor, suba y complete la información de referencia",
"*请填写需要合成的目标文本和语种模式": "*Por favor, complete el texto objetivo a sintetizar y el modo de idioma",
".list标注文件的路径": "Ruta del archivo de anotación .list",
".限制范围越小判别效果越好。": "#!.限制范围越小判别效果越好。",
".限制范围越小判别效果越好。": ".Cuanto más pequeño sea el rango, mejor será el efecto de discriminación.",
"0-前置数据集获取工具": "0-Herramienta de obtención de conjunto de datos previo",
"0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-Herramienta de separación de voz y acompañamiento UVR5 y eliminación de reverberación y retardo",
"0b-语音切分工具": "0b-Herramienta de división de voz",
@ -52,6 +52,7 @@
"UVR5已开启": "UVR5 está habilitado",
"UVR5进程输出信息": "Información de salida del proceso UVR5",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proporción de mezcla de audio normalizado que entra",
"batch_size": "Tamaño de lote",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: cómo calcular la curva de volumen, cuanto más pequeño, mayor precisión pero mayor carga computacional (mayor precisión no significa mejor rendimiento)",
"max:归一化后最大值多少": "max: valor máximo después de la normalización",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept: duración máxima del silencio después del corte",
@ -66,6 +67,9 @@
"中文": "Chino",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Documentación del tutorial en chino: https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "Chino e inglés mezclados",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio de referencia principal (Por favor, suba un audio de referencia de entre 3 y 10 segundos, si supera este límite se producirá un error)",
"主参考音频的文本": "Texto del audio de referencia principal",
"主参考音频的语种": "Idioma del audio de referencia principal",
"也可批量输入音频文件, 二选一, 优先读文件夹": "También se pueden ingresar archivos de audio por lotes, seleccionar uno, prioridad para leer carpetas",
"人声伴奏分离批量处理, 使用UVR5模型。": "Procesamiento por lotes de separación de voz y acompañamiento utilizando el modelo UVR5",
"人声提取激进程度": "Nivel de agresividad en la extracción de voz",
@ -73,13 +77,24 @@
"以下模型不存在:": "No Existe tal Modelo:",
"伴奏人声分离&去混响&去回声": "Separación de acompañamiento y voz principal y eliminación de reverberación y eco",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "Se recomienda usar un GPT ajustado en modo sin texto de referencia; habilítelo si no puede entender el audio de referencia (si no sabe qué escribir). Una vez habilitado, ignorará el texto de referencia ingresado.",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Se recomienda usar un GPT ajustado cuando se use el modo sin texto de referencia. Si el audio de referencia no es claro o no sabe qué escribir, habilite esta opción para ignorar el texto de referencia.",
"保存频率save_every_epoch": "Frecuencia de guardado (cada epoch)",
"保持随机": "Mantener aleatorio",
"关闭TTS推理WebUI": "Cerrar TTS Inference WebUI",
"关闭UVR5-WebUI": "Cerrar UVR5-WebUI",
"关闭打标WebUI": "Cerrar Labeling WebUI",
"凑50字一切": "Todo para alcanzar las 50 palabras",
"凑四句一切": "Completa cuatro oraciones para rellenar todo",
"分桶处理模式已关闭": "Modo de procesamiento por lotes deshabilitado",
"分桶处理模式已开启": "Modo de procesamiento por lotes habilitado",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "El modo de retorno segmentado no es compatible con el procesamiento por lotes, se ha deshabilitado automáticamente",
"分段返回模式已开启": "Modo de retorno segmentado habilitado",
"分段间隔(秒)": "Intervalo de segmentación (segundos)",
"分段间隔过小已自动设置为0.01": "El intervalo de segmentación es demasiado pequeño, se ha ajustado automáticamente a 0.01",
"切分": "Segmentación",
"切分后文本": "Texto después de la segmentación",
"切分后的子音频的输出根目录": "Directorio raíz de salida de los sub-audios después de la división",
"切分文本": "Segmentar texto",
"切割使用的进程数": "Número de procesos utilizados para la división",
"刷新模型路径": "Actualizar la ruta del modelo",
"前端处理后的文本(每句):": "Texto después del procesamiento previo (por frase):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Sube varios archivos de audio de referencia arrastrándolos y soltándolos (se recomienda que sean del mismo género) y promedia sus tonos. Si esta opción se deja en blanco, el tono será controlado por el único audio de referencia a la izquierda. Si se está afinando el modelo, se recomienda que todos los archivos de audio de referencia tengan tonos dentro del conjunto de entrenamiento de ajuste fino; se puede ignorar el modelo preentrenado.",
"合成语音": "Síntesis de voz",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Ejemplo de formato de ruta de carpeta válida: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simplemente copie desde la barra de direcciones del administrador de archivos).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Se añadirá soporte para conversión de fonemas, edición manual de fonemas y síntesis de voz por pasos en el futuro.",
"启用并行推理版本": "Habilitar versión de inferencia paralela",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "Ingrese el directorio donde se encuentran los audios después de la división. La ruta completa de los archivos de audio leídos = este directorio + nombre de archivo correspondiente en el archivo .list (no la ruta completa). Si se deja en blanco, se utilizará la ruta completa del archivo .list.",
"多语种混合": "Mezcla de varios idiomas",
"多语种混合(粤语)": "Mezcla Multilingüe (Cantonés)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "Texto objetivo realmente ingresado (por frase):",
"实际输入的目标文本:": "Texto objetivo realmente ingresado:",
"导出文件格式": "Formato de archivo de exportación",
"并行推理": "Inferencia paralela",
"并行推理模式已关闭": "Modo de inferencia paralela deshabilitado",
"并行推理模式已开启": "Modo de inferencia paralela habilitado",
"开启GPT训练": "Iniciar entrenamiento de GPT",
"开启SSL提取": "Habilitar la extracción SSL",
"开启SoVITS训练": "Iniciar entrenamiento de SoVITS",
@ -107,6 +127,7 @@
"开启打标WebUI": "Abrir Labeling WebUI",
"开启文本获取": "Habilitar la obtención de texto",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Habilitar el modo sin texto de referencia. No llenar el texto de referencia también lo habilita.",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "Habilitar modo sin texto de referencia. No completar el campo de texto de referencia equivale a habilitar este modo. v3 no es compatible con este modo actualmente y su uso generará un error.",
"开启离线批量ASR": "Habilitar ASR en lote fuera de línea",
"开启语义token提取": "Habilitar la extracción de tokens semánticos",
"开启语音切割": "Habilitar la división de voz",
@ -122,7 +143,11 @@
"按中文句号。切": "Cortar según puntos en chino",
"按标点符号切": "Cortar según los signos de puntuación",
"按英文句号.切": "Cortar por puntos en inglés.",
"推理设置": "Configuración de inferencia",
"提取文本Bert特征": "Extraer características de texto con BERT",
"数据分桶(并行推理时会降低一点计算量)": "Agrupación de datos (Reduce el costo computacional en inferencia paralela)",
"数据类型精度": "precisión del tipo de datos",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Herramienta de segmentación de texto. Un texto demasiado largo puede no producir buenos resultados, por lo que se recomienda segmentarlo. La síntesis se realizará por separado según los saltos de línea y luego se unirá.",
"文本模块学习率权重": "Peso de la tasa de aprendizaje del módulo de texto",
"文本进程输出信息": "Información de salida del proceso de obtención de texto",
"施工中,请静候佳音": "En construcción, por favor espere pacientemente",
@ -146,6 +171,7 @@
"终止SSL提取进程": "Terminar el proceso de extracción SSL",
"终止SoVITS训练": "Detener entrenamiento de SoVITS",
"终止一键三连": "Terminar el proceso de un solo paso de formateo",
"终止合成": "Terminar síntesis",
"终止文本获取进程": "Terminar el proceso de obtención de texto",
"终止语义token提取进程": "Terminar el proceso de extracción de tokens semánticos",
"终止语音切割": "Terminar la división de voz",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "Información de salida del proceso de extracción de tokens semánticos",
"语速": "Velocidad de habla",
"语速调整,高为更快": "Ajustar la velocidad de habla, más alta para más rápido",
"语速调节不支持分桶处理,已自动关闭分桶处理": "El ajuste de velocidad de voz no es compatible con el procesamiento por lotes, se ha deshabilitado automáticamente",
"语音切割进程输出信息": "Información de salida del proceso de división de voz",
"语音降噪进程输出信息": "Información de salida del proceso de reducción de ruido de voz",
"请上传3~10秒内参考音频超过会报错": "Por favor, suba un audio de referencia de entre 3 y 10 segundos, ¡más de eso causará un error!",
@ -166,9 +193,11 @@
"请填入正确的List路径": "Por Favor, Introduzca la Ruta Correcta de la Lista",
"请填入正确的音频文件夹路径": "Por Favor, Introduzca la Ruta Correcta de la Carpeta de Audio",
"请输入有效文本": "Por favor, introduzca un texto válido",
"路径不存在,使用默认配置": "Ruta no encontrada, usando configuración predeterminada",
"路径不能为空": "Se Espera que la Ruta No Esté Vacía",
"路径错误": "Error de Ruta",
"转换": "Convertir",
"辅参考音频(可选多个,或不选)": "Audio de referencia secundario (Opcional, se pueden seleccionar varios o ninguno)",
"输入待处理音频文件夹路径": "Ingrese la ruta de la carpeta de audio a procesar",
"输入文件夹路径": "Ingrese la ruta de la carpeta",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "Debe haber archivos y carpetas que comiencen con 23456 en el directorio logs/nombre del experimento",
@ -176,13 +205,19 @@
"输出文件夹路径": "Ruta de la carpeta de salida",
"输出的语音": "Audio de salida",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Seleccione el modelo almacenado en SoVITS_weights y GPT_weights después del entrenamiento. Uno de ellos es el modelo base, útil para experimentar con TTS de 5 segundos sin entrenamiento.",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Pasos de muestreo: si se siente ruidoso, intente aumentarlo; si es lento, intente reducirlo",
"重复惩罚": "Penalización por repetición",
"降噪结果输出文件夹": "Carpeta de salida de los resultados de reducción de ruido",
"降噪音频文件输入文件夹": "Carpeta de entrada de archivos de audio para reducción de ruido",
"随机种子": "Semilla aleatoria",
"需要合成的切分前文本": "Texto antes de la segmentación para la síntesis",
"需要合成的文本": "Texto a sintetizar",
"需要合成的文本的语种": "Idioma del texto a sintetizar",
"需要合成的语种": "Idioma para la síntesis",
"韩文": "Coreano",
"韩英混合": "Mezcla Coreano-Inglés",
"音频加载失败": "Error al Cargar el Audio",
"音频文件不存在,跳过:": "Archivo de audio no encontrado, omitiendo: ",
"音频自动切分输入路径,可文件可文件夹": "Ruta de entrada para la división automática de audio, puede ser un archivo o una carpeta",
"预训练的GPT模型路径": "Ruta del modelo GPT preentrenado",
"预训练的SSL模型路径": "Ruta del modelo SSL preentrenado",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5 est activé",
"UVR5进程输出信息": "Informations de processus UVR5",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion d'audio normalisé mélangé",
"batch_size": "Taille de lot",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: comment calculer la courbe de volume, plus petit pour une précision plus élevée mais une charge de calcul plus élevée (ce n'est pas une meilleure précision)",
"max:归一化后最大值多少": "max: valeur maximale après normalisation",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept: durée maximale de silence après la coupe",
@ -66,6 +67,9 @@
"中文": "Chinois",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Documentation du tutoriel en chinoishttps://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "Mélange de chinois et d'anglais",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio de référence principal (Veuillez télécharger un audio de référence entre 3 et 10 secondes, dépasser cette limite entraînera une erreur !)",
"主参考音频的文本": "Texte de laudio de référence principal",
"主参考音频的语种": "Langue de laudio de référence principal",
"也可批量输入音频文件, 二选一, 优先读文件夹": "Également possible d'entrer en lot des fichiers audio, au choix, privilégiez la lecture du dossier",
"人声伴奏分离批量处理, 使用UVR5模型。": "Traitement par lot de séparation voix-accompagnement en utilisant le modèle UVR5.",
"人声提取激进程度": "Degré d'extraction des voix",
@ -73,13 +77,24 @@
"以下模型不存在:": "Aucun Modèle de ce Type:",
"伴奏人声分离&去混响&去回声": "Séparation de la voix et de l'accompagnement, suppression de la réverbération et de l'écho",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "Il est recommandé d'utiliser GPT finement ajusté en mode sans texte de référence. Si vous ne comprenez pas ce que dit l'audio de référence (vous ne savez pas quoi écrire), vous pouvez l'activer ; une fois activé, ignorez le texte de référence saisi.",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Il est recommandé dutiliser un GPT ajusté pour le mode sans texte de référence. Si laudio de référence est incompréhensible ou si vous ne savez pas quoi écrire, activez cette option pour ignorer le texte de référence.",
"保存频率save_every_epoch": "Fréquence de sauvegarde (sauvegarder à chaque époque)",
"保持随机": "Garder aléatoire",
"关闭TTS推理WebUI": "Fermer TTS Inference WebUI",
"关闭UVR5-WebUI": "Fermer UVR5-WebUI",
"关闭打标WebUI": "Fermer Labeling WebUI",
"凑50字一切": "Assembler 50 mots tout",
"凑四句一切": "Composez quatre phrases pour tout remplir",
"分桶处理模式已关闭": "Mode de traitement par regroupement désactivé",
"分桶处理模式已开启": "Mode de traitement par regroupement activé",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "Le mode de retour segmenté ne prend pas en charge le regroupement des données, désactivation automatique",
"分段返回模式已开启": "Mode de retour segmenté activé",
"分段间隔(秒)": "Intervalle de segmentation (secondes)",
"分段间隔过小已自动设置为0.01": "Lintervalle de segmentation est trop court, réglé automatiquement à 0.01",
"切分": "Segmentation",
"切分后文本": "Texte après segmentation",
"切分后的子音频的输出根目录": "Répertoire racine de sortie des sous-audios après découpage",
"切分文本": "Segmenter le texte",
"切割使用的进程数": "Nombre de processus utilisés pour le découpage",
"刷新模型路径": "Actualiser le chemin du modèle",
"前端处理后的文本(每句):": "Texte après traitement frontal (par phrase):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optionnel : Téléchargez plusieurs fichiers audio de référence en les faisant glisser (recommandé d'être du même genre) et fusionnez leur tonalité. Si cette option est laissée vide, la tonalité sera contrôlée par l'unique fichier audio de référence à gauche. Si vous ajustez le modèle, il est recommandé que tous les fichiers audio de référence aient des tonalités dans l'ensemble d'entraînement d'ajustement ; le modèle pré-entrainé peut être ignoré.",
"合成语音": "Synthèse vocale",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemple de format de chemin de dossier valide : E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copiez-le depuis la barre d'adresse de l'explorateur de fichiers).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Le support pour la conversion phonémique, lédition manuelle des phonèmes et la synthèse vocale par étapes sera ajouté ultérieurement.",
"启用并行推理版本": "Activer la version dinférence parallèle",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "Veuillez indiquer le répertoire contenant les audio découpés ! Le chemin complet du fichier audio à lire = ce répertoire - nom du fichier correspondant à l'onde dans le fichier .list (pas le chemin complet). Si laissé vide, le chemin absolu dans le fichier .list sera utilisé.",
"多语种混合": "Mélange multilingue",
"多语种混合(粤语)": "Mélange Multilingue (Cantonais)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "Texte cible réellement saisi (par phrase):",
"实际输入的目标文本:": "Texte cible réellement saisi:",
"导出文件格式": "Format d'exportation du fichier",
"并行推理": "Inférence parallèle",
"并行推理模式已关闭": "Mode dinférence parallèle désactivé",
"并行推理模式已开启": "Mode dinférence parallèle activé",
"开启GPT训练": "Activer l'entraînement GPT",
"开启SSL提取": "Activer l'extraction SSL",
"开启SoVITS训练": "Activer l'entraînement SoVITS",
@ -107,6 +127,7 @@
"开启打标WebUI": "Ouvrir Labeling WebUI",
"开启文本获取": "Activer l'extraction de texte",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Activer le mode sans texte de référence. Laisser le texte de référence vide équivaut également à activer le mode.",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "Activer le mode sans texte de référence. Ne pas remplir le texte de référence revient à activer ce mode. v3 ne prend pas encore en charge ce mode et lutiliser entraînera une erreur.",
"开启离线批量ASR": "Activer la transcription automatique hors ligne en masse",
"开启语义token提取": "Activer l'extraction de jetons sémantiques",
"开启语音切割": "Activer le découpage vocal",
@ -122,7 +143,11 @@
"按中文句号。切": "Couper selon les points en chinois.",
"按标点符号切": "Couper selon les signes de ponctuation",
"按英文句号.切": "Découpez par des points en anglais",
"推理设置": "Paramètres dinférence",
"提取文本Bert特征": "Extraire les caractéristiques du texte avec BERT",
"数据分桶(并行推理时会降低一点计算量)": "Regroupement des données (Réduit le coût de calcul en inférence parallèle)",
"数据类型精度": "précision du type de données",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Outil de segmentation de texte. Un texte trop long peut donner un mauvais résultat, il est donc recommandé de le segmenter. La synthèse se fera selon les sauts de ligne puis sera assemblée.",
"文本模块学习率权重": "Poids du taux d'apprentissage du module de texte",
"文本进程输出信息": "Informations de processus de texte",
"施工中,请静候佳音": "En construction, veuillez attendre patiemment",
@ -146,6 +171,7 @@
"终止SSL提取进程": "Arrêter le processus d'extraction SSL",
"终止SoVITS训练": "Arrêter l'entraînement SoVITS",
"终止一键三连": "Arrêter l'un clic trois connexions",
"终止合成": "Terminer la synthèse",
"终止文本获取进程": "Arrêter le processus d'extraction de texte",
"终止语义token提取进程": "Arrêter le processus d'extraction de jetons sémantiques",
"终止语音切割": "Arrêter le découpage vocal",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "Informations de processus d'extraction de jetons sémantiques",
"语速": "Débit de parole",
"语速调整,高为更快": "Ajuster la vitesse de parole, plus élevée pour plus rapide",
"语速调节不支持分桶处理,已自动关闭分桶处理": "Le réglage de la vitesse vocale ne prend pas en charge le regroupement des données, désactivation automatique",
"语音切割进程输出信息": "Informations de processus de découpage vocal",
"语音降噪进程输出信息": "Informations de sortie du processus de réduction du bruit vocal",
"请上传3~10秒内参考音频超过会报错": "Veuillez télécharger une référence audio de 3 à 10 secondes ; les fichiers plus longs généreront une erreur!",
@ -166,9 +193,11 @@
"请填入正确的List路径": "Veuillez Remplir le Chemin Correct de la Liste",
"请填入正确的音频文件夹路径": "Veuillez Remplir le Chemin Correct du Dossier Audio",
"请输入有效文本": "Veuillez entrer un texte valide",
"路径不存在,使用默认配置": "Chemin introuvable, utilisation de la configuration par défaut",
"路径不能为空": "Chemin Non Vide Attendu",
"路径错误": "Erreur de Chemin",
"转换": "Conversion",
"辅参考音频(可选多个,或不选)": "Audio de référence secondaire (Facultatif, plusieurs possibles ou aucun)",
"输入待处理音频文件夹路径": "Entrez le chemin du dossier audio à traiter",
"输入文件夹路径": "Chemin du dossier à entrer",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "Les fichiers et dossiers commençant par 23456 devraient être présents dans le répertoire logs/nom de l'expérience",
@ -176,13 +205,19 @@
"输出文件夹路径": "Chemin du dossier de sortie",
"输出的语音": "Audio de sortie",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Choisissez le modèle entraîné stocké sous SoVITS_weights et GPT_weights. Par défaut, l'un d'eux est un modèle de base pour l'expérience de TTS Zero Shot de 5 secondes.",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Nombre détapes déchantillonnage : si le son est bruité, essayez daugmenter, si cest lent, essayez de réduire",
"重复惩罚": "Pénalité de répétition",
"降噪结果输出文件夹": "Dossier de sortie des résultats de réduction du bruit",
"降噪音频文件输入文件夹": "Dossier d'entrée des fichiers audio de réduction du bruit",
"随机种子": "Graine aléatoire",
"需要合成的切分前文本": "Texte avant segmentation pour la synthèse",
"需要合成的文本": "Texte à synthétiser",
"需要合成的文本的语种": "Langue du texte à synthétiser",
"需要合成的语种": "Langue de synthèse requise",
"韩文": "Coreano",
"韩英混合": "Mezcla Coreano-Inglés",
"音频加载失败": "Échec du Chargement de l'Audio",
"音频文件不存在,跳过:": "Fichier audio introuvable, passage : ",
"音频自动切分输入路径,可文件可文件夹": "Chemin d'entrée automatique de découpage audio, peut être un fichier ou un dossier",
"预训练的GPT模型路径": "Chemin du modèle GPT pré-entraîné",
"预训练的SSL模型路径": "Chemin du modèle SSL pré-entraîné",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5 è attivato",
"UVR5进程输出信息": "Informazioni sull'output del processo UVR5",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Quanta proporzione dell'audio normalizzato deve essere miscelata",
"batch_size": "Dimensione del batch",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: Come calcolare la curva del volume. Più piccolo è, maggiore è la precisione ma aumenta la complessità computazionale (non significa che una maggiore precisione dà risultati migliori)",
"max:归一化后最大值多少": "max: Massimo valore dopo la normalizzazione",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept: Massima durata del silenzio dopo il taglio",
@ -66,6 +67,9 @@
"中文": "Cinese",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Documentazione del tutorial in cinesehttps://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "Cinese e inglese misti",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio di riferimento principale (Si prega di caricare un audio di riferimento tra 3 e 10 secondi, superato questo limite si verificherà un errore!)",
"主参考音频的文本": "Testo dell'audio di riferimento principale",
"主参考音频的语种": "Lingua dell'audio di riferimento principale",
"也可批量输入音频文件, 二选一, 优先读文件夹": "È possibile anche inserire file audio in batch, una delle due opzioni, con priorità alla lettura della cartella",
"人声伴奏分离批量处理, 使用UVR5模型。": "Separazione voce-accompagnamento in batch, utilizza il modello UVR5.",
"人声提取激进程度": "Grado di aggressività dell'estrazione vocale",
@ -73,13 +77,24 @@
"以下模型不存在:": "Nessun Modello del Genere:",
"伴奏人声分离&去混响&去回声": "Separazione tra accompagnamento e voce & Rimozione dell'eco & Rimozione dell'eco",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "Si consiglia di utilizzare GPT fine-tuned quando si utilizza la modalità senza testo di riferimento. Se non si riesce a capire cosa dice l'audio di riferimento (e non si sa cosa scrivere), è possibile abilitare questa opzione, ignorando il testo di riferimento inserito.",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Si consiglia di usare un GPT fine-tuned quando si usa la modalità senza testo di riferimento. Se l'audio di riferimento non è chiaro o non sai cosa scrivere, abilita questa opzione per ignorare il testo di riferimento.",
"保存频率save_every_epoch": "Frequenza di salvataggio ogni epoca",
"保持随机": "Mantieni casuale",
"关闭TTS推理WebUI": "Chiudere TTS Inference WebUI",
"关闭UVR5-WebUI": "Chiudere UVR5-WebUI",
"关闭打标WebUI": "Chiudere Labeling WebUI",
"凑50字一切": "Riempire con 50 caratteri per tutto",
"凑四句一切": "Riempire con quattro frasi per tutto",
"分桶处理模式已关闭": "Modalità di elaborazione per bucket disabilitata",
"分桶处理模式已开启": "Modalità di elaborazione per bucket abilitata",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "La modalità di ritorno segmentato non supporta l'elaborazione per bucket, l'elaborazione per bucket è stata disabilitata automaticamente",
"分段返回模式已开启": "Modalità di ritorno segmentato abilitata",
"分段间隔(秒)": "Intervallo di segmentazione (secondi)",
"分段间隔过小已自动设置为0.01": "L'intervallo di segmentazione è troppo breve, impostato automaticamente a 0.01",
"切分": "Segmentazione",
"切分后文本": "Testo dopo segmentazione",
"切分后的子音频的输出根目录": "Directory radice di output per gli audio segmentati",
"切分文本": "Segmenta il testo",
"切割使用的进程数": "Numero di processi utilizzati per il taglio",
"刷新模型路径": "Aggiorna il percorso del modello",
"前端处理后的文本(每句):": "Testo elaborato dal front-end (per frase):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opzionale: Carica più file audio di riferimento trascinandoli (si consiglia dello stesso genere) e media il loro tono. Se questa opzione è lasciata vuota, il tono sarà controllato dal singolo file audio di riferimento a sinistra. Se si sta perfezionando il modello, è consigliato che tutti i file audio di riferimento abbiano toni presenti nel set di addestramento per il perfezionamento; il modello pre-addestrato può essere ignorato.",
"合成语音": "Sintesi vocale",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Formato di percorso della cartella valido: E:\\codes\\py39\\vits_vc_gpu\\Esempio di test di BaiLuShuangHua (copiare direttamente dalla barra degli indirizzi del gestore file).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "In futuro verrà aggiunto il supporto per la conversione dei fonemi, la modifica manuale dei fonemi e la sintesi vocale passo dopo passo.",
"启用并行推理版本": "Abilita versione di inferenza parallela",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "Inserisci la directory dell'audio segmentato! Il percorso completo del file audio letto = questa directory - unione del nome del file corrispondente alle forme d'onda nel file .list (non il percorso completo). Se lasciato vuoto, verrà utilizzato il percorso assoluto nel file .list.",
"多语种混合": "Mix multilingue",
"多语种混合(粤语)": "Misto Multilingue (Cantonese)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "Testo di destinazione effettivamente inserito (per frase):",
"实际输入的目标文本:": "Testo di destinazione effettivamente inserito:",
"导出文件格式": "Formato di esportazione del file",
"并行推理": "Inferenza parallela",
"并行推理模式已关闭": "Modalità di inferenza parallela disabilitata",
"并行推理模式已开启": "Modalità di inferenza parallela abilitata",
"开启GPT训练": "Attivare l'allenamento di GPT",
"开启SSL提取": "Attivare l'estrazione SSL",
"开启SoVITS训练": "Attivare l'allenamento di SoVITS",
@ -107,6 +127,7 @@
"开启打标WebUI": "Aprire Labeling WebUI",
"开启文本获取": "Attivare l'estrazione del testo",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Attivare la modalità senza testo di riferimento. Anche se non inserisci un testo di riferimento, la modalità verrà attivata.",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "Abilita modalità senza testo di riferimento. Non compilare il campo del testo di riferimento equivale ad abilitarla. V3 non supporta questa modalità, utilizzarla genererà un errore.",
"开启离线批量ASR": "Attivare ASR offline batch",
"开启语义token提取": "Attivare l'estrazione del token semantico",
"开启语音切割": "Attivare la segmentazione vocale",
@ -122,7 +143,11 @@
"按中文句号。切": "Taglia secondo il punto cinese.",
"按标点符号切": "Taglia secondo i segni di punteggiatura",
"按英文句号.切": "Taglia secondo il punto inglese",
"推理设置": "Impostazioni di inferenza",
"提取文本Bert特征": "Estrai caratteristiche Bert dal testo",
"数据分桶(并行推理时会降低一点计算量)": "Raggruppamento dei dati (Riduce il costo computazionale nell'inferenza parallela)",
"数据类型精度": "precisione del tipo di dati",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Strumento di segmentazione del testo. Testi troppo lunghi potrebbero non dare buoni risultati, quindi si consiglia di segmentarli. La sintesi verrà eseguita separando il testo in base ai ritorni a capo e poi concatenandolo.",
"文本模块学习率权重": "Peso del tasso di apprendimento del modulo di testo",
"文本进程输出信息": "Informazioni sull'output del processo di estrazione del testo",
"施工中,请静候佳音": "In costruzione, attendi pazientemente le buone notizie",
@ -146,6 +171,7 @@
"终止SSL提取进程": "Terminare il processo di estrazione SSL",
"终止SoVITS训练": "Terminare l'allenamento di SoVITS",
"终止一键三连": "Terminare la formattazione con tre passaggi",
"终止合成": "Termina sintesi",
"终止文本获取进程": "Terminare il processo di estrazione del testo",
"终止语义token提取进程": "Terminare il processo di estrazione del token semantico",
"终止语音切割": "Terminare la segmentazione vocale",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "Informazioni sull'output del processo di estrazione del token semantico",
"语速": "Velocità della voce",
"语速调整,高为更快": "Regolare la velocità della voce, più alta per più veloce",
"语速调节不支持分桶处理,已自动关闭分桶处理": "Il controllo della velocità del discorso non supporta l'elaborazione per bucket, l'elaborazione per bucket è stata disabilitata automaticamente",
"语音切割进程输出信息": "Informazioni sull'output del processo di segmentazione vocale",
"语音降噪进程输出信息": "Informazioni sull'output del processo di riduzione del rumore vocale",
"请上传3~10秒内参考音频超过会报错": "Carica un audio di riferimento della durata compresa tra 3 e 10 secondi. Superiore a questo, verrà generato un errore!",
@ -166,9 +193,11 @@
"请填入正确的List路径": "Si Prega di Inserire il Percorso Corretto della Lista",
"请填入正确的音频文件夹路径": "Si Prega di Inserire il Percorso Corretto della Cartella Audio",
"请输入有效文本": "Inserisci un testo valido",
"路径不存在,使用默认配置": "Percorso non trovato, utilizzo della configurazione predefinita",
"路径不能为空": "Percorso Vuoto Non Consentito",
"路径错误": "Errore di Percorso",
"转换": "Converti",
"辅参考音频(可选多个,或不选)": "Audio di riferimento secondario (Facoltativo, seleziona più o nessuno)",
"输入待处理音频文件夹路径": "Inserisci il percorso della cartella dei file audio da elaborare",
"输入文件夹路径": "Inserisci il percorso della cartella",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "Nella cartella logs/nome dell'esperimento dovrebbero esserci file e cartelle che iniziano con 23456",
@ -176,13 +205,19 @@
"输出文件夹路径": "Percorso della cartella di output",
"输出的语音": "Audio di output",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Scegli il modello salvato in SoVITS_weights e GPT_weights dopo l'addestramento. Uno di default è il modello di base, utilizzato per l'esperienza di Zero Shot TTS in 5 secondi.",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Passi di campionamento: se sembra rumoroso, prova a aumentarlo, se è lento, prova a diminuirlo",
"重复惩罚": "Penalità di ripetizione",
"降噪结果输出文件夹": "Cartella di output dei risultati di riduzione del rumore",
"降噪音频文件输入文件夹": "Cartella di input dei file audio per la riduzione del rumore",
"随机种子": "Seme casuale",
"需要合成的切分前文本": "Testo prima della segmentazione per la sintesi",
"需要合成的文本": "Testo da sintetizzare",
"需要合成的文本的语种": "Lingua del testo da sintetizzare",
"需要合成的语种": "Lingua da sintetizzare",
"韩文": "Coreano",
"韩英混合": "Misto Coreano-Inglese",
"音频加载失败": "Caricamento Audio Fallito",
"音频文件不存在,跳过:": "File audio non trovato, salto: ",
"音频自动切分输入路径,可文件可文件夹": "Percorso di input per la segmentazione automatica dell'audio, può essere un file o una cartella",
"预训练的GPT模型路径": "Percorso del modello preaddestrato GPT",
"预训练的SSL模型路径": "Percorso del modello SSL preaddestrato",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5がオンになっています",
"UVR5进程输出信息": "UVR5プロセスの出力情報",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix正規化後のオーディオが入る割合",
"batch_size": "バッチサイズ",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: 音量曲線の計算方法、小さいほど精度が高くなりますが、計算量が増加します(精度が高いほど必ずしも効果が良いわけではありません)",
"max:归一化后最大值多少": "max正規化後の最大値",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept切り終えた後、最大でどれだけ静かにするか",
@ -66,6 +67,9 @@
"中文": "中国語",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中国語チュートリアルドキュメントhttps://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "プライマリーFERエンス音声3〜10秒以内の参考音声をアップロードしてください。これを超えるとエラーが発生します",
"主参考音频的文本": "プライマリーFERエンス音声のテキスト",
"主参考音频的语种": "プライマリーFERエンス音声の言語",
"也可批量输入音频文件, 二选一, 优先读文件夹": "複数のオーディオファイルもインポートできます。フォルダパスが存在する場合、この入力は無視されます。",
"人声伴奏分离批量处理, 使用UVR5模型。": "人声と伴奏の分離をバッチ処理で行い、UVR5モデルを使用します。",
"人声提取激进程度": "人声抽出の積極性",
@ -73,13 +77,24 @@
"以下模型不存在:": "モデルが存在しません:",
"伴奏人声分离&去混响&去回声": "ボーカル/伴奏の分離と残響の除去",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "参考テキストなしモードを使用する場合は、微調整されたGPTの使用をお勧めします。参考音声が聞き取れない場合何を書けば良いかわからない場合は、有効にすると、入力した参考テキストを無視します。",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "参照テキストなしモードを使用する際は、微調整されたGPTを推奨めることをお勧びします。参照音声がはっきり、または何を書くかわからない場合は、このオプションを有効にして参照テキストを無視します。",
"保存频率save_every_epoch": "保存頻度save_every_epoch",
"保持随机": "ランダムを維持",
"关闭TTS推理WebUI": "TTS Inference WebUIを閉じる",
"关闭UVR5-WebUI": "UVR5-WebUIを閉じる",
"关闭打标WebUI": "ラベリングWebUIを閉じる",
"凑50字一切": "50文字ずつカット",
"凑四句一切": "4つの文で埋める",
"分桶处理模式已关闭": "バケット処理モードを無効化",
"分桶处理模式已开启": "バケット処理モードを有効化",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "セグメントされた返却モードはバケット処理をサポートしていないため、自動的にバケット処理を無効化",
"分段返回模式已开启": "セグメントされた返却モードを有効化",
"分段间隔(秒)": "セグメント間隔(秒)",
"分段间隔过小已自动设置为0.01": "セグメント間隔が短すぎていので、自動的に0.01に設定されました",
"切分": "セグメント",
"切分后文本": "セグメント後のテキスト",
"切分后的子音频的输出根目录": "分割後のサブオーディオの出力ルートディレクトリ",
"切分文本": "テキストをセグメント",
"切割使用的进程数": "分割に使用されるプロセス数",
"刷新模型路径": "モデルのパスを更新",
"前端处理后的文本(每句):": "フロントエンド処理後のテキスト(文ごと):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "オプション:複数の参照オーディオファイルをドラッグ&ドロップしてアップロードし、それらのトーンを平均化します(同性推奨)。このオプションを空白のままにした場合、トーンは左側の単一の参照オーディオによって制御されます。モデルを微調整する場合、すべての参照オーディオファイルが微調整のトレーニングセット内のトーンを持つことをお勧めします。プリトレーニングモデルは無視しても構いません。",
"合成语音": "推論を開始",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "適切なフォルダパスの例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华テストサンプル(ファイルマネージャのアドレスバーからコピーしてください)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "今後、フォンメ转换、手動フォンメ編集、音声合成のステップバイステップ実行をサポートします。",
"启用并行推理版本": "並列推論バージョンを有効化",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "切断後の音声ファイルが格納されているディレクトリを入力してください!読み取り対象の音声ファイルの完全パス = このディレクトリ - 結合 - listファイル内の波形に対応するファイル名完全パスではありません。空白の場合、.listファイル内の絶対完全パスを使用します。",
"多语种混合": "多言語混合",
"多语种混合(粤语)": "多言語混合(粤語)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "実際に入力された目標テキスト(文ごと):",
"实际输入的目标文本:": "実際に入力された目標テキスト:",
"导出文件格式": "エクスポートファイル形式",
"并行推理": "並列推論",
"并行推理模式已关闭": "並列推論モードを無効化",
"并行推理模式已开启": "並列推論モードを有効化",
"开启GPT训练": "GPTトレーニングを開始",
"开启SSL提取": "SSL抽出を開始",
"开启SoVITS训练": "SoVITSトレーニングを開始",
@ -107,6 +127,7 @@
"开启打标WebUI": "ラベリングWebUIを開く",
"开启文本获取": "テキストの取得を開始",
"开启无参考文本模式。不填参考文本亦相当于开启。": "参照テキストなしモードを有効にします。参照テキストを入力しない場合も同様に有効になります。",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "参照テキストなしモードを有効化。参照テキストフィールドを空にすると、このモードが有効になります。v3では現在このモードをサポートしておらず、使用するとエラーが発生します。",
"开启离线批量ASR": "オフラインバッチASRを開始",
"开启语义token提取": "セマンティックトークン抽出を開始",
"开启语音切割": "音声の分割を開始",
@ -122,7 +143,11 @@
"按中文句号。切": "中国語の句点でカット",
"按标点符号切": "句読点で分割",
"按英文句号.切": "英文のピリオドで切ってください",
"推理设置": "推論設定",
"提取文本Bert特征": "テキストBERT特徴を抽出",
"数据分桶(并行推理时会降低一点计算量)": "データバケット化(並列推論時に少し計算コストを減らす)",
"数据类型精度": "データ型の精度",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "テキストセグメントツール。非常に長いテキストは合成結果が良くない可能性があるため、長いテキストは事前にセグメントすることをお勧びします。合成は改行に基づいて分かれ、その後に結合されます。",
"文本模块学习率权重": "テキストモジュールの学習率の重み",
"文本进程输出信息": "テキストプロセスの出力情報",
"施工中,请静候佳音": "施工中、お待ちください",
@ -146,6 +171,7 @@
"终止SSL提取进程": "SSL抽出プロセスを停止",
"终止SoVITS训练": "SoVITSトレーニングを停止",
"终止一键三连": "ワンクリック三連を停止",
"终止合成": "合成を終了",
"终止文本获取进程": "テキスト取得プロセスを停止",
"终止语义token提取进程": "セマンティックトークン抽出プロセスを停止",
"终止语音切割": "音声の分割を停止",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "セマンティックトークン抽出プロセスの出力情報",
"语速": "話速",
"语速调整,高为更快": "話速調整、高いほど速く",
"语速调节不支持分桶处理,已自动关闭分桶处理": "音声調整はバケット処理をサポートしていないため、自動的にバケット処理を無効化",
"语音切割进程输出信息": "音声分割プロセスの出力情報",
"语音降噪进程输出信息": "音声ノイズ除去プロセスの出力情報",
"请上传3~10秒内参考音频超过会报错": "310秒以内の参照音声をアップロードしてください。それを超えるとエラーが発生します",
@ -166,9 +193,11 @@
"请填入正确的List路径": "正しいリストパスを入力してください",
"请填入正确的音频文件夹路径": "正しいオーディオフォルダパスを入力してください",
"请输入有效文本": "有効なテキストを入力してください",
"路径不存在,使用默认配置": "パスが見つからないため、デフォルト設定を使用",
"路径不能为空": "空のパスは予期されていません",
"路径错误": "パスエラー",
"转换": "変換",
"辅参考音频(可选多个,或不选)": "二次参考音声(複数可、またはなし)",
"输入待处理音频文件夹路径": "処理するオーディオフォルダのパスを入力してください:",
"输入文件夹路径": "入力フォルダのパス",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "logs/実験名ディレクトリには23456で始まるファイルとフォルダが含まれている必要があります",
@ -176,13 +205,19 @@
"输出文件夹路径": "出力フォルダのパス",
"输出的语音": "推論結果",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "SoVITS_weightsおよびGPT_weightsに保存されたモデルを選択します。デフォルトのものはプレトレインであり、ゼロショットTTSを体験できます。",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "サンプリングステップ数:ノイズが強い場合は増やし、遅い場合は減らしてみてください",
"重复惩罚": "繰り返しペナルティ",
"降噪结果输出文件夹": "ノイズ除去結果出力フォルダ",
"降噪音频文件输入文件夹": "ノイズ除去音声ファイル入力フォルダ",
"随机种子": "ランダムシード",
"需要合成的切分前文本": "セグメント前の推論テキスト",
"需要合成的文本": "推論テキスト",
"需要合成的语种": "推論テキストの言語",
"需要合成的文本的语种": "推論テキストの言語",
"需要合成的语种": "推論の言語",
"韩文": "韓国語",
"韩英混合": "韓英混合",
"音频加载失败": "音声の読み込みに失敗しました",
"音频文件不存在,跳过:": "オーディオファイルが見つからない。スキップ:",
"音频自动切分输入路径,可文件可文件夹": "オーディオの自動分割入力パス、ファイルまたはフォルダを指定できます",
"预训练的GPT模型路径": "事前にトレーニングされたGPTモデルのパス",
"预训练的SSL模型路径": "事前にトレーニングされたSSLモデルのパス",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5가 활성화되었습니다",
"UVR5进程输出信息": "UVR5 프로세스 출력 정보",
"alpha_mix:混多少比例归一化后音频进来": "알파 믹스: 정규화된 오디오가 들어오는 비율",
"batch_size": "배치 크기",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop 크기: 볼륨 곡선을 계산하는 방법. 작을수록 정확도가 높아지지만 계산량이 높아집니다 (정확도가 높다고 효과가 좋아지지 않음)",
"max:归一化后最大值多少": "최대 값 (정규화 후)",
"max_sil_kept:切完后静音最多留多长": "최대 유지되는 정적 길이 (분리 후)",
@ -66,6 +67,9 @@
"中文": "중국어",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "중국어 튜토리얼 문서https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "중영 혼합",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "주 참조 오디오 (3~10초 이내의 참조 오디오를 업로드하세요. 초과 시 오류 발생!)",
"主参考音频的文本": "주 참조 오디오의 텍스트",
"主参考音频的语种": "주 참조 오디오의 언어",
"也可批量输入音频文件, 二选一, 优先读文件夹": "오디오 파일을 일괄로 입력할 수도 있습니다. 둘 중 하나를 선택하고 폴더를 읽기를 우선합니다.",
"人声伴奏分离批量处理, 使用UVR5模型。": "보컬과 반주 분리 배치 처리, UVR5 모델 사용.",
"人声提取激进程度": "보컬 추출의 공격성",
@ -73,13 +77,24 @@
"以下模型不存在:": "해당 모델이 존재하지 않습니다:",
"伴奏人声分离&去混响&去回声": "반주 및 보컬 분리 & 리버브 제거 & 에코 제거",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "참고 텍스트가 없을 때는 미세 조정된 GPT를 사용하는 것이 좋습니다. 참고 오디오에서 무엇을 말하는지 잘 들리지 않으면 이 모드를 켜서 입력한 참고 텍스트를 무시할 수 있습니다.",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "참조 텍스트 없는 모드 사용 시 미세 조정된 GPT 사용을 권장합니다. 참조 오디오가 들리지 않거나 무엇을 쓸지 모를 경우 이 옵션을 활성화하여 참조 텍스트를 무시할 수 있습니다.",
"保存频率save_every_epoch": "저장 빈도 (각 라운드마다)",
"保持随机": "랜덤 유지",
"关闭TTS推理WebUI": "TTS Inference WebUI 닫기",
"关闭UVR5-WebUI": "UVR5-WebUI 닫기",
"关闭打标WebUI": "Labeling WebUI 닫기",
"凑50字一切": "50자를 채우십시오",
"凑四句一切": "네 문장의 세트를 완성하세요.",
"分桶处理模式已关闭": "버킷 처리 모드 비활성화됨",
"分桶处理模式已开启": "버킷 처리 모드 활성화됨",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "분할 반환 모드는 버킷 처리를 지원하지 않아 버킷 처리 자동 비활성화됨",
"分段返回模式已开启": "분할 반환 모드 활성화됨",
"分段间隔(秒)": "분할 간격(초)",
"分段间隔过小已自动设置为0.01": "분할 간격이 너무 짧아 자동으로 0.01로 설정됨",
"切分": "분할",
"切分后文本": "분할 후 추론 텍스트",
"切分后的子音频的输出根目录": "분리된 하위 오디오의 출력 기본 디렉터리",
"切分文本": "텍스트 분할",
"切割使用的进程数": "사용되는 프로세스 수로 자르기",
"刷新模型路径": "모델 경로 새로 고침",
"前端处理后的文本(每句):": "프론트엔드 처리 후 텍스트(문장별):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "선택 사항: 여러 참조 오디오 파일을 드래그하여 업로드하고 (동일한 성별을 권장), 그들의 톤을 평균화합니다. 이 옵션을 비워두면 톤은 왼쪽의 단일 참조 오디오로 제어됩니다. 모델을 미세 조정하는 경우 모든 참조 오디오 파일이 미세 조정 훈련 세트 내의 톤을 가지고 있는 것이 좋으며, 사전 훈련된 모델은 무시할 수 있습니다.",
"合成语音": "합성 음성",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "적절한 폴더 경로 형식 예: E:\\codes\\py39\\vits_vc_gpu\\백로서리 테스트 샘플 (파일 관리자 주소 표시줄에서 복사하면 됩니다).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "향후 음소 변환, 수동 음소 편집, 단계별 음성 합성 지원이 추가될 예정입니다.",
"启用并行推理版本": "병렬 추론 버전 활성화",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "분리된 오디오가 위치한 디렉터리를 입력하세요! 읽어들인 오디오 파일의 전체 경로 = 이 디렉터리 - list 파일에서 파형에 해당하는 파일명(전체 경로가 아님). 비워 두면 .list 파일의 절대 전체 경로를 사용합니다.",
"多语种混合": "다국어 혼합",
"多语种混合(粤语)": "다국어 혼합(粤語)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "실제 입력된 목표 텍스트(문장별):",
"实际输入的目标文本:": "실제 입력된 목표 텍스트:",
"导出文件格式": "내보내기 파일 형식",
"并行推理": "병렬 추론",
"并行推理模式已关闭": "병렬 추론 모드 비활성화됨",
"并行推理模式已开启": "병렬 추론 모드 활성화됨",
"开启GPT训练": "GPT 훈련 활성화",
"开启SSL提取": "SSL 추출 활성화",
"开启SoVITS训练": "SoVITS 훈련 활성화",
@ -107,6 +127,7 @@
"开启打标WebUI": "Labeling WebUI 열기",
"开启文本获取": "텍스트 추출 활성화",
"开启无参考文本模式。不填参考文本亦相当于开启。": "참고 텍스트 없이 모드를 활성화합니다. 참고 텍스트를 입력하지 않으면 자동으로 활성화됩니다.",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "참조 텍스트 없는 모드 활성화. 참조 텍스트를 입력하지 않으면 이 모드가 활성화됩니다. v3는 현재 이 모드를 지원하지 않으며, 사용 시 오류가 발생합니다.",
"开启离线批量ASR": "오프라인 대량 ASR 활성화",
"开启语义token提取": "의미 토큰 추출 활성화",
"开启语音切割": "음성 분리 활성화",
@ -122,7 +143,11 @@
"按中文句号。切": "중국어 문장으로 분리하십시오.",
"按标点符号切": "구두점을 기준으로 자르기",
"按英文句号.切": "영어 문장으로 분리하기",
"推理设置": "추론 설정",
"提取文本Bert特征": "텍스트 BERT 특징 추출",
"数据分桶(并行推理时会降低一点计算量)": "데이터 버킷팅 (병렬 추론 시 계산량 감소)",
"数据类型精度": "데이터 유형 정밀도",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "텍스트 분할 도구. 너무 긴 텍스트는 합성 결과가 좋지 않을 수 있으므로 분할을 권장합니다. 합성은 텍스트의 줄바꿈을 기준으로 분할된 후 결합됩니다.",
"文本模块学习率权重": "텍스트 모듈 학습률 가중치",
"文本进程输出信息": "텍스트 프로세스 출력 정보",
"施工中,请静候佳音": "공사 중입니다. 기다려주십시오.",
@ -146,6 +171,7 @@
"终止SSL提取进程": "SSL 추출 프로세스 종료",
"终止SoVITS训练": "SoVITS 훈련 종료",
"终止一键三连": "일괄 처리 종료",
"终止合成": "합성 종료",
"终止文本获取进程": "텍스트 추출 프로세스 종료",
"终止语义token提取进程": "의미 토큰 추출 프로세스 종료",
"终止语音切割": "음성 분리 종료",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "의미 토큰 추출 프로세스 출력 정보",
"语速": "언어 속도",
"语速调整,高为更快": "언어 속도 조정, 높을수록 빠름",
"语速调节不支持分桶处理,已自动关闭分桶处理": "음속 조절은 버킷 처리를 지원하지 않아 버킷 처리 자동 비활성화됨",
"语音切割进程输出信息": "음성 분리 프로세스 출력 정보",
"语音降噪进程输出信息": "음성 노이즈 제거 프로세스 출력 정보",
"请上传3~10秒内参考音频超过会报错": "3~10초 이내의 참고 오디오를 업로드하십시오. 초과하면 오류가 발생합니다!",
@ -166,9 +193,11 @@
"请填入正确的List路径": "올바른 리스트 경로를 입력하세요",
"请填入正确的音频文件夹路径": "올바른 오디오 폴더 경로를 입력하세요",
"请输入有效文本": "유효한 텍스트를 입력하세요",
"路径不存在,使用默认配置": "경로가 존재하지 않음, 기본 설정 사용",
"路径不能为空": "경로가 비어 있을 수 없습니다",
"路径错误": "경로 오류",
"转换": "변환",
"辅参考音频(可选多个,或不选)": "보조 참조 오디오 (여러 개 선택 가능 또는 선택 안 함)",
"输入待处理音频文件夹路径": "처리 대기 중인 오디오 폴더 경로 입력",
"输入文件夹路径": "폴더 경로 입력",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "logs/실험 이름 디렉터리에는 23456으로 시작하는 파일과 폴더가 있어야 함",
@ -176,13 +205,19 @@
"输出文件夹路径": "출력 폴더 경로",
"输出的语音": "출력 음성",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "SoVITS_weights 및 GPT_weights에 저장된 훈련 완료된 모델 중 선택. 기본적으로 하나는 기본 모델이며 5초 Zero Shot TTS를 체험할 수 있습니다.",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "샘플링 스텝: 노이즈가 느껴지면 증가, 느리다면 감소 시도",
"重复惩罚": "반복 패널티",
"降噪结果输出文件夹": "노이즈 제거 결과 출력 폴더",
"降噪音频文件输入文件夹": "노이즈 제거 오디오 파일 입력 폴더",
"随机种子": "랜덤 시드",
"需要合成的切分前文本": "분할 전 추론 텍스트",
"需要合成的文本": "합성해야 할 텍스트",
"需要合成的文本的语种": "추론 텍스트의 언어",
"需要合成的语种": "합성해야 할 언어",
"韩文": "한국어",
"韩英混合": "한영 혼합",
"音频加载失败": "오디오 로드 실패",
"音频文件不存在,跳过:": "오디오 파일이 존재하지 않음, 건너뜀: ",
"音频自动切分输入路径,可文件可文件夹": "오디오 자동 분리 입력 경로, 파일 또는 폴더 가능",
"预训练的GPT模型路径": "사전 훈련된 GPT 모델 경로",
"预训练的SSL模型路径": "사전 훈련된 SSL 모델 경로",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5 está ativado",
"UVR5进程输出信息": "Informações de saída do processo UVR5",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Em que proporção o áudio normalizado é misturado de volta",
"batch_size": "Tamanho do Lote",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "HOP_SIZE: Como calcular a curva de volume, quanto menor a precisão, maior a quantidade de cálculos (não significa que quanto maior a precisão, melhor o efeito)",
"max:归一化后最大值多少": "MAX: Qual é o valor máximo após a normalização?",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept: Depois de cortar, por quanto tempo no máximo o silêncio é mantido",
@ -66,6 +67,9 @@
"中文": "Chinês",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Documentação do tutorial em chinêshttps://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "Mistura de Chinês e Inglês",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Áudio de Referência Principal (Por favor, carregue um áudio de referência entre 3-10 segundos, exceder esse limite causará um erro!)",
"主参考音频的文本": "Texto do Áudio de Referência Principal",
"主参考音频的语种": "Idioma do Áudio de Referência Principal",
"也可批量输入音频文件, 二选一, 优先读文件夹": "Também é possível inserir arquivos de áudio em lote; escolha uma opção, preferencialmente leia a pasta.",
"人声伴奏分离批量处理, 使用UVR5模型。": "Processamento em lote de separação de voz e acompanhamento, usando o modelo UVR5.",
"人声提取激进程度": "Grau de agressividade da extração de voz",
@ -73,13 +77,24 @@
"以下模型不存在:": "Nenhum Modelo Tal:",
"伴奏人声分离&去混响&去回声": "Separação de acompanhamento e voz & remoção de reverberação & remoção de eco",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "Ao usar o modo sem texto de referência, recomenda-se usar um GPT ajustado. Se não conseguir ouvir claramente o áudio de referência (não sabe o que escrever), você pode ativar o modo e ignorar o texto de referência fornecido.",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Recomenda-se usar um GPT ajustado ao usar o Modo Sem Texto de Referência. Se o áudio de referência não estiver claro ou não souber o que escrever, ative esta opção para ignorar o texto de referência.",
"保存频率save_every_epoch": "Frequência de salvamento save_every_epoch",
"保持随机": "Manter Aleatório",
"关闭TTS推理WebUI": "Fechar TTS Inference WebUI",
"关闭UVR5-WebUI": "Fechar UVR5-WebUI",
"关闭打标WebUI": "Fechar Labeling WebUI",
"凑50字一切": "Complete com 50 caracteres",
"凑四句一切": "Complete com quatro frases",
"分桶处理模式已关闭": "Modo de Processamento em Balde Desativado",
"分桶处理模式已开启": "Modo de Processamento em Balde Ativado",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "Modo de Retorno Segmentado não suporta Processamento em Balde, Processamento em Balde Desativado Automaticamente",
"分段返回模式已开启": "Modo de Retorno Segmentado Ativado",
"分段间隔(秒)": "Intervalo de Segmentação (Segundos)",
"分段间隔过小已自动设置为0.01": "Intervalo de Segmentação muito curto, automaticamente definido como 0.01",
"切分": "Segmentação",
"切分后文本": "Texto de Inferência Após Segmentação",
"切分后的子音频的输出根目录": "Diretório raiz de saída do sub-áudio após o corte",
"切分文本": "Segmentar Texto",
"切割使用的进程数": "Número de processos para corte",
"刷新模型路径": "Atualizar caminho do modelo",
"前端处理后的文本(每句):": "Texto após processamento front-end (por frase):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Faça upload de vários arquivos de áudio de referência arrastando e soltando-os (recomendado que sejam do mesmo gênero) e faça uma média dos seus tons. Se essa opção for deixada em branco, o tom será controlado pelo único áudio de referência à esquerda. Se estiver ajustando o modelo, é recomendado que todos os arquivos de áudio de referência tenham tons dentro do conjunto de treinamento de ajuste; o modelo pré-treinado pode ser ignorado.",
"合成语音": "Voz sintetizada",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemplo de formato de caminho de pasta válido: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copie do endereço da barra do gerenciador de arquivos).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Suporte para Conversão de Fonemas, Edição Manual de Fonemas e Síntese de Fase por Fase será adicionado no futuro.",
"启用并行推理版本": "Ativar Versão de Inferência Paralela",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "Preencha o diretório onde os áudios cortados estão localizados! O caminho completo dos arquivos de áudio lidos = este diretório - concatenação com o nome do arquivo de forma correspondente no arquivo .list (não o caminho completo). Se deixar em branco, use o caminho absoluto no arquivo .list.",
"多语种混合": "Mistura de múltiplos idiomas",
"多语种混合(粤语)": "Mistura Multilíngue (Yue)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "Texto alvo realmente inserido (por frase):",
"实际输入的目标文本:": "Texto alvo realmente inserido:",
"导出文件格式": "Formato de arquivo de exportação",
"并行推理": "Inferência Paralela",
"并行推理模式已关闭": "Modo de Inferência Paralela Desativado",
"并行推理模式已开启": "Modo de Inferência Paralela Ativado",
"开启GPT训练": "Ativar treinamento GPT",
"开启SSL提取": "Ativar extração SSL",
"开启SoVITS训练": "Ativar treinamento SoVITS",
@ -107,6 +127,7 @@
"开启打标WebUI": "Abrir Labeling WebUI",
"开启文本获取": "Ativar obtenção de texto",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativar.",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "Ativar Modo Sem Texto de Referência. Deixar o campo de texto de referência em branco equivale a ativar este modo. v3 não suporta este modo atualmente, usar causará um erro.",
"开启离线批量ASR": "Ativar ASR offline em lote",
"开启语义token提取": "Ativar extração de token semântico",
"开启语音切割": "Ativar corte de voz",
@ -122,7 +143,11 @@
"按中文句号。切": "Dividir por ponto final chinês",
"按标点符号切": "Dividir por sinais de pontuação",
"按英文句号.切": "Dividir por ponto final em inglês",
"推理设置": "Configurações de Inferência",
"提取文本Bert特征": "Extrair Características BERT do Texto",
"数据分桶(并行推理时会降低一点计算量)": "Agrupamento de Dados (Reduz o Custo Computacional na Inferência Paralela)",
"数据类型精度": "precisão do tipo de dado",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Ferramenta de Segmentação de Texto. Textos muito longos podem não resultar em uma boa síntese, então é recomendado segmentar. A síntese será feita com base nas quebras de linha e depois concatenada.",
"文本模块学习率权重": "Weight da taxa de aprendizado do módulo de texto",
"文本进程输出信息": "Informações de saída do processo de texto",
"施工中,请静候佳音": "Em construção, por favor, aguarde por um bom som",
@ -146,6 +171,7 @@
"终止SSL提取进程": "Encerrar processo de extração SSL",
"终止SoVITS训练": "Encerrar treinamento SoVITS",
"终止一键三连": "Encerrar um clique",
"终止合成": "Terminar Síntese",
"终止文本获取进程": "Encerrar processo de obtenção de texto",
"终止语义token提取进程": "Encerrar processo de extração de token semântico",
"终止语音切割": "Encerrar corte de voz",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "Informações de saída do processo de extração de token semântico",
"语速": "Velocidade da fala",
"语速调整,高为更快": "Ajustar a velocidade da fala, mais alta para mais rápido",
"语速调节不支持分桶处理,已自动关闭分桶处理": "Ajuste de Velocidade de Fala não suporta Processamento em Balde, Processamento em Balde Desativado Automaticamente",
"语音切割进程输出信息": "Informações de saída do processo de corte de voz",
"语音降噪进程输出信息": "Informações de saída do processo de redução de ruído de voz",
"请上传3~10秒内参考音频超过会报错": "Por favor, faça upload de um áudio de referência com duração entre 3 e 10 segundos. Áudios fora dessa faixa causarão erro!",
@ -166,9 +193,11 @@
"请填入正确的List路径": "Por Favor, Insira o Caminho Correto da Lista",
"请填入正确的音频文件夹路径": "Por Favor, Insira o Caminho Correto da Pasta de Áudio",
"请输入有效文本": "Por favor, insira um texto válido",
"路径不存在,使用默认配置": "Caminhos Não Encontrados, Usando Configuração Padrão",
"路径不能为空": "Esperado Caminho Não Vazio",
"路径错误": "Erro de Caminho",
"转换": "Converter",
"辅参考音频(可选多个,或不选)": "Áudio de Referência Secundário (Múltiplos Opcionais ou Nenhum)",
"输入待处理音频文件夹路径": "Caminho da pasta de arquivos de áudio a ser processados",
"输入文件夹路径": "Caminho da pasta de entrada",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "Logs de saída/deve haver arquivos e pastas começando com 23456 no diretório do nome do experimento",
@ -176,13 +205,19 @@
"输出文件夹路径": "Caminho da pasta de saída",
"输出的语音": "Áudio de saída",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Selecione os modelos armazenados em Sovits_weights e GPT_WEIGHTS. O padrão é o modelo inferior, experiência para 5 segundos de Zero Shot TTS",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Passos de Amostragem: Se parecer ruidoso, tente aumentar; se parecer lento, tente diminuir",
"重复惩罚": "Penalidade de Repetição",
"降噪结果输出文件夹": "Pasta de saída dos resultados de redução de ruído",
"降噪音频文件输入文件夹": "Pasta de entrada dos arquivos de áudio para redução de ruído",
"随机种子": "Semente Aleatória",
"需要合成的切分前文本": "Texto de Inferência Antes da Segmentação",
"需要合成的文本": "Texto a ser sintetizado",
"需要合成的文本的语种": "Idioma do Texto de Inferência",
"需要合成的语种": "Idioma a ser sintetizado",
"韩文": "Coreano",
"韩英混合": "Mistura Coreano-Inglês",
"音频加载失败": "Falha ao Carregar o Áudio",
"音频文件不存在,跳过:": "Arquivo de Áudio Não Encontrado, Pulando: ",
"音频自动切分输入路径,可文件可文件夹": "Caminho de entrada automático de corte de áudio, pode ser um arquivo ou uma pasta",
"预训练的GPT模型路径": "Caminho do modelo GPT pre-train",
"预训练的SSL模型路径": "Caminho do modelo SSL pre-train",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5 включен",
"UVR5进程输出信息": "Вывод информации процесса UVR5",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Какая доля нормализованного аудио смешивается",
"batch_size": "размер пакета",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Как рассчитывается кривая громкости, чем меньше, тем выше точность и больше вычислительная нагрузка (большая точность не всегда означает лучший результат)",
"max:归一化后最大值多少": "max:Максимальное значение после нормализации",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept:Максимальная длительность тишины после разреза",
@ -66,6 +67,9 @@
"中文": "Китайский",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Документация на китайском языкеhttps://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "Китайский и английский",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Основной опорный аудиофайл (пожалуйста, загрузите опорный аудиофайл длительностью от 3 до 10 секунд, превышение этого лимита вызовет ошибку!)",
"主参考音频的文本": "Текст основного опорного аудиофайла",
"主参考音频的语种": "Язык основного опорного аудиофайла",
"也可批量输入音频文件, 二选一, 优先读文件夹": "Можно также импортировать несколько аудиофайлов. Если путь к папке существует, то этот ввод игнорируется.",
"人声伴奏分离批量处理, 使用UVR5模型。": "Обработка разделения вокала и аккомпанемента пакетно с использованием модели UVR5.",
"人声提取激进程度": "Степень агрессивности извлечения вокала",
@ -73,13 +77,24 @@
"以下模型不存在:": "Этот модель не существует",
"伴奏人声分离&去混响&去回声": "Разделение вокала/аккомпанемента и удаление эхо",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "При использовании режима без референсного текста рекомендуется использовать настроенную модель GPT. Если не удается разобрать, что говорит референсное аудио (не знаете, что писать), можете включить этот режим, и он проигнорирует введенный референсный текст.",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Рекомендуется использовать тонко настроенный GPT при использовании режима без опорного текста. Если опорный аудиофайл нечеткий или не уверены, что писать, включите эту опцию, чтобы игнорировать опорный текст.",
"保存频率save_every_epoch": "Частота сохранения save_every_epoch",
"保持随机": "Сохранить случайное значение",
"关闭TTS推理WebUI": "Закрыть TTS Inference WebUI",
"关闭UVR5-WebUI": "Закрыть UVR5-WebUI",
"关闭打标WebUI": "Закрыть Labeling WebUI",
"凑50字一切": "Соберите все в 50 символов",
"凑四句一切": "Собрать четыре предложения и разрезать",
"分桶处理模式已关闭": "Режим обработки пакетов отключен",
"分桶处理模式已开启": "Режим обработки пакетов включен",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "Режим сегментированного возврата не поддерживает обработку пакетов, обработка пакетов отключена автоматически",
"分段返回模式已开启": "Режим сегментированного возврата включен",
"分段间隔(秒)": "Интервал сегментации (секунды)",
"分段间隔过小已自动设置为0.01": "Интервал сегментации слишком короткий, автоматически установлен на 0.01",
"切分": "Сегментация",
"切分后文本": "Текст для синтеза после сегментации",
"切分后的子音频的输出根目录": "Корневой каталог вывода для подаудио после разделения",
"切分文本": "Сегментировать текст",
"切割使用的进程数": "Количество процессов, используемых для разрезания",
"刷新模型路径": "Обновить путь к модели",
"前端处理后的文本(每句):": "Текст после предварительной обработки (каждое предложение):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Необязательно: загрузите несколько файлов с эталонными аудиозаписями, перетащив их (рекомендуется одного пола), и усредните их тон. Если этот параметр не заполнен, тон будет контролироваться одной эталонной аудиозаписью слева. При тонкой настройке модели рекомендуется, чтобы все эталонные аудиозаписи имели тон в пределах обучающего набора для тонкой настройки; предварительно обученную модель можно игнорировать.",
"合成语音": "Синтезированный голос",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Пример допустимого формата пути к папке: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (просто скопируйте из адресной строки файлового менеджера).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "В будущем будет добавлена поддержка преобразования в фонемы, ручного редактирования фонемов и пошагового выполнения синтеза речи.",
"启用并行推理版本": "Включить параллельную версию вывода",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "Заполните каталог, где находятся аудиофайлы после разрезания! Полный путь к читаемым аудиофайлам = каталог - файл .list, имя файла соответствует волне (не полный путь). Если оставить пустым, будет использоваться абсолютный путь из файла .list.",
"多语种混合": "Смешанные языки",
"多语种混合(粤语)": "Многоязычная смесь (кантонский)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "Фактически введенный целевой текст (каждое предложение):",
"实际输入的目标文本:": "Фактически введенный целевой текст:",
"导出文件格式": "Формат выходных файлов",
"并行推理": "Параллельный вывод",
"并行推理模式已关闭": "Режим параллельного вывода отключен",
"并行推理模式已开启": "Режим параллельного вывода включен",
"开启GPT训练": "Включить обучение GPT",
"开启SSL提取": "Включить извлечение SSL",
"开启SoVITS训练": "Включить обучение SoVITS",
@ -107,6 +127,7 @@
"开启打标WebUI": "Открыть Labeling WebUI",
"开启文本获取": "Включить получение текста",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Включить режим без референтного текста. Не заполняя референтный текст, вы также включаете этот режим.",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "Включить режим без опорного текста. Не заполнять опорный текст также эквивалентно включению этого режима. v3 пока не поддерживает этот режим, его использование вызовет ошибку.",
"开启离线批量ASR": "Включить пакетную офлайн ASR",
"开启语义token提取": "Включить извлечение семантических токенов",
"开启语音切割": "Включить разрезание речи",
@ -122,7 +143,11 @@
"按中文句号。切": "Разделение по китайским точкам.",
"按标点符号切": "Разрезать по пунктуационным знакам",
"按英文句号.切": "Разрезать по английской точке.",
"推理设置": "Настройки вывода",
"提取文本Bert特征": "Извлечь текстовые признаки BERT",
"数据分桶(并行推理时会降低一点计算量)": "Разбиение данных на пакеты (уменьшает вычислительные затраты при параллельном выводе)",
"数据类型精度": "точность типа данных",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Инструмент для сегментации текста. Очень длинный текст может не дать хороших результатов синтеза, поэтому рекомендуется сначала разбить его на сегменты. Синтез будет выполняться на основе разрывов строк, а затем объединяться.",
"文本模块学习率权重": "Веса скорости обучения текстового модуля",
"文本进程输出信息": "Информация о процессе обработки текста",
"施工中,请静候佳音": "В разработке, ожидайте хороших новостей",
@ -146,6 +171,7 @@
"终止SSL提取进程": "Прекратить процесс извлечения SSL",
"终止SoVITS训练": "Прекратить обучение SoVITS",
"终止一键三连": "Прекратить одно нажатие",
"终止合成": "Завершить синтез",
"终止文本获取进程": "Прекратить процесс получения текста",
"终止语义token提取进程": "Прекратить процесс извлечения семантических токенов",
"终止语音切割": "Прекратить разрезание речи",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "Информация о процессе извлечения семантических токенов",
"语速": "Скорость речи",
"语速调整,高为更快": "Регулировка скорости речи, чем выше, тем быстрее",
"语速调节不支持分桶处理,已自动关闭分桶处理": "Регулировка скорости речи не поддерживает обработку пакетов, обработка пакетов отключена автоматически",
"语音切割进程输出信息": "Информация о процессе разрезания речи",
"语音降噪进程输出信息": "Информация о процессе шумоподавления",
"请上传3~10秒内参考音频超过会报错": "Пожалуйста, загрузите референтное аудио длительностью от 3 до 10 секунд, иначе будет ошибка!",
@ -166,9 +193,11 @@
"请填入正确的List路径": "Пожалуйста, укажите правильный путь к списку",
"请填入正确的音频文件夹路径": "Пожалуйста, укажите правильный путь к папке с аудио",
"请输入有效文本": "Введите действительный текст",
"路径不存在,使用默认配置": "Путь не найден, используется конфигурация по умолчанию",
"路径不能为空": "Ожидается, что путь не будет пустым",
"路径错误": "Ошибка пути",
"转换": "Преобразовать",
"辅参考音频(可选多个,或不选)": "Дополнительный опорный аудиофайл (несколько файлов по желанию или ни одного)",
"输入待处理音频文件夹路径": "Путь к папке с аудиофайлами для обработки:",
"输入文件夹路径": "Введите путь к папке",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "В директории logs/имя_эксперимента должны быть файлы и папки, начинающиеся с 23456",
@ -176,13 +205,19 @@
"输出文件夹路径": "Путь к папке для вывода",
"输出的语音": "Выводимый звук",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Выберите модель, сохраненную в SoVITS_weights и GPT_weights после обучения. По умолчанию используется базовая модель для 5-секундного Zero Shot TTS.",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Количество шагов выборки: если кажется, что шумно, попробуйте увеличить, если кажется, что медленно, попробуйте уменьшить",
"重复惩罚": "Штраф за повторение",
"降噪结果输出文件夹": "Папка для вывода результатов шумоподавления",
"降噪音频文件输入文件夹": "Папка для ввода аудиофайлов для шумоподавления",
"随机种子": "Случайное начальное значение",
"需要合成的切分前文本": "Текст для синтеза до сегментации",
"需要合成的文本": "Текст для синтеза",
"需要合成的文本的语种": "Язык текста для синтеза",
"需要合成的语种": "Язык для синтеза",
"韩文": "Корейский",
"韩英混合": "Корейско-английская смесь",
"音频加载失败": "Не удалось загрузить аудио",
"音频文件不存在,跳过:": "Аудиофайл не найден, пропускается: ",
"音频自动切分输入路径,可文件可文件夹": "Путь ввода для автоматического разделения аудио, может быть файлом или папкой",
"预训练的GPT模型路径": "Путь к предварительно обученной модели GPT",
"预训练的SSL模型路径": "Путь к предварительно обученной модели SSL",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5 açıldı",
"UVR5进程输出信息": "UVR5 işlem çıktı bilgisi",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Normalizasyondan sonraki sesin ne kadarlık bir oranı karıştırılsın",
"batch_size": "Toplu Boyut",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Ses seviyesi eğrisi nasıl hesaplanır, ne kadar küçükse hassasiyet o kadar yüksek ve hesaplama yükü o kadar artar (hassasiyet arttıkça etki mutlaka daha iyi olmaz)",
"max:归一化后最大值多少": "max:Normalizasyondan sonra maksimum değer ne kadar",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept:Kesimden sonra en fazla ne kadar sessizlik bırakılır",
@ -66,6 +67,9 @@
"中文": "Çince",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Çince öğretici belgehttps://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "Çince ve İngilizce karışık",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Ana Referans Sesi (Lütfen 3-10 saniye arasında referans sesi yükleyin, bu sınırı aşarsanız hata alırsınız!)",
"主参考音频的文本": "Ana Referans Sesinin Metni",
"主参考音频的语种": "Ana Referans Sesinin Dili",
"也可批量输入音频文件, 二选一, 优先读文件夹": "Ses dosyaları ayrıca toplu olarak, iki seçimle, öncelikli okuma klasörüyle içe aktarılabilir",
"人声伴奏分离批量处理, 使用UVR5模型。": "Vokal ve akor ayırma toplu işleme, UVR5 modelini kullanarak.",
"人声提取激进程度": "Vokal çıkarma agresiflik derecesi",
@ -73,13 +77,24 @@
"以下模型不存在:": "Böyle bir model yok:",
"伴奏人声分离&去混响&去回声": "Vokal/Müzik Ayrıştırma ve Yankı Giderme",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "Referans metin modu olmadan kullanıldığında, referans sesi net duyulmadığında (ne yazılacağı bilinmiyorsa) açık bırakılması önerilir, bu durumda girilen referans metni göz ardı edilir.",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Referanssız Metin Modu kullanırken ince ayarlanmış bir GPT kullanılması önerilir. Referans sesi net anlaşılamıyorsa veya ne yazacağınız belli değilse, bu seçeneği etkinleştirerek referans metni yazılmasını yoksayabilirsiniz.",
"保存频率save_every_epoch": "Kayıt sıklığı save_every_epoch",
"保持随机": "Rastgeleliği Koru",
"关闭TTS推理WebUI": "TTS Inference WebUI'yi Kapat",
"关闭UVR5-WebUI": "UVR5-WebUI'yi Kapat",
"关闭打标WebUI": "Labeling WebUI'yi Kapat",
"凑50字一切": "50 kelime birleştir ve kes",
"凑四句一切": "Dört cümleyi bir araya getirip kes",
"分桶处理模式已关闭": "Kovaya İşleme Modu Kapalı",
"分桶处理模式已开启": "Kovaya İşleme Modu Etkin",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "Bölümlü Dönüş Modu Kovaya İşlemeyi Desteklemediği İçin Kovaya İşleme Otomatik Olarak Kapatılmıştır",
"分段返回模式已开启": "Bölümlü Dönüş Modu Etkin",
"分段间隔(秒)": "Bölüm Aralığı (Saniye)",
"分段间隔过小已自动设置为0.01": "Bölüm Aralığı Çok Kısa, Otomatik Olarak 0.01 Olarak Ayarlanmıştır",
"切分": "Bölümleme",
"切分后文本": "Bölümlenmiş Metin",
"切分后的子音频的输出根目录": "Bölündükten sonra alt ses dosyalarının çıktı kök dizini",
"切分文本": "Metni Bölümle",
"切割使用的进程数": "Kesim için kullanılan işlem sayısı",
"刷新模型路径": "Model yolu yenile",
"前端处理后的文本(每句):": "Ön işleme tabi tutulan metin (her cümle):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "İsteğe bağlı: Birden fazla referans ses dosyasını sürükleyip bırakarak yükleyin (aynı cinsiyetten olmaları önerilir) ve tonlarını ortalayın. Bu seçenek boş bırakılırsa, ton soldaki tek referans ses dosyası tarafından kontrol edilir. Modeli ince ayar yapıyorsanız, tüm referans ses dosyalarının ince ayar eğitim seti içindeki tonlara sahip olması önerilir; önceden eğitilmiş model dikkate alınmayabilir.",
"合成语音": "Ses sentezi",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Geçerli klasör yolu formatı örneği: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (dosya yöneticisi adres çubuğundan kopyalayabilirsiniz).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Sonraki sürümlerde fonem dönüşümü, el ile fonem düzenleme ve adım adım konuşma sentezi desteği eklenecek.",
"启用并行推理版本": "Paralel Çıkarım Versiyonunu Etkinleştir",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "Kesmeye uygun ses dosyalarının bulunduğu dizini doldurun! Okunan ses dosyasının tam yolu = bu dizin + list dosyasındaki dalga biçimiyle eşleşen dosya adı (tam yol değil). Boş bırakılırsa, .list dosyasındaki tam yol kullanılır.",
"多语种混合": "Çok dilli karışım",
"多语种混合(粤语)": "Çok dilli karışık (Yue)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "Gerçekten girilen hedef metin (her cümle):",
"实际输入的目标文本:": "Gerçekten girilen hedef metin:",
"导出文件格式": "Dışa aktarma dosya formatı",
"并行推理": "Paralel Çıkarım",
"并行推理模式已关闭": "Paralel Çıkarım Modu Kapalı",
"并行推理模式已开启": "Paralel Çıkarım Modu Etkin",
"开启GPT训练": "GPT eğitimini başlat",
"开启SSL提取": "SSL çıkarmayı başlat",
"开启SoVITS训练": "SoVITS eğitimini başlat",
@ -107,6 +127,7 @@
"开启打标WebUI": "Labeling WebUI'yi Aç",
"开启文本获取": "Metin alma başlat",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Referans metni olmayan mod açık. Referans metni doldurulmazsa bu mod otomatik olarak açılır.",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "Referanssız Metin Modunu Etkinleştir. Referans metni boş bırakmak da bu modu etkinleştirmekle eşdeğerdir. v3 bu modu henüz desteklemiyor, kullanılırsa hata verecektir.",
"开启离线批量ASR": "Offline toplu ASR başlat",
"开启语义token提取": "Anlamsal token çıkarmayı başlat",
"开启语音切割": "Ses kesimi başlat",
@ -122,7 +143,11 @@
"按中文句号。切": "Çince dönem işaretine göre kes",
"按标点符号切": "Noktalama işaretlerine göre kes",
"按英文句号.切": "İngilizce nokta işaretine göre kes",
"推理设置": ıkarım Ayarları",
"提取文本Bert特征": "Metin BERT Özelliklerini Çıkar",
"数据分桶(并行推理时会降低一点计算量)": "Veri Kovaya Ayrılması (Paralel Çıkarımda Hesaplama Maliyetini Azaltır)",
"数据类型精度": "veri türü doğruluğu",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Metin Bölümleme Aracı. Çok uzun metinlerin sentez sonuçları iyi olmayabilir, bu nedenle uzun metinleri önce bölümlere ayırmanız önerilir. Sentez, metnin satır sonlarına göre ayrı ayrı yapılıp sonra birleştirilecektir.",
"文本模块学习率权重": "Metin modülü öğrenme oranıırlığı",
"文本进程输出信息": "Metin işlemi çıktı bilgisi",
"施工中,请静候佳音": "Yapım aşamasında, lütfen iyi haberler için bekleyin",
@ -146,6 +171,7 @@
"终止SSL提取进程": "SSL çıkarma işlemini durdur",
"终止SoVITS训练": "SoVITS eğitimini durdur",
"终止一键三连": "Tek tuşla üçlemeyi durdur",
"终止合成": "Sentezi Sonlandır",
"终止文本获取进程": "Metin alma işlemini durdur",
"终止语义token提取进程": "Anlamsal token çıkarma işlemini durdur",
"终止语音切割": "Ses kesimini durdur",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "Anlamsal token çıkarma işlemi çıktı bilgisi",
"语速": "Konuşma hızı",
"语速调整,高为更快": "Konuşma hızını ayarla, yüksek daha hızlı",
"语速调节不支持分桶处理,已自动关闭分桶处理": "Konuşma Hızı Ayarlaması Kovaya İşlemeyi Desteklemediği İçin Kovaya İşleme Otomatik Olarak Kapatılmıştır",
"语音切割进程输出信息": "Ses kesim işlemi çıktı bilgisi",
"语音降噪进程输出信息": "Gürültü azaltma işlemi çıktı bilgisi",
"请上传3~10秒内参考音频超过会报错": "Lütfen 3~10 saniye arasında bir referans ses dosyası yükleyin, aşım durumunda hata verilecektir!",
@ -166,9 +193,11 @@
"请填入正确的List路径": "Lütfen Doğru Liste Yolunu Girin",
"请填入正确的音频文件夹路径": "Lütfen Doğru Ses Klasörü Yolunu Girin",
"请输入有效文本": "Geçerli metin girin",
"路径不存在,使用默认配置": "Yol Bulunamadı, Varsayılan Yapılandırma Kullanılıyor",
"路径不能为空": "Boş Yol Beklenmiyor",
"路径错误": "Yol Hatası",
"转换": "Dönüştür",
"辅参考音频(可选多个,或不选)": "Yardımcı Referans Sesi (İsteğe bağlı birden çok seçilebilir veya hiç seçilmeyebilir)",
"输入待处理音频文件夹路径": "İşlenecek ses klasörünün yolunu girin:",
"输入文件夹路径": "Dosya klasörü yolu girin",
"输出logs/实验名目录下应有23456开头的文件和文件夹": ıktı logs/deney adı dizininde 23456 ile başlayan dosya ve klasörler olmalı",
@ -176,13 +205,19 @@
"输出文件夹路径": ıktı klasörü yolu",
"输出的语音": ıktı sesi",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Eğitimi tamamlanmış ve SoVITS_weights ile GPT_weights altına kaydedilmiş modeli seçin. Varsayılan bir temel modeldir, 5 saniyelik Zero Shot TTS deneyimi için kullanılır.",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Örnekleme Adımları: Eğer gürültülü görünüyorsa, adımları artırın; eğer yavaş görünüyorsa, adımları azaltın",
"重复惩罚": "Tekrarlama Cezası",
"降噪结果输出文件夹": "Gürültü azaltma sonuçları çıktı klasörü",
"降噪音频文件输入文件夹": "Gürültü azaltma ses dosyaları giriş klasörü",
"随机种子": "Rastgele Tohum",
"需要合成的切分前文本": "Sentez İçin Bölümlenmemiş Metin",
"需要合成的文本": "Sentezlenmesi gereken metin",
"需要合成的文本的语种": "Sentez Metninin Dili",
"需要合成的语种": "Sentezlenmesi gereken dil",
"韩文": "Korece",
"韩英混合": "Korece-İngilizce Karışık",
"音频加载失败": "Ses Yüklenemedi",
"音频文件不存在,跳过:": "Ses Dosyası Bulunamadı, Atlanıyor: ",
"音频自动切分输入路径,可文件可文件夹": "Ses otomatik bölme giriş yolu, dosya veya klasör olabilir",
"预训练的GPT模型路径": "Ön eğitilmiş GPT model yolu",
"预训练的SSL模型路径": "Ön eğitilmiş SSL model yolu",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5已开启",
"UVR5进程输出信息": "UVR5进程输出信息",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例归一化后音频进来",
"batch_size": "batch_size",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)",
"max:归一化后最大值多少": "max:归一化后最大值多少",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept:切完后静音最多留多长",
@ -66,6 +67,9 @@
"中文": "中文",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "主参考音频(请上传3~10秒内参考音频超过会报错)",
"主参考音频的文本": "主参考音频的文本",
"主参考音频的语种": "主参考音频的语种",
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
"人声伴奏分离批量处理, 使用UVR5模型。": "人声伴奏分离批量处理, 使用UVR5模型。",
"人声提取激进程度": "人声提取激进程度",
@ -73,13 +77,24 @@
"以下模型不存在:": "以下模型不存在:",
"伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。",
"保存频率save_every_epoch": "保存频率save_every_epoch",
"保持随机": "保持随机",
"关闭TTS推理WebUI": "关闭TTS推理WebUI",
"关闭UVR5-WebUI": "关闭UVR5-WebUI",
"关闭打标WebUI": "关闭打标WebUI",
"凑50字一切": "凑50字一切",
"凑四句一切": "凑四句一切",
"分桶处理模式已关闭": "分桶处理模式已关闭",
"分桶处理模式已开启": "分桶处理模式已开启",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "分段返回模式不支持分桶处理,已自动关闭分桶处理",
"分段返回模式已开启": "分段返回模式已开启",
"分段间隔(秒)": "分段间隔(秒)",
"分段间隔过小已自动设置为0.01": "分段间隔过小已自动设置为0.01",
"切分": "切分",
"切分后文本": "切分后文本",
"切分后的子音频的输出根目录": "切分后的子音频的输出根目录",
"切分文本": "切分文本",
"切割使用的进程数": "切割使用的进程数",
"刷新模型路径": "刷新模型路径",
"前端处理后的文本(每句):": "前端处理后的文本(每句):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。",
"合成语音": "合成语音",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "后续将支持转音素、手工修改音素、语音合成分步执行。",
"启用并行推理版本": "启用并行推理版本",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。",
"多语种混合": "多语种混合",
"多语种混合(粤语)": "多语种混合(粤语)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
"实际输入的目标文本:": "实际输入的目标文本:",
"导出文件格式": "导出文件格式",
"并行推理": "并行推理",
"并行推理模式已关闭": "并行推理模式已关闭",
"并行推理模式已开启": "并行推理模式已开启",
"开启GPT训练": "开启GPT训练",
"开启SSL提取": "开启SSL提取",
"开启SoVITS训练": "开启SoVITS训练",
@ -107,6 +127,7 @@
"开启打标WebUI": "开启打标WebUI",
"开启文本获取": "开启文本获取",
"开启无参考文本模式。不填参考文本亦相当于开启。": "开启无参考文本模式。不填参考文本亦相当于开启。",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。",
"开启离线批量ASR": "开启离线批量ASR",
"开启语义token提取": "开启语义token提取",
"开启语音切割": "开启语音切割",
@ -122,7 +143,11 @@
"按中文句号。切": "按中文句号。切",
"按标点符号切": "按标点符号切",
"按英文句号.切": "按英文句号.切",
"推理设置": "推理设置",
"提取文本Bert特征": "提取文本Bert特征",
"数据分桶(并行推理时会降低一点计算量)": "数据分桶(并行推理时会降低一点计算量)",
"数据类型精度": "数据类型精度",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。",
"文本模块学习率权重": "文本模块学习率权重",
"文本进程输出信息": "文本进程输出信息",
"施工中,请静候佳音": "施工中,请静候佳音",
@ -146,6 +171,7 @@
"终止SSL提取进程": "终止SSL提取进程",
"终止SoVITS训练": "终止SoVITS训练",
"终止一键三连": "终止一键三连",
"终止合成": "终止合成",
"终止文本获取进程": "终止文本获取进程",
"终止语义token提取进程": "终止语义token提取进程",
"终止语音切割": "终止语音切割",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "语义token提取进程输出信息",
"语速": "语速",
"语速调整,高为更快": "语速调整,高为更快",
"语速调节不支持分桶处理,已自动关闭分桶处理": "语速调节不支持分桶处理,已自动关闭分桶处理",
"语音切割进程输出信息": "语音切割进程输出信息",
"语音降噪进程输出信息": "语音降噪进程输出信息",
"请上传3~10秒内参考音频超过会报错": "请上传3~10秒内参考音频超过会报错",
@ -166,9 +193,11 @@
"请填入正确的List路径": "请填入正确的List路径",
"请填入正确的音频文件夹路径": "请填入正确的音频文件夹路径",
"请输入有效文本": "请输入有效文本",
"路径不存在,使用默认配置": "路径不存在,使用默认配置",
"路径不能为空": "路径不能为空",
"路径错误": "路径错误",
"转换": "转换",
"辅参考音频(可选多个,或不选)": "辅参考音频(可选多个,或不选)",
"输入待处理音频文件夹路径": "输入待处理音频文件夹路径",
"输入文件夹路径": "输入文件夹路径",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "输出logs/实验名目录下应有23456开头的文件和文件夹",
@ -176,13 +205,19 @@
"输出文件夹路径": "输出文件夹路径",
"输出的语音": "输出的语音",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试",
"重复惩罚": "重复惩罚",
"降噪结果输出文件夹": "降噪结果输出文件夹",
"降噪音频文件输入文件夹": "降噪音频文件输入文件夹",
"随机种子": "随机种子",
"需要合成的切分前文本": "需要合成的切分前文本",
"需要合成的文本": "需要合成的文本",
"需要合成的文本的语种": "需要合成的文本的语种",
"需要合成的语种": "需要合成的语种",
"韩文": "韩文",
"韩英混合": "韩英混合",
"音频加载失败": "音频加载失败",
"音频文件不存在,跳过:": "音频文件不存在,跳过:",
"音频自动切分输入路径,可文件可文件夹": "音频自动切分输入路径,可文件可文件夹",
"预训练的GPT模型路径": "预训练的GPT模型路径",
"预训练的SSL模型路径": "预训练的SSL模型路径",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5已開啟",
"UVR5进程输出信息": "UVR5進程輸出信息",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
"batch_size": "批次大小",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
"max:归一化后最大值多少": "max:歸一化後最大值多少",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept:切完後靜音最多留多長",
@ -66,6 +67,9 @@
"中文": "中文",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中文教程文檔https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音頻請上傳3~10秒內參考音頻超過會報錯",
"主参考音频的文本": "主參考音頻的文本",
"主参考音频的语种": "主參考音頻的語種",
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
"人声伴奏分离批量处理, 使用UVR5模型。": "人聲伴奏分離批量處理, 使用UVR5模型。",
"人声提取激进程度": "人聲提取激進程度",
@ -73,13 +77,24 @@
"以下模型不存在:": "以下模型不存在:",
"伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "使用無參考文本模式時建議使用微調的GPT聽不清參考音頻說的是啥不知道寫啥可以開啟開啟後無視填寫的參考文本。",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "使用無參考文本模式時建議使用微調的GPT聽不清參考音頻說的啥不曉得寫啥可以開開啟後無視填寫的參考文本。",
"保存频率save_every_epoch": "保存頻率save_every_epoch",
"保持随机": "保持隨機",
"关闭TTS推理WebUI": "關閉 TTS Inference WebUI",
"关闭UVR5-WebUI": "關閉 UVR5-WebUI",
"关闭打标WebUI": "關閉 Labeling WebUI",
"凑50字一切": "湊50字一切",
"凑四句一切": "湊四句一切",
"分桶处理模式已关闭": "分桶處理模式已關閉",
"分桶处理模式已开启": "分桶處理模式已開啟",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "分段返回模式不支持分桶處理,已自動關閉分桶處理",
"分段返回模式已开启": "分段返回模式已開啟",
"分段间隔(秒)": "分段間隔(秒)",
"分段间隔过小已自动设置为0.01": "分段間隔過小已自動設置為0.01",
"切分": "切分",
"切分后文本": "切分後文本",
"切分后的子音频的输出根目录": "切分後的子音頻的輸出根目錄",
"切分文本": "切分文本",
"切割使用的进程数": "切割使用的進程數",
"刷新模型路径": "刷新模型路徑",
"前端处理后的文本(每句):": "前端處理後的文本(每句):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
"合成语音": "合成語音",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試樣例(去文件管理器地址欄拷就行了)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支持轉音素、手工修改音素、語音合成分步執行。",
"启用并行推理版本": "啟用並行推理版本",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "填切割後音頻所在目錄!讀取的音頻文件完整路徑=該目錄-拼接-list文件裡波形對應的文件名不是全路徑。如果留空則使用.list文件裡的絕對全路徑。",
"多语种混合": "多語種混合",
"多语种混合(粤语)": "多語種混合 (粵語)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
"实际输入的目标文本:": "實際輸入的目標文本:",
"导出文件格式": "導出檔格式",
"并行推理": "並行推理",
"并行推理模式已关闭": "並行推理模式已關閉",
"并行推理模式已开启": "並行推理模式已開啟",
"开启GPT训练": "開啟GPT訓練",
"开启SSL提取": "開啟SSL提取",
"开启SoVITS训练": "開啟SoVITS訓練",
@ -107,6 +127,7 @@
"开启打标WebUI": "開啟 Labeling WebUI",
"开启文本获取": "開啟文本獲取",
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "開啟無參考文本模式。不填參考文本亦相當於開啟。v3暫不支持該模式使用了會報錯。",
"开启离线批量ASR": "開啟離線批量ASR",
"开启语义token提取": "開啟語義token提取",
"开启语音切割": "開啟語音切割",
@ -122,7 +143,11 @@
"按中文句号。切": "按中文句號。切",
"按标点符号切": "按標點符號切",
"按英文句号.切": "按英文句號.切",
"推理设置": "推理設置",
"提取文本Bert特征": "提取文本BERT特徵",
"数据分桶(并行推理时会降低一点计算量)": "數據分桶(並行推理時會降低一點計算量)",
"数据类型精度": "數據類型精度",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "文本切分工具。太長的文本合成出來效果不一定好,所以太長建議先切。合成會根據文本的換行分開合成再拼起來。",
"文本模块学习率权重": "文本模塊學習率權重",
"文本进程输出信息": "文本進程輸出信息",
"施工中,请静候佳音": "施工中,請靜候佳音",
@ -146,6 +171,7 @@
"终止SSL提取进程": "終止SSL提取進程",
"终止SoVITS训练": "終止SoVITS訓練",
"终止一键三连": "終止一鍵三連",
"终止合成": "終止合成",
"终止文本获取进程": "終止文本獲取進程",
"终止语义token提取进程": "終止語義token提取進程",
"终止语音切割": "終止語音切割",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "語義token提取進程輸出信息",
"语速": "語速",
"语速调整,高为更快": "調整語速,高為更快",
"语速调节不支持分桶处理,已自动关闭分桶处理": "語速調節不支持分桶處理,已自動關閉分桶處理",
"语音切割进程输出信息": "語音切割進程輸出信息",
"语音降噪进程输出信息": "語音降噪進程輸出信息",
"请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯",
@ -166,9 +193,11 @@
"请填入正确的List路径": "請填寫正確的列表路徑",
"请填入正确的音频文件夹路径": "請填寫正確的音頻文件夾路徑",
"请输入有效文本": "請輸入有效文本",
"路径不存在,使用默认配置": "路徑不存在,使用默認配置",
"路径不能为空": "路徑不應該為空",
"路径错误": "路徑錯誤",
"转换": "轉換",
"辅参考音频(可选多个,或不选)": "輔參考音頻(可選多個,或不選)",
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
"输入文件夹路径": "輸入文件夾路徑",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "輸出logs/實驗名目錄下應有23456開頭的文件和文件夾",
@ -176,13 +205,19 @@
"输出文件夹路径": "輸出文件夾路徑",
"输出的语音": "輸出的語音",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模體驗5秒Zero Shot TTS用。",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
"重复惩罚": "重複懲罰",
"降噪结果输出文件夹": "降噪結果輸出文件夾",
"降噪音频文件输入文件夹": "降噪音頻文件輸入文件夾",
"随机种子": "隨機種子",
"需要合成的切分前文本": "需要合成的切分前文本",
"需要合成的文本": "需要合成的文本",
"需要合成的文本的语种": "需要合成的文本的語種",
"需要合成的语种": "需要合成的語種",
"韩文": "韓文",
"韩英混合": "韓英混合",
"音频加载失败": "無法加載音頻",
"音频文件不存在,跳过:": "音頻檔案不存在,跳過:",
"音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
"预训练的GPT模型路径": "預訓練的GPT模型路徑",
"预训练的SSL模型路径": "預訓練的SSL模型路徑",

View File

@ -1,6 +1,6 @@
{
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):對於雙通道混響是最好的選擇,不能去除單通道混響;",
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底DeReverb额外去除混响可去除单声道混响但是对高频重的板式混响去不干净。": "(234)DeEcho: Aggressive 比 Normal 去除得更徹底DeReverb 額外去除混響,可去除單聲道混響,但是對高頻重的板式混響去不乾淨。",
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb)對於雙通道混響是最好的選擇,不能去除單通道混響;",
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底DeReverb额外去除混响可去除单声道混响但是对高频重的板式混响去不干净。": "(234)DeEcho:去除延遲效果。Aggressive 比 Normal 去除得更徹底DeReverb 額外去除混響,可去除單聲道混響,但是對高頻重的板式混響去不乾淨。",
"*GPT模型列表": "*GPT模型列表",
"*SoVITS模型列表": "*SoVITS模型列表",
"*实验/模型名": "*實驗/模型名",
@ -52,6 +52,7 @@
"UVR5已开启": "UVR5已開啟",
"UVR5进程输出信息": "UVR5進程輸出資訊",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
"batch_size": "批次大小",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
"max:归一化后最大值多少": "max:歸一化後最大值多少",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept:切完後靜音最多留多長",
@ -66,6 +67,9 @@
"中文": "中文",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中文教程文檔https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音檔請上傳3~10秒內參考音檔超過會報錯",
"主参考音频的文本": "主參考音檔的文本",
"主参考音频的语种": "主參考音檔的語種",
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
"人声伴奏分离批量处理, 使用UVR5模型。": "人聲伴奏分離批量處理, 使用UVR5模型。",
"人声提取激进程度": "人聲提取激進程度",
@ -73,13 +77,24 @@
"以下模型不存在:": "以下模型不存在",
"伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "使用無參考文本模式時建議使用微調的GPT聽不清參考音頻說的啥(不曉得寫啥)可以開,開啟後無視填寫的參考文本。",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "使用無參考文本模式時建議使用微調的GPT聽不清參考音檔說的啥不曉得寫啥可以開開啟後無視填寫的參考文本。",
"保存频率save_every_epoch": "保存頻率save_every_epoch",
"保持随机": "保持隨機",
"关闭TTS推理WebUI": "關閉 TTS Inference WebUI",
"关闭UVR5-WebUI": "關閉 UVR5-WebUI",
"关闭打标WebUI": "關閉 Labeling WebUI",
"凑50字一切": "湊50字一切",
"凑四句一切": "湊四句一切",
"分桶处理模式已关闭": "分桶處理模式已關閉",
"分桶处理模式已开启": "分桶處理模式已開啟",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "分段返回模式不支援分桶處理,已自動關閉分桶處理",
"分段返回模式已开启": "分段返回模式已開啟",
"分段间隔(秒)": "分段間隔(秒)",
"分段间隔过小已自动设置为0.01": "分段間隔過小已自動設置為0.01",
"切分": "切分",
"切分后文本": "切分後文本",
"切分后的子音频的输出根目录": "切分後的子音頻的輸出根目錄",
"切分文本": "切分文本",
"切割使用的进程数": "切割使用的進程數",
"刷新模型路径": "刷新模型路徑",
"前端处理后的文本(每句):": "前端處理後的文本(每句):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
"合成语音": "合成語音",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支援轉音素、手工修改音素、語音合成分步執行。",
"启用并行推理版本": "啟用並行推理版本",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "填切割後音頻所在目錄!讀取的音頻檔案完整路徑=該目錄-拼接-list檔案裡波形對應的檔案名不是全路徑。如果留空則使用.list檔案裡的絕對全路徑。",
"多语种混合": "多語種混合",
"多语种混合(粤语)": "多語種混合 (粵語)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
"实际输入的目标文本:": "實際輸入的目標文本:",
"导出文件格式": "導出檔格式",
"并行推理": "並行推理",
"并行推理模式已关闭": "並行推理模式已關閉",
"并行推理模式已开启": "並行推理模式已開啟",
"开启GPT训练": "開啟GPT訓練",
"开启SSL提取": "開啟SSL提取",
"开启SoVITS训练": "開啟SoVITS訓練",
@ -107,6 +127,7 @@
"开启打标WebUI": "開啟 Labeling WebUI",
"开启文本获取": "開啟文本獲取",
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "開啟無參考文本模式。不填參考文本亦相當於開啟。v3暫不支援該模式使用了會報錯。",
"开启离线批量ASR": "開啟離線批量ASR",
"开启语义token提取": "開啟語義token提取",
"开启语音切割": "開啟語音切割",
@ -122,7 +143,11 @@
"按中文句号。切": "按中文句號。切",
"按标点符号切": "按標點符號切",
"按英文句号.切": "按英文句號.切",
"推理设置": "推理設定",
"提取文本Bert特征": "提取文本BERT特徵",
"数据分桶(并行推理时会降低一点计算量)": "資料分桶(並行推理時會降低一點計算量)",
"数据类型精度": "數據類型精度",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "文本切分工具。太長的文本合成出來效果不一定好,所以太長建議先切。合成會根據文本的換行分開合成再拼起來。",
"文本模块学习率权重": "文本模塊學習率權重",
"文本进程输出信息": "文本進程輸出資訊",
"施工中,请静候佳音": "施工中,請靜候佳音",
@ -146,6 +171,7 @@
"终止SSL提取进程": "終止SSL提取進程",
"终止SoVITS训练": "終止SoVITS訓練",
"终止一键三连": "終止一鍵三連",
"终止合成": "終止合成",
"终止文本获取进程": "終止文本獲取進程",
"终止语义token提取进程": "終止語義token提取進程",
"终止语音切割": "終止語音切割",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "語義token提取進程輸出資訊",
"语速": "語速",
"语速调整,高为更快": "調整語速,高為更快",
"语速调节不支持分桶处理,已自动关闭分桶处理": "語速調節不支援分桶處理,已自動關閉分桶處理",
"语音切割进程输出信息": "語音切割進程輸出資訊",
"语音降噪进程输出信息": "語音降噪進程輸出資訊",
"请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯",
@ -166,9 +193,11 @@
"请填入正确的List路径": "請填寫正確的列表路徑",
"请填入正确的音频文件夹路径": "請填寫正確的音頻文件夾路徑",
"请输入有效文本": "請輸入有效文本",
"路径不存在,使用默认配置": "路徑不存在,使用預設配置",
"路径不能为空": "路徑不應該為空",
"路径错误": "路徑錯誤",
"转换": "轉換",
"辅参考音频(可选多个,或不选)": "輔參考音檔(可選多個,或不選)",
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
"输入文件夹路径": "輸入文件夾路徑",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "輸出logs/實驗名目錄下應有23456開頭的文件和文件夾",
@ -176,13 +205,19 @@
"输出文件夹路径": "輸出文件夾路徑",
"输出的语音": "輸出的語音",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模體驗5秒Zero Shot TTS用。",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
"重复惩罚": "重複懲罰",
"降噪结果输出文件夹": "降噪結果輸出文件夾",
"降噪音频文件输入文件夹": "降噪音頻文件輸入文件夾",
"随机种子": "隨機種子",
"需要合成的切分前文本": "需要合成的切分前文本",
"需要合成的文本": "需要合成的文本",
"需要合成的文本的语种": "需要合成的文本的語種",
"需要合成的语种": "需要合成的語種",
"韩文": "韓文",
"韩英混合": "韓英混合",
"音频加载失败": "無法加載音頻",
"音频文件不存在,跳过:": "音檔不存在,跳過:",
"音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
"预训练的GPT模型路径": "預訓練的GPT模型路徑",
"预训练的SSL模型路径": "預訓練的SSL模型路徑",

View File

@ -52,6 +52,7 @@
"UVR5已开启": "UVR5已開啟",
"UVR5进程输出信息": "UVR5進程輸出資訊",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
"batch_size": "批次大小",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
"max:归一化后最大值多少": "max:歸一化後最大值多少",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept:切完後靜音最多留多長",
@ -66,6 +67,9 @@
"中文": "中文",
"中文教程文档https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中文教程文檔https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
"中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音檔請上傳3~10秒內參考音檔超過會報錯",
"主参考音频的文本": "主參考音檔的文字",
"主参考音频的语种": "主參考音檔的語種",
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
"人声伴奏分离批量处理, 使用UVR5模型。": "人聲伴奏分離批量處理, 使用UVR5模型。",
"人声提取激进程度": "人聲提取激進程度",
@ -73,13 +77,24 @@
"以下模型不存在:": "#以下模型不存在",
"伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "使用無參考文本模式時建議使用微調的GPT聽不清參考音頻說的啥(不曉得寫啥)可以開,開啟後無視填寫的參考文本。",
"使用无参考文本模式时建议使用微调的GPT听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "使用無參考文字模式時建議使用微調的GPT聽不清參考音檔說的啥不曉得寫啥可以開開啟後無視填寫的參考文字。",
"保存频率save_every_epoch": "保存頻率save_every_epoch",
"保持随机": "保持隨機",
"关闭TTS推理WebUI": "關閉 TTS Inference WebUI",
"关闭UVR5-WebUI": "關閉 UVR5-WebUI",
"关闭打标WebUI": "關閉 Labeling WebUI",
"凑50字一切": "湊50字一切",
"凑四句一切": "湊四句一切",
"分桶处理模式已关闭": "分桶處理模式已關閉",
"分桶处理模式已开启": "分桶處理模式已開啟",
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "分段返回模式不支援分桶處理,已自動關閉分桶處理",
"分段返回模式已开启": "分段返回模式已開啟",
"分段间隔(秒)": "分段間隔(秒)",
"分段间隔过小已自动设置为0.01": "分段間隔過小已自動設定為0.01",
"切分": "切分",
"切分后文本": "切分後文字",
"切分后的子音频的输出根目录": "切分後的子音頻的輸出根目錄",
"切分文本": "切分文字",
"切割使用的进程数": "切割使用的進程數",
"刷新模型路径": "刷新模型路徑",
"前端处理后的文本(每句):": "前端處理後的文本(每句):",
@ -90,6 +105,8 @@
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
"合成语音": "合成語音",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支援轉音素、手工修改音素、語音合成分步執行。",
"启用并行推理版本": "啟用並行推理版本",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。如果留空则使用.list文件里的绝对全路径。": "填切割後音頻所在目錄!讀取的音頻檔案完整路徑=該目錄-拼接-list檔案裡波形對應的檔案名不是全路徑。如果留空則使用.list檔案裡的絕對全路徑。",
"多语种混合": "多語種混合",
"多语种混合(粤语)": "多語種混合 (粵語)",
@ -98,6 +115,9 @@
"实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
"实际输入的目标文本:": "實際輸入的目標文本:",
"导出文件格式": "導出檔格式",
"并行推理": "並行推理",
"并行推理模式已关闭": "並行推理模式已關閉",
"并行推理模式已开启": "並行推理模式已開啟",
"开启GPT训练": "開啟GPT訓練",
"开启SSL提取": "開啟SSL提取",
"开启SoVITS训练": "開啟SoVITS訓練",
@ -107,6 +127,7 @@
"开启打标WebUI": "開啟 Labeling WebUI",
"开启文本获取": "開啟文本獲取",
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
"开启无参考文本模式。不填参考文本亦相当于开启。v3暂不支持该模式使用了会报错。": "開啟無參考文字模式。不填參考文字亦相當於開啟。v3暫不支援該模式使用了會報錯。",
"开启离线批量ASR": "開啟離線批量ASR",
"开启语义token提取": "開啟語義token提取",
"开启语音切割": "開啟語音切割",
@ -122,7 +143,11 @@
"按中文句号。切": "按中文句號。切",
"按标点符号切": "按標點符號切",
"按英文句号.切": "按英文句號.切",
"推理设置": "推理設定",
"提取文本Bert特征": "提取文字BERT特徵",
"数据分桶(并行推理时会降低一点计算量)": "資料分桶(並行推理時會降低一點計算量)",
"数据类型精度": "數據類型精度",
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "文字切分工具。太長的文字合成出來效果不一定好,所以太長建議先切。合成會根據文字的換行分開合成再拼起來。",
"文本模块学习率权重": "文本模塊學習率權重",
"文本进程输出信息": "文本進程輸出資訊",
"施工中,请静候佳音": "施工中,請靜候佳音",
@ -146,6 +171,7 @@
"终止SSL提取进程": "終止SSL提取進程",
"终止SoVITS训练": "終止SoVITS訓練",
"终止一键三连": "終止一鍵三連",
"终止合成": "終止合成",
"终止文本获取进程": "終止文本獲取進程",
"终止语义token提取进程": "終止語義token提取進程",
"终止语音切割": "終止語音切割",
@ -158,6 +184,7 @@
"语义token提取进程输出信息": "語義token提取進程輸出資訊",
"语速": "語速",
"语速调整,高为更快": "調整語速,高為更快",
"语速调节不支持分桶处理,已自动关闭分桶处理": "語速調節不支援分桶處理,已自動關閉分桶處理",
"语音切割进程输出信息": "語音切割進程輸出資訊",
"语音降噪进程输出信息": "語音降噪進程輸出資訊",
"请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯",
@ -166,9 +193,11 @@
"请填入正确的List路径": "請填寫正確的列表路徑",
"请填入正确的音频文件夹路径": "請填寫正確的音頻文件夾路徑",
"请输入有效文本": "請輸入有效文本",
"路径不存在,使用默认配置": "路徑不存在,使用預設配置",
"路径不能为空": "路徑不應該為空",
"路径错误": "路徑錯誤",
"转换": "轉換",
"辅参考音频(可选多个,或不选)": "輔參考音檔(可選多個,或不選)",
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
"输入文件夹路径": "輸入文件夾路徑",
"输出logs/实验名目录下应有23456开头的文件和文件夹": "輸出logs/實驗名目錄下應有23456開頭的文件和文件夾",
@ -176,13 +205,19 @@
"输出文件夹路径": "輸出文件夾路徑",
"输出的语音": "輸出的語音",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模體驗5秒Zero Shot TTS用。",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
"重复惩罚": "重複懲罰",
"降噪结果输出文件夹": "降噪結果輸出文件夾",
"降噪音频文件输入文件夹": "降噪音頻文件輸入文件夾",
"随机种子": "隨機種子",
"需要合成的切分前文本": "需要合成的切分前文字",
"需要合成的文本": "需要合成的文本",
"需要合成的文本的语种": "需要合成的文字語種",
"需要合成的语种": "需要合成的語種",
"韩文": "韓文",
"韩英混合": "韓英混合",
"音频加载失败": "無法加載音頻",
"音频文件不存在,跳过:": "音檔不存在,跳過:",
"音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
"预训练的GPT模型路径": "預訓練的GPT模型路徑",
"预训练的SSL模型路径": "預訓練的SSL模型路徑",

View File

@ -37,16 +37,21 @@ def scan_i18n_strings():
strings = []
print(" Scanning Files and Extracting i18n Strings ".center(TITLE_LEN, "="))
for filename in glob.iglob("**/*.py", recursive=True):
with open(filename, "r", encoding="utf-8") as f:
code = f.read()
if "I18nAuto" in code:
tree = ast.parse(code)
i18n_strings = extract_i18n_strings(tree)
print(f"{filename.ljust(30)}: {len(i18n_strings)}")
strings.extend(i18n_strings)
try:
with open(filename, "r", encoding="utf-8") as f:
code = f.read()
if "I18nAuto" in code:
tree = ast.parse(code)
i18n_strings = extract_i18n_strings(tree)
print(f"{filename.ljust(KEY_LEN*3//2)}: {len(i18n_strings)}")
if SHOW_KEYS:
print("\n".join([s for s in i18n_strings]))
strings.extend(i18n_strings)
except Exception as e:
print(f"\033[31m[Failed] Error occur at {filename}: {e}\033[0m")
code_keys = set(strings)
print(f"{'Total Unique'.ljust(30)}: {len(code_keys)}")
print(f"{'Total Unique'.ljust(KEY_LEN*3//2)}: {len(code_keys)}")
return code_keys
def update_i18n_json(json_file, standard_keys):
@ -74,7 +79,7 @@ def update_i18n_json(json_file, standard_keys):
# 识别多余的键并删除
diff_keys = set(json_data.keys()) - set(standard_keys)
if len(diff_keys) > 0:
print(f"{'Unused Keys (-)'.ljust(KEY_LEN)}: {len(diff_keys)}")
print(f"{'Unused Keys (-)'.ljust(KEY_LEN)}: {len(diff_keys)}")
for key in diff_keys:
del json_data[key]
if SHOW_KEYS:
@ -107,7 +112,7 @@ def update_i18n_json(json_file, standard_keys):
for value, keys in duplicate_items.items():
if len(keys) > 1:
print("\n".join([f"\033[31m{'[Failed] Duplicate Value'.ljust(KEY_LEN)}: {key} -> {value}\033[0m" for key in keys]))
if num_miss_translation > 0:
print(f"\033[31m{'[Failed] Missing Translation'.ljust(KEY_LEN)}: {num_miss_translation}\033[0m")
else:

120
webui.py
View File

@ -39,8 +39,9 @@ for site_packages_root in site_packages_roots:
try:
with open("%s/users.pth" % (site_packages_root), "w") as f:
f.write(
"%s\n%s/tools\n%s/tools/asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
% (now_dir, now_dir, now_dir, now_dir, now_dir)
# "%s\n%s/runtime\n%s/tools\n%s/tools/asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
"%s\n%s/GPT_SoVITS/BigVGAN\n%s/tools\n%s/tools/asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
% (now_dir, now_dir, now_dir, now_dir, now_dir, now_dir)
)
break
except PermissionError as e:
@ -72,7 +73,7 @@ mem = []
if_gpu_ok = False
# 判断是否有能用来训练和加速推理的N卡
ok_gpu_keywords={"10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060","H","600"}
ok_gpu_keywords={"10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060","H","600","506","507","508","509"}
set_gpu_numbers=set()
if torch.cuda.is_available() or ngpu != 0:
for i in range(ngpu):
@ -89,14 +90,33 @@ if torch.cuda.is_available() or ngpu != 0:
# gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
if if_gpu_ok and len(gpu_infos) > 0:
gpu_info = "\n".join(gpu_infos)
default_batch_size = min(mem) // 2
else:
gpu_info = ("%s\t%s" % ("0", "CPU"))
gpu_infos.append("%s\t%s" % ("0", "CPU"))
set_gpu_numbers.add(0)
default_batch_size = int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2)
def set_default():
global default_batch_size,default_max_batch_size,gpu_info,default_sovits_epoch,default_sovits_save_every_epoch,max_sovits_epoch,max_sovits_save_every_epoch,default_batch_size_s1
if if_gpu_ok and len(gpu_infos) > 0:
gpu_info = "\n".join(gpu_infos)
minmem = min(mem)
default_batch_size = minmem // 2 if version!="v3"else minmem//14
default_batch_size_s1=minmem // 2
else:
gpu_info = ("%s\t%s" % ("0", "CPU"))
gpu_infos.append("%s\t%s" % ("0", "CPU"))
set_gpu_numbers.add(0)
default_batch_size = default_batch_size_s1=int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2)
if version!="v3":
default_sovits_epoch=8
default_sovits_save_every_epoch=4
max_sovits_epoch=25
max_sovits_save_every_epoch=25
else:
default_sovits_epoch=2
default_sovits_save_every_epoch=1
max_sovits_epoch=6
max_sovits_save_every_epoch=6
default_max_batch_size=default_batch_size*3
set_default()
gpus = "-".join([i[0] for i in gpu_infos])
default_gpu_numbers=str(sorted(list(set_gpu_numbers))[0])
def fix_gpu_number(input):#将越界的number强制改到界内
@ -112,10 +132,10 @@ def fix_gpu_numbers(inputs):
except:
return inputs
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "GPT_SoVITS/pretrained_models/s2G488k.pth"]
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"]
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth","GPT_SoVITS/pretrained_models/s2Gv3.pth"]
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"]
pretrained_model_list = (pretrained_sovits_name[-int(version[-1])+2],pretrained_sovits_name[-int(version[-1])+2].replace("s2G","s2D"),pretrained_gpt_name[-int(version[-1])+2],"GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large","GPT_SoVITS/pretrained_models/chinese-hubert-base")
pretrained_model_list = (pretrained_sovits_name[int(version[-1])-1],pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D"),pretrained_gpt_name[int(version[-1])-1],"GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large","GPT_SoVITS/pretrained_models/chinese-hubert-base")
_=''
for i in pretrained_model_list:
@ -125,15 +145,15 @@ if _:
print("warning:",i18n('以下模型不存在:')+_)
_ =[[],[]]
for i in range(2):
for i in range(3):
if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
else:_[0].append("")##没有下pretrained模型的说不定他们是想自己从零训底模呢
if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
else:_[-1].append("")
pretrained_gpt_name,pretrained_sovits_name = _
SoVITS_weight_root=["SoVITS_weights_v2","SoVITS_weights"]
GPT_weight_root=["GPT_weights_v2","GPT_weights"]
SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"]
GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"]
for root in SoVITS_weight_root+GPT_weight_root:
os.makedirs(root,exist_ok=True)
def get_weights_names():
@ -169,7 +189,7 @@ p_asr=None
p_denoise=None
p_tts_inference=None
def kill_proc_tree(pid, including_parent=True):
def kill_proc_tree(pid, including_parent=True):
try:
parent = psutil.Process(pid)
except psutil.NoSuchProcess:
@ -195,7 +215,7 @@ def kill_process(pid):
os.system(cmd)
else:
kill_proc_tree(pid)
def change_label(path_list):
global p_label
@ -229,6 +249,9 @@ def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits
cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language)
else:
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
#####v3暂不支持加速推理
if version=="v3":
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
if(p_tts_inference==None):
os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
@ -311,7 +334,7 @@ def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_s
data=f.read()
data=json.loads(data)
s2_dir="%s/%s"%(exp_root,exp_name)
os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
os.makedirs("%s/logs_s2_%s"%(s2_dir,version),exist_ok=True)
if check_for_existance([s2_dir],is_train=True):
check_details([s2_dir],is_train=True)
if(is_half==False):
@ -328,13 +351,15 @@ def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_s
data["train"]["gpu_numbers"]=gpu_numbers1Ba
data["model"]["version"]=version
data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
data["save_weight_dir"]=SoVITS_weight_root[-int(version[-1])+2]
data["save_weight_dir"]=SoVITS_weight_root[int(version[-1])-1]
data["name"]=exp_name
data["version"]=version
tmp_config_path="%s/tmp_s2.json"%tmp
with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
if version in ["v1","v2"]:
cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
else:
cmd = '"%s" GPT_SoVITS/s2_train_v3.py --config "%s"'%(python_exec,tmp_config_path)
yield "SoVITS训练开始%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True}
print(cmd)
p_train_SoVITS = Popen(cmd, shell=True)
@ -372,11 +397,11 @@ def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_
data["train"]["if_save_every_weights"]=if_save_every_weights
data["train"]["if_save_latest"]=if_save_latest
data["train"]["if_dpo"]=if_dpo
data["train"]["half_weights_save_dir"]=GPT_weight_root[-int(version[-1])+2]
data["train"]["half_weights_save_dir"]=GPT_weight_root[int(version[-1])-1]
data["train"]["exp_name"]=exp_name
data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
data["output_dir"]="%s/logs_s1"%s1_dir
data["output_dir"]="%s/logs_s1_%s"%(s1_dir,version)
# data["version"]=version
os.environ["_CUDA_VISIBLE_DEVICES"]=fix_gpu_numbers(gpu_numbers.replace("-",","))
@ -747,13 +772,14 @@ def close1abc():
return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
def switch_version(version_):
os.environ['version']=version_
os.environ["version"]=version_
global version
version = version_
if pretrained_sovits_name[-int(version[-1])+2] !='' and pretrained_gpt_name[-int(version[-1])+2] !='':...
else:
if pretrained_sovits_name[int(version[-1])-1] !='' and pretrained_gpt_name[int(version[-1])-1] !='':...
else:
gr.Warning(i18n(f'未下载{version.upper()}模型'))
return {'__type__':'update', 'value':pretrained_sovits_name[-int(version[-1])+2]}, {'__type__':'update', 'value':pretrained_sovits_name[-int(version[-1])+2].replace("s2G","s2D")}, {'__type__':'update', 'value':pretrained_gpt_name[-int(version[-1])+2]}, {'__type__':'update', 'value':pretrained_gpt_name[-int(version[-1])+2]}, {'__type__':'update', 'value':pretrained_sovits_name[-int(version[-1])+2]}
set_default()
return {'__type__':'update', 'value':pretrained_sovits_name[int(version[-1])-1]}, {'__type__':'update', 'value':pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D")}, {'__type__':'update', 'value':pretrained_gpt_name[int(version[-1])-1]}, {'__type__':'update', 'value':pretrained_gpt_name[int(version[-1])-1]}, {'__type__':'update', 'value':pretrained_sovits_name[int(version[-1])-1]},{'__type__':'update',"value":default_batch_size,"maximum":default_max_batch_size},{'__type__':'update',"value":default_sovits_epoch,"maximum":max_sovits_epoch},{'__type__':'update',"value":default_sovits_save_every_epoch,"maximum":max_sovits_save_every_epoch},{'__type__':'update',"interactive":True if version!="v3"else False}
if os.path.exists('GPT_SoVITS/text/G2PWModel'):...
else:
@ -796,7 +822,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500")
with gr.Row():
_max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True)
alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
with gr.Row():
n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
slicer_info = gr.Textbox(label=i18n("语音切割进程输出信息"))
@ -850,11 +876,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
choices = ["float32"],
interactive = True,
value="float32"
)
)
with gr.Row():
asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
def change_lang_choices(key): #根据选择的模型修改可选的语言
# return gr.Dropdown(choices=asr_dict[key]['lang'])
@ -878,7 +904,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
asr_model.change(change_size_choices, [asr_model], [asr_size])
asr_model.change(change_precision_choices, [asr_model], [asr_precision])
gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
with gr.Row():
with gr.Column(scale=3):
@ -889,7 +915,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
interactive=True,
)
label_info = gr.Textbox(label=i18n("打标工具进程输出信息"))
open_label = gr.Button(value=i18n("开启打标WebUI"),variant="primary",visible=True)
close_label = gr.Button(value=i18n("关闭打标WebUI"),variant="primary",visible=False)
open_label.click(change_label, [path_list], [label_info,open_label,close_label])
@ -902,11 +928,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Row():
exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
version_checkbox = gr.Radio(label=i18n("版本"),value=version,choices=['v1','v2'])
version_checkbox = gr.Radio(label=i18n("版本"),value=version,choices=['v1','v2','v3'])
with gr.Row():
pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value=pretrained_sovits_name[-int(version[-1])+2], interactive=True, lines=2, max_lines=3,scale=9)
pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value=pretrained_sovits_name[-int(version[-1])+2].replace("s2G","s2D"), interactive=True, lines=2, max_lines=3,scale=9)
pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value=pretrained_gpt_name[-int(version[-1])+2], interactive=True, lines=2, max_lines=3,scale=10)
pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value=pretrained_sovits_name[int(version[-1])-1], interactive=True, lines=2, max_lines=3,scale=9)
pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value=pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D"), interactive=True, lines=2, max_lines=3,scale=9)
pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value=pretrained_gpt_name[int(version[-1])-1], interactive=True, lines=2, max_lines=3,scale=10)
with gr.TabItem(i18n("1A-训练集格式化工具")):
gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹"))
with gr.Row():
@ -946,7 +972,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Row():
gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
with gr.Row():
pretrained_s2G_ = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value=pretrained_sovits_name[-int(version[-1])+2], interactive=False,lines=2)
pretrained_s2G_ = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value=pretrained_sovits_name[int(version[-1])-1], interactive=False,lines=2)
with gr.Row():
button1c_open = gr.Button(i18n("开启语义token提取"), variant="primary",visible=True)
button1c_close = gr.Button(i18n("终止语义token提取进程"), variant="primary",visible=False)
@ -981,11 +1007,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Row():
with gr.Column():
with gr.Row():
batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
total_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("总训练轮数total_epoch不建议太高"),value=8,interactive=True)
batch_size = gr.Slider(minimum=1,maximum=default_max_batch_size,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
total_epoch = gr.Slider(minimum=1,maximum=max_sovits_epoch,step=1,label=i18n("总训练轮数total_epoch不建议太高"),value=default_sovits_epoch,interactive=True)
with gr.Row():
text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True)
save_every_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True)
text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True if version!="v3"else False)#v3 not need
save_every_epoch = gr.Slider(minimum=1,maximum=max_sovits_save_every_epoch,step=1,label=i18n("保存频率save_every_epoch"),value=default_sovits_save_every_epoch,interactive=True)
with gr.Column():
with gr.Column():
if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
@ -1002,7 +1028,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Row():
with gr.Column():
with gr.Row():
batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size_s1,interactive=True)
total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True)
with gr.Row():
save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
@ -1035,7 +1061,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])
with gr.Row():
with gr.Row():
batched_infer_enabled = gr.Checkbox(label=i18n("启用并行推理版本(推理速度更快)"), value=False, interactive=True, show_label=True)
batched_infer_enabled = gr.Checkbox(label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True)
with gr.Row():
open_tts = gr.Button(value=i18n("开启TTS推理WebUI"),variant='primary',visible=True)
close_tts = gr.Button(value=i18n("关闭TTS推理WebUI"),variant='primary',visible=False)
@ -1043,7 +1069,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
tts_info = gr.Textbox(label=i18n("TTS推理WebUI进程输出信息"))
open_tts.click(change_tts_inference, [bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown, batched_infer_enabled], [tts_info,open_tts,close_tts])
close_tts.click(change_tts_inference, [bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown, batched_infer_enabled], [tts_info,open_tts,close_tts])
version_checkbox.change(switch_version,[version_checkbox],[pretrained_s2G,pretrained_s2D,pretrained_s1,GPT_dropdown,SoVITS_dropdown])
version_checkbox.change(switch_version,[version_checkbox],[pretrained_s2G,pretrained_s2D,pretrained_s1,GPT_dropdown,SoVITS_dropdown,batch_size,total_epoch,save_every_epoch,text_low_lr_rate])
with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("施工中,请静候佳音"))
app.queue().launch(#concurrency_count=511, max_size=1022
server_name="0.0.0.0",