feat: Migrate from CUDA to XPU for Intel GPU support

This commit migrates the project from using NVIDIA CUDA to Intel XPU for GPU acceleration, based on the PyTorch 2.9 release.

Key changes include:
- Replaced `torch.cuda` with `torch.xpu` for device checks, memory management, and distributed training.
- Updated device strings from "cuda" to "xpu" across the codebase.
- Switched the distributed training backend from "nccl" to "ccl" for Intel GPUs.
- Disabled custom CUDA kernels in the `BigVGAN` module by setting `use_cuda_kernel=False`.
- Updated `requirements.txt` to include `torch==2.9` and `intel-extension-for-pytorch`.
- Modified CI/CD pipelines and build scripts to remove CUDA dependencies and build for an XPU target.
This commit is contained in:
google-labs-jules[bot] 2025-11-10 13:09:27 +00:00
parent 11aa78bd9b
commit d3b8f7e09e
13 changed files with 1826 additions and 2393 deletions

View File

@ -15,11 +15,7 @@ on:
jobs:
build:
runs-on: windows-latest
strategy:
matrix:
torch_cuda: [cu124, cu128]
env:
TORCH_CUDA: ${{ matrix.torch_cuda }}
MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }}
MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }}
HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }}

View File

@ -18,68 +18,22 @@ jobs:
DATE=$(date +'%Y%m%d')
COMMIT=$(git rev-parse --short=6 HEAD)
echo "tag=${DATE}-${COMMIT}" >> $GITHUB_OUTPUT
build-amd64:
build-and-publish:
needs: generate-meta
runs-on: ubuntu-22.04
strategy:
matrix:
include:
- cuda_version: 12.6
lite: true
- lite: true
torch_base: lite
tag_prefix: cu126-lite
- cuda_version: 12.6
lite: false
tag_prefix: xpu-lite
- lite: false
torch_base: full
tag_prefix: cu126
- cuda_version: 12.8
lite: true
torch_base: lite
tag_prefix: cu128-lite
- cuda_version: 12.8
lite: false
torch_base: full
tag_prefix: cu128
tag_prefix: xpu
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Free up disk space
run: |
echo "Before cleanup:"
df -h
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /opt/hostedtoolcache/PyPy
sudo rm -rf /opt/hostedtoolcache/go
sudo rm -rf /opt/hostedtoolcache/node
sudo rm -rf /opt/hostedtoolcache/Ruby
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/pipx
sudo rm -rf /opt/az
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/lib/google-cloud-sdk
sudo rm -rf /usr/lib/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /usr/local/julia1.11.5
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/share/miniconda
sudo rm -rf /usr/share/az_12.1.0
sudo rm -rf /usr/share/dotnet
echo "After cleanup:"
df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@ -89,188 +43,18 @@ jobs:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Build and Push Docker Image (amd64)
- name: Build and Push Docker Image
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile
push: true
platforms: linux/amd64
platforms: linux/amd64,linux/arm64
build-args: |
LITE=${{ matrix.lite }}
TORCH_BASE=${{ matrix.torch_base }}
CUDA_VERSION=${{ matrix.cuda_version }}
WORKFLOW=true
tags: |
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-amd64
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-amd64
build-arm64:
needs: generate-meta
runs-on: ubuntu-22.04-arm
strategy:
matrix:
include:
- cuda_version: 12.6
lite: true
torch_base: lite
tag_prefix: cu126-lite
- cuda_version: 12.6
lite: false
torch_base: full
tag_prefix: cu126
- cuda_version: 12.8
lite: true
torch_base: lite
tag_prefix: cu128-lite
- cuda_version: 12.8
lite: false
torch_base: full
tag_prefix: cu128
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Free up disk space
run: |
echo "Before cleanup:"
df -h
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /opt/hostedtoolcache/PyPy
sudo rm -rf /opt/hostedtoolcache/go
sudo rm -rf /opt/hostedtoolcache/node
sudo rm -rf /opt/hostedtoolcache/Ruby
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/pipx
sudo rm -rf /opt/az
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/lib/google-cloud-sdk
sudo rm -rf /usr/lib/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /usr/local/julia1.11.5
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/share/miniconda
sudo rm -rf /usr/share/az_12.1.0
sudo rm -rf /usr/share/dotnet
echo "After cleanup:"
df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Build and Push Docker Image (arm64)
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile
push: true
platforms: linux/arm64
build-args: |
LITE=${{ matrix.lite }}
TORCH_BASE=${{ matrix.torch_base }}
CUDA_VERSION=${{ matrix.cuda_version }}
WORKFLOW=true
tags: |
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-arm64
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-arm64
merge-and-clean:
needs:
- build-amd64
- build-arm64
- generate-meta
runs-on: ubuntu-latest
strategy:
matrix:
include:
- tag_prefix: cu126-lite
- tag_prefix: cu126
- tag_prefix: cu128-lite
- tag_prefix: cu128
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Merge amd64 and arm64 into multi-arch image
run: |
DATE_TAG=${{ needs.generate-meta.outputs.tag }}
TAG_PREFIX=${{ matrix.tag_prefix }}
docker buildx imagetools create \
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG} \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-amd64 \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-arm64
docker buildx imagetools create \
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX} \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-amd64 \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-arm64
- name: Delete old platform-specific tags via Docker Hub API
env:
DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }}
DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_PASSWORD }}
TAG_PREFIX: ${{ matrix.tag_prefix }}
DATE_TAG: ${{ needs.generate-meta.outputs.tag }}
run: |
sudo apt-get update && sudo apt-get install -y jq
TOKEN=$(curl -s -u $DOCKER_HUB_USERNAME:$DOCKER_HUB_TOKEN \
"https://auth.docker.io/token?service=registry.docker.io&scope=repository:$DOCKER_HUB_USERNAME/gpt-sovits:pull,push,delete" \
| jq -r .token)
for PLATFORM in amd64 arm64; do
SAFE_PLATFORM=$(echo $PLATFORM | sed 's/\//-/g')
TAG="${TAG_PREFIX}-${DATE_TAG}-${SAFE_PLATFORM}"
LATEST_TAG="latest-${TAG_PREFIX}-${SAFE_PLATFORM}"
for DEL_TAG in "$TAG" "$LATEST_TAG"; do
echo "Deleting tag: $DEL_TAG"
curl -X DELETE -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/$DOCKER_HUB_USERNAME/gpt-sovits/manifests/$DEL_TAG
done
done
create-default:
runs-on: ubuntu-latest
needs:
- merge-and-clean
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Create Default Tag
run: |
docker buildx imagetools create \
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-cu126-lite
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}

View File

@ -58,9 +58,11 @@ def main():
parser.add_argument("--input_wavs_dir", default="test_files")
parser.add_argument("--output_dir", default="generated_files")
parser.add_argument("--checkpoint_file", required=True)
parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
# --use_cuda_kernel argument is removed to disable custom CUDA kernels.
# parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
a = parser.parse_args()
a.use_cuda_kernel = False
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
with open(config_file) as f:
@ -72,9 +74,9 @@ def main():
torch.manual_seed(h.seed)
global device
if torch.cuda.is_available():
torch.cuda.manual_seed(h.seed)
device = torch.device("cuda")
if torch.xpu.is_available():
torch.xpu.manual_seed(h.seed)
device = torch.device("xpu")
else:
device = torch.device("cpu")

View File

@ -73,9 +73,11 @@ def main():
parser.add_argument("--input_mels_dir", default="test_mel_files")
parser.add_argument("--output_dir", default="generated_files_from_mel")
parser.add_argument("--checkpoint_file", required=True)
parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
# --use_cuda_kernel argument is removed to disable custom CUDA kernels.
# parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
a = parser.parse_args()
a.use_cuda_kernel = False
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
with open(config_file) as f:
@ -87,9 +89,9 @@ def main():
torch.manual_seed(h.seed)
global device
if torch.cuda.is_available():
torch.cuda.manual_seed(h.seed)
device = torch.device("cuda")
if torch.xpu.is_available():
torch.xpu.manual_seed(h.seed)
device = torch.device("xpu")
else:
device = torch.device("cpu")

View File

@ -1,215 +0,0 @@
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
import os
import sys
# to import modules from parent_dir
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(parent_dir)
import torch
import json
from env import AttrDict
from bigvgan import BigVGAN
from time import time
from tqdm import tqdm
from meldataset import mel_spectrogram, MAX_WAV_VALUE
from scipy.io.wavfile import write
import numpy as np
import argparse
torch.backends.cudnn.benchmark = True
# For easier debugging
torch.set_printoptions(linewidth=200, threshold=10_000)
def generate_soundwave(duration=5.0, sr=24000):
t = np.linspace(0, duration, int(sr * duration), False, dtype=np.float32)
modulation = np.sin(2 * np.pi * t / duration)
min_freq = 220
max_freq = 1760
frequencies = min_freq + (max_freq - min_freq) * (modulation + 1) / 2
soundwave = np.sin(2 * np.pi * frequencies * t)
soundwave = soundwave / np.max(np.abs(soundwave)) * 0.95
return soundwave, sr
def get_mel(x, h):
return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print(f"Loading '{filepath}'")
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test script to check CUDA kernel correctness.")
parser.add_argument(
"--checkpoint_file",
type=str,
required=True,
help="Path to the checkpoint file. Assumes config.json exists in the directory.",
)
args = parser.parse_args()
config_file = os.path.join(os.path.split(args.checkpoint_file)[0], "config.json")
with open(config_file) as f:
config = f.read()
json_config = json.loads(config)
h = AttrDict({**json_config})
print("loading plain Pytorch BigVGAN")
generator_original = BigVGAN(h).to("cuda")
print("loading CUDA kernel BigVGAN with auto-build")
generator_cuda_kernel = BigVGAN(h, use_cuda_kernel=True).to("cuda")
state_dict_g = load_checkpoint(args.checkpoint_file, "cuda")
generator_original.load_state_dict(state_dict_g["generator"])
generator_cuda_kernel.load_state_dict(state_dict_g["generator"])
generator_original.remove_weight_norm()
generator_original.eval()
generator_cuda_kernel.remove_weight_norm()
generator_cuda_kernel.eval()
# define number of samples and length of mel frame to benchmark
num_sample = 10
num_mel_frame = 16384
# CUDA kernel correctness check
diff = 0.0
for i in tqdm(range(num_sample)):
# Random mel
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
with torch.inference_mode():
audio_original = generator_original(data)
with torch.inference_mode():
audio_cuda_kernel = generator_cuda_kernel(data)
# Both outputs should be (almost) the same
test_result = (audio_original - audio_cuda_kernel).abs()
diff += test_result.mean(dim=-1).item()
diff /= num_sample
if diff <= 2e-3: # We can expect a small difference (~1e-3) which does not affect perceptual quality
print(
f"\n[Success] test CUDA fused vs. plain torch BigVGAN inference"
f"\n > mean_difference={diff}"
f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}"
f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}"
)
else:
print(
f"\n[Fail] test CUDA fused vs. plain torch BigVGAN inference"
f"\n > mean_difference={diff}"
f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}, "
f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}"
)
del data, audio_original, audio_cuda_kernel
# Variables for tracking total time and VRAM usage
toc_total_original = 0
toc_total_cuda_kernel = 0
vram_used_original_total = 0
vram_used_cuda_kernel_total = 0
audio_length_total = 0
# Measure Original inference in isolation
for i in tqdm(range(num_sample)):
torch.cuda.reset_peak_memory_stats(device="cuda")
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
torch.cuda.synchronize()
tic = time()
with torch.inference_mode():
audio_original = generator_original(data)
torch.cuda.synchronize()
toc = time() - tic
toc_total_original += toc
vram_used_original_total += torch.cuda.max_memory_allocated(device="cuda")
del data, audio_original
torch.cuda.empty_cache()
# Measure CUDA kernel inference in isolation
for i in tqdm(range(num_sample)):
torch.cuda.reset_peak_memory_stats(device="cuda")
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
torch.cuda.synchronize()
tic = time()
with torch.inference_mode():
audio_cuda_kernel = generator_cuda_kernel(data)
torch.cuda.synchronize()
toc = time() - tic
toc_total_cuda_kernel += toc
audio_length_total += audio_cuda_kernel.shape[-1]
vram_used_cuda_kernel_total += torch.cuda.max_memory_allocated(device="cuda")
del data, audio_cuda_kernel
torch.cuda.empty_cache()
# Calculate metrics
audio_second = audio_length_total / h.sampling_rate
khz_original = audio_length_total / toc_total_original / 1000
khz_cuda_kernel = audio_length_total / toc_total_cuda_kernel / 1000
vram_used_original_gb = vram_used_original_total / num_sample / (1024**3)
vram_used_cuda_kernel_gb = vram_used_cuda_kernel_total / num_sample / (1024**3)
# Print results
print(
f"Original BigVGAN: took {toc_total_original:.2f} seconds to generate {audio_second:.2f} seconds of audio, {khz_original:.1f}kHz, {audio_second / toc_total_original:.1f} faster than realtime, VRAM used {vram_used_original_gb:.1f} GB"
)
print(
f"CUDA kernel BigVGAN: took {toc_total_cuda_kernel:.2f} seconds to generate {audio_second:.2f} seconds of audio, {khz_cuda_kernel:.1f}kHz, {audio_second / toc_total_cuda_kernel:.1f} faster than realtime, VRAM used {vram_used_cuda_kernel_gb:.1f} GB"
)
print(f"speedup of CUDA kernel: {khz_cuda_kernel / khz_original}")
print(f"VRAM saving of CUDA kernel: {vram_used_original_gb / vram_used_cuda_kernel_gb}")
# Use artificial sine waves for inference test
audio_real, sr = generate_soundwave(duration=5.0, sr=h.sampling_rate)
audio_real = torch.tensor(audio_real).to("cuda")
# Compute mel spectrogram from the ground truth audio
x = get_mel(audio_real.unsqueeze(0), h)
with torch.inference_mode():
y_g_hat_original = generator_original(x)
y_g_hat_cuda_kernel = generator_cuda_kernel(x)
audio_real = audio_real.squeeze()
audio_real = audio_real * MAX_WAV_VALUE
audio_real = audio_real.cpu().numpy().astype("int16")
audio_original = y_g_hat_original.squeeze()
audio_original = audio_original * MAX_WAV_VALUE
audio_original = audio_original.cpu().numpy().astype("int16")
audio_cuda_kernel = y_g_hat_cuda_kernel.squeeze()
audio_cuda_kernel = audio_cuda_kernel * MAX_WAV_VALUE
audio_cuda_kernel = audio_cuda_kernel.cpu().numpy().astype("int16")
os.makedirs("tmp", exist_ok=True)
output_file_real = os.path.join("tmp", "audio_real.wav")
output_file_original = os.path.join("tmp", "audio_generated_original.wav")
output_file_cuda_kernel = os.path.join("tmp", "audio_generated_cuda_kernel.wav")
write(output_file_real, h.sampling_rate, audio_real)
write(output_file_original, h.sampling_rate, audio_original)
write(output_file_cuda_kernel, h.sampling_rate, audio_cuda_kernel)
print("Example generated audios of original vs. fused CUDA kernel written to tmp!")
print("Done")

View File

@ -110,15 +110,15 @@ def main(args):
os.environ["USE_LIBUV"] = "0"
trainer: Trainer = Trainer(
max_epochs=config["train"]["epochs"],
accelerator="gpu" if torch.cuda.is_available() else "cpu",
accelerator="xpu" if torch.xpu.is_available() else "cpu",
# val_check_interval=9999999999999999999999,###不要验证
# check_val_every_n_epoch=None,
limit_val_batches=0,
devices=-1 if torch.cuda.is_available() else 1,
devices=-1 if torch.xpu.is_available() else 1,
benchmark=False,
fast_dev_run=False,
strategy=DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo")
if torch.cuda.is_available()
strategy=DDPStrategy(process_group_backend="ccl" if platform.system() != "Windows" else "gloo")
if torch.xpu.is_available()
else "auto",
precision=config["train"]["precision"],
logger=logger,

View File

@ -41,18 +41,18 @@ from process_ckpt import savee
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = False
###反正A100fp32更快那试试tf32吧
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# torch.backends.cuda.matmul.allow_tf32 = True # XPU does not support this
# torch.backends.cudnn.allow_tf32 = True # XPU does not support this
torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就快一丁点),对于结果造成不了影响
# from config import pretrained_s2G,pretrained_s2D
global_step = 0
device = "cpu" # cuda以外的设备等mps优化后加入
device = "xpu" if torch.xpu.is_available() else "cpu"
def main():
if torch.cuda.is_available():
n_gpus = torch.cuda.device_count()
if torch.xpu.is_available():
n_gpus = torch.xpu.device_count()
else:
n_gpus = 1
os.environ["MASTER_ADDR"] = "localhost"
@ -78,14 +78,14 @@ def run(rank, n_gpus, hps):
writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
dist.init_process_group(
backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
backend="gloo" if os.name == "nt" or not torch.xpu.is_available() else "ccl",
init_method="env://?use_libuv=False",
world_size=n_gpus,
rank=rank,
)
torch.manual_seed(hps.train.seed)
if torch.cuda.is_available():
torch.cuda.set_device(rank)
if torch.xpu.is_available():
torch.xpu.set_device(rank)
train_dataset = TextAudioSpeakerLoader(hps.data, version=hps.model.version)
train_sampler = DistributedBucketSampler(
@ -132,27 +132,14 @@ def run(rank, n_gpus, hps):
# batch_size=1, pin_memory=True,
# drop_last=False, collate_fn=collate_fn)
net_g = (
SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model,
).cuda(rank)
if torch.cuda.is_available()
else SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model,
).to(device)
)
net_g = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model,
).to(device)
net_d = (
MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).cuda(rank)
if torch.cuda.is_available()
else MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).to(device)
)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).to(device)
for name, param in net_g.named_parameters():
if not param.requires_grad:
print(name, "not requires_grad")
@ -196,7 +183,7 @@ def run(rank, n_gpus, hps):
betas=hps.train.betas,
eps=hps.train.eps,
)
if torch.cuda.is_available():
if torch.xpu.is_available():
net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
else:
@ -238,7 +225,7 @@ def run(rank, n_gpus, hps):
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"],
strict=False,
)
if torch.cuda.is_available()
if torch.xpu.is_available()
else net_g.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"],
strict=False,
@ -256,7 +243,7 @@ def run(rank, n_gpus, hps):
net_d.module.load_state_dict(
torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"], strict=False
)
if torch.cuda.is_available()
if torch.xpu.is_available()
else net_d.load_state_dict(
torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"],
),
@ -333,42 +320,24 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths, sv_emb = data
else:
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths = data
if torch.cuda.is_available():
if torch.xpu.is_available():
spec, spec_lengths = (
spec.cuda(
rank,
non_blocking=True,
),
spec_lengths.cuda(
rank,
non_blocking=True,
),
spec.to(device, non_blocking=True),
spec_lengths.to(device, non_blocking=True),
)
y, y_lengths = (
y.cuda(
rank,
non_blocking=True,
),
y_lengths.cuda(
rank,
non_blocking=True,
),
y.to(device, non_blocking=True),
y_lengths.to(device, non_blocking=True),
)
ssl = ssl.cuda(rank, non_blocking=True)
ssl = ssl.to(device, non_blocking=True)
ssl.requires_grad = False
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
# ssl_lengths = ssl_lengths.to(device, non_blocking=True)
text, text_lengths = (
text.cuda(
rank,
non_blocking=True,
),
text_lengths.cuda(
rank,
non_blocking=True,
),
text.to(device, non_blocking=True),
text_lengths.to(device, non_blocking=True),
)
if hps.model.version in {"v2Pro", "v2ProPlus"}:
sv_emb = sv_emb.cuda(rank, non_blocking=True)
sv_emb = sv_emb.to(device, non_blocking=True)
else:
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
y, y_lengths = y.to(device), y_lengths.to(device)
@ -596,11 +565,11 @@ def evaluate(hps, generator, eval_loader, writer_eval):
text_lengths,
) in enumerate(eval_loader):
print(111)
if torch.cuda.is_available():
spec, spec_lengths = spec.cuda(), spec_lengths.cuda()
y, y_lengths = y.cuda(), y_lengths.cuda()
ssl = ssl.cuda()
text, text_lengths = text.cuda(), text_lengths.cuda()
if torch.xpu.is_available():
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
y, y_lengths = y.to(device), y_lengths.to(device)
ssl = ssl.to(device)
text, text_lengths = text.to(device), text_lengths.to(device)
else:
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
y, y_lengths = y.to(device), y_lengths.to(device)

2793
api.py

File diff suppressed because it is too large Load Diff

442
config.py
View File

@ -1,218 +1,224 @@
import os
import re
import sys
import torch
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
pretrained_sovits_name = {
"v1": "GPT_SoVITS/pretrained_models/s2G488k.pth",
"v2": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"v3": "GPT_SoVITS/pretrained_models/s2Gv3.pth", ###v3v4还要检查vocoder算了。。。
"v4": "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
"v2Pro": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
"v2ProPlus": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
}
pretrained_gpt_name = {
"v1": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"v2": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"v3": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v4": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v2Pro": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v2ProPlus": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
}
name2sovits_path = {
# i18n("不训练直接推v1底模"): "GPT_SoVITS/pretrained_models/s2G488k.pth",
i18n("不训练直接推v2底模"): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
# i18n("不训练直接推v3底模"): "GPT_SoVITS/pretrained_models/s2Gv3.pth",
# i18n("不训练直接推v4底模"): "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
i18n("不训练直接推v2Pro底模"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
i18n("不训练直接推v2ProPlus底模"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
}
name2gpt_path = {
# i18n("不训练直接推v1底模"):"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
i18n(
"不训练直接推v2底模"
): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
i18n("不训练直接推v3底模"): "GPT_SoVITS/pretrained_models/s1v3.ckpt",
}
SoVITS_weight_root = [
"SoVITS_weights",
"SoVITS_weights_v2",
"SoVITS_weights_v3",
"SoVITS_weights_v4",
"SoVITS_weights_v2Pro",
"SoVITS_weights_v2ProPlus",
]
GPT_weight_root = [
"GPT_weights",
"GPT_weights_v2",
"GPT_weights_v3",
"GPT_weights_v4",
"GPT_weights_v2Pro",
"GPT_weights_v2ProPlus",
]
SoVITS_weight_version2root = {
"v1": "SoVITS_weights",
"v2": "SoVITS_weights_v2",
"v3": "SoVITS_weights_v3",
"v4": "SoVITS_weights_v4",
"v2Pro": "SoVITS_weights_v2Pro",
"v2ProPlus": "SoVITS_weights_v2ProPlus",
}
GPT_weight_version2root = {
"v1": "GPT_weights",
"v2": "GPT_weights_v2",
"v3": "GPT_weights_v3",
"v4": "GPT_weights_v4",
"v2Pro": "GPT_weights_v2Pro",
"v2ProPlus": "GPT_weights_v2ProPlus",
}
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split("(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
def get_weights_names():
SoVITS_names = []
for key in name2sovits_path:
if os.path.exists(name2sovits_path[key]):
SoVITS_names.append(key)
for path in SoVITS_weight_root:
if not os.path.exists(path):
continue
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
if not SoVITS_names:
SoVITS_names = [""]
GPT_names = []
for key in name2gpt_path:
if os.path.exists(name2gpt_path[key]):
GPT_names.append(key)
for path in GPT_weight_root:
if not os.path.exists(path):
continue
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
GPT_names = sorted(GPT_names, key=custom_sort_key)
if not GPT_names:
GPT_names = [""]
return SoVITS_names, GPT_names
def change_choices():
SoVITS_names, GPT_names = get_weights_names()
return {"choices": SoVITS_names, "__type__": "update"}, {
"choices": GPT_names,
"__type__": "update",
}
# 推理用的指定模型
sovits_path = ""
gpt_path = ""
is_half_str = os.environ.get("is_half", "True")
is_half = True if is_half_str.lower() == "true" else False
is_share_str = os.environ.get("is_share", "False")
is_share = True if is_share_str.lower() == "true" else False
cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
exp_root = "logs"
python_exec = sys.executable or "python"
webui_port_main = 9874
webui_port_uvr5 = 9873
webui_port_infer_tts = 9872
webui_port_subfix = 9871
api_port = 9880
# Thanks to the contribution of @Karasukaigan and @XXXXRT666
def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]:
cpu = torch.device("cpu")
cuda = torch.device(f"cuda:{idx}")
if not torch.cuda.is_available():
return cpu, torch.float32, 0.0, 0.0
device_idx = idx
capability = torch.cuda.get_device_capability(device_idx)
name = torch.cuda.get_device_name(device_idx)
mem_bytes = torch.cuda.get_device_properties(device_idx).total_memory
mem_gb = mem_bytes / (1024**3) + 0.4
major, minor = capability
sm_version = major + minor / 10.0
is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
if mem_gb < 4 or sm_version < 5.3:
return cpu, torch.float32, 0.0, 0.0
if sm_version == 6.1 or is_16_series == True:
return cuda, torch.float32, sm_version, mem_gb
if sm_version > 6.1:
return cuda, torch.float16, sm_version, mem_gb
return cpu, torch.float32, 0.0, 0.0
IS_GPU = True
GPU_INFOS: list[str] = []
GPU_INDEX: set[int] = set()
GPU_COUNT = torch.cuda.device_count()
CPU_INFO: str = "0\tCPU " + i18n("CPU训练,较慢")
tmp: list[tuple[torch.device, torch.dtype, float, float]] = []
memset: set[float] = set()
for i in range(max(GPU_COUNT, 1)):
tmp.append(get_device_dtype_sm(i))
for j in tmp:
device = j[0]
memset.add(j[3])
if device.type != "cpu":
GPU_INFOS.append(f"{device.index}\t{torch.cuda.get_device_name(device.index)}")
GPU_INDEX.add(device.index)
if not GPU_INFOS:
IS_GPU = False
GPU_INFOS.append(CPU_INFO)
GPU_INDEX.add(0)
infer_device = max(tmp, key=lambda x: (x[2], x[3]))[0]
is_half = any(dtype == torch.float16 for _, dtype, _, _ in tmp)
class Config:
def __init__(self):
self.sovits_path = sovits_path
self.gpt_path = gpt_path
self.is_half = is_half
self.cnhubert_path = cnhubert_path
self.bert_path = bert_path
self.pretrained_sovits_path = pretrained_sovits_path
self.pretrained_gpt_path = pretrained_gpt_path
self.exp_root = exp_root
self.python_exec = python_exec
self.infer_device = infer_device
self.webui_port_main = webui_port_main
self.webui_port_uvr5 = webui_port_uvr5
self.webui_port_infer_tts = webui_port_infer_tts
self.webui_port_subfix = webui_port_subfix
self.api_port = api_port
import os
import re
import sys
import torch
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
pretrained_sovits_name = {
"v1": "GPT_SoVITS/pretrained_models/s2G488k.pth",
"v2": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"v3": "GPT_SoVITS/pretrained_models/s2Gv3.pth", ###v3v4还要检查vocoder算了。。。
"v4": "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
"v2Pro": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
"v2ProPlus": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
}
pretrained_gpt_name = {
"v1": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"v2": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"v3": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v4": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v2Pro": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v2ProPlus": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
}
name2sovits_path = {
# i18n("不训练直接推v1底模"): "GPT_SoVITS/pretrained_models/s2G488k.pth",
i18n("不训练直接推v2底模"): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
# i18n("不训练直接推v3底模"): "GPT_SoVITS/pretrained_models/s2Gv3.pth",
# i18n("不训练直接推v4底模"): "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
i18n("不训练直接推v2Pro底模"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
i18n("不训练直接推v2ProPlus底模"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
}
name2gpt_path = {
# i18n("不训练直接推v1底模"):"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
i18n(
"不训练直接推v2底模"
): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
i18n("不训练直接推v3底模"): "GPT_SoVITS/pretrained_models/s1v3.ckpt",
}
SoVITS_weight_root = [
"SoVITS_weights",
"SoVITS_weights_v2",
"SoVITS_weights_v3",
"SoVITS_weights_v4",
"SoVITS_weights_v2Pro",
"SoVITS_weights_v2ProPlus",
]
GPT_weight_root = [
"GPT_weights",
"GPT_weights_v2",
"GPT_weights_v3",
"GPT_weights_v4",
"GPT_weights_v2Pro",
"GPT_weights_v2ProPlus",
]
SoVITS_weight_version2root = {
"v1": "SoVITS_weights",
"v2": "SoVITS_weights_v2",
"v3": "SoVITS_weights_v3",
"v4": "SoVITS_weights_v4",
"v2Pro": "SoVITS_weights_v2Pro",
"v2ProPlus": "SoVITS_weights_v2ProPlus",
}
GPT_weight_version2root = {
"v1": "GPT_weights",
"v2": "GPT_weights_v2",
"v3": "GPT_weights_v3",
"v4": "GPT_weights_v4",
"v2Pro": "GPT_weights_v2Pro",
"v2ProPlus": "GPT_weights_v2ProPlus",
}
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split("(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
def get_weights_names():
SoVITS_names = []
for key in name2sovits_path:
if os.path.exists(name2sovits_path[key]):
SoVITS_names.append(key)
for path in SoVITS_weight_root:
if not os.path.exists(path):
continue
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
if not SoVITS_names:
SoVITS_names = [""]
GPT_names = []
for key in name2gpt_path:
if os.path.exists(name2gpt_path[key]):
GPT_names.append(key)
for path in GPT_weight_root:
if not os.path.exists(path):
continue
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
GPT_names = sorted(GPT_names, key=custom_sort_key)
if not GPT_names:
GPT_names = [""]
return SoVITS_names, GPT_names
def change_choices():
SoVITS_names, GPT_names = get_weights_names()
return {"choices": SoVITS_names, "__type__": "update"}, {
"choices": GPT_names,
"__type__": "update",
}
# 推理用的指定模型
sovits_path = ""
gpt_path = ""
is_half_str = os.environ.get("is_half", "True")
is_half = True if is_half_str.lower() == "true" else False
is_share_str = os.environ.get("is_share", "False")
is_share = True if is_share_str.lower() == "true" else False
cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
exp_root = "logs"
python_exec = sys.executable or "python"
webui_port_main = 9874
webui_port_uvr5 = 9873
webui_port_infer_tts = 9872
webui_port_subfix = 9871
api_port = 9880
# Thanks to the contribution of @Karasukaigan and @XXXXRT666
# Modified for Intel GPU (XPU)
def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]:
cpu = torch.device("cpu")
try:
if not torch.xpu.is_available():
return cpu, torch.float32, 0.0, 0.0
except AttributeError:
return cpu, torch.float32, 0.0, 0.0
xpu_device = torch.device(f"xpu:{idx}")
properties = torch.xpu.get_device_properties(idx)
mem_bytes = properties.total_memory
mem_gb = mem_bytes / (1024**3)
# Simplified logic for XPU, assuming FP16/BF16 is generally supported.
# The complex SM version check is CUDA-specific.
if mem_gb < 4: # Example threshold
return cpu, torch.float32, 0.0, 0.0
# For Intel GPUs, we can generally assume float16 is available.
# The 'sm_version' equivalent is not straightforward, so we use a placeholder value (e.g., 1.0)
# for compatibility with the downstream logic that sorts devices.
return xpu_device, torch.float16, 1.0, mem_gb
IS_GPU = True
GPU_INFOS: list[str] = []
GPU_INDEX: set[int] = set()
try:
GPU_COUNT = torch.xpu.device_count() if torch.xpu.is_available() else 0
except AttributeError:
GPU_COUNT = 0
CPU_INFO: str = "0\tCPU " + i18n("CPU训练,较慢")
tmp: list[tuple[torch.device, torch.dtype, float, float]] = []
memset: set[float] = set()
for i in range(max(GPU_COUNT, 1)):
tmp.append(get_device_dtype_sm(i))
for j in tmp:
device = j[0]
memset.add(j[3])
if device.type == "xpu":
GPU_INFOS.append(f"{device.index}\t{torch.xpu.get_device_name(device.index)}")
GPU_INDEX.add(device.index)
if not GPU_INFOS:
IS_GPU = False
GPU_INFOS.append(CPU_INFO)
GPU_INDEX.add(0)
infer_device = max(tmp, key=lambda x: (x[2], x[3]))[0]
is_half = any(dtype == torch.float16 for _, dtype, _, _ in tmp)
class Config:
def __init__(self):
self.sovits_path = sovits_path
self.gpt_path = gpt_path
self.is_half = is_half
self.cnhubert_path = cnhubert_path
self.bert_path = bert_path
self.pretrained_sovits_path = pretrained_sovits_path
self.pretrained_gpt_path = pretrained_gpt_path
self.exp_root = exp_root
self.python_exec = python_exec
self.infer_device = infer_device
self.webui_port_main = webui_port_main
self.webui_port_uvr5 = webui_port_uvr5
self.webui_port_infer_tts = webui_port_infer_tts
self.webui_port_subfix = webui_port_subfix
self.api_port = api_port

View File

@ -14,49 +14,29 @@ fi
trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR
LITE=false
CUDA_VERSION=12.6
print_help() {
echo "Usage: bash docker_build.sh [OPTIONS]"
echo ""
echo "Options:"
echo " --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED)"
echo " --lite Build a Lite Image"
echo " -h, --help Show this help message and exit"
echo ""
echo "Examples:"
echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
echo " bash docker_build.sh --lite"
}
# Show help if no arguments provided
if [[ $# -eq 0 ]]; then
print_help
exit 0
fi
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--cuda)
case "$2" in
12.6)
CUDA_VERSION=12.6
;;
12.8)
CUDA_VERSION=12.8
;;
*)
echo "Error: Invalid CUDA_VERSION: $2"
echo "Choose From: [12.6, 12.8]"
exit 1
;;
esac
shift 2
;;
--lite)
LITE=true
shift
;;
-h|--help)
print_help
exit 0
;;
*)
echo "Unknown Argument: $1"
echo "Use -h or --help to see available options."
@ -74,7 +54,6 @@ else
fi
docker build \
--build-arg CUDA_VERSION=$CUDA_VERSION \
--build-arg LITE=$LITE \
--build-arg TARGETPLATFORM="$TARGETPLATFORM" \
--build-arg TORCH_BASE=$TORCH_BASE \

View File

@ -5,6 +5,9 @@ tensorboard
librosa==0.10.2
numba
pytorch-lightning>=2.4
torch==2.9
intel-extension-for-pytorch
torchvision
gradio<5
ffmpeg-python
onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64"

View File

@ -85,7 +85,7 @@ def execute_asr(input_folder, output_folder, model_path, language, precision):
if language == "auto":
language = None # 不设置语种由模型自动输出概率最高的语种
print("loading faster whisper model:", model_path, model_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
device = "xpu" if torch.xpu.is_available() else "cpu"
model = WhisperModel(model_path, device=device, compute_type=precision)
input_file_names = os.listdir(input_folder)
@ -128,8 +128,6 @@ def execute_asr(input_folder, output_folder, model_path, language, precision):
return output_file_path
load_cudnn()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(

View File

@ -1,231 +1,137 @@
import ctypes
import os
import sys
from pathlib import Path
import ffmpeg
import gradio as gr
import numpy as np
import pandas as pd
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
def load_audio(file, sr):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
if os.path.exists(file) is False:
raise RuntimeError("You input a wrong audio path that does not exists, please fix it!")
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except Exception:
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True)
) # Expose the Error
raise RuntimeError(i18n("音频加载失败"))
return np.frombuffer(out, np.float32).flatten()
def clean_path(path_str: str):
if path_str.endswith(("\\", "/")):
return clean_path(path_str[0:-1])
path_str = path_str.replace("/", os.sep).replace("\\", os.sep)
return path_str.strip(
" '\n\"\u202a"
) # path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")
def check_for_existance(file_list: list = None, is_train=False, is_dataset_processing=False):
files_status = []
if is_train == True and file_list:
file_list.append(os.path.join(file_list[0], "2-name2text.txt"))
file_list.append(os.path.join(file_list[0], "3-bert"))
file_list.append(os.path.join(file_list[0], "4-cnhubert"))
file_list.append(os.path.join(file_list[0], "5-wav32k"))
file_list.append(os.path.join(file_list[0], "6-name2semantic.tsv"))
for file in file_list:
if os.path.exists(file):
files_status.append(True)
else:
files_status.append(False)
if sum(files_status) != len(files_status):
if is_train:
for file, status in zip(file_list, files_status):
if status:
pass
else:
gr.Warning(file)
gr.Warning(i18n("以下文件或文件夹不存在"))
return False
elif is_dataset_processing:
if files_status[0]:
return True
elif not files_status[0]:
gr.Warning(file_list[0])
elif not files_status[1] and file_list[1]:
gr.Warning(file_list[1])
gr.Warning(i18n("以下文件或文件夹不存在"))
return False
else:
if file_list[0]:
gr.Warning(file_list[0])
gr.Warning(i18n("以下文件或文件夹不存在"))
else:
gr.Warning(i18n("路径不能为空"))
return False
return True
def check_details(path_list=None, is_train=False, is_dataset_processing=False):
if is_dataset_processing:
list_path, audio_path = path_list
if not list_path.endswith(".list"):
gr.Warning(i18n("请填入正确的List路径"))
return
if audio_path:
if not os.path.isdir(audio_path):
gr.Warning(i18n("请填入正确的音频文件夹路径"))
return
with open(list_path, "r", encoding="utf8") as f:
line = f.readline().strip("\n").split("\n")
wav_name, _, __, ___ = line[0].split("|")
wav_name = clean_path(wav_name)
if audio_path != "" and audio_path != None:
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s" % (audio_path, wav_name)
else:
wav_path = wav_name
if os.path.exists(wav_path):
...
else:
gr.Warning(wav_path + i18n("路径错误"))
return
if is_train:
path_list.append(os.path.join(path_list[0], "2-name2text.txt"))
path_list.append(os.path.join(path_list[0], "4-cnhubert"))
path_list.append(os.path.join(path_list[0], "5-wav32k"))
path_list.append(os.path.join(path_list[0], "6-name2semantic.tsv"))
phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
with open(phone_path, "r", encoding="utf-8") as f:
if f.read(1):
...
else:
gr.Warning(i18n("缺少音素数据集"))
if os.listdir(hubert_path):
...
else:
gr.Warning(i18n("缺少Hubert数据集"))
if os.listdir(wav_path):
...
else:
gr.Warning(i18n("缺少音频数据集"))
df = pd.read_csv(semantic_path, delimiter="\t", encoding="utf-8")
if len(df) >= 1:
...
else:
gr.Warning(i18n("缺少语义数据集"))
def load_cudnn():
import torch
if not torch.cuda.is_available():
print("[INFO] CUDA is not available, skipping cuDNN setup.")
return
if sys.platform == "win32":
torch_lib_dir = Path(torch.__file__).parent / "lib"
if torch_lib_dir.exists():
os.add_dll_directory(str(torch_lib_dir))
print(f"[INFO] Added DLL directory: {torch_lib_dir}")
matching_files = sorted(torch_lib_dir.glob("cudnn_cnn*.dll"))
if not matching_files:
print(f"[ERROR] No cudnn_cnn*.dll found in {torch_lib_dir}")
return
for dll_path in matching_files:
dll_name = os.path.basename(dll_path)
try:
ctypes.CDLL(dll_name)
print(f"[INFO] Loaded: {dll_name}")
except OSError as e:
print(f"[WARNING] Failed to load {dll_name}: {e}")
else:
print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
elif sys.platform == "linux":
site_packages = Path(torch.__file__).resolve().parents[1]
cudnn_dir = site_packages / "nvidia" / "cudnn" / "lib"
if not cudnn_dir.exists():
print(f"[ERROR] cudnn dir not found: {cudnn_dir}")
return
matching_files = sorted(cudnn_dir.glob("libcudnn_cnn*.so*"))
if not matching_files:
print(f"[ERROR] No libcudnn_cnn*.so* found in {cudnn_dir}")
return
for so_path in matching_files:
try:
ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore
print(f"[INFO] Loaded: {so_path}")
except OSError as e:
print(f"[WARNING] Failed to load {so_path}: {e}")
def load_nvrtc():
import torch
if not torch.cuda.is_available():
print("[INFO] CUDA is not available, skipping nvrtc setup.")
return
if sys.platform == "win32":
torch_lib_dir = Path(torch.__file__).parent / "lib"
if torch_lib_dir.exists():
os.add_dll_directory(str(torch_lib_dir))
print(f"[INFO] Added DLL directory: {torch_lib_dir}")
matching_files = sorted(torch_lib_dir.glob("nvrtc*.dll"))
if not matching_files:
print(f"[ERROR] No nvrtc*.dll found in {torch_lib_dir}")
return
for dll_path in matching_files:
dll_name = os.path.basename(dll_path)
try:
ctypes.CDLL(dll_name)
print(f"[INFO] Loaded: {dll_name}")
except OSError as e:
print(f"[WARNING] Failed to load {dll_name}: {e}")
else:
print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
elif sys.platform == "linux":
site_packages = Path(torch.__file__).resolve().parents[1]
nvrtc_dir = site_packages / "nvidia" / "cuda_nvrtc" / "lib"
if not nvrtc_dir.exists():
print(f"[ERROR] nvrtc dir not found: {nvrtc_dir}")
return
matching_files = sorted(nvrtc_dir.glob("libnvrtc*.so*"))
if not matching_files:
print(f"[ERROR] No libnvrtc*.so* found in {nvrtc_dir}")
return
for so_path in matching_files:
try:
ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore
print(f"[INFO] Loaded: {so_path}")
except OSError as e:
print(f"[WARNING] Failed to load {so_path}: {e}")
import ctypes
import os
import sys
from pathlib import Path
import ffmpeg
import gradio as gr
import numpy as np
import pandas as pd
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
def load_audio(file, sr):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
if os.path.exists(file) is False:
raise RuntimeError("You input a wrong audio path that does not exists, please fix it!")
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except Exception:
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True)
) # Expose the Error
raise RuntimeError(i18n("音频加载失败"))
return np.frombuffer(out, np.float32).flatten()
def clean_path(path_str: str):
if path_str.endswith(("\\", "/")):
return clean_path(path_str[0:-1])
path_str = path_str.replace("/", os.sep).replace("\\", os.sep)
return path_str.strip(
" '\n\"\u202a"
) # path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")
def check_for_existance(file_list: list = None, is_train=False, is_dataset_processing=False):
files_status = []
if is_train == True and file_list:
file_list.append(os.path.join(file_list[0], "2-name2text.txt"))
file_list.append(os.path.join(file_list[0], "3-bert"))
file_list.append(os.path.join(file_list[0], "4-cnhubert"))
file_list.append(os.path.join(file_list[0], "5-wav32k"))
file_list.append(os.path.join(file_list[0], "6-name2semantic.tsv"))
for file in file_list:
if os.path.exists(file):
files_status.append(True)
else:
files_status.append(False)
if sum(files_status) != len(files_status):
if is_train:
for file, status in zip(file_list, files_status):
if status:
pass
else:
gr.Warning(file)
gr.Warning(i18n("以下文件或文件夹不存在"))
return False
elif is_dataset_processing:
if files_status[0]:
return True
elif not files_status[0]:
gr.Warning(file_list[0])
elif not files_status[1] and file_list[1]:
gr.Warning(file_list[1])
gr.Warning(i18n("以下文件或文件夹不存在"))
return False
else:
if file_list[0]:
gr.Warning(file_list[0])
gr.Warning(i18n("以下文件或文件夹不存在"))
else:
gr.Warning(i18n("路径不能为空"))
return False
return True
def check_details(path_list=None, is_train=False, is_dataset_processing=False):
if is_dataset_processing:
list_path, audio_path = path_list
if not list_path.endswith(".list"):
gr.Warning(i18n("请填入正确的List路径"))
return
if audio_path:
if not os.path.isdir(audio_path):
gr.Warning(i18n("请填入正确的音频文件夹路径"))
return
with open(list_path, "r", encoding="utf8") as f:
line = f.readline().strip("\n").split("\n")
wav_name, _, __, ___ = line[0].split("|")
wav_name = clean_path(wav_name)
if audio_path != "" and audio_path != None:
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s" % (audio_path, wav_name)
else:
wav_path = wav_name
if os.path.exists(wav_path):
...
else:
gr.Warning(wav_path + i18n("路径错误"))
return
if is_train:
path_list.append(os.path.join(path_list[0], "2-name2text.txt"))
path_list.append(os.path.join(path_list[0], "4-cnhubert"))
path_list.append(os.path.join(path_list[0], "5-wav32k"))
path_list.append(os.path.join(path_list[0], "6-name2semantic.tsv"))
phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
with open(phone_path, "r", encoding="utf-8") as f:
if f.read(1):
...
else:
gr.Warning(i18n("缺少音素数据集"))
if os.listdir(hubert_path):
...
else:
gr.Warning(i18n("缺少Hubert数据集"))
if os.listdir(wav_path):
...
else:
gr.Warning(i18n("缺少音频数据集"))
df = pd.read_csv(semantic_path, delimiter="\t", encoding="utf-8")
if len(df) >= 1:
...
else:
gr.Warning(i18n("缺少语义数据集"))