From 6c349541d3cbfae62eab6e533662bdfe4df68c38 Mon Sep 17 00:00:00 2001 From: Terrence Yang Date: Thu, 12 Mar 2026 10:53:58 +0800 Subject: [PATCH] fix: add fallback for torchaudio/torchcodec loading and support PyTorch 2.6+ security policy --- GPT_SoVITS/inference_webui.py | 18 +++++++++++++++++- GPT_SoVITS/prepare_datasets/2-get-sv.py | 23 ++++++++++++++++++++++- GPT_SoVITS/s1_train.py | 14 ++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index a361ed58..d15cb6aa 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -521,7 +521,23 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False): # audio = torch.FloatTensor(audio) sr1 = int(hps.data.sampling_rate) - audio, sr0 = torchaudio.load(filename) + + #audio, sr0 = torchaudio.load(filename) + try: + # Attempt to load using the author's preferred torchaudio method + audio, sr0 = torchaudio.load(filename) + except Exception as e: + # Fallback for environments where torchcodec/torchaudio is broken (e.g., RTX 50-series) + print(f"Warning: torchaudio load failed, falling back to librosa. Error: {e}") + import librosa + import numpy as np + # Load using librosa as a robust CPU-based alternative + audio_np, sr0 = librosa.load(filename, sr=None) + # Convert back to tensor and match the [1, N] shape expected by the pipeline + audio = torch.from_numpy(audio_np).unsqueeze(0) + + + if sr0 != sr1: audio = audio.to(device) if audio.shape[0] == 2: diff --git a/GPT_SoVITS/prepare_datasets/2-get-sv.py b/GPT_SoVITS/prepare_datasets/2-get-sv.py index 80b0ad69..21c00017 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-sv.py +++ b/GPT_SoVITS/prepare_datasets/2-get-sv.py @@ -89,7 +89,28 @@ def name2go(wav_name, wav_path): if os.path.exists(sv_cn_path): return wav_path = "%s/%s" % (wav32dir, wav_name) - wav32k, sr0 = torchaudio.load(wav_path) + + + #wav32k, sr0 = torchaudio.load(wav_path) + try: + # Attempt to load using torchaudio as originally intended by the author + wav32k, sr0 = torchaudio.load(wav_path) + except Exception as e: + # Fallback for environments with torchcodec/torchaudio issues (e.g., RTX 50-series/Python 3.12) + print(f"Warning: torchaudio load failed for {wav_path}, falling back to librosa. Error: {e}") + import librosa + import torch + + # Load and resample to 32k in one step using librosa + # sr=32000 ensures compatibility with the Hubert/Semantic encoder requirements + wav32k_np, sr0 = librosa.load(wav_path, sr=32000) + + # Convert numpy array back to torch Tensor and add the channel dimension [1, N] + wav32k = torch.from_numpy(wav32k_np).unsqueeze(0) + + + + assert sr0 == 32000 wav32k = wav32k.to(device) emb = sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480]) diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py index 1176f0bc..cef8e240 100644 --- a/GPT_SoVITS/s1_train.py +++ b/GPT_SoVITS/s1_train.py @@ -8,7 +8,21 @@ import logging import platform from pathlib import Path +#import torch + import torch +import pathlib + +# Fix for PyTorch 2.6+ where weights_only=True is the default. +# This allows the trainer to safely load PosixPath objects from checkpoints. +if hasattr(torch.serialization, 'add_safe_globals'): + try: + torch.serialization.add_safe_globals([pathlib.PosixPath]) + except Exception as e: + print(f"Note: Could not add pathlib.PosixPath to safe_globals: {e}") + + + from AR.data.data_module import Text2SemanticDataModule from AR.models.t2s_lightning_module import Text2SemanticLightningModule from AR.utils.io import load_yaml_config