From 6c349541d3cbfae62eab6e533662bdfe4df68c38 Mon Sep 17 00:00:00 2001
From: Terrence Yang <terrence.ye.yang@gmail.com>
Date: Thu, 12 Mar 2026 10:53:58 +0800
Subject: [PATCH] fix: add fallback for torchaudio/torchcodec loading and
 support PyTorch 2.6+ security policy

---
 GPT_SoVITS/inference_webui.py           | 18 +++++++++++++++++-
 GPT_SoVITS/prepare_datasets/2-get-sv.py | 23 ++++++++++++++++++++++-
 GPT_SoVITS/s1_train.py                  | 14 ++++++++++++++
 3 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index a361ed58..d15cb6aa 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -521,7 +521,23 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
     # audio = torch.FloatTensor(audio)
 
     sr1 = int(hps.data.sampling_rate)
-    audio, sr0 = torchaudio.load(filename)
+
+    #audio, sr0 = torchaudio.load(filename)
+    try:
+        # Attempt to load using the author's preferred torchaudio method
+        audio, sr0 = torchaudio.load(filename)
+    except Exception as e:
+        # Fallback for environments where torchcodec/torchaudio is broken (e.g., RTX 50-series)
+        print(f"Warning: torchaudio load failed, falling back to librosa. Error: {e}")
+        import librosa
+        import numpy as np
+        # Load using librosa as a robust CPU-based alternative
+        audio_np, sr0 = librosa.load(filename, sr=None)
+        # Convert back to tensor and match the [1, N] shape expected by the pipeline
+        audio = torch.from_numpy(audio_np).unsqueeze(0)
+    
+    
+    
     if sr0 != sr1:
         audio = audio.to(device)
         if audio.shape[0] == 2:
diff --git a/GPT_SoVITS/prepare_datasets/2-get-sv.py b/GPT_SoVITS/prepare_datasets/2-get-sv.py
index 80b0ad69..21c00017 100644
--- a/GPT_SoVITS/prepare_datasets/2-get-sv.py
+++ b/GPT_SoVITS/prepare_datasets/2-get-sv.py
@@ -89,7 +89,28 @@ def name2go(wav_name, wav_path):
     if os.path.exists(sv_cn_path):
         return
     wav_path = "%s/%s" % (wav32dir, wav_name)
-    wav32k, sr0 = torchaudio.load(wav_path)
+    
+    
+    #wav32k, sr0 = torchaudio.load(wav_path)
+    try:
+        # Attempt to load using torchaudio as originally intended by the author
+        wav32k, sr0 = torchaudio.load(wav_path)
+    except Exception as e:
+        # Fallback for environments with torchcodec/torchaudio issues (e.g., RTX 50-series/Python 3.12)
+        print(f"Warning: torchaudio load failed for {wav_path}, falling back to librosa. Error: {e}")
+        import librosa
+        import torch
+        
+        # Load and resample to 32k in one step using librosa
+        # sr=32000 ensures compatibility with the Hubert/Semantic encoder requirements
+        wav32k_np, sr0 = librosa.load(wav_path, sr=32000)
+        
+        # Convert numpy array back to torch Tensor and add the channel dimension [1, N]
+        wav32k = torch.from_numpy(wav32k_np).unsqueeze(0)
+    
+    
+    
+    
     assert sr0 == 32000
     wav32k = wav32k.to(device)
     emb = sv.compute_embedding3(wav32k).cpu()  # torch.Size([1, 20480])
diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py
index 1176f0bc..cef8e240 100644
--- a/GPT_SoVITS/s1_train.py
+++ b/GPT_SoVITS/s1_train.py
@@ -8,7 +8,21 @@ import logging
 import platform
 from pathlib import Path
 
+#import torch
+
 import torch
+import pathlib
+
+# Fix for PyTorch 2.6+ where weights_only=True is the default.
+# This allows the trainer to safely load PosixPath objects from checkpoints.
+if hasattr(torch.serialization, 'add_safe_globals'):
+    try:
+        torch.serialization.add_safe_globals([pathlib.PosixPath])
+    except Exception as e:
+        print(f"Note: Could not add pathlib.PosixPath to safe_globals: {e}")
+
+
+
 from AR.data.data_module import Text2SemanticDataModule
 from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 from AR.utils.io import load_yaml_config