From 551d3dc2812a9f8ba8ac91cfb5ed61b7806eb530 Mon Sep 17 00:00:00 2001
From: lsh <laishaoheng1996@gmail.com>
Date: Sat, 16 May 2026 19:10:20 -0700
Subject: [PATCH] Fix s1_train DDP crash on Windows single-GPU (sm_120 /
 Blackwell)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On Windows with a single GPU running CUDA 12.8 + PyTorch 2.7+ on Blackwell
(sm_120) hardware, s1_train.py crashes with an access violation (exit code
3221225477) shortly after pytorch_lightning's Trainer initialization, before
the first batch runs.

Root cause: DDPStrategy with the gloo backend is forced on Windows even
when there's only one GPU. The gloo + sm_120 + CUDA 12.8 combination has a
known incompatibility (see PyTorch forum "[Solved] RTX 5090 sm_120 Training
Segfault - DDP Was the Cause") that produces a native crash inside the
Lightning training loop.

Two changes, scoped to Windows + CUDA only:

  * GPT_SoVITS/s1_train.py: on Windows, use Lightning's "auto" strategy,
    which picks `single_device` for one GPU and skips DDP entirely. Also
    pin devices=1 on Windows so multi-GPU users don't accidentally enable
    DDP. Non-Windows behaviour is unchanged (NCCL DDP, all available GPUs).
  * GPT_SoVITS/AR/data/bucket_sampler.py: when the distributed process
    group isn't initialized (i.e. running under single_device strategy),
    fall back to a single-replica configuration instead of crashing in
    dist.get_world_size(). Defensive change — behaviour is unchanged when
    DDP is properly initialized.

Tested on:
  * Windows 11 + RTX 5090 (sm_120) + CUDA 12.8 + PyTorch 2.11+cu128
    15-epoch s1 training completes cleanly, weights saved as expected.

Closes #2626.
---
 GPT_SoVITS/AR/data/bucket_sampler.py | 14 ++++++++------
 GPT_SoVITS/s1_train.py               | 14 ++++++++++----
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/GPT_SoVITS/AR/data/bucket_sampler.py b/GPT_SoVITS/AR/data/bucket_sampler.py
index d8457334..8f0ecefa 100644
--- a/GPT_SoVITS/AR/data/bucket_sampler.py
+++ b/GPT_SoVITS/AR/data/bucket_sampler.py
@@ -36,14 +36,16 @@ class DistributedBucketSampler(Sampler[T_co]):
         drop_last: bool = False,
         batch_size: int = 32,
     ) -> None:
+        # Patched: support non-DDP single-GPU runs (Lightning strategy='auto' on
+        # Windows). When the distributed group isn't initialized, fall back to
+        # a single-replica configuration.
+        _dist_ready = (
+            dist.is_available() and dist.is_initialized() and torch.cuda.is_available()
+        )
         if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1
+            num_replicas = dist.get_world_size() if _dist_ready else 1
         if rank is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            rank = dist.get_rank() if torch.cuda.is_available() else 0
+            rank = dist.get_rank() if _dist_ready else 0
             if torch.cuda.is_available():
                 torch.cuda.set_device(rank)
         if rank >= num_replicas or rank < 0:
diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py
index 1176f0bc..b98de600 100644
--- a/GPT_SoVITS/s1_train.py
+++ b/GPT_SoVITS/s1_train.py
@@ -114,12 +114,18 @@ def main(args):
         # val_check_interval=9999999999999999999999,###不要验证
         # check_val_every_n_epoch=None,
         limit_val_batches=0,
-        devices=-1 if torch.cuda.is_available() else 1,
+        # On Windows, force single-device (no DDP) — see strategy comment below.
+        # Non-Windows preserves original "all GPUs" behaviour.
+        devices=(1 if platform.system() == "Windows" else -1) if torch.cuda.is_available() else 1,
         benchmark=False,
         fast_dev_run=False,
-        strategy=DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo")
-        if torch.cuda.is_available()
-        else "auto",
+        # On Windows, DDPStrategy with the gloo backend crashes with native
+        # access violations on Blackwell (sm_120) / CUDA 12.8. Lightning's
+        # "auto" strategy picks `single_device` for 1 GPU which avoids DDP
+        # entirely. Non-Windows behaviour is preserved (NCCL DDP).
+        strategy="auto"
+        if (platform.system() == "Windows" or not torch.cuda.is_available())
+        else DDPStrategy(process_group_backend="nccl"),
         precision=config["train"]["precision"],
         logger=logger,
         num_sanity_val_steps=0,