mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2026-06-03 20:40:30 +08:00
Fix s1_train DDP crash on Windows single-GPU (sm_120 / Blackwell)
On Windows with a single GPU running CUDA 12.8 + PyTorch 2.7+ on Blackwell
(sm_120) hardware, s1_train.py crashes with an access violation (exit code
3221225477) shortly after pytorch_lightning's Trainer initialization, before
the first batch runs.
Root cause: DDPStrategy with the gloo backend is forced on Windows even
when there's only one GPU. The gloo + sm_120 + CUDA 12.8 combination has a
known incompatibility (see PyTorch forum "[Solved] RTX 5090 sm_120 Training
Segfault - DDP Was the Cause") that produces a native crash inside the
Lightning training loop.
Two changes, scoped to Windows + CUDA only:
* GPT_SoVITS/s1_train.py: on Windows, use Lightning's "auto" strategy,
which picks `single_device` for one GPU and skips DDP entirely. Also
pin devices=1 on Windows so multi-GPU users don't accidentally enable
DDP. Non-Windows behaviour is unchanged (NCCL DDP, all available GPUs).
* GPT_SoVITS/AR/data/bucket_sampler.py: when the distributed process
group isn't initialized (i.e. running under single_device strategy),
fall back to a single-replica configuration instead of crashing in
dist.get_world_size(). Defensive change — behaviour is unchanged when
DDP is properly initialized.
Tested on:
* Windows 11 + RTX 5090 (sm_120) + CUDA 12.8 + PyTorch 2.11+cu128
15-epoch s1 training completes cleanly, weights saved as expected.
Closes #2626.
This commit is contained in:
parent
08d627c333
commit
551d3dc281
@ -36,14 +36,16 @@ class DistributedBucketSampler(Sampler[T_co]):
|
|||||||
drop_last: bool = False,
|
drop_last: bool = False,
|
||||||
batch_size: int = 32,
|
batch_size: int = 32,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
# Patched: support non-DDP single-GPU runs (Lightning strategy='auto' on
|
||||||
|
# Windows). When the distributed group isn't initialized, fall back to
|
||||||
|
# a single-replica configuration.
|
||||||
|
_dist_ready = (
|
||||||
|
dist.is_available() and dist.is_initialized() and torch.cuda.is_available()
|
||||||
|
)
|
||||||
if num_replicas is None:
|
if num_replicas is None:
|
||||||
if not dist.is_available():
|
num_replicas = dist.get_world_size() if _dist_ready else 1
|
||||||
raise RuntimeError("Requires distributed package to be available")
|
|
||||||
num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1
|
|
||||||
if rank is None:
|
if rank is None:
|
||||||
if not dist.is_available():
|
rank = dist.get_rank() if _dist_ready else 0
|
||||||
raise RuntimeError("Requires distributed package to be available")
|
|
||||||
rank = dist.get_rank() if torch.cuda.is_available() else 0
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.set_device(rank)
|
torch.cuda.set_device(rank)
|
||||||
if rank >= num_replicas or rank < 0:
|
if rank >= num_replicas or rank < 0:
|
||||||
|
|||||||
@ -114,12 +114,18 @@ def main(args):
|
|||||||
# val_check_interval=9999999999999999999999,###不要验证
|
# val_check_interval=9999999999999999999999,###不要验证
|
||||||
# check_val_every_n_epoch=None,
|
# check_val_every_n_epoch=None,
|
||||||
limit_val_batches=0,
|
limit_val_batches=0,
|
||||||
devices=-1 if torch.cuda.is_available() else 1,
|
# On Windows, force single-device (no DDP) — see strategy comment below.
|
||||||
|
# Non-Windows preserves original "all GPUs" behaviour.
|
||||||
|
devices=(1 if platform.system() == "Windows" else -1) if torch.cuda.is_available() else 1,
|
||||||
benchmark=False,
|
benchmark=False,
|
||||||
fast_dev_run=False,
|
fast_dev_run=False,
|
||||||
strategy=DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo")
|
# On Windows, DDPStrategy with the gloo backend crashes with native
|
||||||
if torch.cuda.is_available()
|
# access violations on Blackwell (sm_120) / CUDA 12.8. Lightning's
|
||||||
else "auto",
|
# "auto" strategy picks `single_device` for 1 GPU which avoids DDP
|
||||||
|
# entirely. Non-Windows behaviour is preserved (NCCL DDP).
|
||||||
|
strategy="auto"
|
||||||
|
if (platform.system() == "Windows" or not torch.cuda.is_available())
|
||||||
|
else DDPStrategy(process_group_backend="nccl"),
|
||||||
precision=config["train"]["precision"],
|
precision=config["train"]["precision"],
|
||||||
logger=logger,
|
logger=logger,
|
||||||
num_sanity_val_steps=0,
|
num_sanity_val_steps=0,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user