From 60f07ea36e3d306606bc03960b02dd0e16702629 Mon Sep 17 00:00:00 2001
From: csh <458761603@qq.com>
Date: Mon, 18 Aug 2025 16:25:38 +0800
Subject: [PATCH] =?UTF-8?q?stream=5Fv2pro:=20=E4=BF=AE=E5=A4=8D=E7=94=B1?=
 =?UTF-8?q?=E4=BA=8E=20spectrogram=5Ftorch=20=E8=BE=93=E5=85=A5=E6=98=AF?=
 =?UTF-8?q?=20half=20=E5=AF=BC=E8=87=B4=20spec=20=E6=BA=A2=E5=87=BA?=
 =?UTF-8?q?=E6=9C=80=E7=BB=88=E6=B2=A1=E6=9C=89=E5=A3=B0=E9=9F=B3=E7=9A=84?=
 =?UTF-8?q?=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/stream_v2pro.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPT_SoVITS/stream_v2pro.py b/GPT_SoVITS/stream_v2pro.py
index 0a7712ad..718ef9af 100644
--- a/GPT_SoVITS/stream_v2pro.py
+++ b/GPT_SoVITS/stream_v2pro.py
@@ -162,13 +162,14 @@ class StepVitsModel(nn.Module):
     def ref_handle(self, ref_audio_32k):
         refer = spectrogram_torch(
             self.hann_window,
-            ref_audio_32k,
+            ref_audio_32k.float(),
             self.hps.data.filter_length,
             self.hps.data.sampling_rate,
             self.hps.data.hop_length,
             self.hps.data.win_length,
             center=False,
         )
+        refer = refer.to(ref_audio_32k.dtype)
         ref_audio_16k = resamplex(ref_audio_32k, 32000, 16000).to(ref_audio_32k.dtype).to(ref_audio_32k.device)
         sv_emb = self.sv(ref_audio_16k)
         return refer, sv_emb