From 60f07ea36e3d306606bc03960b02dd0e16702629 Mon Sep 17 00:00:00 2001 From: csh <458761603@qq.com> Date: Mon, 18 Aug 2025 16:25:38 +0800 Subject: [PATCH] =?UTF-8?q?stream=5Fv2pro:=20=E4=BF=AE=E5=A4=8D=E7=94=B1?= =?UTF-8?q?=E4=BA=8E=20spectrogram=5Ftorch=20=E8=BE=93=E5=85=A5=E6=98=AF?= =?UTF-8?q?=20half=20=E5=AF=BC=E8=87=B4=20spec=20=E6=BA=A2=E5=87=BA?= =?UTF-8?q?=E6=9C=80=E7=BB=88=E6=B2=A1=E6=9C=89=E5=A3=B0=E9=9F=B3=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/stream_v2pro.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPT_SoVITS/stream_v2pro.py b/GPT_SoVITS/stream_v2pro.py index 0a7712ad..718ef9af 100644 --- a/GPT_SoVITS/stream_v2pro.py +++ b/GPT_SoVITS/stream_v2pro.py @@ -162,13 +162,14 @@ class StepVitsModel(nn.Module): def ref_handle(self, ref_audio_32k): refer = spectrogram_torch( self.hann_window, - ref_audio_32k, + ref_audio_32k.float(), self.hps.data.filter_length, self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, center=False, ) + refer = refer.to(ref_audio_32k.dtype) ref_audio_16k = resamplex(ref_audio_32k, 32000, 16000).to(ref_audio_32k.dtype).to(ref_audio_32k.device) sv_emb = self.sv(ref_audio_16k) return refer, sv_emb