diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 58a21ee..2dffac1 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -223,7 +223,7 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, y, y_lengths, text, text_lengths, ge, test=None): + def forward(self, y, y_lengths, text, text_lengths, ge, speed=1,test=None): y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( y.dtype ) @@ -240,9 +240,10 @@ class TextEncoder(nn.Module): text = self.text_embedding(text).transpose(1, 2) text = self.encoder_text(text * text_mask, text_mask) y = self.mrte(y, y_mask, text, text_mask, ge) - y = self.encoder2(y * y_mask, y_mask) - + if(speed!=1): + y = F.interpolate(y, size=int(y.shape[-1] * speed)+1, mode="linear") + y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest") stats = self.proj(y) * y_mask m, logs = torch.split(stats, self.out_channels, dim=1) return y, m, logs, y_mask @@ -966,7 +967,7 @@ class SynthesizerTrn(nn.Module): return o, y_mask, (z, z_p, m_p, logs_p) @torch.no_grad() - def decode(self, codes, text, refer, noise_scale=0.5): + def decode(self, codes, text, refer, noise_scale=0.5,speed=1): ge = None if refer is not None: refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) @@ -983,9 +984,8 @@ class SynthesizerTrn(nn.Module): quantized = F.interpolate( quantized, size=int(quantized.shape[-1] * 2), mode="nearest" ) - x, m_p, logs_p, y_mask = self.enc_p( - quantized, y_lengths, text, text_lengths, ge + quantized, y_lengths, text, text_lengths, ge,speed ) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale