diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 8fd0a084..17d4a0f6 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1330,22 +1330,12 @@ class TTS: continue _semantic_tokens = semantic_tokens - # if is_first_chunk: - # _semantic_tokens = torch.cat([torch.ones((1,overlap_length), dtype=torch.long, device=self.configs.device)*self.configs.mute_tokens[self.configs.version], _semantic_tokens], dim=-1) - # else: - # _semantic_tokens = torch.cat([last_tokens[:, -overlap_length:], _semantic_tokens], dim=-1) - # # _semantic_tokens = torch.cat(previous_tokens+[_semantic_tokens,], dim=-1) previous_tokens.append(semantic_tokens) _semantic_tokens = torch.cat(previous_tokens, dim=-1) - # last_tokens = semantic_tokens - - # print(f"_semantic_tokens shape:{_semantic_tokens.shape}") - - if not self.configs.use_vocoder: audio_chunk = self.vits_model.decode( _semantic_tokens.unsqueeze(0), @@ -1360,13 +1350,7 @@ class TTS: speed=speed_factor, sample_steps=sample_steps, result_length = semantic_tokens.shape[-1]+overlap_length if not is_first_chunk else None ) - - - # if is_first_chunk: - # audio_chunk = audio_chunk[overlap_size:] - # # is_first_chunk = False - audio_chunk_ = audio_chunk if is_first_chunk and not is_final: is_first_chunk = False @@ -1379,7 +1363,7 @@ class TTS: audio_chunk_[last_audio_chunk.shape[0]-overlap_size:-overlap_size] if not is_final \ else audio_chunk_[last_audio_chunk.shape[0]-overlap_size:] ) - # audio_chunk_ = audio_chunk_[:-overlap_size] if not is_final else audio_chunk_ + last_audio_chunk = audio_chunk yield self.audio_postprocess( @@ -1391,7 +1375,7 @@ class TTS: 0.0, super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False, ) - print(f"first_package_delay: {time.perf_counter()-t0:.3f}") + # print(f"first_package_delay: {time.perf_counter()-t0:.3f}") yield output_sr, np.zeros(int(output_sr*fragment_interval), dtype=np.int16)