Update api.py

2026-06-28 08:48:14 +08:00 · 2024-01-24 17:02:18 +08:00 · 2024-01-24 17:02:18 +08:00 · 93dd8334f4
commit 93dd8334f4
parent 30a5168637
1 changed files with 7 additions and 2 deletions
--- a/api.py
+++ b/api.py
@ -192,13 +192,18 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
    t0 = ttime()
    prompt_text = prompt_text.strip("\n")
    prompt_language, text = prompt_language, text.strip("\n")
+    zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
    with torch.no_grad():
-        wav16k, sr = librosa.load(ref_wav_path, sr=16000)  # 派蒙
+        wav16k, sr = librosa.load(ref_wav_path, sr=16000)
        wav16k = torch.from_numpy(wav16k)
+        zero_wav_torch = torch.from_numpy(zero_wav)
        if (is_half == True):
            wav16k = wav16k.half().to(device)
+            zero_wav_torch = zero_wav_torch.half().to(device)
        else:
            wav16k = wav16k.to(device)
+            zero_wav_torch = zero_wav_torch.to(device)
+        wav16k=torch.cat([wav16k,zero_wav_torch])
        ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)  # .float()
        codes = vq_model.extract_latent(ssl_content)
        prompt_semantic = codes[0, 0]
@ -209,7 +214,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
    phones1 = cleaned_text_to_sequence(phones1)
    texts = text.split("\n")
    audio_opt = []
-    zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
+
    for text in texts:
        phones2, word2ph2, norm_text2 = clean_text(text, text_language)
        phones2 = cleaned_text_to_sequence(phones2)