mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2026-05-19 00:48:12 +08:00
Introduce multiple new modules including unified_engine_api, unified_engine_audio, unified_engine_bridge, unified_engine_builder, unified_engine_components, unified_engine_delegates, and unified_engine_runtime. These additions provide a comprehensive framework for managing TTS requests, audio packing, and engine state management, significantly improving the architecture and maintainability of the TTS system. The new structure supports asynchronous operations and enhances overall performance through better request handling and processing capabilities.
107 lines
2.7 KiB
Python
107 lines
2.7 KiB
Python
from __future__ import annotations
|
|
|
|
import subprocess
|
|
import threading
|
|
import wave
|
|
from io import BytesIO
|
|
|
|
import numpy as np
|
|
import soundfile as sf
|
|
import torch
|
|
|
|
|
|
def set_scheduler_seed(seed: int):
|
|
if seed in ["", None]:
|
|
return
|
|
seed = int(seed)
|
|
if seed < 0:
|
|
return
|
|
np.random.seed(seed)
|
|
torch.manual_seed(seed)
|
|
if torch.cuda.is_available():
|
|
torch.cuda.manual_seed(seed)
|
|
torch.cuda.manual_seed_all(seed)
|
|
|
|
|
|
def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
|
def handle_pack_ogg():
|
|
with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
|
|
audio_file.write(data)
|
|
|
|
stack_size = 4096 * 4096
|
|
try:
|
|
threading.stack_size(stack_size)
|
|
pack_ogg_thread = threading.Thread(target=handle_pack_ogg)
|
|
pack_ogg_thread.start()
|
|
pack_ogg_thread.join()
|
|
except (RuntimeError, ValueError):
|
|
handle_pack_ogg()
|
|
return io_buffer
|
|
|
|
|
|
def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
|
io_buffer.write(data.tobytes())
|
|
return io_buffer
|
|
|
|
|
|
def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
|
io_buffer = BytesIO()
|
|
sf.write(io_buffer, data, rate, format="wav")
|
|
return io_buffer
|
|
|
|
|
|
def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
|
process = subprocess.Popen(
|
|
[
|
|
"ffmpeg",
|
|
"-f",
|
|
"s16le",
|
|
"-ar",
|
|
str(rate),
|
|
"-ac",
|
|
"1",
|
|
"-i",
|
|
"pipe:0",
|
|
"-c:a",
|
|
"aac",
|
|
"-b:a",
|
|
"192k",
|
|
"-vn",
|
|
"-f",
|
|
"adts",
|
|
"pipe:1",
|
|
],
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
)
|
|
out, _ = process.communicate(input=data.tobytes())
|
|
io_buffer.write(out)
|
|
return io_buffer
|
|
|
|
|
|
def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str):
|
|
if media_type == "ogg":
|
|
io_buffer = pack_ogg(io_buffer, data, rate)
|
|
elif media_type == "aac":
|
|
io_buffer = pack_aac(io_buffer, data, rate)
|
|
elif media_type == "wav":
|
|
io_buffer = pack_wav(io_buffer, data, rate)
|
|
else:
|
|
io_buffer = pack_raw(io_buffer, data, rate)
|
|
io_buffer.seek(0)
|
|
return io_buffer
|
|
|
|
|
|
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000):
|
|
wav_buf = BytesIO()
|
|
with wave.open(wav_buf, "wb") as vfout:
|
|
vfout.setnchannels(channels)
|
|
vfout.setsampwidth(sample_width)
|
|
vfout.setframerate(sample_rate)
|
|
vfout.writeframes(frame_input)
|
|
wav_buf.seek(0)
|
|
return wav_buf.read()
|
|
|
|
|