Compare commits

...

8 Commits

Author SHA1 Message Date
Ray
acb8018fec
Merge 62ee3c2aa063bc2361127f9aa418eea49a132dae into 08d627c3338173c3229286d8787060d6559fe0f8 2026-05-14 01:28:52 +08:00
RVC-Boss
08d627c333
增加cuda graph支持,普通推理模式推理速度原地翻倍,效果不变。2
增加cuda graph支持,普通推理模式推理速度原地翻倍,效果不变。2
2026-04-30 15:01:45 +08:00
RVC-Boss
6d95b559e8
增加cuda graph支持,普通推理模式推理速度原地翻倍,效果不变。1
增加cuda graph支持,普通推理模式推理速度原地翻倍,效果不变。1
2026-04-30 15:01:11 +08:00
RVC-Boss
ea2d2a8166
Update README.md 2026-04-19 21:02:57 +08:00
SapphireLab
d9f03dad3e
Update Documentation (#2768)
* 调整日志格式

* docs: Update other languages' changelogs
2026-04-18 22:33:55 +08:00
RVC-Boss
647935357a
Update Changelog_CN.md 2026-04-18 19:01:11 +08:00
RVC-Boss
02425ea256
Fixed issues such as missing imports for types like Optional.
Fixed issues such as missing imports for types like `Optional`.
2026-04-18 17:33:53 +08:00
Ray
62ee3c2aa0 feat: improve WebUI usability with presets and native dialogs 2026-01-16 18:12:05 +11:00
15 changed files with 4049 additions and 2335 deletions

View File

@ -0,0 +1,76 @@
import math
import torch
from torch import nn
class TokenEmbedding(nn.Module):
def __init__(self, embedding_dim: int, vocab_size: int, dropout: float = 0.0):
super().__init__()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.dropout = nn.Dropout(p=dropout)
self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
@property
def weight(self) -> torch.Tensor:
return self.word_embeddings.weight
def embedding(self, index: int) -> torch.Tensor:
return self.word_embeddings.weight[index : index + 1]
def forward(self, x: torch.Tensor):
x = self.word_embeddings(x)
x = self.dropout(x)
return x
class SinePositionalEmbeddingNested(nn.Module):
def __init__(
self,
embedding_dim: int,
dropout: float = 0.0,
scale: bool = False,
alpha: bool = False,
max_batch_size: int = 20,
max_seq_len: int = 2500,
):
super().__init__()
self.embedding_dim = embedding_dim
self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
self.dropout = nn.Dropout(p=dropout)
self.max_batch_size = max_batch_size
self.max_seq_len = max_seq_len
self.reverse = False
self.register_buffer(
"pe", torch.zeros(max_batch_size, max_seq_len, embedding_dim), persistent=False
)
self.pe: torch.Tensor
self.compute_pe()
def compute_pe(self):
if self.reverse:
position = torch.arange(self.max_seq_len - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
else:
position = torch.arange(self.max_seq_len, dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
* -(math.log(10000.0) / self.embedding_dim)
)
pe = self.pe
pe[:, :, 0::2] = torch.sin(position * div_term)
pe[:, :, 1::2] = torch.cos(position * div_term)
def forward(self, input_pos: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
batch_size = x.shape[0]
pe_values = self.pe[torch.arange(batch_size), input_pos - 1]
return x * self.x_scale + self.alpha * pe_values.unsqueeze(1)
def prefill(self, x: torch.Tensor) -> torch.Tensor:
input_pos = torch.tensor([i.shape[0] for i in x.unbind()])
pe_values = torch.nested.nested_tensor(
[self.pe[i, : input_pos[i], :] for i in range(input_pos.size(0))]
)
return x * self.x_scale + self.alpha.item() * pe_values

View File

@ -0,0 +1,78 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Literal, Optional
import torch
Tensor = torch.Tensor
@dataclass
class T2SResult:
result: List[Tensor] | None = None
infer_speed: float = 0.0
status: Literal["Success", "Error"] = "Success"
exception: Optional[Exception] = None
traceback: Optional[str] = None
@dataclass
class T2SRequest:
x: List[torch.Tensor]
x_lens: Tensor
prompts: torch.Tensor
bert_feature: List[Tensor]
valid_length: int
top_k: int = 5
top_p: float = 1
early_stop_num: int = -1
temperature: float = 1.0
repetition_penalty: float = 1.35
use_cuda_graph: bool = False
debug: bool = False
class T2SSession:
def __init__(self, decoder, request: T2SRequest, device: torch.device, dtype: torch.dtype):
with device:
self.decoder = decoder
self.request = request
self.device = device
self.dtype = dtype
bsz = len(request.x)
y_len = request.prompts.size(-1)
self.bsz = bsz
self.y_len = y_len
from AR.models.t2s_model_cudagraph import Sampler
self.sampler = Sampler(bsz, decoder.vocab_size)
self.x = request.x
self.x_lens = request.x_lens.to(torch.int32)
self.y = request.prompts
self.bert_feature = request.bert_feature
self.prefill_len = self.x_lens + self.y.size(1)
self.input_pos = torch.zeros_like(self.prefill_len)
self.input_pos.add_(self.prefill_len)
self.completed = torch.Tensor([False] * len(self.x)).bool().to(device)
self.y_results: List[Tensor] = [None] * len(self.x) # type: ignore
self.xy_pos = decoder.embed(self.x, self.y, self.bert_feature)
attn_mask = []
for bs in range(bsz):
pos = int(self.x_lens[bs].item())
mask = torch.zeros(pos + y_len, pos + y_len).bool()
mask[:, :pos].fill_(True)
if y_len > 0:
mask[-y_len:, -y_len:] = ~torch.triu(
torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1
)
attn_mask.append(mask)
self.attn_mask_nested = torch.nested.nested_tensor(attn_mask)

View File

@ -0,0 +1,602 @@
"""
CUDA Graph accelerated T2S decoder.
Uses PyTorch native scaled_dot_product_attention (no flash_attn dependency).
Adapted from gsvpp/AR/models/t2s_model_abc.py and t2s_model_flash_attn.py.
"""
from __future__ import annotations
import os
import time
import traceback
from typing import Dict, List, MutableSequence, Optional, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.graphs import CUDAGraph
from tqdm import tqdm
from AR.models.embedding_cudagraph import (
SinePositionalEmbeddingNested as SinePositionalEmbedding,
)
from AR.models.embedding_cudagraph import TokenEmbedding
from AR.models.structs_cudagraph import T2SRequest, T2SResult, T2SSession
Tensor = torch.Tensor
class Sampler(nn.Module):
def __init__(self, batch_size: int, vocab_size: int) -> None:
super().__init__()
self.batch_size = batch_size
def sample(
self,
logits: Tensor,
previous_tokens: Tensor,
temperature: float,
top_k: int,
top_p: float,
repetition_penalty: float,
) -> Tensor:
previous_tokens = previous_tokens.long()
score = torch.gather(logits, dim=1, index=previous_tokens)
score = torch.where(
score < 0, score * repetition_penalty, score / repetition_penalty
)
logits.scatter_(dim=1, index=previous_tokens, src=score)
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cum_probs = torch.cumsum(
torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
)
sorted_indices_to_remove = cum_probs > top_p
sorted_indices_to_remove[:, 0] = False
indices_to_remove = sorted_indices_to_remove.scatter(
dim=1, index=sorted_indices, src=sorted_indices_to_remove
)
logits = logits.masked_fill(indices_to_remove, -float("Inf"))
logits = logits / max(temperature, 1e-5)
v, _ = torch.topk(logits, top_k)
pivot = v[:, -1].unsqueeze(-1)
logits = torch.where(logits < pivot, -float("Inf"), logits)
probs = torch.nn.functional.softmax(logits, dim=-1)
q = torch.empty_like(probs).exponential_(1.0)
idx_next = torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int32)
return idx_next
# ─── KV Cache ────────────────────<E29480><E29480><EFBFBD>───────────────────────────────────────────
class KVCacheNHD(nn.Module):
def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
super().__init__()
assert batch_size > 0
cache_shape = (batch_size, max_seq_length, n_heads, head_dim)
self.n_head = n_heads
self.head_dim = head_dim
self.batch_size = batch_size
self.max_seq_length = max_seq_length
self.register_buffer(
"k_cache", torch.zeros(size=cache_shape), persistent=False
)
self.register_buffer(
"v_cache", torch.zeros(size=cache_shape), persistent=False
)
def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
index = (
(input_pos - 1)
.unsqueeze(-1)
.unsqueeze(-1)
.unsqueeze(-1)
.expand(-1, -1, self.n_head, self.head_dim)
.to(torch.int64)
)
k_out = self.k_cache
v_out = self.v_cache
k_out.scatter_(1, index, k_val)
v_out.scatter_(1, index, v_val)
return k_out, v_out
def empty(self):
self.k_cache.zero_()
self.v_cache.zero_()
def prefill_kv(self, k_val: Tensor, v_val: Tensor, bs: int):
self.k_cache[[bs], : k_val.shape[1]] = k_val
self.v_cache[[bs], : v_val.shape[1]] = v_val
# ─── Attention (PyTorch native SDPA, no flash_attn) ─────────────────────────
class Attention(nn.Module):
def __init__(self, n_head: int, hidden_dim: int):
super().__init__()
self.n_head = n_head
self.hidden_dim = hidden_dim
assert hidden_dim % n_head == 0
self.head_dim = hidden_dim // n_head
self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
self.dropout = nn.Dropout(0.1)
self._register_load_state_dict_pre_hook(self.load_hook)
def load_hook(self, state_dict: dict, prefix, *args):
keys_to_modify = [key for key in state_dict if "in_proj_" in key]
for key in keys_to_modify:
new_key = key.replace("in_proj_", "in_proj.")
state_dict[new_key] = state_dict.pop(key)
def forward(
self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheNHD
) -> Tensor:
bsz, seqlen, _ = x.shape
q, k, v = self.in_proj.forward(x).chunk(3, dim=-1)
q = q.view(bsz, seqlen, self.n_head, self.head_dim)
k = k.view(bsz, seqlen, self.n_head, self.head_dim)
v = v.view(bsz, seqlen, self.n_head, self.head_dim)
k_cache, v_cache = kv_cache.update(input_pos, k, v)
q = q.transpose(1, 2) # [B, H, 1, D]
k_out = k_cache.transpose(1, 2) # [B, H, max_seq, D]
v_out = v_cache.transpose(1, 2) # [B, H, max_seq, D]
attn = F.scaled_dot_product_attention(q, k_out, v_out)
attn = self.dropout.forward(attn)
attn = attn.transpose(1, 2).reshape(bsz, seqlen, self.hidden_dim)
attn = self.out_proj.forward(attn)
return attn
def prefill(self, x: Tensor, mask: Tensor, kv_cache: KVCacheNHD) -> Tensor:
bsz = x.size(0)
outputs = []
for bs in range(bsz):
x_b = x[bs].unsqueeze(0)
q, k, v = self.in_proj.forward(x_b.unsqueeze(0)).chunk(3, dim=-1)
q = q.contiguous().view(1, -1, self.n_head, self.head_dim)
k = k.contiguous().view(1, -1, self.n_head, self.head_dim)
v = v.contiguous().view(1, -1, self.n_head, self.head_dim)
kv_cache.prefill_kv(k, v, bs)
q, k, v = map(lambda t: t.transpose(1, 2), (q, k, v))
attn_mask = (
mask[bs].unsqueeze(0).unsqueeze(0).expand(1, self.n_head, -1, -1)
)
attn = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
attn = self.dropout.forward(attn)
attn = attn.transpose(1, 2).contiguous().view(1, -1, self.hidden_dim)
output = self.out_proj.forward(attn)
outputs.append(output.squeeze(0))
return torch.nested.nested_tensor(outputs)
# ─── Feed Forward ────────────────────────────────────────────────────────────
class FeedForward(nn.Module):
def __init__(self, dim: int, hidden_dim: int) -> None:
super().__init__()
self.linear1 = nn.Linear(dim, hidden_dim, bias=True)
self.linear2 = nn.Linear(hidden_dim, dim, bias=True)
self.dropout = nn.Dropout(0.1)
def forward(self, x: Tensor) -> Tensor:
return self.dropout.forward(
self.linear2(self.dropout.forward(F.relu(self.linear1(x))))
)
# ─── Transformer Block ──────────────────────────────────────────────────────
class TransformerBlock(nn.Module):
def __init__(self, n_head, ffn_dim, hidden_dim) -> None:
super().__init__()
self.hidden_dim = hidden_dim
self.attention = Attention(n_head, hidden_dim)
self.feed_forward = FeedForward(hidden_dim, ffn_dim)
self.attention_norm = nn.LayerNorm([hidden_dim])
self.ffn_norm = nn.LayerNorm([hidden_dim])
self.dropout = nn.Dropout(0.1)
self._register_load_state_dict_pre_hook(self.load_hook)
def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
for key in list(state_dict.keys()):
new_key = (
key.replace("self_attn", "attention")
.replace("linear", "feed_forward.linear")
.replace("norm1", "attention_norm")
.replace("norm2", "ffn_norm")
)
state_dict[new_key] = state_dict.pop(key)
def forward(
self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheNHD
) -> Tensor:
h = self.attention_norm.forward(
x + self.dropout.forward(self.attention.forward(x, input_pos, kv_cache))
)
out = self.ffn_norm.forward(h + self.feed_forward.forward(h))
return out
def prefill(self, x: Tensor, mask: Tensor, kv_cache: KVCacheNHD) -> Tensor:
h = self.attention_norm.forward(
x + self.dropout.forward(self.attention.prefill(x, mask, kv_cache))
)
out = self.ffn_norm.forward(h + self.feed_forward.forward(h))
return out
# ─── Transformer Decoder ────────────────────────────────────────────────────
class TransformerDecoder(nn.Module):
def __init__(
self,
hidden_dim,
n_layer,
n_head,
ffn_dim,
vocab_size,
max_seq_length,
max_batch_size,
) -> None:
super().__init__()
self.hidden_dim = hidden_dim
self.n_head = n_head
assert hidden_dim % n_head == 0
self.head_dim = hidden_dim // n_head
self.vocab_size = vocab_size
self.n_layer = n_layer
self.layers = nn.ModuleList(
TransformerBlock(n_head, ffn_dim, hidden_dim) for _ in range(n_layer)
)
self.max_seq_length: int = max_seq_length
self.max_batch_size: int = max_batch_size
def forward(
self,
input_pos: Tensor,
x: Tensor,
kv_caches: MutableSequence[KVCacheNHD],
):
for layer, kv_cache in zip(self.layers, kv_caches):
x = layer.forward(x, input_pos, kv_cache)
return x
def prefill(
self,
x: Tensor,
mask: Tensor,
kv_caches: MutableSequence[KVCacheNHD],
):
for layer, kv_cache in zip(self.layers, kv_caches):
x = layer.prefill(x, mask, kv_cache)
return x
# ─── T2S Decoder ─────────────────────────────────────────────────────────────
class T2SDecoder(nn.Module):
def __init__(
self,
config,
*args,
norm_first=False,
max_seq_length=2500,
max_batch_size=10,
**kwds,
) -> None:
super().__init__()
hidden_dim = config["model"]["hidden_dim"]
embedding_dim = config["model"]["embedding_dim"]
n_head = config["model"]["head"]
n_layer = config["model"]["n_layer"]
vocab_size = config["model"]["vocab_size"]
phoneme_vocab_size = config["model"]["phoneme_vocab_size"]
p_dropout = config["model"]["dropout"]
EOS = config["model"]["EOS"]
ffn_dim = hidden_dim * 4
self.n_layer = n_layer
self.hidden_dim = hidden_dim
self.n_head = n_head
assert hidden_dim % n_head == 0
self.head_dim = hidden_dim // n_head
self.embedding_dim = embedding_dim
self.vocab_size = vocab_size
self.phoneme_vocab_size = phoneme_vocab_size
self.p_dropout = p_dropout
self.max_seq_length = max_seq_length
self.max_batch_size = max_batch_size
self.EOS = EOS
assert self.EOS == self.vocab_size - 1
self.bert_proj = nn.Linear(1024, self.embedding_dim)
self.ar_text_embedding = TokenEmbedding(
self.embedding_dim, self.phoneme_vocab_size, self.p_dropout
)
self.ar_text_position = SinePositionalEmbedding(
self.embedding_dim,
dropout=0.1,
scale=False,
alpha=True,
max_batch_size=max_batch_size,
max_seq_len=max_seq_length,
)
self.ar_audio_embedding = TokenEmbedding(
self.embedding_dim, self.vocab_size, self.p_dropout
)
self.ar_audio_position = SinePositionalEmbedding(
self.embedding_dim,
dropout=0.1,
scale=False,
alpha=True,
max_batch_size=max_batch_size,
max_seq_len=max_seq_length,
)
self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
self.h = TransformerDecoder(
hidden_dim,
n_layer,
n_head,
ffn_dim,
vocab_size,
max_seq_length,
max_batch_size,
)
self._register_load_state_dict_pre_hook(self.load_hook)
def load_hook(self, state_dict, prefix, *args):
model_keys = [key for key in state_dict if key.startswith("model.")]
for key in model_keys:
new_key = key[len("model.") :]
state_dict[new_key] = state_dict.pop(key)
def init_cache(self, bsz: int = 0) -> nn.ModuleList:
bsz = bsz or self.h.max_batch_size
assert bsz <= self.h.max_batch_size
seq_lens = self.h.max_seq_length
device = self.bert_proj.bias.device
dtype = self.bert_proj.bias.dtype
return nn.ModuleList(
[
KVCacheNHD(bsz, seq_lens, self.n_head, self.head_dim)
for _ in range(self.n_layer)
],
).to(device, dtype)
def embed(
self,
x: List[torch.Tensor],
y: torch.Tensor,
bert_features: List[torch.Tensor],
):
x_nested = torch.nested.nested_tensor(x)
assert x_nested.size(0) <= self.max_batch_size
bert_features_nested = torch.nested.nested_tensor(
list(map(lambda t: t.transpose(0, 1), bert_features))
)
x_emb = self.ar_text_embedding.forward(x_nested)
bert = self.bert_proj.forward(bert_features_nested)
x_emb = x_emb + bert
x_pos = self.ar_text_position.prefill(x_emb)
y_nested = torch.nested.nested_tensor(list(y.unbind(0)))
y_emb = self.ar_audio_embedding.forward(y_nested)
y_pos = self.ar_audio_position.prefill(y_emb)
xy_pos = torch.nested.nested_tensor(
[torch.cat([x_pos[i], y_pos[i]]) for i in range(len(x))]
)
return xy_pos
def capture(
self,
input_pos: Tensor,
x: Tensor,
x_dec: Tensor,
kv_caches,
) -> CUDAGraph:
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
graph = torch.cuda.CUDAGraph()
with torch.cuda.stream(s):
for _ in range(5):
self.h.forward(input_pos, x, kv_caches)
torch.cuda.current_stream().wait_stream(s)
with torch.cuda.graph(graph):
x_dec.copy_(self.h.forward(input_pos, x, kv_caches))
torch.cuda.synchronize()
return graph
# ─── CUDA Graph Runner ───────────────────────────────────────────────────────
class CUDAGraphRunner:
def __init__(
self,
decoder_model: T2SDecoder,
device: torch.device = torch.device("cpu"),
dtype: torch.dtype = torch.float32,
) -> None:
assert device.type in {"cpu", "cuda", "mps", "xpu", "mtia"}
assert dtype in {torch.float16, torch.bfloat16, torch.float32}
self.device = device
self.dtype = dtype
self.decoder_model: T2SDecoder = decoder_model.to(self.device, self.dtype)
self.graph: Optional[CUDAGraph] = None
self.xy_pos_ = torch.rand(
(1, 1, decoder_model.embedding_dim), device=device
).to(dtype)
self.xy_dec_ = torch.rand(
(1, 1, decoder_model.embedding_dim), device=device
).to(dtype)
self.kv_cache = decoder_model.init_cache(1)
self.input_pos = torch.tensor([10]).int().cuda()
def _handle_request(self, request: T2SRequest):
with self.device:
for i in self.kv_cache:
i.empty()
decoder = self.decoder_model
session = T2SSession(decoder, request, device=self.device, dtype=self.dtype)
self.input_pos.copy_(session.input_pos)
t1 = 0.0
infer_speed = 0.0
y = session.y
bsz = y.size(0)
for idx in tqdm(range(1500)):
if idx == 0:
xy_dec = decoder.h.prefill(
session.xy_pos, session.attn_mask_nested, self.kv_cache
)
xy_dec = torch.stack([t[[-1]] for t in xy_dec.unbind()])
else:
if (
request.use_cuda_graph
and self.graph is None
and torch.cuda.is_available()
):
self.xy_pos_.copy_(session.xy_pos)
self.graph = decoder.capture(
self.input_pos,
self.xy_pos_,
self.xy_dec_,
kv_caches=self.kv_cache,
)
if self.graph:
self.xy_pos_.copy_(session.xy_pos)
self.graph.replay()
xy_dec = self.xy_dec_.clone()
else:
xy_dec = decoder.h.forward(
self.input_pos,
session.xy_pos,
self.kv_cache,
)
logits = decoder.ar_predict_layer(xy_dec[:, -1])
self.input_pos.add_(1)
if idx == 0:
logits[:, -1] = float("-inf")
samples = session.sampler.sample(
logits=logits,
previous_tokens=session.y,
top_k=request.top_k,
top_p=request.top_p,
repetition_penalty=request.repetition_penalty,
temperature=request.temperature,
)
session.y = torch.cat([session.y, samples], dim=1)
argmax_token = torch.argmax(logits, dim=-1)
sample_token = samples.squeeze(1)
EOS_mask = (argmax_token == decoder.EOS) | (
sample_token == decoder.EOS
)
newly_done_mask = EOS_mask & (~session.completed)
newly_done_indices = newly_done_mask.nonzero()
if newly_done_indices.numel() > 0:
session.y_results[newly_done_indices[0]] = session.y[
newly_done_indices[0], session.y_len : -1
].squeeze(0)
session.completed[newly_done_indices] = True
if torch.all(session.completed).item():
if session.y.size(1) == 0:
session.y = torch.cat(
[session.y, torch.zeros_like(samples)], dim=1
)
tqdm.write("Bad Zero Prediction")
else:
tqdm.write(
f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> \n"
f"{[i.size(0) for i in session.y_results].__str__().strip('[]')}"
)
tqdm.write(
f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s"
)
infer_speed = (idx - 1) / (time.perf_counter() - t1)
break
if (
request.early_stop_num != -1
and (session.y.size(1) - session.y_len) > request.early_stop_num
) or idx == 1499:
for i in range(bsz):
if not session.completed[i].item():
session.y_results[i] = session.y[i, session.y_len :]
session.completed[i] = True
break
y_emb = decoder.ar_audio_embedding(session.y[:, -1:])
session.xy_pos = decoder.ar_audio_position.forward(
self.input_pos - session.x_lens, y_emb
)
if idx == 2:
t1 = time.perf_counter()
if idx % 100 == 0 and self.device.type == "cuda":
torch.cuda.empty_cache()
if self.device.type == "cuda":
torch.cuda.empty_cache()
return session.y_results[: request.valid_length], infer_speed
def generate(self, request: T2SRequest) -> T2SResult:
try:
result, infer_speed = self._handle_request(request)
t2s_result = T2SResult(
result=result, infer_speed=infer_speed, status="Success"
)
except Exception as e:
t2s_result = T2SResult(
status="Error", exception=e, traceback=traceback.format_exc()
)
return t2s_result
@staticmethod
def load_decoder(weights_path, max_batch_size=1) -> T2SDecoder:
print(
f"Loading Text2Semantic Weights from {weights_path} with CUDA Graph (SDPA) Implement"
)
dict_s1 = torch.load(
weights_path, map_location="cpu", weights_only=False#, mmap=True
)
config = dict_s1["config"]
decoder = T2SDecoder(config, max_batch_size=max_batch_size)
state_dict = dict_s1["weight"]
decoder.load_state_dict(state_dict)
return decoder.eval()

View File

@ -8,30 +8,30 @@ def multi_head_attention_forward_patched(
query,
key,
value,
embed_dim_to_check: int,
num_heads: int,
embed_dim_to_check,
num_heads,
in_proj_weight,
in_proj_bias: Optional[Tensor],
bias_k: Optional[Tensor],
bias_v: Optional[Tensor],
add_zero_attn: bool,
dropout_p: float,
out_proj_weight: Tensor,
out_proj_bias: Optional[Tensor],
training: bool = True,
key_padding_mask: Optional[Tensor] = None,
need_weights: bool = True,
attn_mask: Optional[Tensor] = None,
use_separate_proj_weight: bool = False,
q_proj_weight: Optional[Tensor] = None,
k_proj_weight: Optional[Tensor] = None,
v_proj_weight: Optional[Tensor] = None,
static_k: Optional[Tensor] = None,
static_v: Optional[Tensor] = None,
average_attn_weights: bool = True,
is_causal: bool = False,
in_proj_bias,
bias_k,
bias_v,
add_zero_attn,
dropout_p,
out_proj_weight,
out_proj_bias,
training=True,
key_padding_mask=None,
need_weights=True,
attn_mask=None,
use_separate_proj_weight=False,
q_proj_weight=None,
k_proj_weight=None,
v_proj_weight=None,
static_k=None,
static_v=None,
average_attn_weights=True,
is_causal=False,
cache=None,
) -> Tuple[Tensor, Optional[Tensor]]:
):
# set up shape vars
_, _, embed_dim = query.shape
attn_mask = _canonical_mask(

File diff suppressed because it is too large Load Diff

View File

@ -6,20 +6,7 @@
全部按英文识别
全部按日文识别
"""
import psutil
import os
def set_high_priority():
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
if os.name != "nt":
return # 仅 Windows 有效
p = psutil.Process(os.getpid())
try:
p.nice(psutil.HIGH_PRIORITY_CLASS)
print("已将进程优先级设为 High")
except psutil.AccessDenied:
print("权限不足,无法修改优先级(请用管理员运行)")
set_high_priority()
import json
import logging
import os
@ -60,7 +47,6 @@ import gradio as gr
from TTS_infer_pack.text_segmentation_method import get_method
from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config
from tools.assets import css, js, top_html
from tools.i18n.i18n import I18nAuto, scan_language_list
language = os.environ.get("language", "Auto")
@ -112,28 +98,66 @@ cut_method = {
i18n("按标点符号切"): "cut5",
}
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
# 推理参数预设系统
INFERENCE_PRESETS = {
i18n("快速合成"): {
"batch_size": 1,
"sample_steps": 8,
"top_k": 5,
"top_p": 1,
"temperature": 1,
"repetition_penalty": 1.35,
"parallel_infer": True,
"split_bucket": True,
},
i18n("高质量"): {
"batch_size": 1,
"sample_steps": 64,
"top_k": 15,
"top_p": 0.8,
"temperature": 0.8,
"repetition_penalty": 1.35,
"parallel_infer": False,
"split_bucket": False,
},
i18n("平衡"): {
"batch_size": 20,
"sample_steps": 32,
"top_k": 5,
"top_p": 1,
"temperature": 1,
"repetition_penalty": 1.35,
"parallel_infer": True,
"split_bucket": True,
},
i18n("自定义"): None, # 不应用任何预设
}
SoVITS_names, GPT_names = get_weights_names()
from config import pretrained_sovits_name
path_sovits_v3 = pretrained_sovits_name["v3"]
path_sovits_v4 = pretrained_sovits_name["v4"]
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
def apply_preset(preset_name):
"""Apply inference preset and return updated parameter values"""
if preset_name == i18n("自定义") or preset_name not in INFERENCE_PRESETS:
# Return current values without changes
return [gr.update() for _ in range(8)]
preset = INFERENCE_PRESETS[preset_name]
return [
gr.update(value=preset["batch_size"]),
gr.update(value=preset["sample_steps"]),
gr.update(value=preset["top_k"]),
gr.update(value=preset["top_p"]),
gr.update(value=preset["temperature"]),
gr.update(value=preset["repetition_penalty"]),
gr.update(value=preset["parallel_infer"]),
gr.update(value=preset["split_bucket"]),
]
tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml")
tts_config.device = device
tts_config.is_half = is_half
# tts_config.version = version
tts_config.update_version(version)
tts_config.version = version
if gpt_path is not None:
if "" in gpt_path or "!" in gpt_path:
gpt_path = name2gpt_path[gpt_path]
tts_config.t2s_weights_path = gpt_path
if sovits_path is not None:
if "" in sovits_path or "!" in sovits_path:
sovits_path = name2sovits_path[sovits_path]
tts_config.vits_weights_path = sovits_path
if cnhubert_base_path is not None:
tts_config.cnhuhbert_base_path = cnhubert_base_path
@ -209,6 +233,40 @@ def custom_sort_key(s):
return parts
def change_choices():
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {
"choices": sorted(GPT_names, key=custom_sort_key),
"__type__": "update",
}
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth"
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
pretrained_sovits_name = [
"GPT_SoVITS/pretrained_models/s2G488k.pth",
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"GPT_SoVITS/pretrained_models/s2Gv3.pth",
"GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
]
pretrained_gpt_name = [
"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
]
_ = [[], []]
for i in range(4):
if os.path.exists(pretrained_gpt_name[i]):
_[0].append(pretrained_gpt_name[i])
if os.path.exists(pretrained_sovits_name[i]):
_[-1].append(pretrained_sovits_name[i])
pretrained_gpt_name, pretrained_sovits_name = _
if os.path.exists("./weight.json"):
pass
else:
@ -218,28 +276,50 @@ else:
with open("./weight.json", "r", encoding="utf-8") as file:
weight_data = file.read()
weight_data = json.loads(weight_data)
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, SoVITS_names[0]))
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name))
sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name))
if isinstance(gpt_path, list):
gpt_path = gpt_path[0]
if isinstance(sovits_path, list):
sovits_path = sovits_path[0]
SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"]
GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"]
for path in SoVITS_weight_root + GPT_weight_root:
os.makedirs(path, exist_ok=True)
def get_weights_names(GPT_weight_root, SoVITS_weight_root):
SoVITS_names = [i for i in pretrained_sovits_name]
for path in SoVITS_weight_root:
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
GPT_names = [i for i in pretrained_gpt_name]
for path in GPT_weight_root:
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
return SoVITS_names, GPT_names
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
from process_ckpt import get_sovits_version_from_path_fast
v3v4set = {"v3", "v4"}
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
if "" in sovits_path or "!" in sovits_path:
sovits_path = name2sovits_path[sovits_path]
global version, model_version, dict_language, if_lora_v3
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
# print(sovits_path,version, model_version, if_lora_v3)
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
if if_lora_v3 == True and is_exist == False:
info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失,无法加载相应 LoRA 权重")
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
gr.Warning(info)
raise FileExistsError(info)
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
@ -297,19 +377,11 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
f.write(json.dumps(data))
def change_gpt_weights(gpt_path):
if "" in gpt_path or "!" in gpt_path:
gpt_path = name2gpt_path[gpt_path]
tts_pipeline.init_t2s_weights(gpt_path)
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css) as app:
gr.HTML(
top_html.format(
i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
+ i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
),
elem_classes="markdown",
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
gr.Markdown(
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
+ "<br>"
+ i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
)
with gr.Column():
@ -368,6 +440,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
with gr.Group():
gr.Markdown(value=i18n("推理设置"))
with gr.Row():
preset_dropdown = gr.Dropdown(
label=i18n("参数预设"),
choices=list(INFERENCE_PRESETS.keys()),
value=i18n("平衡"),
interactive=True,
info=i18n("选择预设可快速配置推理参数")
)
with gr.Row():
with gr.Column():
with gr.Row():
@ -382,10 +462,10 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
minimum=0.01, maximum=1, step=0.01, label=i18n("分段间隔(秒)"), value=0.3, interactive=True
)
speed_factor = gr.Slider(
minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True
minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1.0, interactive=True
)
with gr.Row():
top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True)
top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True)
top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True)
with gr.Row():
temperature = gr.Slider(
@ -477,7 +557,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
inference_button,
],
) #
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])
# 预设选择事件绑定
preset_dropdown.change(
apply_preset,
[preset_dropdown],
[batch_size, sample_steps, top_k, top_p, temperature, repetition_penalty, parallel_infer, split_bucket]
)
with gr.Group():
gr.Markdown(

View File

@ -48,6 +48,8 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
请不要尬黑GPT-SoVITS推理速度慢谢谢
CPU-Optimized Inference Versionhttps://github.com/baicai-1145/GPT-SoVITS-CPUFast
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
## Installation

454
api.py
View File

@ -163,7 +163,7 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np
from feature_extractor import cnhubert
from io import BytesIO
from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
from module.models import SynthesizerTrn, SynthesizerTrnV3
from peft import LoraConfig, get_peft_model
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from text import cleaned_text_to_sequence
@ -198,44 +198,39 @@ def is_full(*items): # 任意一项为空返回False
return True
bigvgan_model = hifigan_model = sv_cn_model = None
def clean_hifigan_model():
global hifigan_model
if hifigan_model:
hifigan_model = hifigan_model.cpu()
hifigan_model = None
try:
torch.cuda.empty_cache()
except:
pass
def clean_bigvgan_model():
global bigvgan_model
if bigvgan_model:
bigvgan_model = bigvgan_model.cpu()
bigvgan_model = None
try:
torch.cuda.empty_cache()
except:
pass
def clean_sv_cn_model():
global sv_cn_model
if sv_cn_model:
sv_cn_model.embedding_model = sv_cn_model.embedding_model.cpu()
sv_cn_model = None
try:
torch.cuda.empty_cache()
except:
pass
def normalize_api_params(params: dict) -> dict:
"""
Normalize API parameters to support both v1 and v2 naming conventions.
This provides backward compatibility for different API versions.
Mapping (v2 -> v1):
ref_audio_path -> refer_wav_path
text_lang -> text_language
prompt_lang -> prompt_language
speed_factor -> speed
"""
param_mappings = {
# v2 style -> v1 style
"ref_audio_path": "refer_wav_path",
"text_lang": "text_language",
"prompt_lang": "prompt_language",
"speed_factor": "speed",
"aux_ref_audio_paths": "inp_refs",
}
normalized = {}
for key, value in params.items():
# If this is a v2 parameter name, map it to v1; otherwise keep original
normalized_key = param_mappings.get(key, key)
# Don't overwrite if v1 key already exists
if normalized_key not in normalized:
normalized[normalized_key] = value
return normalized
def init_bigvgan():
global bigvgan_model, hifigan_model, sv_cn_model
global bigvgan_model
from BigVGAN import bigvgan
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
@ -245,57 +240,20 @@ def init_bigvgan():
# remove weight norm in the model and set to eval mode
bigvgan_model.remove_weight_norm()
bigvgan_model = bigvgan_model.eval()
if is_half == True:
bigvgan_model = bigvgan_model.half().to(device)
else:
bigvgan_model = bigvgan_model.to(device)
def init_hifigan():
global hifigan_model, bigvgan_model, sv_cn_model
hifigan_model = Generator(
initial_channel=100,
resblock="1",
resblock_kernel_sizes=[3, 7, 11],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
upsample_rates=[10, 6, 2, 2, 2],
upsample_initial_channel=512,
upsample_kernel_sizes=[20, 12, 4, 4, 4],
gin_channels=0,
is_bias=True,
)
hifigan_model.eval()
hifigan_model.remove_weight_norm()
state_dict_g = torch.load(
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
map_location="cpu",
weights_only=False,
)
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
if is_half == True:
hifigan_model = hifigan_model.half().to(device)
else:
hifigan_model = hifigan_model.to(device)
from sv import SV
def init_sv_cn():
global hifigan_model, bigvgan_model, sv_cn_model
sv_cn_model = SV(device, is_half)
resample_transform_dict = {}
def resample(audio_tensor, sr0, sr1, device):
def resample(audio_tensor, sr0):
global resample_transform_dict
key = "%s-%s-%s" % (sr0, sr1, str(device))
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
if sr0 not in resample_transform_dict:
resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device)
return resample_transform_dict[sr0](audio_tensor)
from module.mel_processing import mel_spectrogram_torch
@ -325,19 +283,6 @@ mel_fn = lambda x: mel_spectrogram_torch(
"center": False,
},
)
mel_fn_v4 = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
sr_model = None
@ -379,19 +324,12 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
def get_sovits_weights(sovits_path):
from config import pretrained_sovits_name
path_sovits_v3 = pretrained_sovits_name["v3"]
path_sovits_v4 = pretrained_sovits_name["v4"]
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
if if_lora_v3 == True and is_exist == False:
logger.info("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
if if_lora_v3 == True and is_exist_s2gv3 == False:
logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
dict_s2 = load_sovits_new(sovits_path)
hps = dict_s2["config"]
@ -404,13 +342,11 @@ def get_sovits_weights(sovits_path):
else:
hps.model.version = "v2"
model_params_dict = vars(hps.model)
if model_version not in {"v3", "v4"}:
if "Pro" in model_version:
hps.model.version = model_version
if sv_cn_model == None:
init_sv_cn()
if model_version == "v3":
hps.model.version = "v3"
model_params_dict = vars(hps.model)
if model_version != "v3":
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
@ -418,18 +354,13 @@ def get_sovits_weights(sovits_path):
**model_params_dict,
)
else:
hps.model.version = model_version
vq_model = SynthesizerTrnV3(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**model_params_dict,
)
if model_version == "v3":
init_bigvgan()
if model_version == "v4":
init_hifigan()
init_bigvgan()
model_version = hps.model.version
logger.info(f"模型版本: {model_version}")
if "pretrained" not in sovits_path:
@ -445,8 +376,7 @@ def get_sovits_weights(sovits_path):
if if_lora_v3 == False:
vq_model.load_state_dict(dict_s2["weight"], strict=False)
else:
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False)
vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False)
lora_rank = dict_s2["lora_rank"]
lora_config = LoraConfig(
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
@ -475,7 +405,7 @@ hz = 50
def get_gpt_weights(gpt_path):
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
dict_s1 = torch.load(gpt_path, map_location="cpu")
config = dict_s1["config"]
max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
@ -543,65 +473,62 @@ from text import chinese
def get_phones_and_bert(text, language, version, final=False):
text = re.sub(r' {2,}', ' ', text)
textlist = []
langlist = []
if language == "all_zh":
for tmp in LangSegmenter.getTexts(text,"zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
langlist.append("en")
textlist.append(text)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "auto_yue":
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
langlist.append(tmp["lang"])
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
if language == "all_zh":
if re.search(r"[A-Za-z]", formattext):
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
formattext = chinese.mix_text_normalize(formattext)
return get_phones_and_bert(formattext, "zh", version)
else:
# 因无法区别中日韩文汉字,以用户输入为准
langlist.append(language)
textlist.append(tmp["text"])
phones_list = []
bert_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
bert = get_bert_inf(phones, word2ph, norm_text, lang)
phones_list.append(phones)
norm_text_list.append(norm_text)
bert_list.append(bert)
bert = torch.cat(bert_list, dim=1)
phones = sum(phones_list, [])
norm_text = "".join(norm_text_list)
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
bert = get_bert_feature(norm_text, word2ph).to(device)
elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
formattext = chinese.mix_text_normalize(formattext)
return get_phones_and_bert(formattext, "yue", version)
else:
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
textlist = []
langlist = []
if language == "auto":
for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "auto_yue":
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "en":
langlist.append(tmp["lang"])
else:
# 因无法区别中日韩文汉字,以用户输入为准
langlist.append(language)
textlist.append(tmp["text"])
phones_list = []
bert_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
bert = get_bert_inf(phones, word2ph, norm_text, lang)
phones_list.append(phones)
norm_text_list.append(norm_text)
bert_list.append(bert)
bert = torch.cat(bert_list, dim=1)
phones = sum(phones_list, [])
norm_text = "".join(norm_text_list)
if not final and len(phones) < 6:
return get_phones_and_bert("." + text, language, version, final=True)
@ -637,34 +564,23 @@ class DictToAttrRecursive(dict):
raise AttributeError(f"Attribute {item} not found")
def get_spepc(hps, filename, dtype, device, is_v2pro=False):
sr1 = int(hps.data.sampling_rate)
audio, sr0 = torchaudio.load(filename)
if sr0 != sr1:
audio = audio.to(device)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
audio = resample(audio, sr0, sr1, device)
else:
audio = audio.to(device)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
def get_spepc(hps, filename):
audio, _ = librosa.load(filename, sr=int(hps.data.sampling_rate))
audio = torch.FloatTensor(audio)
maxx = audio.abs().max()
if maxx > 1:
audio /= min(2, maxx)
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(
audio,
audio_norm,
hps.data.filter_length,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
center=False,
)
spec = spec.to(dtype)
if is_v2pro == True:
audio = resample(audio, sr1, 16000, device).to(dtype)
return spec, audio
return spec
def pack_audio(audio_bytes, data, rate):
@ -851,16 +767,6 @@ def get_tts_wav(
t2s_model = infer_gpt.t2s_model
max_sec = infer_gpt.max_sec
if version == "v3":
if sample_steps not in [4, 8, 16, 32, 64, 128]:
sample_steps = 32
elif version == "v4":
if sample_steps not in [4, 8, 16, 32]:
sample_steps = 8
if if_sr and version != "v3":
if_sr = False
t0 = ttime()
prompt_text = prompt_text.strip("\n")
if prompt_text[-1] not in splits:
@ -884,29 +790,19 @@ def get_tts_wav(
prompt_semantic = codes[0, 0]
prompt = prompt_semantic.unsqueeze(0).to(device)
is_v2pro = version in {"v2Pro", "v2ProPlus"}
if version not in {"v3", "v4"}:
if version != "v3":
refers = []
if is_v2pro:
sv_emb = []
if sv_cn_model == None:
init_sv_cn()
if inp_refs:
for path in inp_refs:
try: #####这里加上提取sv的逻辑要么一堆sv一堆refer要么单个sv单个refer
refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro)
try:
refer = get_spepc(hps, path).to(dtype).to(device)
refers.append(refer)
if is_v2pro:
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
except Exception as e:
logger.error(e)
if len(refers) == 0:
refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro)
refers = [refers]
if is_v2pro:
sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
else:
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
t1 = ttime()
# os.environ['version'] = version
@ -946,56 +842,41 @@ def get_tts_wav(
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
t3 = ttime()
if version not in {"v3", "v4"}:
if is_v2pro:
audio = (
vq_model.decode(
pred_semantic,
torch.LongTensor(phones2).to(device).unsqueeze(0),
refers,
speed=speed,
sv_emb=sv_emb,
)
.detach()
.cpu()
.numpy()[0, 0]
)
else:
audio = (
vq_model.decode(
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
)
.detach()
.cpu()
.numpy()[0, 0]
)
if version != "v3":
audio = (
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)
.detach()
.cpu()
.numpy()[0, 0]
) ###试试重建不带上prompt部分
else:
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0)
# print(11111111, phoneme_ids0, phoneme_ids1)
fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer)
ref_audio, sr = torchaudio.load(ref_wav_path)
ref_audio = ref_audio.to(device).float()
if ref_audio.shape[0] == 2:
ref_audio = ref_audio.mean(0).unsqueeze(0)
tgt_sr = 24000 if version == "v3" else 32000
if sr != tgt_sr:
ref_audio = resample(ref_audio, sr, tgt_sr, device)
mel2 = mel_fn(ref_audio) if version == "v3" else mel_fn_v4(ref_audio)
if sr != 24000:
ref_audio = resample(ref_audio, sr)
# print("ref_audio",ref_audio.abs().mean())
mel2 = mel_fn(ref_audio)
mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2])
mel2 = mel2[:, :, :T_min]
fea_ref = fea_ref[:, :, :T_min]
Tref = 468 if version == "v3" else 500
Tchunk = 934 if version == "v3" else 1000
if T_min > Tref:
mel2 = mel2[:, :, -Tref:]
fea_ref = fea_ref[:, :, -Tref:]
T_min = Tref
chunk_len = Tchunk - T_min
if T_min > 468:
mel2 = mel2[:, :, -468:]
fea_ref = fea_ref[:, :, -468:]
T_min = 468
chunk_len = 934 - T_min
# print("fea_ref",fea_ref,fea_ref.shape)
# print("mel2",mel2)
mel2 = mel2.to(dtype)
fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed)
# print("fea_todo",fea_todo)
# print("ge",ge.abs().mean())
cfm_resss = []
idx = 0
while 1:
@ -1004,24 +885,22 @@ def get_tts_wav(
break
idx += chunk_len
fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
# set_seed(123)
cfm_res = vq_model.cfm.inference(
fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0
)
cfm_res = cfm_res[:, :, mel2.shape[2] :]
mel2 = cfm_res[:, :, -T_min:]
# print("fea", fea)
# print("mel2in", mel2)
fea_ref = fea_todo_chunk[:, :, -T_min:]
cfm_resss.append(cfm_res)
cfm_res = torch.cat(cfm_resss, 2)
cfm_res = denorm_spec(cfm_res)
if version == "v3":
if bigvgan_model == None:
init_bigvgan()
else: # v4
if hifigan_model == None:
init_hifigan()
vocoder_model = bigvgan_model if version == "v3" else hifigan_model
cmf_res = torch.cat(cfm_resss, 2)
cmf_res = denorm_spec(cmf_res)
if bigvgan_model == None:
init_bigvgan()
with torch.inference_mode():
wav_gen = vocoder_model(cfm_res)
wav_gen = bigvgan_model(cmf_res)
audio = wav_gen[0][0].cpu().detach().numpy()
max_audio = np.abs(audio).max()
@ -1032,13 +911,7 @@ def get_tts_wav(
audio_opt = np.concatenate(audio_opt, 0)
t4 = ttime()
if version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
sr = 32000
elif version == "v3":
sr = 24000
else:
sr = 48000 # v4
sr = hps.data.sampling_rate if version != "v3" else 24000
if if_sr and sr == 24000:
audio_opt = torch.from_numpy(audio_opt).float().to(device)
audio_opt, sr = audio_sr(audio_opt.unsqueeze(0), sr)
@ -1058,12 +931,8 @@ def get_tts_wav(
if not stream_mode == "normal":
if media_type == "wav":
if version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
sr = 32000
elif version == "v3":
sr = 48000 if if_sr else 24000
else:
sr = 48000 # v4
sr = 48000 if if_sr else 24000
sr = hps.data.sampling_rate if version != "v3" else sr
audio_bytes = pack_wav(audio_bytes, sr)
yield audio_bytes.getvalue()
@ -1128,6 +997,9 @@ def handle(
if not default_refer.is_ready():
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
if sample_steps not in [4, 8, 16, 32]:
sample_steps = 32
if cut_punc == None:
text = cut_text(text, default_cut_punc)
else:
@ -1230,10 +1102,10 @@ default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, a
# 模型路径检查
if sovits_path == "":
sovits_path = g_config.pretrained_sovits_path
logger.warning(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
logger.warn(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
if gpt_path == "":
gpt_path = g_config.pretrained_gpt_path
logger.warning(f"未指定GPT模型路径, fallback后当前值: {gpt_path}")
logger.warn(f"未指定GPT模型路径, fallback后当前值: {gpt_path}")
# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
@ -1341,20 +1213,22 @@ async def change_refer(refer_wav_path: str = None, prompt_text: str = None, prom
@app.post("/")
async def tts_endpoint(request: Request):
json_post_raw = await request.json()
# Normalize parameters to support both v1 and v2 naming conventions
params = normalize_api_params(json_post_raw)
return handle(
json_post_raw.get("refer_wav_path"),
json_post_raw.get("prompt_text"),
json_post_raw.get("prompt_language"),
json_post_raw.get("text"),
json_post_raw.get("text_language"),
json_post_raw.get("cut_punc"),
json_post_raw.get("top_k", 15),
json_post_raw.get("top_p", 1.0),
json_post_raw.get("temperature", 1.0),
json_post_raw.get("speed", 1.0),
json_post_raw.get("inp_refs", []),
json_post_raw.get("sample_steps", 32),
json_post_raw.get("if_sr", False),
params.get("refer_wav_path"),
params.get("prompt_text"),
params.get("prompt_language"),
params.get("text"),
params.get("text_language"),
params.get("cut_punc"),
params.get("top_k", 15),
params.get("top_p", 1.0),
params.get("temperature", 1.0),
params.get("speed", 1.0),
params.get("inp_refs", []),
params.get("sample_steps", 32),
params.get("if_sr", False),
)

View File

@ -594,11 +594,11 @@
- 内容: 修复实验名结尾出现空格在win中路径不正确的问题
- 类型: 修复
- 提交: RVC-Boss
- 2025.06.10 [Commit#746cb536](https://github.com/RVC-Boss/GPT-SoVITS/commit/746cb536c68b1fe6ce3ca7e882235375b8a8dd89)
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- 内容: 语种分割优化
- 类型: 优化
- 提交: KamioRinn
- 2025.06.11 [Commit#dd2b9253](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2b9253aabb09db32db7a3344570ed9df043351)
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- 内容: 修复并行推理对v2pro支持bug
- 类型: 修复
- 提交: YYuX-1145
@ -606,21 +606,132 @@
- 内容: v2pro对ge提取时会出现数值溢出的问题修复
- 类型: 修复
- 提交: RVC-Boss
- 2025.06.11 [Commit#37f5abfc](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fdc67ca83418306f11e90b9139278313ac5c3e9)[Commit#6fdc67ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/37f5abfcb4a6553652235909db2e124b6f8ff3a5)
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- 内容: install.sh逻辑优化
- 类型: 优化
- 提交: XXXXRT666
- 2025.06.27 [Commit#90ebefa7](https://github.com/RVC-Boss/GPT-SoVITS/commit/90ebefa78fd544da36eebe0b2003620879c921b0)
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- 内容: onnxruntime加载逻辑优化对gpu/cpu的判断
- 类型: 优化
- 提交: KamioRinn
- 2025.06.27 [Commit#6df61f58](https://github.com/RVC-Boss/GPT-SoVITS/commit/6df61f58e4d18d4c2ad9d1eddd6a1bd690034c23)
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- 内容: 语言分割及格式化优化
- 类型: 优化
- 提交: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- 内容: 提升推理进程优先级修复win11下可能GPU利用率受限的问题
- 类型: 修复
- 类型: 优化
- 提交: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- 内容: 解决 TTS.py 无法识别真正支持版本 v2Pro、v2ProPlus 的问题, 同时更新一版默认配置。
- 类型: 修复
- 提交: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- 内容: 修复并行推理模式下v2pro模型识别问题
- 类型: 修复
- 提交: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- 内容: whisper asr支持性价比更高的distill模型
- 类型: 优化
- 提交: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- 内容: 优化TTS_Config的代码逻辑
- 类型: 优化
- 提交: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- 内容: 修复gpt的loss计算问题
- 类型: 修复
- 提交: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- 内容: WSL Rocm
- 类型: 修复
- 提交: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- 内容: 修复环境变量可能不为str的问题
- 类型: 修复
- 提交: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- 内容: 流式推理
- 类型: 新功能
- 提交: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- 内容: 数学计算文本前端逻辑优化
- 类型: 优化
- 提交: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- 内容: 流式推理
- 类型: 新功能
- 提交: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- 内容: 支持vq分布式训练
- 类型: 优化
- 提交: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- 内容: ASR模型下载逻辑优化
- 类型: 优化
- 提交: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- 内容: default batch size bug 修复
- 类型: 修复
- 提交: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- 内容: 修复采样错误
- 类型: 修复
- 提交: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- 内容: 修复 Conda 条款未同意导致的构建失败
- 类型: 修复
- 提交: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- 内容: 环境自动构建优化
- 类型: 优化
- 提交: XXXXRT666
## 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- 内容: 优化 G2PW 的推理输入构造与多音字处理流程,减少重复计算,降低长句场景下的推理开销
- 类型: 优化
- 提交: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- 内容: 改进 Windows 单卡 v3 LoRA 训练流程
- 类型: 优化
- 提交: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- 内容: 修复多个模块中的独立 bug
- 类型: 修复
- 提交: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- 内容: 添加数据集的错误处理提示
- 类型: 优化
- 提交: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- 内容: 并行推理部分bug修复
- 类型: 修复
- 提交: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- 内容: bug修复DPO 训练不支持漏字模拟
- 类型: 修复
- 提交: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- 内容: 修复onnx脚本未导入Optional等的问题
- 类型: 修复
- 提交: RVC-Boss

View File

@ -578,3 +578,160 @@
- Content: Optimized automatic precision detection logic; added collapsible functionality to WebUI frontend modules.
- Type: New Feature
- Contributors: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- Content: Fix polyphone detection for "X一X" pattern
- Type: Fix
- Contributor: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- Content: Config fix; fix SoVITS model loading
- Type: Fix
- Contributor: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- Content: Fix possible numerical explosion of `ge.sum` causing silent inference
- Type: Fix
- Contributor: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- Content: Fix incorrect Windows path when experiment name ends with a space
- Type: Fix
- Contributor: RVC-Boss
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- Content: Optimize language segmentation
- Type: Optimization
- Contributor: KamioRinn
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- Content: Fix bug in parallel inference support for v2pro
- Type: Fix
- Contributor: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- Content: Fix numerical overflow issue when extracting `ge` for v2pro
- Type: Fix
- Contributor: RVC-Boss
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- Content: Optimize `install.sh` logic
- Type: Optimization
- Contributor: XXXXRT666
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- Content: Optimize onnxruntime loading logic (GPU/CPU detection)
- Type: Optimization
- Contributor: KamioRinn
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- Content: Optimize language segmentation and formatting
- Type: Optimization
- Contributor: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- Content: Increase inference process priority (fix possible GPU utilization limitation on Win11)
- Type: Optimization
- Contributor: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- Content: Fix TTS.py not recognizing actually supported versions v2Pro and v2ProPlus, and update default configuration
- Type: Fix
- Contributor: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- Content: Fix v2pro model recognition issue in parallel inference mode
- Type: Fix
- Contributor: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- Content: Whisper ASR supports more cost-effective distill models
- Type: Optimization
- Contributor: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- Content: Optimize `TTS_Config` code logic
- Type: Optimization
- Contributor: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- Content: Fix GPT loss calculation issue
- Type: Fix
- Contributor: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- Content: WSL Rocm
- Type: Fix
- Contributor: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- Content: Fix issue where environment variable may not be a string
- Type: Fix
- Contributor: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- Content: Streaming inference
- Type: New Feature
- Contributor: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- Content: Optimize text frontend logic for mathematical expression text
- Type: Optimization
- Contributor: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- Content: Streaming inference
- Type: New Feature
- Contributor: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- Content: Support VQ distributed training
- Type: Optimization
- Contributor: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- Content: Optimize ASR model download logic
- Type: Optimization
- Contributor: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- Content: Fix default batch size bug
- Type: Fix
- Contributor: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- Content: Fix sampling error
- Type: Fix
- Contributor: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- Content: Fix build failure caused by unaccepted Conda terms
- Type: Fix
- Contributor: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- Content: Optimize automatic environment setup
- Type: Optimization
- Contributor: XXXXRT666
## 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- Content: Optimize G2PW inference input construction and polyphone handling to reduce redundant computation and inference overhead for long sentences
- Type: Optimization
- Contributor: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- Content: Improve the LoRA training flow for GPT-SoVITS v3 on a single card under Windows
- Type: Optimization
- Contributor: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- Content: Fix miscellaneous bugs in multiple modules
- Type: Fix
- Contributor: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- Content: Add error handling hints for dataset processing
- Type: Optimization
- Contributor: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- Content: Fix some bugs in parallel inference
- Type: Fix
- Contributor: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- Content: Fix bug where DPO training does not support missing word simulation
- Type: Fix
- Contributor: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- Content: Fix missing imports (e.g., Optional) in ONNX script
- Type: Fix
- Contributor: RVC-Boss

View File

@ -578,3 +578,160 @@
- 内容: 自動精度検出ロジックを最適化し、WebUI フロントエンドモジュールに折り畳みCollapsible機能を追加
- タイプ: 新機能
- 貢献者: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- 内容: 「X一X」パターンの多音字検出を修正
- タイプ: 修正
- 貢献者: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- 内容: 設定の修正SoVITSモデル読み込みの修正
- タイプ: 修正
- 貢献者: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- 内容: `ge.sum`の数値爆発による推論の無音化を修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- 内容: 実験名がスペースで終わる場合のWindowsパスの誤りを修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- 内容: 言語分割の最適化
- タイプ: 最適化
- 貢献者: KamioRinn
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- 内容: v2proの並列推論対応におけるバグを修正
- タイプ: 修正
- 貢献者: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- 内容: v2proの`ge`抽出時の数値オーバーフロー問題を修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- 内容: `install.sh`のロジックを最適化
- タイプ: 最適化
- 貢献者: XXXXRT666
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- 内容: onnxruntime読み込みロジックを最適化GPU/CPU検出
- タイプ: 最適化
- 貢献者: KamioRinn
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- 内容: 言語分割と書式を最適化
- タイプ: 最適化
- 貢献者: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- 内容: 推論プロセスの優先度を上げるWin11でのGPU利用制限の可能性を修正
- タイプ: 最適化
- 貢献者: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- 内容: TTS.pyが実際にサポートされているバージョンv2Proおよびv2ProPlusを認識しない問題を修正し、デフォルト設定を更新
- タイプ: 修正
- 貢献者: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- 内容: 並列推論モードでのv2proモデル認識問題を修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- 内容: Whisper ASRがよりコスト効率の高い蒸留モデルをサポート
- タイプ: 最適化
- 貢献者: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- 内容: `TTS_Config`のコードロジックを最適化
- タイプ: 最適化
- 貢献者: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- 内容: GPT損失計算の問題を修正
- タイプ: 修正
- 貢献者: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- 内容: WSL Rocm対応
- タイプ: 修正
- 貢献者: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- 内容: 環境変数が文字列でない可能性がある問題を修正
- タイプ: 修正
- 貢献者: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- 内容: ストリーミング推論
- タイプ: 新機能
- 貢献者: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- 内容: 数式テキストに対するテキスト前処理ロジックを最適化
- タイプ: 最適化
- 貢献者: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- 内容: ストリーミング推論
- タイプ: 新機能
- 貢献者: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- 内容: VQ分散学習をサポート
- タイプ: 最適化
- 貢献者: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- 内容: ASRモデルダウンロードロジックを最適化
- タイプ: 最適化
- 貢献者: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- 内容: デフォルトのバッチサイズのバグを修正
- タイプ: 修正
- 貢献者: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- 内容: サンプリングエラーを修正
- タイプ: 修正
- 貢献者: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- 内容: 受け入れられなかったConda利用規約によるビルド失敗を修正
- タイプ: 修正
- 貢献者: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- 内容: 自動環境セットアップを最適化
- タイプ: 最適化
- 貢献者: XXXXRT666
## 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- 内容: G2PW推論入力の構築と多音字処理を最適化し、長文における冗長な計算と推論オーバーヘッドを削減
- タイプ: 最適化
- 貢献者: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- 内容: WindowsでのシングルカードにおけるGPT-SoVITS v3のLoRAトレーニングフローを改善
- タイプ: 最適化
- 貢献者: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- 内容: 複数モジュールの雑多なバグを修正
- タイプ: 修正
- 貢献者: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- 内容: データセット処理時のエラーハンドリングヒントを追加
- タイプ: 最適化
- 貢献者: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- 内容: 並列推論の一部バグを修正
- タイプ: 修正
- 貢献者: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- 内容: DPOトレーニングが欠落単語シミュレーションをサポートしないバグを修正
- タイプ: 修正
- 貢献者: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- 内容: ONNXスクリプトでのOptionalなどの不足インポートを修正
- タイプ: 修正
- 貢献者: RVC-Boss

View File

@ -578,3 +578,160 @@
- 내용: 자동 정밀도 감지 로직 최적화; WebUI 프론트엔드 모듈에 접기 기능 추가
- 유형: 신규 기능
- 기여자: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- 내용: "X一X" 패턴의 다중 발음 감지 오류 수정
- 유형: 수정
- 기여자: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- 내용: 설정 오류 수정; SoVITS 모델 로딩 오류 수정
- 유형: 수정
- 기여자: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- 내용: `ge.sum`의 수치 폭발 가능성으로 인한 추론 무음 현상 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- 내용: 실험 이름이 공백으로 끝날 때 발생하는 잘못된 Windows 경로 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- 내용: 언어 분할 최적화
- 유형: 최적화
- 기여자: KamioRinn
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- 내용: v2pro 병렬 추론 지원 버그 수정
- 유형: 수정
- 기여자: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- 내용: v2pro의 `ge` 추출 시 수치 오버플로우 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- 내용: `install.sh` 로직 최적화
- 유형: 최적화
- 기여자: XXXXRT666
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- 내용: onnxruntime 로딩 로직 최적화 (GPU/CPU 감지)
- 유형: 최적화
- 기여자: KamioRinn
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- 내용: 언어 분할 및 형식 최적화
- 유형: 최적화
- 기여자: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- 내용: 추론 프로세스 우선순위 증가 (Win11에서 GPU 활용 제한 가능성 수정)
- 유형: 최적화
- 기여자: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- 내용: TTS.py가 실제 지원되는 버전 v2Pro 및 v2ProPlus를 인식하지 못하는 문제 수정 및 기본 설정 업데이트
- 유형: 수정
- 기여자: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- 내용: 병렬 추론 모드에서 v2pro 모델 인식 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- 내용: Whisper ASR이 더 비용 효율적인 distill 모델 지원
- 유형: 최적화
- 기여자: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- 내용: `TTS_Config` 코드 로직 최적화
- 유형: 최적화
- 기여자: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- 내용: GPT 손실(loss) 계산 문제 수정
- 유형: 수정
- 기여자: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- 내용: WSL Rocm
- 유형: 수정
- 기여자: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- 내용: 환경 변수가 문자열이 아닐 수 있는 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- 내용: 스트리밍 추론
- 유형: 새 기능
- 기여자: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- 내용: 수학 표현식 텍스트에 대한 텍스트 전처리 로직 최적화
- 유형: 최적화
- 기여자: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- 내용: 스트리밍 추론
- 유형: 새 기능
- 기여자: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- 내용: VQ 분산 학습 지원
- 유형: 최적화
- 기여자: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- 내용: ASR 모델 다운로드 로직 최적화
- 유형: 최적화
- 기여자: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- 내용: 기본 배치 크기 버그 수정
- 유형: 수정
- 기여자: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- 내용: 샘플링 오류 수정
- 유형: 수정
- 기여자: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- 내용: Conda 약관 미동의로 인한 빌드 실패 수정
- 유형: 수정
- 기여자: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- 내용: 자동 환경 설정 최적화
- 유형: 최적화
- 기여자: XXXXRT666
## 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- 내용: G2PW 추론 입력 구성 및 다중 발음 처리를 최적화하여 긴 문장에 대한 중복 계산 및 추론 오버헤드 감소
- 유형: 최적화
- 기여자: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- 내용: Windows 환경 단일 GPU에서 GPT-SoVITS v3의 LoRA 학습 흐름 개선
- 유형: 최적화
- 기여자: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- 내용: 여러 모듈의 잡다한 버그 수정
- 유형: 수정
- 기여자: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- 내용: 데이터셋 처리를 위한 오류 처리 힌트 추가
- 유형: 최적화
- 기여자: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- 내용: 병렬 추론의 일부 버그 수정
- 유형: 수정
- 기여자: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- 내용: DPO 학습이 누락 단어 시뮬레이션을 지원하지 않는 버그 수정
- 유형: 수정
- 기여자: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- 내용: ONNX 스크립트에서 Optional 등 누락된 임포트 문제 수정
- 유형: 수정
- 기여자: RVC-Boss

View File

@ -2,8 +2,6 @@
## 202401
## 202401
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108)
- İçerik: WebUI'ya İngilizce sistem çeviri desteği eklendi.
- Tür: Dokümantasyon
@ -332,6 +330,8 @@
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss, GoHomeToMacDonal
- İlgili: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- Gelecek güncellemeler, `fast_inference` dalındaki değişikliklerin tutarlılığını doğrulamaya devam edecek.
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298)
- İçerik: i18n taraması yeniden düzenlendi ve çok dilli yapılandırma dosyaları güncellendi
- Tür: Dokümantasyon
@ -578,3 +578,160 @@
- İçerik: Otomatik hassasiyet algılama mantığı optimize edildi; WebUI önyüz modüllerine katlanabilir özellik eklendi
- Tür: Yeni Özellik
- Katkıda Bulunanlar: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- İçerik: "X一X" kalıbı için çok sesli harf tespitini düzelt
- Tür: Düzeltme
- Katkıda Bulunan: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- İçerik: Yapılandırma düzeltmesi; SoVITS model yüklemesini düzelt
- Tür: Düzeltme
- Katkıda Bulunan: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- İçerik: `ge.sum` kaynaklı olası sayısal patlamayı (sessiz çıkarıma yol açan) düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- İçerik: Deney adı boşlukla bittiğinde oluşan hatalı Windows yolunu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- İçerik: Dil bölütlemeyi optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- İçerik: v2pro için paralel çıkarım desteğindeki hatayı düzelt
- Tür: Düzeltme
- Katkıda Bulunan: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- İçerik: v2pro için `ge` çıkarımındaki sayısal taşma sorununu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- İçerik: `install.sh` mantığını optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- İçerik: onnxruntime yükleme mantığını optimize et (GPU/CPU algılama)
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- İçerik: Dil bölütleme ve biçimlendirmeyi optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- İçerik: Çıkarım işlem önceliğini artır (Win11'de olası GPU kullanım sınırlamasını düzelt)
- Tür: Optimizasyon
- Katkıda Bulunan: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- İçerik: TTS.py'nin gerçekte desteklenen sürümler olan v2Pro ve v2ProPlus'ı tanımaması sorununu düzelt ve varsayılan yapılandırmayı güncelle
- Tür: Düzeltme
- Katkıda Bulunan: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- İçerik: Paralel çıkarım modunda v2pro model tanıma sorununu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- İçerik: Whisper ASR daha uygun maliyetli distill modellerini destekler
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- İçerik: `TTS_Config` kod mantığını optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- İçerik: GPT kayıp (loss) hesaplama sorununu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- İçerik: WSL Rocm
- Tür: Düzeltme
- Katkıda Bulunan: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- İçerik: Ortam değişkeninin dize (string) olmaması sorununu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- İçerik: Akışlı çıkarım (streaming inference)
- Tür: Yeni Özellik
- Katkıda Bulunan: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- İçerik: Matematiksel ifade metinleri için metin ön uç (frontend) mantığını optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- İçerik: Akışlı çıkarım (streaming inference)
- Tür: Yeni Özellik
- Katkıda Bulunan: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- İçerik: VQ dağıtılmış eğitimi destekle
- Tür: Optimizasyon
- Katkıda Bulunan: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- İçerik: ASR model indirme mantığını optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- İçerik: Varsayılan parti boyutu (batch size) hatasını düzelt
- Tür: Düzeltme
- Katkıda Bulunan: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- İçerik: Örnekleme (sampling) hatasını düzelt
- Tür: Düzeltme
- Katkıda Bulunan: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- İçerik: Kabul edilmeyen Conda koşullarının neden olduğu derleme hatasını düzelt
- Tür: Düzeltme
- Katkıda Bulunan: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- İçerik: Otomatik ortam kurulumunu optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
# 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- İçerik: Uzun cümlelerde gereksiz hesaplama ve çıkarım yükünü azaltmak için G2PW çıkarım girdi oluşturmayı ve çok sesli harf işlemeyi optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- İçerik: Windows altında tek kartta GPT-SoVITS v3 için LoRA eğitim akışını iyileştir
- Tür: Optimizasyon
- Katkıda Bulunan: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- İçerik: Birden çok modüldeki çeşitli hataları düzelt
- Tür: Düzeltme
- Katkıda Bulunan: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- İçerik: Veri kümesi işleme için hata işleme ipuçları ekle
- Tür: Optimizasyon
- Katkıda Bulunan: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- İçerik: Paralel çıkarımdaki bazı hataları düzelt
- Tür: Düzeltme
- Katkıda Bulunan: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- İçerik: DPO eğitiminin eksik kelime simülasyonunu desteklememe hatasını düzelt
- Tür: Düzeltme
- Katkıda Bulunan: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- İçerik: ONNX betiğinde (Optional vb.) eksik içe aktarmaları düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR model",
"ASR 模型尺寸": "ASR model size",
"ASR 语言设置": "ASR language",
"CPU训练,较慢": "Training on CPU (slower)",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT Training: Model Weights saved in GPT_weights/",
"GPT模型列表": "GPT weight list",
"GPT训练": "GPT Training",
@ -26,19 +25,18 @@
"GPU卡号,只能填1个整数": "GPU number, can only input ONE integer",
"GPU卡号以-分割,每个卡号一个进程": "GPU number is separated by -, each GPU will run one process ",
"LoRA秩": "LoRA Rank",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained SoVITS V3 Model, Cannot Load LoRA Weights",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Training: Model Weights saved in SoVITS_weights/",
"SoVITS模型列表": "SoVITS weight list",
"SoVITS训练": "SoVITS Training",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Submit Text: Manually save all text box contents on the current page to memory and file (If you don't click this button before switching pages or exiting the labeling page, the data will be rolled back when you return, which would be a waste of work.)",
"TTS推理WebUI": "TTS Inference WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 WebUI (Vocal Separation/Deecho/Dereverb)",
"V3不支持无参考文本模式请填写参考文本": "V3 does not support the no-reference-text mode. Please provide reference text!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset",
"batch_size": "Batch Size",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy",
"max:归一化后最大值多少": "Loudness multiplier after normalized",
"max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept",
"min_interval:最短切割间隔": "Minimum interval for audio cutting",
"min_interval:最短切割间隔": "Minumum interval for audio cutting",
"min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: the minimum length of each segment. If the first segment is too short, it will be concatenated with the next segment until it exceeds this value",
"temperature": "temperature",
"threshold:音量小于这个值视作静音的备选切割点": "Noise gate threshold (loudness below this value will be treated as noise",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3 does not support this mode currently, using it will cause an error.",
"v3输出如果觉得闷可以试试开超分": "For V3 model, if generated audio sounds somewhat muffled, try enable audio super-resolution.",
"不切": "No slice",
"不训练直接推v2ProPlus底模": "Use v2ProPlus base model directly without training!",
"不训练直接推v2Pro底模": "Use v2Pro base model directly without training!",
"不训练直接推v2底模": "Use v2 base model directly without training!",
"不训练直接推v3底模": "Use v3 base model directly without training!",
"中文": "Chinese",
"中文教程文档": "Chinese Tutorial",
"中英混合": "Chinese-English Mixed",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Primary Reference Audio (Please upload reference audio within 3-10 seconds, exceeding this limit will cause an error!)",
"主参考音频的文本": "Text of Primary Reference Audio",
@ -92,7 +87,6 @@
"句间停顿秒数": "Pause Duration between Sentences (Seconds)",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optional: Upload multiple reference audio files by dragging and dropping them (recommended to be of the same gender), and average their tone. If this option is left blank, the tone will be controlled by the single reference audio on the left. If fine-tuning the model, it is recommended that all reference audio files have tones within the fine-tuning training set; the pretrained model can be ignored.",
"合成语音": "Start inference",
"合成音频": "Synthesize Audio",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "An example of a valid folder path format: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simply copy the address from the file manager's address bar).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Support for Phoneme Conversion, Manual Phoneme Editing, and Step-by-Step Speech Synthesis will be added in the future.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "If reference audio is not clear or unsure what to write, enable this option to ignore the reference text.",
@ -110,15 +104,11 @@
"已关闭": " is Closed",
"已完成": " Finished",
"已开启": " is Opened",
"并行合成中": "Parallel Synthesis in Progress",
"并行推理": "Parallel Inference",
"并行推理模式已关闭": "Parallel Inference Mode Disabled",
"并行推理模式已开启": "Parallel Inference Mode Enabled",
"底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained Model, Cannot Load LoRA Weights",
"开启": "Open ",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Enable no reference mode. If you don't fill 'Text for reference audio', no reference mode will be enabled.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "When parallel inference mode is enabled, SoVITS V3/4 models do not support bucket processing; bucket processing has been automatically disabled.",
"微调模型信息": "Fine-tuned Model Information",
"微调训练": "Fine-Tuning",
"怎么切": "How to slice the sentence",
"总训练轮数total_epoch": "Total training epochs (total_epoch):",
@ -150,8 +140,8 @@
"模型": "Model",
"模型分为三类:": "Models are categorized into three types:",
"模型切换": "Model switch",
"模型加载中,请等待": "Model is loading, please wait...",
"每张显卡的batch_size": "Batch size per GPU:",
"版本": "Version",
"粤英混合": "Yue-English Mixed",
"粤语": "Yue",
"终止合成": "Terminate Synthesis",
@ -160,7 +150,6 @@
"缺少音素数据集": "Missing Phoneme Dataset",
"缺少音频数据集": "Missing Audio Dataset",
"英文": "English",
"训练模型的版本": "Version of the trained model",
"训练集格式化一键三连": "Training Set One-Click Formatting",
"训练集格式化工具": "Dataset Formatting Tool",
"语义Token提取": "Semantics Token Extraction",
@ -174,9 +163,10 @@
"语音识别": "Speech Recognition",
"语音识别工具": "Speech Recognition Tool",
"语音降噪": "Speech Denoising",
"语音降噪工具": "Speech Denoising Tool",
"请上传3~10秒内参考音频超过会报错": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.",
"请上传参考音频": "Please Upload the Reference Audio",
"请填入推理文本": "Please Fill in the Target Text",
"请填入推理文本": "Please Fill in the Terget Text",
"请填入正确的List路径": "Please Fill in the Correct List Path",
"请填入正确的音频文件夹路径": "Please Fill in the Correct Audio Folder Path",
"请输入有效文本": "Please enter valid text.",
@ -197,8 +187,7 @@
"进度": "Progress",
"进程已终止": " Process Terminated",
"进程输出信息": " Process Output Information",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "Select the model from SoVITS_weights and GPT_weights. The default models are pretrained models for experiencing 5-second Zero-Shot TTS without training.",
"采样步数(仅对V3/4生效)": "Sampling Steps (V3/V4 Only)",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Choose the models from SoVITS_weights and GPT_weights. The default one is a pretrain, so you can experience zero shot TTS.",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Sampling Steps: If feel noisy, try increasing, if feel slow, try decreasing",
"重复惩罚": "Repetition Penalty",
"随机种子": "Random Seed",
@ -214,13 +203,29 @@
"音频标注WebUI": "Audio Labeling WebUI",
"音频自动切分输入路径,可文件可文件夹": "Audio slicer input (file or folder)",
"音频超分中": "Running Audio Super-Resolution",
"音频超采样": "Audio Upsampling",
"音频超采样(仅对V3生效))": "Audio Upsampling (V3 Only)",
"预测语义Token": "Predict Semantic Token",
"预训练GPT模型路径": "Pretrained GPT Model Path",
"预训练SSL模型路径": "Pretrained SSL Model Path",
"预训练SoVITS-D模型路径": "Pretrained SoVITS-D Model Path",
"预训练SoVITS-G模型路径": "Pretrained SoVITS-G Model Path",
"预训练中文BERT模型路径": "Pretrained Chinese BERT Model Path",
"预训练模型路径": "Pretrained Model Path"
}
"参数预设": "Preset",
"选择预设可快速配置推理参数": "Select a preset to quickly configure inference parameters",
"快速合成": "Fast Synthesis",
"高质量": "High Quality",
"平衡": "Balanced",
"自定义": "Custom",
"请输入包含音频文件的文件夹路径": "Please enter the folder path containing audio files",
"请输入 .list 标注文件的完整路径": "Please enter the full path to the .list annotation file",
"音频超采样(仅对V3生效))": "Audio Super-Sampling (V3 Only)",
"采样步数(仅对V3/4生效)": "Sampling Steps (V3/V4 Only)",
"选择文件/文件夹": "Select File/Folder",
"选择输出目录(选择其中任意文件)": "Select Output Directory (Select any file inside)",
"选择输入目录": "Select Input Directory",
"选择输出目录": "Select Output Directory",
"选择标注文件": "Select Annotation File",
"选择音频目录": "Select Audio Directory",
"选择文件夹": "Select Folder",
"选择文件": "Select File",
"📁 选择文件夹": "📁 Select Folder",
"📄 选择文件": "📄 Select File"
}

1351
webui.py

File diff suppressed because it is too large Load Diff