mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2026-06-06 14:38:14 +08:00
Compare commits
90 Commits
fe34958fc8
...
bd3dfa685c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd3dfa685c | ||
|
|
08d627c333 | ||
|
|
6d95b559e8 | ||
|
|
ea2d2a8166 | ||
|
|
d9f03dad3e | ||
|
|
647935357a | ||
|
|
02425ea256 | ||
|
|
938f05fce8 | ||
|
|
445d18ccce | ||
|
|
00ce973412 | ||
|
|
14191901cd | ||
|
|
780383d5bd | ||
|
|
ba8de9b760 | ||
|
|
50a88a596d | ||
|
|
86e5b67448 | ||
|
|
2faf74beaa | ||
|
|
16b3c2a131 | ||
|
|
9f418af1dd | ||
|
|
5ffb193bcd | ||
|
|
56d6ae6b3b | ||
|
|
7c3c778b17 | ||
|
|
61b21e1fca | ||
|
|
18002ad809 | ||
|
|
12fa7d875f | ||
|
|
48cc70a7de | ||
|
|
3ac7aad4d0 | ||
|
|
036d828a7e | ||
|
|
7e1c40ef9f | ||
|
|
fdffd50066 | ||
|
|
8a10c528e3 | ||
|
|
02fabe807f | ||
|
|
4ebcb3bf1b | ||
|
|
5843d56c4e | ||
|
|
7660f1c8fb | ||
|
|
fa45c5ac4f | ||
|
|
2dc36d3d60 | ||
|
|
f70fd8ff87 | ||
|
|
ed8d276ac9 | ||
|
|
1de89feb7b | ||
|
|
b8356880dc | ||
|
|
8182908f7d | ||
|
|
5081168918 | ||
|
|
c26fa983a4 | ||
|
|
5280d17d2f | ||
|
|
371a2d7138 | ||
|
|
fe969ab9a2 | ||
|
|
61db7f05dc | ||
|
|
536c226b1a | ||
|
|
01468158d3 | ||
|
|
c9547ab669 | ||
|
|
b1ad8b5dcd | ||
|
|
d6e255a071 | ||
|
|
e89f986e3f | ||
|
|
af0bd9f414 | ||
|
|
13567362d9 | ||
|
|
27325f4cf9 | ||
|
|
6cb3c15448 | ||
|
|
9264f7e38e | ||
|
|
25b65cdfd0 | ||
|
|
1a7cf580e0 | ||
|
|
c36d0a93fe | ||
|
|
2a23f95f61 | ||
|
|
d1e92edc7c | ||
|
|
d8d551d4d2 | ||
|
|
1d434e1a0a | ||
|
|
9fe20c14d6 | ||
|
|
64cc2fd9d1 | ||
|
|
a291629438 | ||
|
|
e3e47d2c06 | ||
|
|
ca9ffbf98e | ||
|
|
684e1cfd2f | ||
|
|
878fef248a | ||
|
|
2880e3a6f8 | ||
|
|
1da23aa259 | ||
|
|
c8be484c0e | ||
|
|
d855eecc7b | ||
|
|
d20bd37965 | ||
|
|
926dd6b34a | ||
|
|
f61a723bab | ||
|
|
441ab54889 | ||
|
|
ecbc7d0b1e | ||
|
|
b6f0bb36ef | ||
|
|
4daa9ad53c | ||
|
|
2c8f6bd4c9 | ||
|
|
4cbbe2a258 | ||
|
|
a1fc00a9d8 | ||
|
|
8c9627bb30 | ||
|
|
e69e449599 | ||
|
|
29b8370c45 | ||
|
|
7efdf31113 |
@ -67,8 +67,10 @@ class Text2SemanticDataset(Dataset):
|
|||||||
)
|
)
|
||||||
) # "%s/3-bert"%exp_dir#bert_dir
|
) # "%s/3-bert"%exp_dir#bert_dir
|
||||||
self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path
|
self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path
|
||||||
assert os.path.exists(self.path2)
|
if not os.path.exists(self.path2):
|
||||||
assert os.path.exists(self.path6)
|
raise FileNotFoundError(f"Phoneme data file not found: {self.path2}")
|
||||||
|
if not os.path.exists(self.path6):
|
||||||
|
raise FileNotFoundError(f"Semantic data file not found: {self.path6}")
|
||||||
self.phoneme_data = {}
|
self.phoneme_data = {}
|
||||||
with open(self.path2, "r", encoding="utf8") as f:
|
with open(self.path2, "r", encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
@ -131,7 +133,7 @@ class Text2SemanticDataset(Dataset):
|
|||||||
phoneme, word2ph, text = self.phoneme_data[item_name]
|
phoneme, word2ph, text = self.phoneme_data[item_name]
|
||||||
except Exception:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
# print(f"{item_name} not in self.phoneme_data !")
|
print(f"Warning: File \"{item_name}\" not in self.phoneme_data! Skipped. ")
|
||||||
num_not_in += 1
|
num_not_in += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -152,7 +154,7 @@ class Text2SemanticDataset(Dataset):
|
|||||||
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
|
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
# print(f"{item_name} not in self.phoneme_data !")
|
print(f"Warning: Failed to convert phonemes to sequence for file \"{item_name}\"! Skipped. ")
|
||||||
num_not_in += 1
|
num_not_in += 1
|
||||||
continue
|
continue
|
||||||
# if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行
|
# if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行
|
||||||
@ -228,7 +230,11 @@ class Text2SemanticDataset(Dataset):
|
|||||||
# bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
|
# bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
|
||||||
bert_feature = None
|
bert_feature = None
|
||||||
else:
|
else:
|
||||||
assert bert_feature.shape[-1] == len(phoneme_ids)
|
try:
|
||||||
|
assert bert_feature.shape[-1] == len(phoneme_ids)
|
||||||
|
except AssertionError:
|
||||||
|
print(f"AssertionError: The BERT feature dimension ({bert_feature.shape[-1]}) of the file '{item_name}' does not match the length of the phoneme sequence ({len(phoneme_ids)}).")
|
||||||
|
raise
|
||||||
return {
|
return {
|
||||||
"idx": idx,
|
"idx": idx,
|
||||||
"phoneme_ids": phoneme_ids,
|
"phoneme_ids": phoneme_ids,
|
||||||
|
|||||||
76
GPT_SoVITS/AR/models/embedding_cudagraph.py
Normal file
76
GPT_SoVITS/AR/models/embedding_cudagraph.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
import math
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
|
class TokenEmbedding(nn.Module):
|
||||||
|
def __init__(self, embedding_dim: int, vocab_size: int, dropout: float = 0.0):
|
||||||
|
super().__init__()
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.embedding_dim = embedding_dim
|
||||||
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
|
self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def weight(self) -> torch.Tensor:
|
||||||
|
return self.word_embeddings.weight
|
||||||
|
|
||||||
|
def embedding(self, index: int) -> torch.Tensor:
|
||||||
|
return self.word_embeddings.weight[index : index + 1]
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor):
|
||||||
|
x = self.word_embeddings(x)
|
||||||
|
x = self.dropout(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class SinePositionalEmbeddingNested(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding_dim: int,
|
||||||
|
dropout: float = 0.0,
|
||||||
|
scale: bool = False,
|
||||||
|
alpha: bool = False,
|
||||||
|
max_batch_size: int = 20,
|
||||||
|
max_seq_len: int = 2500,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.embedding_dim = embedding_dim
|
||||||
|
self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
|
||||||
|
self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
|
||||||
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
|
self.max_batch_size = max_batch_size
|
||||||
|
self.max_seq_len = max_seq_len
|
||||||
|
|
||||||
|
self.reverse = False
|
||||||
|
self.register_buffer(
|
||||||
|
"pe", torch.zeros(max_batch_size, max_seq_len, embedding_dim), persistent=False
|
||||||
|
)
|
||||||
|
self.pe: torch.Tensor
|
||||||
|
self.compute_pe()
|
||||||
|
|
||||||
|
def compute_pe(self):
|
||||||
|
if self.reverse:
|
||||||
|
position = torch.arange(self.max_seq_len - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
|
||||||
|
else:
|
||||||
|
position = torch.arange(self.max_seq_len, dtype=torch.float32).unsqueeze(1)
|
||||||
|
div_term = torch.exp(
|
||||||
|
torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
|
||||||
|
* -(math.log(10000.0) / self.embedding_dim)
|
||||||
|
)
|
||||||
|
pe = self.pe
|
||||||
|
pe[:, :, 0::2] = torch.sin(position * div_term)
|
||||||
|
pe[:, :, 1::2] = torch.cos(position * div_term)
|
||||||
|
|
||||||
|
def forward(self, input_pos: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
batch_size = x.shape[0]
|
||||||
|
pe_values = self.pe[torch.arange(batch_size), input_pos - 1]
|
||||||
|
return x * self.x_scale + self.alpha * pe_values.unsqueeze(1)
|
||||||
|
|
||||||
|
def prefill(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
input_pos = torch.tensor([i.shape[0] for i in x.unbind()])
|
||||||
|
pe_values = torch.nested.nested_tensor(
|
||||||
|
[self.pe[i, : input_pos[i], :] for i in range(input_pos.size(0))]
|
||||||
|
)
|
||||||
|
return x * self.x_scale + self.alpha.item() * pe_values
|
||||||
78
GPT_SoVITS/AR/models/structs_cudagraph.py
Normal file
78
GPT_SoVITS/AR/models/structs_cudagraph.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Literal, Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
Tensor = torch.Tensor
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class T2SResult:
|
||||||
|
result: List[Tensor] | None = None
|
||||||
|
infer_speed: float = 0.0
|
||||||
|
status: Literal["Success", "Error"] = "Success"
|
||||||
|
exception: Optional[Exception] = None
|
||||||
|
traceback: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class T2SRequest:
|
||||||
|
x: List[torch.Tensor]
|
||||||
|
x_lens: Tensor
|
||||||
|
prompts: torch.Tensor
|
||||||
|
bert_feature: List[Tensor]
|
||||||
|
valid_length: int
|
||||||
|
top_k: int = 5
|
||||||
|
top_p: float = 1
|
||||||
|
early_stop_num: int = -1
|
||||||
|
temperature: float = 1.0
|
||||||
|
repetition_penalty: float = 1.35
|
||||||
|
use_cuda_graph: bool = False
|
||||||
|
debug: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class T2SSession:
|
||||||
|
def __init__(self, decoder, request: T2SRequest, device: torch.device, dtype: torch.dtype):
|
||||||
|
with device:
|
||||||
|
self.decoder = decoder
|
||||||
|
self.request = request
|
||||||
|
self.device = device
|
||||||
|
self.dtype = dtype
|
||||||
|
|
||||||
|
bsz = len(request.x)
|
||||||
|
y_len = request.prompts.size(-1)
|
||||||
|
self.bsz = bsz
|
||||||
|
self.y_len = y_len
|
||||||
|
|
||||||
|
from AR.models.t2s_model_cudagraph import Sampler
|
||||||
|
|
||||||
|
self.sampler = Sampler(bsz, decoder.vocab_size)
|
||||||
|
|
||||||
|
self.x = request.x
|
||||||
|
self.x_lens = request.x_lens.to(torch.int32)
|
||||||
|
self.y = request.prompts
|
||||||
|
self.bert_feature = request.bert_feature
|
||||||
|
|
||||||
|
self.prefill_len = self.x_lens + self.y.size(1)
|
||||||
|
|
||||||
|
self.input_pos = torch.zeros_like(self.prefill_len)
|
||||||
|
self.input_pos.add_(self.prefill_len)
|
||||||
|
|
||||||
|
self.completed = torch.Tensor([False] * len(self.x)).bool().to(device)
|
||||||
|
self.y_results: List[Tensor] = [None] * len(self.x) # type: ignore
|
||||||
|
|
||||||
|
self.xy_pos = decoder.embed(self.x, self.y, self.bert_feature)
|
||||||
|
|
||||||
|
attn_mask = []
|
||||||
|
for bs in range(bsz):
|
||||||
|
pos = int(self.x_lens[bs].item())
|
||||||
|
mask = torch.zeros(pos + y_len, pos + y_len).bool()
|
||||||
|
mask[:, :pos].fill_(True)
|
||||||
|
if y_len > 0:
|
||||||
|
mask[-y_len:, -y_len:] = ~torch.triu(
|
||||||
|
torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1
|
||||||
|
)
|
||||||
|
attn_mask.append(mask)
|
||||||
|
self.attn_mask_nested = torch.nested.nested_tensor(attn_mask)
|
||||||
602
GPT_SoVITS/AR/models/t2s_model_cudagraph.py
Normal file
602
GPT_SoVITS/AR/models/t2s_model_cudagraph.py
Normal file
@ -0,0 +1,602 @@
|
|||||||
|
"""
|
||||||
|
CUDA Graph accelerated T2S decoder.
|
||||||
|
Uses PyTorch native scaled_dot_product_attention (no flash_attn dependency).
|
||||||
|
Adapted from gsvpp/AR/models/t2s_model_abc.py and t2s_model_flash_attn.py.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from typing import Dict, List, MutableSequence, Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch.cuda.graphs import CUDAGraph
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from AR.models.embedding_cudagraph import (
|
||||||
|
SinePositionalEmbeddingNested as SinePositionalEmbedding,
|
||||||
|
)
|
||||||
|
from AR.models.embedding_cudagraph import TokenEmbedding
|
||||||
|
from AR.models.structs_cudagraph import T2SRequest, T2SResult, T2SSession
|
||||||
|
|
||||||
|
Tensor = torch.Tensor
|
||||||
|
|
||||||
|
|
||||||
|
class Sampler(nn.Module):
|
||||||
|
def __init__(self, batch_size: int, vocab_size: int) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.batch_size = batch_size
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
logits: Tensor,
|
||||||
|
previous_tokens: Tensor,
|
||||||
|
temperature: float,
|
||||||
|
top_k: int,
|
||||||
|
top_p: float,
|
||||||
|
repetition_penalty: float,
|
||||||
|
) -> Tensor:
|
||||||
|
previous_tokens = previous_tokens.long()
|
||||||
|
score = torch.gather(logits, dim=1, index=previous_tokens)
|
||||||
|
score = torch.where(
|
||||||
|
score < 0, score * repetition_penalty, score / repetition_penalty
|
||||||
|
)
|
||||||
|
logits.scatter_(dim=1, index=previous_tokens, src=score)
|
||||||
|
|
||||||
|
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
||||||
|
cum_probs = torch.cumsum(
|
||||||
|
torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
|
||||||
|
)
|
||||||
|
sorted_indices_to_remove = cum_probs > top_p
|
||||||
|
sorted_indices_to_remove[:, 0] = False
|
||||||
|
indices_to_remove = sorted_indices_to_remove.scatter(
|
||||||
|
dim=1, index=sorted_indices, src=sorted_indices_to_remove
|
||||||
|
)
|
||||||
|
logits = logits.masked_fill(indices_to_remove, -float("Inf"))
|
||||||
|
|
||||||
|
logits = logits / max(temperature, 1e-5)
|
||||||
|
|
||||||
|
v, _ = torch.topk(logits, top_k)
|
||||||
|
pivot = v[:, -1].unsqueeze(-1)
|
||||||
|
logits = torch.where(logits < pivot, -float("Inf"), logits)
|
||||||
|
|
||||||
|
probs = torch.nn.functional.softmax(logits, dim=-1)
|
||||||
|
q = torch.empty_like(probs).exponential_(1.0)
|
||||||
|
idx_next = torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int32)
|
||||||
|
|
||||||
|
return idx_next
|
||||||
|
|
||||||
|
|
||||||
|
# ─── KV Cache ────────────────────<E29480><E29480><EFBFBD>───────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class KVCacheNHD(nn.Module):
|
||||||
|
def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
|
||||||
|
super().__init__()
|
||||||
|
assert batch_size > 0
|
||||||
|
cache_shape = (batch_size, max_seq_length, n_heads, head_dim)
|
||||||
|
self.n_head = n_heads
|
||||||
|
self.head_dim = head_dim
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.max_seq_length = max_seq_length
|
||||||
|
self.register_buffer(
|
||||||
|
"k_cache", torch.zeros(size=cache_shape), persistent=False
|
||||||
|
)
|
||||||
|
self.register_buffer(
|
||||||
|
"v_cache", torch.zeros(size=cache_shape), persistent=False
|
||||||
|
)
|
||||||
|
|
||||||
|
def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
|
||||||
|
index = (
|
||||||
|
(input_pos - 1)
|
||||||
|
.unsqueeze(-1)
|
||||||
|
.unsqueeze(-1)
|
||||||
|
.unsqueeze(-1)
|
||||||
|
.expand(-1, -1, self.n_head, self.head_dim)
|
||||||
|
.to(torch.int64)
|
||||||
|
)
|
||||||
|
k_out = self.k_cache
|
||||||
|
v_out = self.v_cache
|
||||||
|
k_out.scatter_(1, index, k_val)
|
||||||
|
v_out.scatter_(1, index, v_val)
|
||||||
|
return k_out, v_out
|
||||||
|
|
||||||
|
def empty(self):
|
||||||
|
self.k_cache.zero_()
|
||||||
|
self.v_cache.zero_()
|
||||||
|
|
||||||
|
def prefill_kv(self, k_val: Tensor, v_val: Tensor, bs: int):
|
||||||
|
self.k_cache[[bs], : k_val.shape[1]] = k_val
|
||||||
|
self.v_cache[[bs], : v_val.shape[1]] = v_val
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Attention (PyTorch native SDPA, no flash_attn) ─────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class Attention(nn.Module):
|
||||||
|
def __init__(self, n_head: int, hidden_dim: int):
|
||||||
|
super().__init__()
|
||||||
|
self.n_head = n_head
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
assert hidden_dim % n_head == 0
|
||||||
|
self.head_dim = hidden_dim // n_head
|
||||||
|
self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
|
||||||
|
self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
|
||||||
|
self.dropout = nn.Dropout(0.1)
|
||||||
|
|
||||||
|
self._register_load_state_dict_pre_hook(self.load_hook)
|
||||||
|
|
||||||
|
def load_hook(self, state_dict: dict, prefix, *args):
|
||||||
|
keys_to_modify = [key for key in state_dict if "in_proj_" in key]
|
||||||
|
for key in keys_to_modify:
|
||||||
|
new_key = key.replace("in_proj_", "in_proj.")
|
||||||
|
state_dict[new_key] = state_dict.pop(key)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheNHD
|
||||||
|
) -> Tensor:
|
||||||
|
bsz, seqlen, _ = x.shape
|
||||||
|
|
||||||
|
q, k, v = self.in_proj.forward(x).chunk(3, dim=-1)
|
||||||
|
|
||||||
|
q = q.view(bsz, seqlen, self.n_head, self.head_dim)
|
||||||
|
k = k.view(bsz, seqlen, self.n_head, self.head_dim)
|
||||||
|
v = v.view(bsz, seqlen, self.n_head, self.head_dim)
|
||||||
|
|
||||||
|
k_cache, v_cache = kv_cache.update(input_pos, k, v)
|
||||||
|
|
||||||
|
q = q.transpose(1, 2) # [B, H, 1, D]
|
||||||
|
k_out = k_cache.transpose(1, 2) # [B, H, max_seq, D]
|
||||||
|
v_out = v_cache.transpose(1, 2) # [B, H, max_seq, D]
|
||||||
|
|
||||||
|
attn = F.scaled_dot_product_attention(q, k_out, v_out)
|
||||||
|
|
||||||
|
attn = self.dropout.forward(attn)
|
||||||
|
attn = attn.transpose(1, 2).reshape(bsz, seqlen, self.hidden_dim)
|
||||||
|
attn = self.out_proj.forward(attn)
|
||||||
|
return attn
|
||||||
|
|
||||||
|
def prefill(self, x: Tensor, mask: Tensor, kv_cache: KVCacheNHD) -> Tensor:
|
||||||
|
bsz = x.size(0)
|
||||||
|
outputs = []
|
||||||
|
for bs in range(bsz):
|
||||||
|
x_b = x[bs].unsqueeze(0)
|
||||||
|
q, k, v = self.in_proj.forward(x_b.unsqueeze(0)).chunk(3, dim=-1)
|
||||||
|
q = q.contiguous().view(1, -1, self.n_head, self.head_dim)
|
||||||
|
k = k.contiguous().view(1, -1, self.n_head, self.head_dim)
|
||||||
|
v = v.contiguous().view(1, -1, self.n_head, self.head_dim)
|
||||||
|
kv_cache.prefill_kv(k, v, bs)
|
||||||
|
q, k, v = map(lambda t: t.transpose(1, 2), (q, k, v))
|
||||||
|
attn_mask = (
|
||||||
|
mask[bs].unsqueeze(0).unsqueeze(0).expand(1, self.n_head, -1, -1)
|
||||||
|
)
|
||||||
|
attn = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
|
||||||
|
attn = self.dropout.forward(attn)
|
||||||
|
attn = attn.transpose(1, 2).contiguous().view(1, -1, self.hidden_dim)
|
||||||
|
output = self.out_proj.forward(attn)
|
||||||
|
outputs.append(output.squeeze(0))
|
||||||
|
return torch.nested.nested_tensor(outputs)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Feed Forward ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class FeedForward(nn.Module):
|
||||||
|
def __init__(self, dim: int, hidden_dim: int) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.linear1 = nn.Linear(dim, hidden_dim, bias=True)
|
||||||
|
self.linear2 = nn.Linear(hidden_dim, dim, bias=True)
|
||||||
|
self.dropout = nn.Dropout(0.1)
|
||||||
|
|
||||||
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
|
return self.dropout.forward(
|
||||||
|
self.linear2(self.dropout.forward(F.relu(self.linear1(x))))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Transformer Block ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerBlock(nn.Module):
|
||||||
|
def __init__(self, n_head, ffn_dim, hidden_dim) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
self.attention = Attention(n_head, hidden_dim)
|
||||||
|
self.feed_forward = FeedForward(hidden_dim, ffn_dim)
|
||||||
|
self.attention_norm = nn.LayerNorm([hidden_dim])
|
||||||
|
self.ffn_norm = nn.LayerNorm([hidden_dim])
|
||||||
|
self.dropout = nn.Dropout(0.1)
|
||||||
|
|
||||||
|
self._register_load_state_dict_pre_hook(self.load_hook)
|
||||||
|
|
||||||
|
def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
|
||||||
|
for key in list(state_dict.keys()):
|
||||||
|
new_key = (
|
||||||
|
key.replace("self_attn", "attention")
|
||||||
|
.replace("linear", "feed_forward.linear")
|
||||||
|
.replace("norm1", "attention_norm")
|
||||||
|
.replace("norm2", "ffn_norm")
|
||||||
|
)
|
||||||
|
state_dict[new_key] = state_dict.pop(key)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheNHD
|
||||||
|
) -> Tensor:
|
||||||
|
h = self.attention_norm.forward(
|
||||||
|
x + self.dropout.forward(self.attention.forward(x, input_pos, kv_cache))
|
||||||
|
)
|
||||||
|
out = self.ffn_norm.forward(h + self.feed_forward.forward(h))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def prefill(self, x: Tensor, mask: Tensor, kv_cache: KVCacheNHD) -> Tensor:
|
||||||
|
h = self.attention_norm.forward(
|
||||||
|
x + self.dropout.forward(self.attention.prefill(x, mask, kv_cache))
|
||||||
|
)
|
||||||
|
out = self.ffn_norm.forward(h + self.feed_forward.forward(h))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Transformer Decoder ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerDecoder(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_dim,
|
||||||
|
n_layer,
|
||||||
|
n_head,
|
||||||
|
ffn_dim,
|
||||||
|
vocab_size,
|
||||||
|
max_seq_length,
|
||||||
|
max_batch_size,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
self.n_head = n_head
|
||||||
|
assert hidden_dim % n_head == 0
|
||||||
|
self.head_dim = hidden_dim // n_head
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.layers = nn.ModuleList(
|
||||||
|
TransformerBlock(n_head, ffn_dim, hidden_dim) for _ in range(n_layer)
|
||||||
|
)
|
||||||
|
self.max_seq_length: int = max_seq_length
|
||||||
|
self.max_batch_size: int = max_batch_size
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_pos: Tensor,
|
||||||
|
x: Tensor,
|
||||||
|
kv_caches: MutableSequence[KVCacheNHD],
|
||||||
|
):
|
||||||
|
for layer, kv_cache in zip(self.layers, kv_caches):
|
||||||
|
x = layer.forward(x, input_pos, kv_cache)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def prefill(
|
||||||
|
self,
|
||||||
|
x: Tensor,
|
||||||
|
mask: Tensor,
|
||||||
|
kv_caches: MutableSequence[KVCacheNHD],
|
||||||
|
):
|
||||||
|
for layer, kv_cache in zip(self.layers, kv_caches):
|
||||||
|
x = layer.prefill(x, mask, kv_cache)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
# ─── T2S Decoder ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class T2SDecoder(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
*args,
|
||||||
|
norm_first=False,
|
||||||
|
max_seq_length=2500,
|
||||||
|
max_batch_size=10,
|
||||||
|
**kwds,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
hidden_dim = config["model"]["hidden_dim"]
|
||||||
|
embedding_dim = config["model"]["embedding_dim"]
|
||||||
|
n_head = config["model"]["head"]
|
||||||
|
n_layer = config["model"]["n_layer"]
|
||||||
|
vocab_size = config["model"]["vocab_size"]
|
||||||
|
phoneme_vocab_size = config["model"]["phoneme_vocab_size"]
|
||||||
|
p_dropout = config["model"]["dropout"]
|
||||||
|
EOS = config["model"]["EOS"]
|
||||||
|
ffn_dim = hidden_dim * 4
|
||||||
|
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
self.n_head = n_head
|
||||||
|
assert hidden_dim % n_head == 0
|
||||||
|
self.head_dim = hidden_dim // n_head
|
||||||
|
self.embedding_dim = embedding_dim
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.phoneme_vocab_size = phoneme_vocab_size
|
||||||
|
self.p_dropout = p_dropout
|
||||||
|
self.max_seq_length = max_seq_length
|
||||||
|
self.max_batch_size = max_batch_size
|
||||||
|
self.EOS = EOS
|
||||||
|
assert self.EOS == self.vocab_size - 1
|
||||||
|
|
||||||
|
self.bert_proj = nn.Linear(1024, self.embedding_dim)
|
||||||
|
self.ar_text_embedding = TokenEmbedding(
|
||||||
|
self.embedding_dim, self.phoneme_vocab_size, self.p_dropout
|
||||||
|
)
|
||||||
|
self.ar_text_position = SinePositionalEmbedding(
|
||||||
|
self.embedding_dim,
|
||||||
|
dropout=0.1,
|
||||||
|
scale=False,
|
||||||
|
alpha=True,
|
||||||
|
max_batch_size=max_batch_size,
|
||||||
|
max_seq_len=max_seq_length,
|
||||||
|
)
|
||||||
|
self.ar_audio_embedding = TokenEmbedding(
|
||||||
|
self.embedding_dim, self.vocab_size, self.p_dropout
|
||||||
|
)
|
||||||
|
self.ar_audio_position = SinePositionalEmbedding(
|
||||||
|
self.embedding_dim,
|
||||||
|
dropout=0.1,
|
||||||
|
scale=False,
|
||||||
|
alpha=True,
|
||||||
|
max_batch_size=max_batch_size,
|
||||||
|
max_seq_len=max_seq_length,
|
||||||
|
)
|
||||||
|
self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
|
||||||
|
self.h = TransformerDecoder(
|
||||||
|
hidden_dim,
|
||||||
|
n_layer,
|
||||||
|
n_head,
|
||||||
|
ffn_dim,
|
||||||
|
vocab_size,
|
||||||
|
max_seq_length,
|
||||||
|
max_batch_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._register_load_state_dict_pre_hook(self.load_hook)
|
||||||
|
|
||||||
|
def load_hook(self, state_dict, prefix, *args):
|
||||||
|
model_keys = [key for key in state_dict if key.startswith("model.")]
|
||||||
|
for key in model_keys:
|
||||||
|
new_key = key[len("model.") :]
|
||||||
|
state_dict[new_key] = state_dict.pop(key)
|
||||||
|
|
||||||
|
def init_cache(self, bsz: int = 0) -> nn.ModuleList:
|
||||||
|
bsz = bsz or self.h.max_batch_size
|
||||||
|
assert bsz <= self.h.max_batch_size
|
||||||
|
seq_lens = self.h.max_seq_length
|
||||||
|
device = self.bert_proj.bias.device
|
||||||
|
dtype = self.bert_proj.bias.dtype
|
||||||
|
return nn.ModuleList(
|
||||||
|
[
|
||||||
|
KVCacheNHD(bsz, seq_lens, self.n_head, self.head_dim)
|
||||||
|
for _ in range(self.n_layer)
|
||||||
|
],
|
||||||
|
).to(device, dtype)
|
||||||
|
|
||||||
|
def embed(
|
||||||
|
self,
|
||||||
|
x: List[torch.Tensor],
|
||||||
|
y: torch.Tensor,
|
||||||
|
bert_features: List[torch.Tensor],
|
||||||
|
):
|
||||||
|
x_nested = torch.nested.nested_tensor(x)
|
||||||
|
assert x_nested.size(0) <= self.max_batch_size
|
||||||
|
bert_features_nested = torch.nested.nested_tensor(
|
||||||
|
list(map(lambda t: t.transpose(0, 1), bert_features))
|
||||||
|
)
|
||||||
|
x_emb = self.ar_text_embedding.forward(x_nested)
|
||||||
|
bert = self.bert_proj.forward(bert_features_nested)
|
||||||
|
x_emb = x_emb + bert
|
||||||
|
x_pos = self.ar_text_position.prefill(x_emb)
|
||||||
|
|
||||||
|
y_nested = torch.nested.nested_tensor(list(y.unbind(0)))
|
||||||
|
y_emb = self.ar_audio_embedding.forward(y_nested)
|
||||||
|
y_pos = self.ar_audio_position.prefill(y_emb)
|
||||||
|
|
||||||
|
xy_pos = torch.nested.nested_tensor(
|
||||||
|
[torch.cat([x_pos[i], y_pos[i]]) for i in range(len(x))]
|
||||||
|
)
|
||||||
|
return xy_pos
|
||||||
|
|
||||||
|
def capture(
|
||||||
|
self,
|
||||||
|
input_pos: Tensor,
|
||||||
|
x: Tensor,
|
||||||
|
x_dec: Tensor,
|
||||||
|
kv_caches,
|
||||||
|
) -> CUDAGraph:
|
||||||
|
s = torch.cuda.Stream()
|
||||||
|
s.wait_stream(torch.cuda.current_stream())
|
||||||
|
|
||||||
|
graph = torch.cuda.CUDAGraph()
|
||||||
|
|
||||||
|
with torch.cuda.stream(s):
|
||||||
|
for _ in range(5):
|
||||||
|
self.h.forward(input_pos, x, kv_caches)
|
||||||
|
torch.cuda.current_stream().wait_stream(s)
|
||||||
|
|
||||||
|
with torch.cuda.graph(graph):
|
||||||
|
x_dec.copy_(self.h.forward(input_pos, x, kv_caches))
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
return graph
|
||||||
|
|
||||||
|
|
||||||
|
# ─── CUDA Graph Runner ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class CUDAGraphRunner:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
decoder_model: T2SDecoder,
|
||||||
|
device: torch.device = torch.device("cpu"),
|
||||||
|
dtype: torch.dtype = torch.float32,
|
||||||
|
) -> None:
|
||||||
|
assert device.type in {"cpu", "cuda", "mps", "xpu", "mtia"}
|
||||||
|
assert dtype in {torch.float16, torch.bfloat16, torch.float32}
|
||||||
|
self.device = device
|
||||||
|
self.dtype = dtype
|
||||||
|
self.decoder_model: T2SDecoder = decoder_model.to(self.device, self.dtype)
|
||||||
|
self.graph: Optional[CUDAGraph] = None
|
||||||
|
self.xy_pos_ = torch.rand(
|
||||||
|
(1, 1, decoder_model.embedding_dim), device=device
|
||||||
|
).to(dtype)
|
||||||
|
self.xy_dec_ = torch.rand(
|
||||||
|
(1, 1, decoder_model.embedding_dim), device=device
|
||||||
|
).to(dtype)
|
||||||
|
self.kv_cache = decoder_model.init_cache(1)
|
||||||
|
self.input_pos = torch.tensor([10]).int().cuda()
|
||||||
|
|
||||||
|
def _handle_request(self, request: T2SRequest):
|
||||||
|
with self.device:
|
||||||
|
for i in self.kv_cache:
|
||||||
|
i.empty()
|
||||||
|
|
||||||
|
decoder = self.decoder_model
|
||||||
|
session = T2SSession(decoder, request, device=self.device, dtype=self.dtype)
|
||||||
|
self.input_pos.copy_(session.input_pos)
|
||||||
|
|
||||||
|
t1 = 0.0
|
||||||
|
infer_speed = 0.0
|
||||||
|
y = session.y
|
||||||
|
bsz = y.size(0)
|
||||||
|
|
||||||
|
for idx in tqdm(range(1500)):
|
||||||
|
if idx == 0:
|
||||||
|
xy_dec = decoder.h.prefill(
|
||||||
|
session.xy_pos, session.attn_mask_nested, self.kv_cache
|
||||||
|
)
|
||||||
|
xy_dec = torch.stack([t[[-1]] for t in xy_dec.unbind()])
|
||||||
|
else:
|
||||||
|
if (
|
||||||
|
request.use_cuda_graph
|
||||||
|
and self.graph is None
|
||||||
|
and torch.cuda.is_available()
|
||||||
|
):
|
||||||
|
self.xy_pos_.copy_(session.xy_pos)
|
||||||
|
self.graph = decoder.capture(
|
||||||
|
self.input_pos,
|
||||||
|
self.xy_pos_,
|
||||||
|
self.xy_dec_,
|
||||||
|
kv_caches=self.kv_cache,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.graph:
|
||||||
|
self.xy_pos_.copy_(session.xy_pos)
|
||||||
|
self.graph.replay()
|
||||||
|
xy_dec = self.xy_dec_.clone()
|
||||||
|
else:
|
||||||
|
xy_dec = decoder.h.forward(
|
||||||
|
self.input_pos,
|
||||||
|
session.xy_pos,
|
||||||
|
self.kv_cache,
|
||||||
|
)
|
||||||
|
|
||||||
|
logits = decoder.ar_predict_layer(xy_dec[:, -1])
|
||||||
|
self.input_pos.add_(1)
|
||||||
|
|
||||||
|
if idx == 0:
|
||||||
|
logits[:, -1] = float("-inf")
|
||||||
|
|
||||||
|
samples = session.sampler.sample(
|
||||||
|
logits=logits,
|
||||||
|
previous_tokens=session.y,
|
||||||
|
top_k=request.top_k,
|
||||||
|
top_p=request.top_p,
|
||||||
|
repetition_penalty=request.repetition_penalty,
|
||||||
|
temperature=request.temperature,
|
||||||
|
)
|
||||||
|
|
||||||
|
session.y = torch.cat([session.y, samples], dim=1)
|
||||||
|
|
||||||
|
argmax_token = torch.argmax(logits, dim=-1)
|
||||||
|
sample_token = samples.squeeze(1)
|
||||||
|
EOS_mask = (argmax_token == decoder.EOS) | (
|
||||||
|
sample_token == decoder.EOS
|
||||||
|
)
|
||||||
|
|
||||||
|
newly_done_mask = EOS_mask & (~session.completed)
|
||||||
|
newly_done_indices = newly_done_mask.nonzero()
|
||||||
|
|
||||||
|
if newly_done_indices.numel() > 0:
|
||||||
|
session.y_results[newly_done_indices[0]] = session.y[
|
||||||
|
newly_done_indices[0], session.y_len : -1
|
||||||
|
].squeeze(0)
|
||||||
|
session.completed[newly_done_indices] = True
|
||||||
|
|
||||||
|
if torch.all(session.completed).item():
|
||||||
|
if session.y.size(1) == 0:
|
||||||
|
session.y = torch.cat(
|
||||||
|
[session.y, torch.zeros_like(samples)], dim=1
|
||||||
|
)
|
||||||
|
tqdm.write("Bad Zero Prediction")
|
||||||
|
else:
|
||||||
|
tqdm.write(
|
||||||
|
f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> \n"
|
||||||
|
f"{[i.size(0) for i in session.y_results].__str__().strip('[]')}"
|
||||||
|
)
|
||||||
|
tqdm.write(
|
||||||
|
f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s"
|
||||||
|
)
|
||||||
|
infer_speed = (idx - 1) / (time.perf_counter() - t1)
|
||||||
|
break
|
||||||
|
|
||||||
|
if (
|
||||||
|
request.early_stop_num != -1
|
||||||
|
and (session.y.size(1) - session.y_len) > request.early_stop_num
|
||||||
|
) or idx == 1499:
|
||||||
|
for i in range(bsz):
|
||||||
|
if not session.completed[i].item():
|
||||||
|
session.y_results[i] = session.y[i, session.y_len :]
|
||||||
|
session.completed[i] = True
|
||||||
|
break
|
||||||
|
|
||||||
|
y_emb = decoder.ar_audio_embedding(session.y[:, -1:])
|
||||||
|
session.xy_pos = decoder.ar_audio_position.forward(
|
||||||
|
self.input_pos - session.x_lens, y_emb
|
||||||
|
)
|
||||||
|
|
||||||
|
if idx == 2:
|
||||||
|
t1 = time.perf_counter()
|
||||||
|
|
||||||
|
if idx % 100 == 0 and self.device.type == "cuda":
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
if self.device.type == "cuda":
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return session.y_results[: request.valid_length], infer_speed
|
||||||
|
|
||||||
|
def generate(self, request: T2SRequest) -> T2SResult:
|
||||||
|
try:
|
||||||
|
result, infer_speed = self._handle_request(request)
|
||||||
|
t2s_result = T2SResult(
|
||||||
|
result=result, infer_speed=infer_speed, status="Success"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
t2s_result = T2SResult(
|
||||||
|
status="Error", exception=e, traceback=traceback.format_exc()
|
||||||
|
)
|
||||||
|
return t2s_result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_decoder(weights_path, max_batch_size=1) -> T2SDecoder:
|
||||||
|
print(
|
||||||
|
f"Loading Text2Semantic Weights from {weights_path} with CUDA Graph (SDPA) Implement"
|
||||||
|
)
|
||||||
|
dict_s1 = torch.load(
|
||||||
|
weights_path, map_location="cpu", weights_only=False#, mmap=True
|
||||||
|
)
|
||||||
|
config = dict_s1["config"]
|
||||||
|
decoder = T2SDecoder(config, max_batch_size=max_batch_size)
|
||||||
|
state_dict = dict_s1["weight"]
|
||||||
|
decoder.load_state_dict(state_dict)
|
||||||
|
return decoder.eval()
|
||||||
@ -262,7 +262,7 @@ def make_reject_y(y_o, y_lens):
|
|||||||
reject_y = []
|
reject_y = []
|
||||||
reject_y_lens = []
|
reject_y_lens = []
|
||||||
for b in range(bs):
|
for b in range(bs):
|
||||||
process_item_idx = torch.randint(0, 1, size=(1,))[0]
|
process_item_idx = torch.randint(0, 2, size=(1,))[0]
|
||||||
if process_item_idx == 0:
|
if process_item_idx == 0:
|
||||||
new_y = repeat_P(y_o[b])
|
new_y = repeat_P(y_o[b])
|
||||||
reject_y.append(new_y)
|
reject_y.append(new_y)
|
||||||
|
|||||||
@ -8,30 +8,30 @@ def multi_head_attention_forward_patched(
|
|||||||
query,
|
query,
|
||||||
key,
|
key,
|
||||||
value,
|
value,
|
||||||
embed_dim_to_check: int,
|
embed_dim_to_check,
|
||||||
num_heads: int,
|
num_heads,
|
||||||
in_proj_weight,
|
in_proj_weight,
|
||||||
in_proj_bias: Optional[Tensor],
|
in_proj_bias,
|
||||||
bias_k: Optional[Tensor],
|
bias_k,
|
||||||
bias_v: Optional[Tensor],
|
bias_v,
|
||||||
add_zero_attn: bool,
|
add_zero_attn,
|
||||||
dropout_p: float,
|
dropout_p,
|
||||||
out_proj_weight: Tensor,
|
out_proj_weight,
|
||||||
out_proj_bias: Optional[Tensor],
|
out_proj_bias,
|
||||||
training: bool = True,
|
training=True,
|
||||||
key_padding_mask: Optional[Tensor] = None,
|
key_padding_mask=None,
|
||||||
need_weights: bool = True,
|
need_weights=True,
|
||||||
attn_mask: Optional[Tensor] = None,
|
attn_mask=None,
|
||||||
use_separate_proj_weight: bool = False,
|
use_separate_proj_weight=False,
|
||||||
q_proj_weight: Optional[Tensor] = None,
|
q_proj_weight=None,
|
||||||
k_proj_weight: Optional[Tensor] = None,
|
k_proj_weight=None,
|
||||||
v_proj_weight: Optional[Tensor] = None,
|
v_proj_weight=None,
|
||||||
static_k: Optional[Tensor] = None,
|
static_k=None,
|
||||||
static_v: Optional[Tensor] = None,
|
static_v=None,
|
||||||
average_attn_weights: bool = True,
|
average_attn_weights=True,
|
||||||
is_causal: bool = False,
|
is_causal=False,
|
||||||
cache=None,
|
cache=None,
|
||||||
) -> Tuple[Tensor, Optional[Tensor]]:
|
):
|
||||||
# set up shape vars
|
# set up shape vars
|
||||||
_, _, embed_dim = query.shape
|
_, _, embed_dim = query.shape
|
||||||
attn_mask = _canonical_mask(
|
attn_mask = _canonical_mask(
|
||||||
|
|||||||
@ -499,7 +499,7 @@ class TTS:
|
|||||||
|
|
||||||
if if_lora_v3 == True and os.path.exists(path_sovits) == False:
|
if if_lora_v3 == True and os.path.exists(path_sovits) == False:
|
||||||
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
|
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
|
||||||
raise FileExistsError(info)
|
raise FileNotFoundError(info)
|
||||||
|
|
||||||
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
||||||
dict_s2 = load_sovits_new(weights_path)
|
dict_s2 = load_sovits_new(weights_path)
|
||||||
@ -1578,16 +1578,15 @@ class TTS:
|
|||||||
max_audio = np.abs(audio).max()
|
max_audio = np.abs(audio).max()
|
||||||
if max_audio > 1:
|
if max_audio > 1:
|
||||||
audio /= max_audio
|
audio /= max_audio
|
||||||
audio = (audio * 32768).astype(np.int16)
|
audio = (audio * 32768).astype(np.int16)
|
||||||
|
else:
|
||||||
|
audio = audio.cpu().numpy()
|
||||||
|
audio = (audio * 32768).astype(np.int16)
|
||||||
t2 = time.perf_counter()
|
t2 = time.perf_counter()
|
||||||
print(f"超采样用时:{t2 - t1:.3f}s")
|
print(f"超采样用时:{t2 - t1:.3f}s")
|
||||||
else:
|
else:
|
||||||
# audio = audio.float() * 32768
|
|
||||||
# audio = audio.to(dtype=torch.int16).clamp(-32768, 32767).cpu().numpy()
|
|
||||||
|
|
||||||
audio = audio.cpu().numpy()
|
audio = audio.cpu().numpy()
|
||||||
|
audio = (audio * 32768).astype(np.int16)
|
||||||
audio = (audio * 32768).astype(np.int16)
|
|
||||||
|
|
||||||
|
|
||||||
# try:
|
# try:
|
||||||
@ -1768,7 +1767,10 @@ class TTS:
|
|||||||
pos += chunk_len * upsample_rate
|
pos += chunk_len * upsample_rate
|
||||||
|
|
||||||
audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate)
|
audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate)
|
||||||
audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
|
if padding_len > 0:
|
||||||
|
audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
|
||||||
|
else:
|
||||||
|
audio = audio[overlapped_len * upsample_rate :]
|
||||||
|
|
||||||
audio_fragments = []
|
audio_fragments = []
|
||||||
for feat_len in feat_lens:
|
for feat_len in feat_lens:
|
||||||
|
|||||||
@ -92,7 +92,7 @@ def cut0(inp):
|
|||||||
if not set(inp).issubset(punctuation):
|
if not set(inp).issubset(punctuation):
|
||||||
return inp
|
return inp
|
||||||
else:
|
else:
|
||||||
return "/n"
|
return "\n"
|
||||||
|
|
||||||
|
|
||||||
# 凑四句一切
|
# 凑四句一切
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -87,7 +87,7 @@ def sync_buffer(buffers, average=True):
|
|||||||
for buffer, handle in handles:
|
for buffer, handle in handles:
|
||||||
handle.wait()
|
handle.wait()
|
||||||
if average:
|
if average:
|
||||||
buffer.data /= world_size
|
buffer.data /= world_size()
|
||||||
|
|
||||||
|
|
||||||
def sync_grad(params):
|
def sync_grad(params):
|
||||||
|
|||||||
@ -55,6 +55,10 @@ def main():
|
|||||||
n_gpus = torch.cuda.device_count()
|
n_gpus = torch.cuda.device_count()
|
||||||
else:
|
else:
|
||||||
n_gpus = 1
|
n_gpus = 1
|
||||||
|
if n_gpus <= 1:
|
||||||
|
run(0, n_gpus, hps)
|
||||||
|
return
|
||||||
|
|
||||||
os.environ["MASTER_ADDR"] = "localhost"
|
os.environ["MASTER_ADDR"] = "localhost"
|
||||||
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
|
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
|
||||||
|
|
||||||
@ -77,12 +81,14 @@ def run(rank, n_gpus, hps):
|
|||||||
writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
|
writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
|
||||||
writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
|
writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
|
||||||
|
|
||||||
dist.init_process_group(
|
use_ddp = n_gpus > 1
|
||||||
backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
|
if use_ddp:
|
||||||
init_method="env://?use_libuv=False",
|
dist.init_process_group(
|
||||||
world_size=n_gpus,
|
backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
|
||||||
rank=rank,
|
init_method="env://?use_libuv=False",
|
||||||
)
|
world_size=n_gpus,
|
||||||
|
rank=rank,
|
||||||
|
)
|
||||||
torch.manual_seed(hps.train.seed)
|
torch.manual_seed(hps.train.seed)
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.set_device(rank)
|
torch.cuda.set_device(rank)
|
||||||
@ -118,15 +124,20 @@ def run(rank, n_gpus, hps):
|
|||||||
shuffle=True,
|
shuffle=True,
|
||||||
)
|
)
|
||||||
collate_fn = TextAudioSpeakerCollate()
|
collate_fn = TextAudioSpeakerCollate()
|
||||||
train_loader = DataLoader(
|
worker_count = 0 if os.name == "nt" and n_gpus <= 1 else min(2 if os.name == "nt" else 5, os.cpu_count() or 1)
|
||||||
train_dataset,
|
loader_kwargs = dict(
|
||||||
num_workers=5,
|
num_workers=worker_count,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
pin_memory=True,
|
pin_memory=torch.cuda.is_available(),
|
||||||
collate_fn=collate_fn,
|
collate_fn=collate_fn,
|
||||||
batch_sampler=train_sampler,
|
batch_sampler=train_sampler,
|
||||||
persistent_workers=True,
|
)
|
||||||
prefetch_factor=3,
|
if worker_count > 0:
|
||||||
|
loader_kwargs["persistent_workers"] = True
|
||||||
|
loader_kwargs["prefetch_factor"] = 2 if os.name == "nt" else 3
|
||||||
|
train_loader = DataLoader(
|
||||||
|
train_dataset,
|
||||||
|
**loader_kwargs,
|
||||||
)
|
)
|
||||||
save_root = "%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir, hps.model.version, hps.train.lora_rank)
|
save_root = "%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir, hps.model.version, hps.train.lora_rank)
|
||||||
os.makedirs(save_root, exist_ok=True)
|
os.makedirs(save_root, exist_ok=True)
|
||||||
@ -156,7 +167,9 @@ def run(rank, n_gpus, hps):
|
|||||||
|
|
||||||
def model2cuda(net_g, rank):
|
def model2cuda(net_g, rank):
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True)
|
net_g = net_g.cuda(rank)
|
||||||
|
if use_ddp:
|
||||||
|
net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
|
||||||
else:
|
else:
|
||||||
net_g = net_g.to(device)
|
net_g = net_g.to(device)
|
||||||
return net_g
|
return net_g
|
||||||
@ -242,6 +255,8 @@ def run(rank, n_gpus, hps):
|
|||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
scheduler_g.step()
|
scheduler_g.step()
|
||||||
|
if use_ddp and dist.is_initialized():
|
||||||
|
dist.destroy_process_group()
|
||||||
print("training done")
|
print("training done")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -180,10 +180,15 @@ def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) ->
|
|||||||
def _g2p(segments):
|
def _g2p(segments):
|
||||||
phones_list = []
|
phones_list = []
|
||||||
word2ph = []
|
word2ph = []
|
||||||
for seg in segments:
|
g2pw_batch_results = []
|
||||||
|
g2pw_batch_cursor = 0
|
||||||
|
processed_segments = [re.sub("[a-zA-Z]+", "", seg) for seg in segments]
|
||||||
|
if is_g2pw:
|
||||||
|
batch_inputs = [seg for seg in processed_segments if seg]
|
||||||
|
g2pw_batch_results = g2pw._g2pw(batch_inputs) if batch_inputs else []
|
||||||
|
|
||||||
|
for seg in processed_segments:
|
||||||
pinyins = []
|
pinyins = []
|
||||||
# Replace all English words in the sentence
|
|
||||||
seg = re.sub("[a-zA-Z]+", "", seg)
|
|
||||||
seg_cut = psg.lcut(seg)
|
seg_cut = psg.lcut(seg)
|
||||||
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
||||||
initials = []
|
initials = []
|
||||||
@ -204,8 +209,10 @@ def _g2p(segments):
|
|||||||
finals = sum(finals, [])
|
finals = sum(finals, [])
|
||||||
print("pypinyin结果", initials, finals)
|
print("pypinyin结果", initials, finals)
|
||||||
else:
|
else:
|
||||||
# g2pw采用整句推理
|
# g2pw采用整句推理(批量推理,逐句取结果)
|
||||||
pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
|
if seg:
|
||||||
|
pinyins = g2pw_batch_results[g2pw_batch_cursor]
|
||||||
|
g2pw_batch_cursor += 1
|
||||||
|
|
||||||
pre_word_length = 0
|
pre_word_length = 0
|
||||||
for word, pos in seg_cut:
|
for word, pos in seg_cut:
|
||||||
|
|||||||
@ -18,6 +18,7 @@ Credits
|
|||||||
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from typing import List
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -37,6 +38,8 @@ def prepare_onnx_input(
|
|||||||
use_mask: bool = False,
|
use_mask: bool = False,
|
||||||
window_size: int = None,
|
window_size: int = None,
|
||||||
max_len: int = 512,
|
max_len: int = 512,
|
||||||
|
char2id: Optional[Dict[str, int]] = None,
|
||||||
|
char_phoneme_masks: Optional[Dict[str, List[int]]] = None,
|
||||||
) -> Dict[str, np.array]:
|
) -> Dict[str, np.array]:
|
||||||
if window_size is not None:
|
if window_size is not None:
|
||||||
truncated_texts, truncated_query_ids = _truncate_texts(
|
truncated_texts, truncated_query_ids = _truncate_texts(
|
||||||
@ -48,33 +51,88 @@ def prepare_onnx_input(
|
|||||||
phoneme_masks = []
|
phoneme_masks = []
|
||||||
char_ids = []
|
char_ids = []
|
||||||
position_ids = []
|
position_ids = []
|
||||||
|
tokenized_cache = {}
|
||||||
|
|
||||||
|
if char2id is None:
|
||||||
|
char2id = {char: idx for idx, char in enumerate(chars)}
|
||||||
|
if use_mask:
|
||||||
|
if char_phoneme_masks is None:
|
||||||
|
char_phoneme_masks = {
|
||||||
|
char: [1 if i in char2phonemes[char] else 0 for i in range(len(labels))]
|
||||||
|
for char in char2phonemes
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
full_phoneme_mask = [1] * len(labels)
|
||||||
|
|
||||||
for idx in range(len(texts)):
|
for idx in range(len(texts)):
|
||||||
text = (truncated_texts if window_size else texts)[idx].lower()
|
text = (truncated_texts if window_size else texts)[idx].lower()
|
||||||
query_id = (truncated_query_ids if window_size else query_ids)[idx]
|
query_id = (truncated_query_ids if window_size else query_ids)[idx]
|
||||||
|
|
||||||
try:
|
cached = tokenized_cache.get(text)
|
||||||
tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
|
if cached is None:
|
||||||
except Exception:
|
try:
|
||||||
print(f'warning: text "{text}" is invalid')
|
tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
|
||||||
return {}
|
except Exception:
|
||||||
|
print(f'warning: text "{text}" is invalid')
|
||||||
|
return {}
|
||||||
|
|
||||||
text, query_id, tokens, text2token, token2text = _truncate(
|
if len(tokens) <= max_len - 2:
|
||||||
max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text
|
processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
|
||||||
)
|
shared_input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
|
||||||
|
shared_token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
|
||||||
|
shared_attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
|
||||||
|
cached = {
|
||||||
|
"is_short": True,
|
||||||
|
"tokens": tokens,
|
||||||
|
"text2token": text2token,
|
||||||
|
"token2text": token2text,
|
||||||
|
"input_id": shared_input_id,
|
||||||
|
"token_type_id": shared_token_type_id,
|
||||||
|
"attention_mask": shared_attention_mask,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
cached = {
|
||||||
|
"is_short": False,
|
||||||
|
"tokens": tokens,
|
||||||
|
"text2token": text2token,
|
||||||
|
"token2text": token2text,
|
||||||
|
}
|
||||||
|
tokenized_cache[text] = cached
|
||||||
|
|
||||||
processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
|
if cached["is_short"]:
|
||||||
|
text_for_query = text
|
||||||
|
query_id_for_query = query_id
|
||||||
|
text2token_for_query = cached["text2token"]
|
||||||
|
input_id = cached["input_id"]
|
||||||
|
token_type_id = cached["token_type_id"]
|
||||||
|
attention_mask = cached["attention_mask"]
|
||||||
|
else:
|
||||||
|
(
|
||||||
|
text_for_query,
|
||||||
|
query_id_for_query,
|
||||||
|
tokens_for_query,
|
||||||
|
text2token_for_query,
|
||||||
|
_token2text_for_query,
|
||||||
|
) = _truncate(
|
||||||
|
max_len=max_len,
|
||||||
|
text=text,
|
||||||
|
query_id=query_id,
|
||||||
|
tokens=cached["tokens"],
|
||||||
|
text2token=cached["text2token"],
|
||||||
|
token2text=cached["token2text"],
|
||||||
|
)
|
||||||
|
processed_tokens = ["[CLS]"] + tokens_for_query + ["[SEP]"]
|
||||||
|
input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
|
||||||
|
token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
|
||||||
|
attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
|
||||||
|
|
||||||
input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
|
query_char = text_for_query[query_id_for_query]
|
||||||
token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
|
if use_mask:
|
||||||
attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
|
phoneme_mask = char_phoneme_masks[query_char]
|
||||||
|
else:
|
||||||
query_char = text[query_id]
|
phoneme_mask = full_phoneme_mask
|
||||||
phoneme_mask = (
|
char_id = char2id[query_char]
|
||||||
[1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels)
|
position_id = text2token_for_query[query_id_for_query] + 1 # [CLS] token locate at first place
|
||||||
)
|
|
||||||
char_id = chars.index(query_char)
|
|
||||||
position_id = text2token[query_id] + 1 # [CLS] token locate at first place
|
|
||||||
|
|
||||||
input_ids.append(input_id)
|
input_ids.append(input_id)
|
||||||
token_type_ids.append(token_type_id)
|
token_type_ids.append(token_type_id)
|
||||||
@ -83,10 +141,15 @@ def prepare_onnx_input(
|
|||||||
char_ids.append(char_id)
|
char_ids.append(char_id)
|
||||||
position_ids.append(position_id)
|
position_ids.append(position_id)
|
||||||
|
|
||||||
|
max_token_length = max(len(seq) for seq in input_ids)
|
||||||
|
|
||||||
|
def _pad_sequences(sequences, pad_value=0):
|
||||||
|
return [seq + [pad_value] * (max_token_length - len(seq)) for seq in sequences]
|
||||||
|
|
||||||
outputs = {
|
outputs = {
|
||||||
"input_ids": np.array(input_ids).astype(np.int64),
|
"input_ids": np.array(_pad_sequences(input_ids, pad_value=0)).astype(np.int64),
|
||||||
"token_type_ids": np.array(token_type_ids).astype(np.int64),
|
"token_type_ids": np.array(_pad_sequences(token_type_ids, pad_value=0)).astype(np.int64),
|
||||||
"attention_masks": np.array(attention_masks).astype(np.int64),
|
"attention_masks": np.array(_pad_sequences(attention_masks, pad_value=0)).astype(np.int64),
|
||||||
"phoneme_masks": np.array(phoneme_masks).astype(np.float32),
|
"phoneme_masks": np.array(phoneme_masks).astype(np.float32),
|
||||||
"char_ids": np.array(char_ids).astype(np.int64),
|
"char_ids": np.array(char_ids).astype(np.int64),
|
||||||
"position_ids": np.array(position_ids).astype(np.int64),
|
"position_ids": np.array(position_ids).astype(np.int64),
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from typing import Any, Dict, List, Tuple
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
import requests
|
import requests
|
||||||
import torch
|
|
||||||
from opencc import OpenCC
|
from opencc import OpenCC
|
||||||
from pypinyin import Style, pinyin
|
from pypinyin import Style, pinyin
|
||||||
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
||||||
@ -22,9 +21,8 @@ from .utils import load_config
|
|||||||
onnxruntime.set_default_logger_severity(3)
|
onnxruntime.set_default_logger_severity(3)
|
||||||
try:
|
try:
|
||||||
onnxruntime.preload_dlls()
|
onnxruntime.preload_dlls()
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# traceback.print_exc()
|
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
model_version = "1.1"
|
model_version = "1.1"
|
||||||
@ -55,6 +53,24 @@ def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[Lis
|
|||||||
return all_preds, all_confidences
|
return all_preds, all_confidences
|
||||||
|
|
||||||
|
|
||||||
|
def _load_json_from_candidates(filename: str, candidate_dirs: List[str]) -> Dict[str, Any]:
|
||||||
|
for candidate_dir in candidate_dirs:
|
||||||
|
if not candidate_dir:
|
||||||
|
continue
|
||||||
|
json_path = os.path.join(candidate_dir, filename)
|
||||||
|
if os.path.exists(json_path):
|
||||||
|
with open(json_path, "r", encoding="utf-8") as fr:
|
||||||
|
return json.load(fr)
|
||||||
|
raise FileNotFoundError(f"Cannot locate {filename} in candidate dirs: {candidate_dirs}")
|
||||||
|
|
||||||
|
|
||||||
|
def _find_first_existing_file(*paths: str) -> str:
|
||||||
|
for path in paths:
|
||||||
|
if path and os.path.exists(path):
|
||||||
|
return path
|
||||||
|
raise FileNotFoundError(f"Files not found: {paths}")
|
||||||
|
|
||||||
|
|
||||||
def download_and_decompress(model_dir: str = "G2PWModel/"):
|
def download_and_decompress(model_dir: str = "G2PWModel/"):
|
||||||
if not os.path.exists(model_dir):
|
if not os.path.exists(model_dir):
|
||||||
parent_directory = os.path.dirname(model_dir)
|
parent_directory = os.path.dirname(model_dir)
|
||||||
@ -62,7 +78,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
|
|||||||
extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
|
extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
|
||||||
extract_dir_new = os.path.join(parent_directory, "G2PWModel")
|
extract_dir_new = os.path.join(parent_directory, "G2PWModel")
|
||||||
print("Downloading g2pw model...")
|
print("Downloading g2pw model...")
|
||||||
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip" # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
|
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"
|
||||||
with requests.get(modelscope_url, stream=True) as r:
|
with requests.get(modelscope_url, stream=True) as r:
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
with open(zip_dir, "wb") as f:
|
with open(zip_dir, "wb") as f:
|
||||||
@ -79,7 +95,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
|
|||||||
return model_dir
|
return model_dir
|
||||||
|
|
||||||
|
|
||||||
class G2PWOnnxConverter:
|
class _G2PWBaseOnnxConverter:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_dir: str = "G2PWModel/",
|
model_dir: str = "G2PWModel/",
|
||||||
@ -87,33 +103,16 @@ class G2PWOnnxConverter:
|
|||||||
model_source: str = None,
|
model_source: str = None,
|
||||||
enable_non_tradional_chinese: bool = False,
|
enable_non_tradional_chinese: bool = False,
|
||||||
):
|
):
|
||||||
uncompress_path = download_and_decompress(model_dir)
|
self.model_dir = download_and_decompress(model_dir)
|
||||||
|
self.config = load_config(config_path=os.path.join(self.model_dir, "config.py"), use_default=True)
|
||||||
sess_options = onnxruntime.SessionOptions()
|
|
||||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
||||||
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
|
||||||
sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0
|
|
||||||
if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
|
|
||||||
self.session_g2pW = onnxruntime.InferenceSession(
|
|
||||||
os.path.join(uncompress_path, "g2pW.onnx"),
|
|
||||||
sess_options=sess_options,
|
|
||||||
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.session_g2pW = onnxruntime.InferenceSession(
|
|
||||||
os.path.join(uncompress_path, "g2pW.onnx"),
|
|
||||||
sess_options=sess_options,
|
|
||||||
providers=["CPUExecutionProvider"],
|
|
||||||
)
|
|
||||||
self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True)
|
|
||||||
|
|
||||||
self.model_source = model_source if model_source else self.config.model_source
|
self.model_source = model_source if model_source else self.config.model_source
|
||||||
self.enable_opencc = enable_non_tradional_chinese
|
self.enable_opencc = enable_non_tradional_chinese
|
||||||
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)
|
self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)
|
||||||
|
|
||||||
polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt")
|
polyphonic_chars_path = os.path.join(self.model_dir, "POLYPHONIC_CHARS.txt")
|
||||||
monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt")
|
monophonic_chars_path = os.path.join(self.model_dir, "MONOPHONIC_CHARS.txt")
|
||||||
|
|
||||||
self.polyphonic_chars = [
|
self.polyphonic_chars = [
|
||||||
line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n")
|
line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n")
|
||||||
]
|
]
|
||||||
@ -149,31 +148,47 @@ class G2PWOnnxConverter:
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.chars = sorted(list(self.char2phonemes.keys()))
|
self.chars = sorted(list(self.char2phonemes.keys()))
|
||||||
|
self.char2id = {char: idx for idx, char in enumerate(self.chars)}
|
||||||
|
self.char_phoneme_masks = (
|
||||||
|
{
|
||||||
|
char: [1 if i in self.char2phonemes[char] else 0 for i in range(len(self.labels))]
|
||||||
|
for char in self.char2phonemes
|
||||||
|
}
|
||||||
|
if self.config.use_mask
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
self.polyphonic_chars_new = set(self.chars)
|
self.polyphonic_chars_new = set(self.chars)
|
||||||
for char in self.non_polyphonic:
|
for char in self.non_polyphonic:
|
||||||
if char in self.polyphonic_chars_new:
|
self.polyphonic_chars_new.discard(char)
|
||||||
self.polyphonic_chars_new.remove(char)
|
|
||||||
|
|
||||||
self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars}
|
self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars}
|
||||||
for char in self.non_monophonic:
|
for char in self.non_monophonic:
|
||||||
if char in self.monophonic_chars_dict:
|
self.monophonic_chars_dict.pop(char, None)
|
||||||
self.monophonic_chars_dict.pop(char)
|
|
||||||
|
|
||||||
self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
|
default_asset_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "G2PWModel"))
|
||||||
|
candidate_asset_dirs = [self.model_dir, default_asset_dir]
|
||||||
|
self.bopomofo_convert_dict = _load_json_from_candidates(
|
||||||
|
"bopomofo_to_pinyin_wo_tune_dict.json", candidate_asset_dirs
|
||||||
|
)
|
||||||
|
self.char_bopomofo_dict = _load_json_from_candidates("char_bopomofo_dict.json", candidate_asset_dirs)
|
||||||
|
|
||||||
with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr:
|
|
||||||
self.bopomofo_convert_dict = json.load(fr)
|
|
||||||
self.style_convert_func = {
|
self.style_convert_func = {
|
||||||
"bopomofo": lambda x: x,
|
"bopomofo": lambda x: x,
|
||||||
"pinyin": self._convert_bopomofo_to_pinyin,
|
"pinyin": self._convert_bopomofo_to_pinyin,
|
||||||
}[style]
|
}[style]
|
||||||
|
|
||||||
with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr:
|
|
||||||
self.char_bopomofo_dict = json.load(fr)
|
|
||||||
|
|
||||||
if self.enable_opencc:
|
if self.enable_opencc:
|
||||||
self.cc = OpenCC("s2tw")
|
self.cc = OpenCC("s2tw")
|
||||||
|
self.enable_sentence_dedup = os.getenv("g2pw_sentence_dedup", "true").strip().lower() in {
|
||||||
|
"1",
|
||||||
|
"true",
|
||||||
|
"yes",
|
||||||
|
"y",
|
||||||
|
"on",
|
||||||
|
}
|
||||||
|
# 聚焦到多音字附近上下文,默认左右各16字;设为0表示关闭裁剪(整句)。
|
||||||
|
self.polyphonic_context_chars = max(0, int(os.getenv("g2pw_polyphonic_context_chars", "16")))
|
||||||
|
|
||||||
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
|
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
|
||||||
tone = bopomofo[-1]
|
tone = bopomofo[-1]
|
||||||
@ -181,9 +196,8 @@ class G2PWOnnxConverter:
|
|||||||
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
|
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
|
||||||
if component:
|
if component:
|
||||||
return component + tone
|
return component + tone
|
||||||
else:
|
print(f'Warning: "{bopomofo}" cannot convert to pinyin')
|
||||||
print(f'Warning: "{bopomofo}" cannot convert to pinyin')
|
return None
|
||||||
return None
|
|
||||||
|
|
||||||
def __call__(self, sentences: List[str]) -> List[List[str]]:
|
def __call__(self, sentences: List[str]) -> List[List[str]]:
|
||||||
if isinstance(sentences, str):
|
if isinstance(sentences, str):
|
||||||
@ -197,51 +211,147 @@ class G2PWOnnxConverter:
|
|||||||
translated_sentences.append(translated_sent)
|
translated_sentences.append(translated_sent)
|
||||||
sentences = translated_sentences
|
sentences = translated_sentences
|
||||||
|
|
||||||
texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
|
texts, model_query_ids, result_query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
|
||||||
if len(texts) == 0:
|
if len(texts) == 0:
|
||||||
# sentences no polyphonic words
|
|
||||||
return partial_results
|
return partial_results
|
||||||
|
|
||||||
onnx_input = prepare_onnx_input(
|
model_input = prepare_onnx_input(
|
||||||
tokenizer=self.tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
labels=self.labels,
|
labels=self.labels,
|
||||||
char2phonemes=self.char2phonemes,
|
char2phonemes=self.char2phonemes,
|
||||||
chars=self.chars,
|
chars=self.chars,
|
||||||
texts=texts,
|
texts=texts,
|
||||||
query_ids=query_ids,
|
query_ids=model_query_ids,
|
||||||
use_mask=self.config.use_mask,
|
use_mask=self.config.use_mask,
|
||||||
window_size=None,
|
window_size=None,
|
||||||
|
char2id=self.char2id,
|
||||||
|
char_phoneme_masks=self.char_phoneme_masks,
|
||||||
)
|
)
|
||||||
|
|
||||||
preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels)
|
if not model_input:
|
||||||
|
return partial_results
|
||||||
|
|
||||||
|
if self.enable_sentence_dedup:
|
||||||
|
preds, _confidences = self._predict_with_sentence_dedup(model_input=model_input, texts=texts)
|
||||||
|
else:
|
||||||
|
preds, _confidences = self._predict(model_input=model_input)
|
||||||
|
|
||||||
if self.config.use_char_phoneme:
|
if self.config.use_char_phoneme:
|
||||||
preds = [pred.split(" ")[1] for pred in preds]
|
preds = [pred.split(" ")[1] for pred in preds]
|
||||||
|
|
||||||
results = partial_results
|
results = partial_results
|
||||||
for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
|
for sent_id, query_id, pred in zip(sent_ids, result_query_ids, preds):
|
||||||
results[sent_id][query_id] = self.style_convert_func(pred)
|
results[sent_id][query_id] = self.style_convert_func(pred)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
|
def _prepare_data(
|
||||||
texts, query_ids, sent_ids, partial_results = [], [], [], []
|
self, sentences: List[str]
|
||||||
|
) -> Tuple[List[str], List[int], List[int], List[int], List[List[str]]]:
|
||||||
|
texts, model_query_ids, result_query_ids, sent_ids, partial_results = [], [], [], [], []
|
||||||
for sent_id, sent in enumerate(sentences):
|
for sent_id, sent in enumerate(sentences):
|
||||||
# pypinyin works well for Simplified Chinese than Traditional Chinese
|
|
||||||
sent_s = tranditional_to_simplified(sent)
|
sent_s = tranditional_to_simplified(sent)
|
||||||
pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3)
|
pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3)
|
||||||
partial_result = [None] * len(sent)
|
partial_result = [None] * len(sent)
|
||||||
|
polyphonic_indices: List[int] = []
|
||||||
for i, char in enumerate(sent):
|
for i, char in enumerate(sent):
|
||||||
if char in self.polyphonic_chars_new:
|
if char in self.polyphonic_chars_new:
|
||||||
texts.append(sent)
|
polyphonic_indices.append(i)
|
||||||
query_ids.append(i)
|
|
||||||
sent_ids.append(sent_id)
|
|
||||||
elif char in self.monophonic_chars_dict:
|
elif char in self.monophonic_chars_dict:
|
||||||
partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char])
|
partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char])
|
||||||
elif char in self.char_bopomofo_dict:
|
elif char in self.char_bopomofo_dict:
|
||||||
partial_result[i] = pypinyin_result[i][0]
|
partial_result[i] = pypinyin_result[i][0]
|
||||||
# partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0])
|
|
||||||
else:
|
else:
|
||||||
partial_result[i] = pypinyin_result[i][0]
|
partial_result[i] = pypinyin_result[i][0]
|
||||||
|
|
||||||
|
if polyphonic_indices:
|
||||||
|
if self.polyphonic_context_chars > 0:
|
||||||
|
left = max(0, polyphonic_indices[0] - self.polyphonic_context_chars)
|
||||||
|
right = min(len(sent), polyphonic_indices[-1] + self.polyphonic_context_chars + 1)
|
||||||
|
sent_for_predict = sent[left:right]
|
||||||
|
query_offset = left
|
||||||
|
else:
|
||||||
|
sent_for_predict = sent
|
||||||
|
query_offset = 0
|
||||||
|
|
||||||
|
for index in polyphonic_indices:
|
||||||
|
texts.append(sent_for_predict)
|
||||||
|
model_query_ids.append(index - query_offset)
|
||||||
|
result_query_ids.append(index)
|
||||||
|
sent_ids.append(sent_id)
|
||||||
|
|
||||||
partial_results.append(partial_result)
|
partial_results.append(partial_result)
|
||||||
return texts, query_ids, sent_ids, partial_results
|
return texts, model_query_ids, result_query_ids, sent_ids, partial_results
|
||||||
|
|
||||||
|
def _predict(self, model_input: Dict[str, Any]) -> Tuple[List[str], List[float]]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _predict_with_sentence_dedup(
|
||||||
|
self, model_input: Dict[str, Any], texts: List[str]
|
||||||
|
) -> Tuple[List[str], List[float]]:
|
||||||
|
if len(texts) <= 1:
|
||||||
|
return self._predict(model_input=model_input)
|
||||||
|
|
||||||
|
grouped_indices: Dict[str, List[int]] = {}
|
||||||
|
for idx, text in enumerate(texts):
|
||||||
|
grouped_indices.setdefault(text, []).append(idx)
|
||||||
|
|
||||||
|
if all(len(indices) == 1 for indices in grouped_indices.values()):
|
||||||
|
return self._predict(model_input=model_input)
|
||||||
|
|
||||||
|
preds: List[str] = [""] * len(texts)
|
||||||
|
confidences: List[float] = [0.0] * len(texts)
|
||||||
|
for indices in grouped_indices.values():
|
||||||
|
group_input = {name: value[indices] for name, value in model_input.items()}
|
||||||
|
if len(indices) > 1:
|
||||||
|
for name in ("input_ids", "token_type_ids", "attention_masks"):
|
||||||
|
group_input[name] = group_input[name][:1]
|
||||||
|
|
||||||
|
group_preds, group_confidences = self._predict(model_input=group_input)
|
||||||
|
for output_idx, pred, confidence in zip(indices, group_preds, group_confidences):
|
||||||
|
preds[output_idx] = pred
|
||||||
|
confidences[output_idx] = confidence
|
||||||
|
|
||||||
|
return preds, confidences
|
||||||
|
|
||||||
|
|
||||||
|
class G2PWOnnxConverter(_G2PWBaseOnnxConverter):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_dir: str = "G2PWModel/",
|
||||||
|
style: str = "bopomofo",
|
||||||
|
model_source: str = None,
|
||||||
|
enable_non_tradional_chinese: bool = False,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
model_dir=model_dir,
|
||||||
|
style=style,
|
||||||
|
model_source=model_source,
|
||||||
|
enable_non_tradional_chinese=enable_non_tradional_chinese,
|
||||||
|
)
|
||||||
|
|
||||||
|
sess_options = onnxruntime.SessionOptions()
|
||||||
|
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||||
|
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
||||||
|
sess_options.intra_op_num_threads = 2
|
||||||
|
|
||||||
|
onnx_path = _find_first_existing_file(
|
||||||
|
os.path.join(self.model_dir, "g2pW.onnx"),
|
||||||
|
os.path.join(self.model_dir, "g2pw.onnx"),
|
||||||
|
)
|
||||||
|
|
||||||
|
if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
|
||||||
|
self.session_g2pw = onnxruntime.InferenceSession(
|
||||||
|
onnx_path,
|
||||||
|
sess_options=sess_options,
|
||||||
|
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.session_g2pw = onnxruntime.InferenceSession(
|
||||||
|
onnx_path,
|
||||||
|
sess_options=sess_options,
|
||||||
|
providers=["CPUExecutionProvider"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def _predict(self, model_input: Dict[str, Any]) -> Tuple[List[str], List[float]]:
|
||||||
|
return predict(session=self.session_g2pw, onnx_input=model_input, labels=self.labels)
|
||||||
|
|||||||
@ -396,4 +396,5 @@ arpa = {
|
|||||||
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
|
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
|
||||||
symbols = sorted(set(symbols))
|
symbols = sorted(set(symbols))
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
print(symbols)
|
||||||
print(len(symbols))
|
print(len(symbols))
|
||||||
|
|||||||
@ -48,6 +48,8 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
|||||||
|
|
||||||
请不要尬黑GPT-SoVITS推理速度慢,谢谢!
|
请不要尬黑GPT-SoVITS推理速度慢,谢谢!
|
||||||
|
|
||||||
|
CPU-Optimized Inference Version:https://github.com/baicai-1145/GPT-SoVITS-CPUFast
|
||||||
|
|
||||||
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|||||||
0
Ref_Audio_Selector/__init__.py
Normal file
0
Ref_Audio_Selector/__init__.py
Normal file
0
Ref_Audio_Selector/common/__init__.py
Normal file
0
Ref_Audio_Selector/common/__init__.py
Normal file
156
Ref_Audio_Selector/common/common.py
Normal file
156
Ref_Audio_Selector/common/common.py
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
from tools import my_utils
|
||||||
|
from config import python_exec, is_half
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class RefAudioListManager:
|
||||||
|
def __init__(self, root_dir):
|
||||||
|
self.audio_dict = {'default': []}
|
||||||
|
absolute_root = os.path.abspath(root_dir)
|
||||||
|
|
||||||
|
for subdir, dirs, files in os.walk(absolute_root):
|
||||||
|
relative_path = os.path.relpath(subdir, absolute_root)
|
||||||
|
|
||||||
|
if relative_path == '.':
|
||||||
|
category = 'default'
|
||||||
|
else:
|
||||||
|
category = relative_path.replace(os.sep, '')
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.wav'):
|
||||||
|
# 将相对路径转换为绝对路径
|
||||||
|
audio_abs_path = os.path.join(subdir, file)
|
||||||
|
if category not in self.audio_dict:
|
||||||
|
self.audio_dict[category] = []
|
||||||
|
self.audio_dict[category].append(audio_abs_path)
|
||||||
|
|
||||||
|
def get_audio_list(self):
|
||||||
|
return self.audio_dict
|
||||||
|
|
||||||
|
def get_flattened_audio_list(self):
|
||||||
|
all_audio_files = []
|
||||||
|
for category_audios in self.audio_dict.values():
|
||||||
|
all_audio_files.extend(category_audios)
|
||||||
|
return all_audio_files
|
||||||
|
|
||||||
|
def get_ref_audio_list(self):
|
||||||
|
audio_info_list = []
|
||||||
|
for category, audio_paths in self.audio_dict.items():
|
||||||
|
for audio_path in audio_paths:
|
||||||
|
filename_without_extension = os.path.splitext(os.path.basename(audio_path))[0]
|
||||||
|
audio_info = {
|
||||||
|
'emotion': f"{category}-{filename_without_extension}",
|
||||||
|
'ref_path': audio_path,
|
||||||
|
'ref_text': filename_without_extension,
|
||||||
|
}
|
||||||
|
audio_info_list.append(audio_info)
|
||||||
|
return audio_info_list
|
||||||
|
|
||||||
|
|
||||||
|
def batch_clean_paths(paths):
|
||||||
|
"""
|
||||||
|
批量处理路径列表,对每个路径调用 clean_path() 函数。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
paths (list[str]): 包含待处理路径的列表。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
list[str]: 经过 clean_path() 处理后的路径列表。
|
||||||
|
"""
|
||||||
|
cleaned_paths = []
|
||||||
|
for path in paths:
|
||||||
|
cleaned_paths.append(my_utils.clean_path(path))
|
||||||
|
return cleaned_paths
|
||||||
|
|
||||||
|
|
||||||
|
def read_text_file_to_list(file_path):
|
||||||
|
# 按照UTF-8编码打开文件(确保能够正确读取中文)
|
||||||
|
with open(file_path, mode='r', encoding='utf-8') as file:
|
||||||
|
# 读取所有行并存储到一个列表中
|
||||||
|
lines = file.read().splitlines()
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def get_filename_without_extension(file_path):
|
||||||
|
"""
|
||||||
|
Given a file path string, returns the file name without its extension.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
file_path (str): The full path to the file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The file name without its extension.
|
||||||
|
"""
|
||||||
|
base_name = os.path.basename(file_path) # Get the base name (file name with extension)
|
||||||
|
file_name, file_extension = os.path.splitext(base_name) # Split the base name into file name and extension
|
||||||
|
return file_name # Return the file name without extension
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(file_path):
|
||||||
|
# 使用with语句打开并读取文件
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file: # 'r' 表示以读取模式打开文件
|
||||||
|
# 一次性读取文件所有内容
|
||||||
|
file_content = file.read()
|
||||||
|
|
||||||
|
# 文件在with语句结束时会自动关闭
|
||||||
|
# 现在file_content变量中存储了文件的所有文本内容
|
||||||
|
return file_content
|
||||||
|
|
||||||
|
|
||||||
|
def write_text_to_file(text, output_file_path):
|
||||||
|
try:
|
||||||
|
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(text)
|
||||||
|
except IOError as e:
|
||||||
|
print(f"Error occurred while writing to the file: {e}")
|
||||||
|
else:
|
||||||
|
print(f"Text successfully written to file: {output_file_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_path_existence_and_return(path):
|
||||||
|
"""
|
||||||
|
检查给定路径(文件或目录)是否存在。如果存在,返回该路径;否则,返回空字符串。
|
||||||
|
:param path: 待检查的文件或目录路径(字符串)
|
||||||
|
:return: 如果路径存在,返回原路径;否则,返回空字符串
|
||||||
|
"""
|
||||||
|
if os.path.exists(path):
|
||||||
|
return path
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def open_file(filepath):
|
||||||
|
if sys.platform.startswith('darwin'):
|
||||||
|
subprocess.run(['open', filepath]) # macOS
|
||||||
|
elif os.name == 'nt': # For Windows
|
||||||
|
os.startfile(filepath)
|
||||||
|
elif os.name == 'posix': # For Linux, Unix, etc.
|
||||||
|
subprocess.run(['xdg-open', filepath])
|
||||||
|
|
||||||
|
|
||||||
|
def start_new_service(script_path):
|
||||||
|
# 对于Windows系统
|
||||||
|
if sys.platform.startswith('win'):
|
||||||
|
cmd = f'start cmd /k {python_exec} {script_path}'
|
||||||
|
# 对于Mac或者Linux系统
|
||||||
|
else:
|
||||||
|
cmd = f'xterm -e {python_exec} {script_path}'
|
||||||
|
|
||||||
|
proc = subprocess.Popen(cmd, shell=True)
|
||||||
|
|
||||||
|
# 关闭之前启动的子进程
|
||||||
|
# proc.terminate()
|
||||||
|
|
||||||
|
# 或者如果需要强制关闭可以使用
|
||||||
|
# proc.kill()
|
||||||
|
|
||||||
|
return proc
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
dir = r'C:\Users\Administrator\Desktop/test'
|
||||||
|
dir2 = r'"C:\Users\Administrator\Desktop\test2"'
|
||||||
|
dir, dir2 = batch_clean_paths([dir, dir2])
|
||||||
|
print(dir, dir2)
|
||||||
46
Ref_Audio_Selector/common/model_manager.py
Normal file
46
Ref_Audio_Selector/common/model_manager.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
|
||||||
|
pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
||||||
|
SoVITS_weight_root = "SoVITS_weights"
|
||||||
|
GPT_weight_root = "GPT_weights"
|
||||||
|
os.makedirs(SoVITS_weight_root, exist_ok=True)
|
||||||
|
os.makedirs(GPT_weight_root, exist_ok=True)
|
||||||
|
|
||||||
|
speaker_verification_models = {
|
||||||
|
'speech_campplus_sv_zh-cn_16k-common': {
|
||||||
|
'task': 'speaker-verification',
|
||||||
|
'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_campplus_sv_zh-cn_16k-common',
|
||||||
|
'model_revision': 'v1.0.0'
|
||||||
|
},
|
||||||
|
'speech_eres2net_sv_zh-cn_16k-common': {
|
||||||
|
'task': 'speaker-verification',
|
||||||
|
'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_eres2net_sv_zh-cn_16k-common',
|
||||||
|
'model_revision': 'v1.0.5'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def custom_sort_key(s):
|
||||||
|
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
||||||
|
parts = re.split('(\d+)', s)
|
||||||
|
# 将数字部分转换为整数,非数字部分保持不变
|
||||||
|
parts = [int(part) if part.isdigit() else part for part in parts]
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpt_model_names():
|
||||||
|
gpt_names = [pretrained_gpt_name]
|
||||||
|
for name in os.listdir(GPT_weight_root):
|
||||||
|
if name.endswith(".ckpt"): gpt_names.append("%s/%s" % (GPT_weight_root, name))
|
||||||
|
sorted(gpt_names, key=custom_sort_key)
|
||||||
|
return gpt_names
|
||||||
|
|
||||||
|
|
||||||
|
def get_sovits_model_names():
|
||||||
|
sovits_names = [pretrained_sovits_name]
|
||||||
|
for name in os.listdir(SoVITS_weight_root):
|
||||||
|
if name.endswith(".pth"): sovits_names.append("%s/%s" % (SoVITS_weight_root, name))
|
||||||
|
sorted(sovits_names, key=custom_sort_key)
|
||||||
|
return sovits_names
|
||||||
|
|
||||||
72
Ref_Audio_Selector/common/time_util.py
Normal file
72
Ref_Audio_Selector/common/time_util.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import time
|
||||||
|
import os
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import p_logger
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
|
||||||
|
|
||||||
|
def timeit_decorator(func):
|
||||||
|
"""
|
||||||
|
装饰器,用于计算被装饰函数的执行时间。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
func (function): 要计时的函数。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
function: 包含计时功能的新函数。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
if params.time_log_print_type != 'file':
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
|
start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点
|
||||||
|
|
||||||
|
func_result = func(*args, **kwargs) # 执行原函数
|
||||||
|
|
||||||
|
end_time = time.perf_counter() # 获取计时终点
|
||||||
|
elapsed_time = end_time - start_time # 计算执行耗时
|
||||||
|
|
||||||
|
# 记录日志内容
|
||||||
|
log_message = f"进程ID: {os.getpid()}, {func.__name__} 执行耗时: {elapsed_time:.6f} 秒"
|
||||||
|
p_logger.info(log_message)
|
||||||
|
|
||||||
|
return func_result
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def time_monitor(func):
|
||||||
|
"""
|
||||||
|
返回结果,追加时间
|
||||||
|
"""
|
||||||
|
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
|
||||||
|
start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点
|
||||||
|
|
||||||
|
func_result = func(*args, **kwargs) # 执行原函数
|
||||||
|
|
||||||
|
end_time = time.perf_counter() # 获取计时终点
|
||||||
|
elapsed_time = end_time - start_time # 计算执行耗时
|
||||||
|
|
||||||
|
return elapsed_time, func_result
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
# 使用装饰器
|
||||||
|
@timeit_decorator
|
||||||
|
def example_function(n):
|
||||||
|
time.sleep(n) # 假设这是需要计时的函数,这里模拟耗时操作
|
||||||
|
return n * 2
|
||||||
|
|
||||||
|
|
||||||
|
def example_function2(n):
|
||||||
|
time.sleep(n) # 假设这是需要计时的函数,这里模拟耗时操作
|
||||||
|
return n * 2
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 调用经过装饰的函数
|
||||||
|
# result = example_function(2)
|
||||||
|
print(time_monitor(example_function2)(2))
|
||||||
57
Ref_Audio_Selector/config.ini
Normal file
57
Ref_Audio_Selector/config.ini
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# config.ini
|
||||||
|
|
||||||
|
[Base]
|
||||||
|
# 服务端口号
|
||||||
|
server_port = 9423
|
||||||
|
# 参考音频目录
|
||||||
|
reference_audio_dir = refer_audio
|
||||||
|
# 临时文件目录
|
||||||
|
temp_dir = Ref_Audio_Selector/temp
|
||||||
|
|
||||||
|
[Log]
|
||||||
|
# 日志保存目录路径
|
||||||
|
log_dir = Ref_Audio_Selector/log/general
|
||||||
|
# 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、
|
||||||
|
log_level = INFO
|
||||||
|
# 函数时间消耗日志打印类型 file 打印到文件; close 关闭
|
||||||
|
time_log_print_type = file
|
||||||
|
# 函数时间消耗日志保存目录路径
|
||||||
|
time_log_print_dir = Ref_Audio_Selector/log/performance
|
||||||
|
|
||||||
|
[AudioSample]
|
||||||
|
# list转换待选参考音频目录
|
||||||
|
list_to_convert_reference_audio_dir = refer_audio_all
|
||||||
|
# 音频相似度目录
|
||||||
|
audio_similarity_dir = similarity
|
||||||
|
# 是否开启基准音频预采样 true false
|
||||||
|
enable_pre_sample = true
|
||||||
|
|
||||||
|
[Inference]
|
||||||
|
# 默认测试文本位置
|
||||||
|
default_test_text_path = Ref_Audio_Selector/file/test_content/test_content.txt
|
||||||
|
# 推理音频目录
|
||||||
|
inference_audio_dir = inference_audio
|
||||||
|
# 推理音频文本聚合目录
|
||||||
|
inference_audio_text_aggregation_dir = text
|
||||||
|
# 推理音频情绪聚合目录
|
||||||
|
inference_audio_emotion_aggregation_dir = emotion
|
||||||
|
|
||||||
|
[ResultCheck]
|
||||||
|
# asr输出文件
|
||||||
|
asr_filename = asr
|
||||||
|
# 文本相似度输出目录
|
||||||
|
text_similarity_output_dir = text_similarity
|
||||||
|
# 文本情绪平均相似度报告文件名
|
||||||
|
text_emotion_average_similarity_report_filename = average_similarity
|
||||||
|
# 文本相似度按情绪聚合明细文件名
|
||||||
|
text_similarity_by_emotion_detail_filename = emotion_group_detail
|
||||||
|
# 文本相似度按文本聚合明细文件名
|
||||||
|
text_similarity_by_text_detail_filename = text_group_detail
|
||||||
|
|
||||||
|
[AudioConfig]
|
||||||
|
# 默认模板文件位置
|
||||||
|
default_template_path = Ref_Audio_Selector/file/config_template/ref_audio_template.txt
|
||||||
|
# 参考音频配置文件名
|
||||||
|
reference_audio_config_filename = refer_audio
|
||||||
|
|
||||||
|
[Other]
|
||||||
0
Ref_Audio_Selector/config_param/__init__.py
Normal file
0
Ref_Audio_Selector/config_param/__init__.py
Normal file
111
Ref_Audio_Selector/config_param/config_manager.py
Normal file
111
Ref_Audio_Selector/config_param/config_manager.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
import configparser
|
||||||
|
import os
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
|
||||||
|
|
||||||
|
class ParamReadWriteManager:
|
||||||
|
def __init__(self):
|
||||||
|
self.base_dir = 'Ref_Audio_Selector/file/base_info'
|
||||||
|
os.makedirs(self.base_dir, exist_ok=True)
|
||||||
|
# 基础信息
|
||||||
|
self.work_dir = 'work_dir'
|
||||||
|
self.role = 'role'
|
||||||
|
# 第一步
|
||||||
|
self.subsection_num = 'subsection_num'
|
||||||
|
self.sample_num = 'sample_num'
|
||||||
|
# 第二步
|
||||||
|
self.api_set_model_base_url = 'api_set_model_base_url'
|
||||||
|
self.api_gpt_param = 'api_gpt_param'
|
||||||
|
self.api_sovits_param = 'api_sovits_param'
|
||||||
|
|
||||||
|
self.api_v2_set_gpt_model_base_url = 'api_v2_set_gpt_model_base_url'
|
||||||
|
self.api_v2_gpt_model_param = 'api_v2_gpt_model_param'
|
||||||
|
self.api_v2_set_sovits_model_base_url = 'api_v2_set_sovits_model_base_url'
|
||||||
|
self.api_v2_sovits_model_param = 'api_v2_sovits_model_param'
|
||||||
|
|
||||||
|
self.text_url = 'text_url'
|
||||||
|
self.text_param = 'text_param'
|
||||||
|
self.refer_type_param = 'refer_type_param'
|
||||||
|
self.ref_path_param = 'ref_path_param'
|
||||||
|
self.ref_text_param = 'ref_text_param'
|
||||||
|
self.emotion_param = 'emotion_param'
|
||||||
|
|
||||||
|
self.test_content_path = 'test_content_path'
|
||||||
|
self.request_concurrency_num = 'request_concurrency_num'
|
||||||
|
|
||||||
|
# 第三步
|
||||||
|
self.text_similarity_amplification_boundary = 'text_similarity_amplification_boundary'
|
||||||
|
# 第四步
|
||||||
|
# 第五步
|
||||||
|
self.text_template = 'text_template'
|
||||||
|
|
||||||
|
def read(self, key):
|
||||||
|
file_path = os.path.join(self.base_dir, key + '.txt')
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
content = common.read_file(file_path)
|
||||||
|
return content.strip()
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def write(self, key, content):
|
||||||
|
file_path = os.path.join(self.base_dir, key + '.txt')
|
||||||
|
|
||||||
|
# 确保内容是字符串类型,如果不是,转换为字符串
|
||||||
|
if not isinstance(content, str):
|
||||||
|
clean_content = str(content).strip() # 转换为字符串并移除首尾空白
|
||||||
|
else:
|
||||||
|
clean_content = content.strip()
|
||||||
|
|
||||||
|
common.write_text_to_file(clean_content, file_path)
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigManager:
|
||||||
|
def __init__(self):
|
||||||
|
self.config_path = 'Ref_Audio_Selector/config.ini'
|
||||||
|
self.config = configparser.ConfigParser()
|
||||||
|
self.config.read(self.config_path, encoding='utf-8')
|
||||||
|
|
||||||
|
def get_base(self, key):
|
||||||
|
return self.config.get('Base', key)
|
||||||
|
|
||||||
|
def get_log(self, key):
|
||||||
|
return self.config.get('Log', key)
|
||||||
|
|
||||||
|
def get_audio_sample(self, key):
|
||||||
|
return self.config.get('AudioSample', key)
|
||||||
|
|
||||||
|
def get_inference(self, key):
|
||||||
|
return self.config.get('Inference', key)
|
||||||
|
|
||||||
|
def get_result_check(self, key):
|
||||||
|
return self.config.get('ResultCheck', key)
|
||||||
|
|
||||||
|
def get_audio_config(self, key):
|
||||||
|
return self.config.get('AudioConfig', key)
|
||||||
|
|
||||||
|
def get_other(self, key):
|
||||||
|
return self.config.get('Other', key)
|
||||||
|
|
||||||
|
def print(self):
|
||||||
|
# 打印所有配置
|
||||||
|
for section in self.config.sections():
|
||||||
|
print('[{}]'.format(section))
|
||||||
|
for key in self.config[section]:
|
||||||
|
print('{} = {}'.format(key, self.config[section][key]))
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
_config = ConfigManager()
|
||||||
|
_param_read_write_manager = ParamReadWriteManager()
|
||||||
|
|
||||||
|
|
||||||
|
def get_config():
|
||||||
|
return _config
|
||||||
|
|
||||||
|
|
||||||
|
def get_rw_param():
|
||||||
|
return _param_read_write_manager
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(_config.print())
|
||||||
58
Ref_Audio_Selector/config_param/config_params.py
Normal file
58
Ref_Audio_Selector/config_param/config_params.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import Ref_Audio_Selector.config_param.config_manager as config_manager
|
||||||
|
|
||||||
|
config = config_manager.get_config()
|
||||||
|
|
||||||
|
# [Base]
|
||||||
|
# 服务端口号
|
||||||
|
server_port = int(config.get_base('server_port'))
|
||||||
|
# 参考音频目录
|
||||||
|
reference_audio_dir = config.get_base('reference_audio_dir')
|
||||||
|
# 临时文件目录
|
||||||
|
temp_dir = config.get_base('temp_dir')
|
||||||
|
|
||||||
|
# [Log]
|
||||||
|
# 日志保存目录路径
|
||||||
|
log_dir = config.get_log('log_dir')
|
||||||
|
# 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、
|
||||||
|
log_level = config.get_log('log_level')
|
||||||
|
# 函数时间消耗日志打印类型 file 打印到文件; close 关闭
|
||||||
|
time_log_print_type = config.get_log('time_log_print_type')
|
||||||
|
# 函数时间消耗日志保存目录路径
|
||||||
|
time_log_print_dir = config.get_log('time_log_print_dir')
|
||||||
|
|
||||||
|
# [AudioSample]
|
||||||
|
# list转换待选参考音频目录
|
||||||
|
list_to_convert_reference_audio_dir = config.get_audio_sample('list_to_convert_reference_audio_dir')
|
||||||
|
# 音频相似度目录
|
||||||
|
audio_similarity_dir = config.get_audio_sample('audio_similarity_dir')
|
||||||
|
# 是否开启基准音频预采样 true false
|
||||||
|
enable_pre_sample = config.get_audio_sample('enable_pre_sample')
|
||||||
|
|
||||||
|
# [Inference]
|
||||||
|
# 默认测试文本位置
|
||||||
|
default_test_text_path = config.get_inference('default_test_text_path')
|
||||||
|
# 推理音频目录
|
||||||
|
inference_audio_dir = config.get_inference('inference_audio_dir')
|
||||||
|
# 推理音频文本聚合目录
|
||||||
|
inference_audio_text_aggregation_dir = config.get_inference('inference_audio_text_aggregation_dir')
|
||||||
|
# 推理音频情绪聚合目录
|
||||||
|
inference_audio_emotion_aggregation_dir = config.get_inference('inference_audio_emotion_aggregation_dir')
|
||||||
|
|
||||||
|
# [ResultCheck]
|
||||||
|
# asr输出文件
|
||||||
|
asr_filename = config.get_result_check('asr_filename')
|
||||||
|
# 文本相似度输出目录
|
||||||
|
text_similarity_output_dir = config.get_result_check('text_similarity_output_dir')
|
||||||
|
# 文本情绪平均相似度报告文件名
|
||||||
|
text_emotion_average_similarity_report_filename = config.get_result_check('text_emotion_average_similarity_report_filename')
|
||||||
|
# 文本相似度按情绪聚合明细文件名
|
||||||
|
text_similarity_by_emotion_detail_filename = config.get_result_check('text_similarity_by_emotion_detail_filename')
|
||||||
|
# 文本相似度按文本聚合明细文件名
|
||||||
|
text_similarity_by_text_detail_filename = config.get_result_check('text_similarity_by_text_detail_filename')
|
||||||
|
|
||||||
|
# [AudioConfig]
|
||||||
|
# 默认模板文件位置
|
||||||
|
default_template_path = config.get_audio_config('default_template_path')
|
||||||
|
# 参考音频配置文件名
|
||||||
|
reference_audio_config_filename = config.get_audio_config('reference_audio_config_filename')
|
||||||
|
|
||||||
65
Ref_Audio_Selector/config_param/log_config.py
Normal file
65
Ref_Audio_Selector/config_param/log_config.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
|
||||||
|
|
||||||
|
def create_general_logger():
|
||||||
|
# 获取当前日期,用于文件名和日志内容
|
||||||
|
current_date = datetime.datetime.now().strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
# 创建一个用于控制台输出的处理器,并设置日志级别
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
# console_handler.setLevel(logging.INFO)
|
||||||
|
# 可以设置控制台输出的格式
|
||||||
|
console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
console_handler.setFormatter(console_formatter)
|
||||||
|
console_handler.encoding = 'utf-8' # 设置字符编码为utf-8
|
||||||
|
|
||||||
|
os.makedirs(params.log_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 创建一个用于常规日志的处理器
|
||||||
|
general_handler = logging.FileHandler(f"{params.log_dir}/{current_date}.log", mode='a', encoding='utf-8')
|
||||||
|
# general_handler.setLevel(logging.INFO)
|
||||||
|
general_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
general_handler.setFormatter(general_formatter)
|
||||||
|
|
||||||
|
# 配置一个常规的logger
|
||||||
|
general_logger = logging.getLogger('general')
|
||||||
|
level = logging.getLevelName(params.log_level)
|
||||||
|
general_logger.setLevel(level)
|
||||||
|
general_logger.addHandler(console_handler)
|
||||||
|
general_logger.addHandler(general_handler)
|
||||||
|
|
||||||
|
# 配置根logger,以防万一
|
||||||
|
logging.basicConfig(level=logging.WARNING, handlers=[general_handler])
|
||||||
|
|
||||||
|
return general_logger
|
||||||
|
|
||||||
|
|
||||||
|
def create_performance_logger():
|
||||||
|
# 获取当前日期,用于文件名和日志内容
|
||||||
|
current_date = datetime.datetime.now().strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
os.makedirs(params.time_log_print_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 创建一个专用于性能监控日志的处理器
|
||||||
|
performance_handler = logging.FileHandler(
|
||||||
|
f"{params.time_log_print_dir}/{current_date}.log", mode='a', encoding='utf-8')
|
||||||
|
# performance_handler.setLevel(logging.INFO)
|
||||||
|
performance_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
performance_handler.setFormatter(performance_formatter)
|
||||||
|
|
||||||
|
# 配置一个专门用于性能监控的logger
|
||||||
|
performance_logger = logging.getLogger('performance')
|
||||||
|
performance_logger.setLevel(logging.INFO)
|
||||||
|
performance_logger.addHandler(performance_handler)
|
||||||
|
|
||||||
|
return performance_logger
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging():
|
||||||
|
return create_general_logger(), create_performance_logger()
|
||||||
|
|
||||||
|
|
||||||
|
logger, p_logger = setup_logging()
|
||||||
0
Ref_Audio_Selector/file/base_info/role.txt
Normal file
0
Ref_Audio_Selector/file/base_info/role.txt
Normal file
0
Ref_Audio_Selector/file/base_info/work_dir.txt
Normal file
0
Ref_Audio_Selector/file/base_info/work_dir.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
"${emotion}": {
|
||||||
|
"ref_wav_path": "${ref_path}",
|
||||||
|
"prompt_text": "${ref_text}",
|
||||||
|
"prompt_language": "中文"
|
||||||
|
}
|
||||||
4
Ref_Audio_Selector/file/test_content/test_content.txt
Normal file
4
Ref_Audio_Selector/file/test_content/test_content.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
也是只有一次。”白蓉简单地回答,然后迅速转移话锋,搂住罗辑的脖子说,“算了,我不要那生日礼物了,你也回到正常的生活中来,好吗?”
|
||||||
|
云天明看到那是一条丑陋的虫子,软乎乎湿漉漉的,在她白皙的手指间蠕动着,旁边一个女生尖叫道:恶心死了,你碰它干吗?!程心把虫子轻轻放到旁边的草丛中,说,它在这里会给踩死的。
|
||||||
|
“那么多的星星,像雾似的。”云天明感叹道。程心把目光从银河收回,转头看着他,指着下面的校园和城市说:“你看下面也很漂亮啊,我们的生活是在这儿,可不是在那么远的银河里。”
|
||||||
|
“可我们的专业,不就是为了到地球之外去吗?”“那是为了这里的生活更好,可不是为了逃离地球啊。”云天明当然知道程心的话是委婉地指向他的孤僻和自闭,他也只有默然以对。
|
||||||
1066
Ref_Audio_Selector/ref_audio_selector_webui.py
Normal file
1066
Ref_Audio_Selector/ref_audio_selector_webui.py
Normal file
File diff suppressed because it is too large
Load Diff
5
Ref_Audio_Selector/start_ref_audio_selector_webui.bat
Normal file
5
Ref_Audio_Selector/start_ref_audio_selector_webui.bat
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
CHCP 65001
|
||||||
|
@echo off
|
||||||
|
cd ../
|
||||||
|
runtime\python.exe ./Ref_Audio_Selector/ref_audio_selector_webui.py
|
||||||
|
pause
|
||||||
0
Ref_Audio_Selector/tool/__init__.py
Normal file
0
Ref_Audio_Selector/tool/__init__.py
Normal file
0
Ref_Audio_Selector/tool/asr/__init__.py
Normal file
0
Ref_Audio_Selector/tool/asr/__init__.py
Normal file
120
Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py
Normal file
120
Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
|
||||||
|
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
||||||
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from tools.asr.config import check_fw_local_models
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
language_code_list = [
|
||||||
|
"af", "am", "ar", "as", "az",
|
||||||
|
"ba", "be", "bg", "bn", "bo",
|
||||||
|
"br", "bs", "ca", "cs", "cy",
|
||||||
|
"da", "de", "el", "en", "es",
|
||||||
|
"et", "eu", "fa", "fi", "fo",
|
||||||
|
"fr", "gl", "gu", "ha", "haw",
|
||||||
|
"he", "hi", "hr", "ht", "hu",
|
||||||
|
"hy", "id", "is", "it", "ja",
|
||||||
|
"jw", "ka", "kk", "km", "kn",
|
||||||
|
"ko", "la", "lb", "ln", "lo",
|
||||||
|
"lt", "lv", "mg", "mi", "mk",
|
||||||
|
"ml", "mn", "mr", "ms", "mt",
|
||||||
|
"my", "ne", "nl", "nn", "no",
|
||||||
|
"oc", "pa", "pl", "ps", "pt",
|
||||||
|
"ro", "ru", "sa", "sd", "si",
|
||||||
|
"sk", "sl", "sn", "so", "sq",
|
||||||
|
"sr", "su", "sv", "sw", "ta",
|
||||||
|
"te", "tg", "th", "tk", "tl",
|
||||||
|
"tr", "tt", "uk", "ur", "uz",
|
||||||
|
"vi", "yi", "yo", "zh", "yue",
|
||||||
|
"auto"]
|
||||||
|
|
||||||
|
|
||||||
|
def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language, precision):
|
||||||
|
if '-local' in model_size:
|
||||||
|
model_size = model_size[:-6]
|
||||||
|
model_path = f'tools/asr/models/faster-whisper-{model_size}'
|
||||||
|
else:
|
||||||
|
model_path = model_size
|
||||||
|
if language == 'auto':
|
||||||
|
language = None # 不设置语种由模型自动输出概率最高的语种
|
||||||
|
logger.info("loading faster whisper model:", model_size, model_path)
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
try:
|
||||||
|
model = WhisperModel(model_path, device=device, compute_type=precision)
|
||||||
|
except:
|
||||||
|
return logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
output = []
|
||||||
|
|
||||||
|
# 递归遍历输入目录及所有子目录
|
||||||
|
for root, dirs, files in os.walk(input_folder):
|
||||||
|
for file_name in sorted(files):
|
||||||
|
# 只处理wav文件(假设是wav文件)
|
||||||
|
if file_name.endswith(".wav"):
|
||||||
|
try:
|
||||||
|
file_path = os.path.join(root, file_name)
|
||||||
|
original_text = os.path.basename(root)
|
||||||
|
segments, info = model.transcribe(
|
||||||
|
audio=file_path,
|
||||||
|
beam_size=5,
|
||||||
|
vad_filter=True,
|
||||||
|
vad_parameters=dict(min_silence_duration_ms=700),
|
||||||
|
language=language)
|
||||||
|
text = ''
|
||||||
|
|
||||||
|
if info.language == "zh":
|
||||||
|
logger.info("检测为中文文本, 转 FunASR 处理")
|
||||||
|
if ("only_asr" not in globals()):
|
||||||
|
from Ref_Audio_Selector.tool.asr.funasr_asr_multi_level_dir import \
|
||||||
|
only_asr # #如果用英文就不需要导入下载模型
|
||||||
|
text = only_asr(file_path)
|
||||||
|
|
||||||
|
if text == '':
|
||||||
|
for segment in segments:
|
||||||
|
text += segment.text
|
||||||
|
output.append(f"{file_path}|{original_text}|{info.language.upper()}|{text}")
|
||||||
|
print(f"{file_path}|{original_text}|{info.language.upper()}|{text}")
|
||||||
|
except:
|
||||||
|
return logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
output_folder = output_folder
|
||||||
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
|
output_file_path = os.path.abspath(f'{output_folder}/{params.asr_filename}.list')
|
||||||
|
|
||||||
|
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write("\n".join(output))
|
||||||
|
logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
||||||
|
return output_file_path
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
||||||
|
help="Path to the folder containing WAV files.")
|
||||||
|
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
||||||
|
help="Output folder to store transcriptions.")
|
||||||
|
parser.add_argument("-s", "--model_size", type=str, default='large-v3',
|
||||||
|
choices=check_fw_local_models(),
|
||||||
|
help="Model Size of Faster Whisper")
|
||||||
|
parser.add_argument("-l", "--language", type=str, default='ja',
|
||||||
|
choices=language_code_list,
|
||||||
|
help="Language of the audio files.")
|
||||||
|
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'],
|
||||||
|
help="fp16 or fp32")
|
||||||
|
|
||||||
|
cmd = parser.parse_args()
|
||||||
|
output_file_path = execute_asr_multi_level_dir(
|
||||||
|
input_folder=cmd.input_folder,
|
||||||
|
output_folder=cmd.output_folder,
|
||||||
|
model_size=cmd.model_size,
|
||||||
|
language=cmd.language,
|
||||||
|
precision=cmd.precision,
|
||||||
|
)
|
||||||
94
Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py
Normal file
94
Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
# -*- coding:utf-8 -*-
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
from Ref_Audio_Selector.common.time_util import timeit_decorator
|
||||||
|
from tqdm import tqdm
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
|
||||||
|
path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
|
||||||
|
path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
|
||||||
|
path_asr = path_asr if os.path.exists(
|
||||||
|
path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||||
|
path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||||
|
path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||||
|
|
||||||
|
model = AutoModel(
|
||||||
|
model=path_asr,
|
||||||
|
model_revision="v2.0.4",
|
||||||
|
vad_model=path_vad,
|
||||||
|
vad_model_revision="v2.0.4",
|
||||||
|
punc_model=path_punc,
|
||||||
|
punc_model_revision="v2.0.4",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def only_asr(input_file):
|
||||||
|
try:
|
||||||
|
text = model.generate(input=input_file)[0]["text"]
|
||||||
|
except:
|
||||||
|
text = ''
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
@timeit_decorator
|
||||||
|
def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language):
|
||||||
|
output = []
|
||||||
|
# 递归遍历输入目录及所有子目录
|
||||||
|
for root, dirs, files in os.walk(input_folder):
|
||||||
|
for name in sorted(files):
|
||||||
|
# 只处理wav文件(假设是wav文件)
|
||||||
|
if name.endswith(".wav"):
|
||||||
|
try:
|
||||||
|
original_text = os.path.basename(root)
|
||||||
|
# 构造完整的输入音频文件路径
|
||||||
|
input_file_path = os.path.join(root, name)
|
||||||
|
input_file_path = os.path.normpath(input_file_path) # 先标准化可能存在混合斜杠的情况
|
||||||
|
asr_text = model.generate(input=input_file_path)[0]["text"]
|
||||||
|
|
||||||
|
output.append(f"{input_file_path}|{original_text}|{language.upper()}|{asr_text}")
|
||||||
|
|
||||||
|
except:
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
# 创建或打开指定的输出目录
|
||||||
|
output_folder = output_folder
|
||||||
|
output_dir_abs = os.path.abspath(output_folder)
|
||||||
|
os.makedirs(output_dir_abs, exist_ok=True)
|
||||||
|
|
||||||
|
# 构造输出文件路径
|
||||||
|
output_file_path = os.path.join(output_dir_abs, f'{params.asr_filename}.list')
|
||||||
|
|
||||||
|
# 将输出写入文件
|
||||||
|
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write("\n".join(output))
|
||||||
|
logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
||||||
|
|
||||||
|
return output_file_path
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
||||||
|
help="Path to the folder containing WAV files.")
|
||||||
|
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
||||||
|
help="Output folder to store transcriptions.")
|
||||||
|
parser.add_argument("-s", "--model_size", type=str, default='large',
|
||||||
|
help="Model Size of FunASR is Large")
|
||||||
|
parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
|
||||||
|
help="Language of the audio files.")
|
||||||
|
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'],
|
||||||
|
help="fp16 or fp32") # 还没接入
|
||||||
|
|
||||||
|
cmd = parser.parse_args()
|
||||||
|
execute_asr_multi_level_dir(
|
||||||
|
input_folder=cmd.input_folder,
|
||||||
|
output_folder=cmd.output_folder,
|
||||||
|
model_size=cmd.model_size,
|
||||||
|
language=cmd.language,
|
||||||
|
)
|
||||||
54
Ref_Audio_Selector/tool/audio_check.py
Normal file
54
Ref_Audio_Selector/tool/audio_check.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
def remove_matching_audio_files_in_text_dir(text_dir, emotions_list):
|
||||||
|
count = 0
|
||||||
|
emotions = [item['emotion'] for item in emotions_list]
|
||||||
|
for root, dirs, files in os.walk(text_dir):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".wav"):
|
||||||
|
emotion_tag = os.path.basename(file)[:-4]
|
||||||
|
if emotion_tag not in emotions:
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
logger.info(f"Deleting file: {file_path}")
|
||||||
|
try:
|
||||||
|
os.remove(file_path)
|
||||||
|
count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error deleting file {file_path}: {e}")
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def delete_emotion_subdirectories(emotion_dir, emotions_list):
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
emotions = [item['emotion'] for item in emotions_list]
|
||||||
|
|
||||||
|
for entry in os.listdir(emotion_dir):
|
||||||
|
entry_path = os.path.join(emotion_dir, entry)
|
||||||
|
if os.path.isdir(entry_path):
|
||||||
|
if entry not in emotions:
|
||||||
|
logger.info(f"Deleting directory: {entry_path}")
|
||||||
|
try:
|
||||||
|
# 使用shutil.rmtree删除整个子目录及其内容
|
||||||
|
shutil.rmtree(entry_path)
|
||||||
|
count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error deleting directory {entry_path}: {e}")
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def sync_ref_audio(ref_audio_dir, inference_audio_dir):
|
||||||
|
ref_audio_manager = common.RefAudioListManager(ref_audio_dir)
|
||||||
|
ref_list = ref_audio_manager.get_ref_audio_list()
|
||||||
|
text_dir = os.path.join(inference_audio_dir, params.inference_audio_text_aggregation_dir)
|
||||||
|
emotion_dir = os.path.join(inference_audio_dir, params.inference_audio_emotion_aggregation_dir)
|
||||||
|
delete_text_wav_num = remove_matching_audio_files_in_text_dir(text_dir, ref_list)
|
||||||
|
delete_emotion_dir_num = delete_emotion_subdirectories(emotion_dir, ref_list)
|
||||||
|
return delete_text_wav_num, delete_emotion_dir_num
|
||||||
31
Ref_Audio_Selector/tool/audio_config.py
Normal file
31
Ref_Audio_Selector/tool/audio_config.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import os
|
||||||
|
import platform
|
||||||
|
|
||||||
|
|
||||||
|
def generate_audio_config(work_space_dir, template_str, audio_list, output_file_path):
|
||||||
|
# 定义一个空字符串来存储最终要写入文件的内容
|
||||||
|
file_content = ""
|
||||||
|
|
||||||
|
# 遍历参考音频列表
|
||||||
|
for audio_info in audio_list:
|
||||||
|
emotion = audio_info['emotion']
|
||||||
|
ref_path = audio_info['ref_path']
|
||||||
|
ref_text = audio_info['ref_text']
|
||||||
|
|
||||||
|
relative_path = os.path.relpath(ref_path, work_space_dir)
|
||||||
|
if platform.system() == 'Windows':
|
||||||
|
relative_path = relative_path.replace('\\', '/')
|
||||||
|
|
||||||
|
# 使用字符串模板替换变量
|
||||||
|
formatted_line = template_str.replace('${emotion}', emotion).replace('${ref_path}', relative_path).replace(
|
||||||
|
'${ref_text}', ref_text)
|
||||||
|
|
||||||
|
# 将格式化后的行添加到内容中,使用逗号和换行符分隔
|
||||||
|
file_content += formatted_line + ",\n"
|
||||||
|
|
||||||
|
# 删除最后一个逗号和换行符,确保格式整洁
|
||||||
|
file_content = file_content[:-2]
|
||||||
|
|
||||||
|
# 将内容写入输出文件
|
||||||
|
with open(output_file_path, 'w', encoding='utf-8') as output_file:
|
||||||
|
output_file.write(file_content)
|
||||||
238
Ref_Audio_Selector/tool/audio_inference.py
Normal file
238
Ref_Audio_Selector/tool/audio_inference.py
Normal file
@ -0,0 +1,238 @@
|
|||||||
|
import time
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import itertools
|
||||||
|
import multiprocessing
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
import numpy as np
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, quote
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger, p_logger
|
||||||
|
|
||||||
|
|
||||||
|
class SetModelURLComposer:
|
||||||
|
def __init__(self, type, base_url, gpt_param_name, sovits_param_name):
|
||||||
|
self.type = type
|
||||||
|
self.base_url = base_url
|
||||||
|
self.gpt_param_name = gpt_param_name
|
||||||
|
self.sovits_param_name = sovits_param_name
|
||||||
|
|
||||||
|
def is_valid(self):
|
||||||
|
if self.base_url is None or self.base_url == '':
|
||||||
|
raise Exception("请求地址不能为空")
|
||||||
|
if self.type in ['gpt', 'all']:
|
||||||
|
if self.gpt_param_name is None or self.gpt_param_name == '':
|
||||||
|
raise Exception("GPT参数名不能为空")
|
||||||
|
if self.type in ['sovits', 'all']:
|
||||||
|
if self.sovits_param_name is None or self.sovits_param_name == '':
|
||||||
|
raise Exception("Sovits参数名不能为空")
|
||||||
|
|
||||||
|
def build_get_url(self, value_array, need_url_encode=True):
|
||||||
|
params = {}
|
||||||
|
if self.type == 'gpt':
|
||||||
|
params[self.gpt_param_name] = value_array[0]
|
||||||
|
if self.type == 'sovits':
|
||||||
|
params[self.sovits_param_name] = value_array[0]
|
||||||
|
if self.type == 'all':
|
||||||
|
params[self.gpt_param_name] = value_array[0]
|
||||||
|
params[self.sovits_param_name] = value_array[1]
|
||||||
|
return append_params_to_url(self.base_url, params, need_url_encode)
|
||||||
|
|
||||||
|
def build_post_url(self, value_array, need_url_encode=True):
|
||||||
|
url = append_params_to_url(self.base_url, {}, need_url_encode)
|
||||||
|
params = {}
|
||||||
|
if self.type == 'gpt':
|
||||||
|
params[self.gpt_param_name] = value_array[0]
|
||||||
|
if self.type == 'sovits':
|
||||||
|
params[self.sovits_param_name] = value_array[0]
|
||||||
|
if self.type == 'all':
|
||||||
|
params[self.gpt_param_name] = value_array[0]
|
||||||
|
params[self.sovits_param_name] = value_array[1]
|
||||||
|
return url, params
|
||||||
|
|
||||||
|
|
||||||
|
class TTSURLComposer:
|
||||||
|
def __init__(self, base_url, refer_type_param, emotion_param_name, text_param_name, ref_path_param_name, ref_text_param_name):
|
||||||
|
self.base_url = base_url
|
||||||
|
# 角色情绪 or 参考音频
|
||||||
|
self.refer_type_param = refer_type_param
|
||||||
|
self.emotion_param_name = emotion_param_name
|
||||||
|
self.text_param_name = text_param_name
|
||||||
|
self.ref_path_param_name = ref_path_param_name
|
||||||
|
self.ref_text_param_name = ref_text_param_name
|
||||||
|
|
||||||
|
def is_valid(self):
|
||||||
|
if self.base_url is None or self.base_url == '':
|
||||||
|
raise ValueError("请输入url")
|
||||||
|
|
||||||
|
if self.text_param_name is None or self.text_param_name == '':
|
||||||
|
raise ValueError("请输入text参数名")
|
||||||
|
|
||||||
|
if self.emotion_param_name is None and self.ref_path_param_name is None and self.ref_text_param_name is None:
|
||||||
|
raise ValueError("请输入至少一个参考or情绪的参数")
|
||||||
|
|
||||||
|
def is_emotion(self):
|
||||||
|
return self.refer_type_param == '角色情绪'
|
||||||
|
|
||||||
|
def build_url_with_emotion(self, text_value, emotion_value, need_url_encode=True):
|
||||||
|
params = {
|
||||||
|
self.text_param_name: text_value,
|
||||||
|
self.emotion_param_name: emotion_value,
|
||||||
|
}
|
||||||
|
return append_params_to_url(self.base_url, params, need_url_encode)
|
||||||
|
|
||||||
|
def build_url_with_ref(self, text_value, ref_path_value, ref_text_value, need_url_encode=True):
|
||||||
|
params = {
|
||||||
|
self.text_param_name: text_value,
|
||||||
|
self.ref_path_param_name: ref_path_value,
|
||||||
|
self.ref_text_param_name: ref_text_value,
|
||||||
|
}
|
||||||
|
return append_params_to_url(self.base_url, params, need_url_encode)
|
||||||
|
|
||||||
|
|
||||||
|
def append_params_to_url(url_with_params, params, need_url_encode):
|
||||||
|
if params:
|
||||||
|
query_params = '&'.join([f"{k}={v}" for k, v in params.items()])
|
||||||
|
url_with_params += '?' + query_params if '?' not in url_with_params else '&' + query_params
|
||||||
|
return url_with_params if not need_url_encode else safe_encode_query_params(url_with_params)
|
||||||
|
|
||||||
|
|
||||||
|
def safe_encode_query_params(original_url):
|
||||||
|
# 分析URL以获取查询字符串部分
|
||||||
|
parsed_url = urlparse(original_url)
|
||||||
|
query_params = parse_qs(parsed_url.query)
|
||||||
|
|
||||||
|
# 将查询参数转换为编码过的字典(键值对会被转码)
|
||||||
|
encoded_params = {k: quote(v[0]) for k, v in query_params.items()}
|
||||||
|
|
||||||
|
# 重新编码查询字符串
|
||||||
|
new_query_string = urlencode(encoded_params, doseq=False)
|
||||||
|
|
||||||
|
# 重建完整的URL
|
||||||
|
new_parsed_url = parsed_url._replace(query=new_query_string)
|
||||||
|
encoded_url = urlunparse(new_parsed_url)
|
||||||
|
|
||||||
|
logger.info(encoded_url)
|
||||||
|
return encoded_url
|
||||||
|
|
||||||
|
|
||||||
|
def generate_audio_files_parallel(url_composer, text_list, emotion_list, output_dir_path, num_processes=1):
|
||||||
|
|
||||||
|
# 将emotion_list均匀分成num_processes个子集
|
||||||
|
emotion_groups = np.array_split(emotion_list, num_processes)
|
||||||
|
|
||||||
|
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
||||||
|
futures = [
|
||||||
|
executor.submit(generate_audio_files_for_emotion_group, url_composer, text_list, group, output_dir_path)
|
||||||
|
for group in emotion_groups]
|
||||||
|
for future in futures:
|
||||||
|
future.result() # 等待所有进程完成
|
||||||
|
|
||||||
|
|
||||||
|
def generate_audio_files_for_emotion_group(url_composer, text_list, emotion_list, output_dir_path):
|
||||||
|
start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点
|
||||||
|
# Ensure the output directory exists
|
||||||
|
output_dir = os.path.abspath(output_dir_path)
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Create subdirectories for text and emotion categories
|
||||||
|
text_subdir = os.path.join(output_dir, params.inference_audio_text_aggregation_dir)
|
||||||
|
os.makedirs(text_subdir, exist_ok=True)
|
||||||
|
emotion_subdir = os.path.join(output_dir, params.inference_audio_emotion_aggregation_dir)
|
||||||
|
os.makedirs(emotion_subdir, exist_ok=True)
|
||||||
|
|
||||||
|
all_count = len(text_list) * len(emotion_list)
|
||||||
|
has_generated_count = 0
|
||||||
|
all_text_count = sum(len(item) for item in text_list)
|
||||||
|
|
||||||
|
# 计算笛卡尔积
|
||||||
|
cartesian_product = list(itertools.product(text_list, emotion_list))
|
||||||
|
|
||||||
|
for text, emotion in cartesian_product:
|
||||||
|
# Generate audio byte stream using the create_audio function
|
||||||
|
|
||||||
|
emotion_name = emotion['emotion']
|
||||||
|
|
||||||
|
text_subdir_text = os.path.join(text_subdir, text)
|
||||||
|
os.makedirs(text_subdir_text, exist_ok=True)
|
||||||
|
text_subdir_text_file_path = os.path.join(text_subdir_text, emotion_name + '.wav')
|
||||||
|
|
||||||
|
emotion_subdir_emotion = os.path.join(emotion_subdir, emotion_name)
|
||||||
|
os.makedirs(emotion_subdir_emotion, exist_ok=True)
|
||||||
|
emotion_subdir_emotion_file_path = os.path.join(emotion_subdir_emotion, text + '.wav')
|
||||||
|
|
||||||
|
# 检查是否已经存在对应的音频文件,如果存在则跳过
|
||||||
|
if os.path.exists(text_subdir_text_file_path) and os.path.exists(emotion_subdir_emotion_file_path):
|
||||||
|
has_generated_count += 1
|
||||||
|
logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if url_composer.is_emotion():
|
||||||
|
real_url = url_composer.build_url_with_emotion(text, emotion['emotion'], False)
|
||||||
|
else:
|
||||||
|
real_url = url_composer.build_url_with_ref(text, emotion['ref_path'], emotion['ref_text'], False)
|
||||||
|
|
||||||
|
audio_bytes = inference_audio_from_api(real_url)
|
||||||
|
|
||||||
|
# Write audio bytes to the respective files
|
||||||
|
with open(text_subdir_text_file_path, 'wb') as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
with open(emotion_subdir_emotion_file_path, 'wb') as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
|
||||||
|
has_generated_count += 1
|
||||||
|
logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}")
|
||||||
|
end_time = time.perf_counter() # 获取计时终点
|
||||||
|
elapsed_time = end_time - start_time # 计算执行耗时
|
||||||
|
# 记录日志内容
|
||||||
|
log_message = f"进程ID: {os.getpid()}, generate_audio_files_for_emotion_group 执行耗时: {elapsed_time:.6f} 秒;推理数量: {has_generated_count}; 字符总数:{all_text_count};每秒推理字符数:{all_text_count*len(emotion_list) / elapsed_time:.3f};"
|
||||||
|
p_logger.info(log_message)
|
||||||
|
logger.info(log_message)
|
||||||
|
|
||||||
|
|
||||||
|
def inference_audio_from_api(url):
|
||||||
|
logger.info(f'inference_audio_from_api url: {url}')
|
||||||
|
# 发起GET请求
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
|
||||||
|
# 检查响应状态码是否正常(例如200表示成功)
|
||||||
|
if response.status_code == 200:
|
||||||
|
# 返回音频数据的字节流
|
||||||
|
return response.content
|
||||||
|
else:
|
||||||
|
raise Exception(f"Failed to fetch audio from API. Server responded with status code {response.status_code}.message: {response.json()}")
|
||||||
|
|
||||||
|
|
||||||
|
def start_api_set_model(set_model_url_composer, gpt_models, sovits_models):
|
||||||
|
url, post_body = set_model_url_composer.build_post_url([gpt_models, sovits_models], True)
|
||||||
|
logger.info(f'set_model_url_composer url: {set_model_url_composer}')
|
||||||
|
logger.info(f'start_api_set_model url: {url}')
|
||||||
|
logger.info(f'start_api_set_model post_body: {post_body}')
|
||||||
|
response = requests.post(url, json=post_body)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.text
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return f'请求失败,状态码:{response.status_code}'
|
||||||
|
|
||||||
|
|
||||||
|
def start_api_v2_set_gpt_model(set_model_url_composer, gpt_models):
|
||||||
|
url = set_model_url_composer.build_get_url([gpt_models], False)
|
||||||
|
logger.info(f'start_api_v2_set_gpt_model url: {url}')
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.text
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return f'请求失败,状态码:{response.status_code}'
|
||||||
|
|
||||||
|
|
||||||
|
def start_api_v2_set_sovits_model(set_model_url_composer, sovits_models):
|
||||||
|
url = set_model_url_composer.build_get_url([sovits_models], False)
|
||||||
|
logger.info(f'start_api_v2_set_sovits_model url: {url}')
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.text
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return f'请求失败,状态码:{response.status_code}'
|
||||||
162
Ref_Audio_Selector/tool/audio_sample.py
Normal file
162
Ref_Audio_Selector/tool/audio_sample.py
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import random
|
||||||
|
import librosa
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
def check_audio_duration(path, min_duration=3, max_duration=10):
|
||||||
|
try:
|
||||||
|
|
||||||
|
# 直接计算音频文件的时长(单位:秒)
|
||||||
|
duration = librosa.get_duration(filename=path)
|
||||||
|
|
||||||
|
# 判断时长是否在3s至10s之间
|
||||||
|
if min_duration <= duration <= max_duration:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"无法打开或处理音频文件:{e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def convert_from_list(list_file, output_dir):
|
||||||
|
# 创建输出目录,如果它不存在的话
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
# 解析.list文件,并操作文件
|
||||||
|
with open(list_file, 'r', encoding='utf-8') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
parts = line.strip().split('|')
|
||||||
|
if len(parts) != 4:
|
||||||
|
logger.error(f"Line format incorrect: {line}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
audio_path, _, _, transcription = parts
|
||||||
|
|
||||||
|
# 构建新的文件名和路径
|
||||||
|
new_filename = transcription.strip() + '.wav'
|
||||||
|
# new_filename = new_filename.replace(' ', '_') # 移除空格
|
||||||
|
# new_filename = ''.join(e for e in new_filename if e.isalnum() or e in ['_', '.']) # 移除非法字符
|
||||||
|
new_path = os.path.join(output_dir, new_filename)
|
||||||
|
|
||||||
|
# 如果目标文件已存在,不要覆盖
|
||||||
|
if os.path.exists(new_path):
|
||||||
|
logger.info(f"File already exists: {new_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 检查音频文件是否存在
|
||||||
|
if not os.path.exists(audio_path):
|
||||||
|
logger.info(f"Audio file does not exist: {audio_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if check_audio_duration(audio_path):
|
||||||
|
# 复制音频文件到output目录并重命名
|
||||||
|
shutil.copy2(audio_path, new_path)
|
||||||
|
logger.info(f"File copied and renamed to: {new_path}")
|
||||||
|
else:
|
||||||
|
logger.info(f"File skipped due to duration: {audio_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"An error occurred while processing: {audio_path}")
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
logger.info("Processing complete.")
|
||||||
|
|
||||||
|
|
||||||
|
def sample(output_audio_dir, similarity_list, subsection_num, sample_num):
|
||||||
|
# 按照相似度分值降序排序相似度列表
|
||||||
|
similarity_list.sort(key=lambda x: x['score'], reverse=True)
|
||||||
|
|
||||||
|
# 计算每段的起始索引
|
||||||
|
step = len(similarity_list) // subsection_num
|
||||||
|
if len(similarity_list) % subsection_num != 0:
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
# 分段并随机采样
|
||||||
|
for i in range(subsection_num):
|
||||||
|
start = i * step
|
||||||
|
end = (i + 1) * step
|
||||||
|
end = min(end, len(similarity_list)) # 防止最后一段越界
|
||||||
|
|
||||||
|
# 创建子列表
|
||||||
|
subsection = similarity_list[start:end]
|
||||||
|
# 在子列表上随机打乱
|
||||||
|
random.shuffle(subsection)
|
||||||
|
|
||||||
|
# 从打乱后的子列表中抽取相应数量的个体
|
||||||
|
num = min(sample_num, len(subsection))
|
||||||
|
sampled_subsection = subsection[:num]
|
||||||
|
|
||||||
|
# 创建并进入子目录
|
||||||
|
subdir_name = f'emotion_{i + 1}'
|
||||||
|
subdir_path = os.path.join(output_audio_dir, subdir_name)
|
||||||
|
os.makedirs(subdir_path, exist_ok=True)
|
||||||
|
|
||||||
|
# 复制采样结果的音频到子目录
|
||||||
|
for item in sampled_subsection:
|
||||||
|
src_path = item['wav_path']
|
||||||
|
dst_path = os.path.join(subdir_path, os.path.basename(src_path))
|
||||||
|
shutil.copyfile(src_path, dst_path)
|
||||||
|
|
||||||
|
logger.info("Sampling completed.")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_similarity_file(file_path):
|
||||||
|
"""
|
||||||
|
解析指定文本文件,将其中的内容以元组形式存入列表。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
file_path (str): 待解析的文本文件路径。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
list[tuple[float, str]]: 存储浮点数和路径的元组列表。
|
||||||
|
"""
|
||||||
|
result_list = []
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# 去除行尾换行符并按'|'分割
|
||||||
|
score, filepath = line.strip().split('|')
|
||||||
|
|
||||||
|
# 将浮点数字符串转换为浮点数类型
|
||||||
|
score = float(score)
|
||||||
|
|
||||||
|
# 将得分和路径作为元组添加到结果列表
|
||||||
|
result_list.append({
|
||||||
|
'score': score,
|
||||||
|
'wav_path': filepath
|
||||||
|
})
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def copy_and_move(output_audio_directory, similarity_scores):
|
||||||
|
# 确保新目录存在
|
||||||
|
if not os.path.exists(output_audio_directory):
|
||||||
|
os.makedirs(output_audio_directory)
|
||||||
|
|
||||||
|
# 遍历并复制文件
|
||||||
|
for item in similarity_scores:
|
||||||
|
# 构造新的文件名
|
||||||
|
base_name = os.path.basename(item['wav_path'])[:-4] # 去掉.wav扩展名
|
||||||
|
new_name = f"{item['score'] * 10000:04.0f}-{base_name}.wav"
|
||||||
|
|
||||||
|
# 新文件的完整路径
|
||||||
|
new_path = os.path.join(output_audio_directory, new_name)
|
||||||
|
|
||||||
|
# 复制文件到新目录
|
||||||
|
shutil.copyfile(item['wav_path'], new_path)
|
||||||
|
|
||||||
|
logger.info("已完成复制和重命名操作。")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
similarity_list = parse_similarity_file("D:/tt/similarity/啊,除了伊甸和樱,竟然还有其他人会提起我?.txt")
|
||||||
|
sample('D:/tt/similarity/output', similarity_list, 10, 4)
|
||||||
142
Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py
Normal file
142
Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import torchaudio
|
||||||
|
import torchaudio.transforms as T
|
||||||
|
import platform
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
import Ref_Audio_Selector.config_param.log_config as log_config
|
||||||
|
from Ref_Audio_Selector.common.time_util import timeit_decorator
|
||||||
|
from Ref_Audio_Selector.common.model_manager import speaker_verification_models as models
|
||||||
|
|
||||||
|
from modelscope.pipelines import pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def init_model(model_type='speech_campplus_sv_zh-cn_16k-common'):
|
||||||
|
log_config.logger.info(f'人声识别模型类型:{model_type}')
|
||||||
|
return pipeline(
|
||||||
|
task=models[model_type]['task'],
|
||||||
|
model=models[model_type]['model'],
|
||||||
|
model_revision=models[model_type]['model_revision']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@timeit_decorator
|
||||||
|
def compare_audio_and_generate_report(reference_audio_path, comparison_dir_path, output_file_path, model_type):
|
||||||
|
sv_pipeline = init_model(model_type)
|
||||||
|
|
||||||
|
# Step 1: 获取比较音频目录下所有音频文件的路径
|
||||||
|
comparison_audio_paths = [os.path.join(comparison_dir_path, f) for f in os.listdir(comparison_dir_path) if
|
||||||
|
f.endswith('.wav')]
|
||||||
|
|
||||||
|
if platform.system() == 'Windows':
|
||||||
|
# 因为这个模型是基于16k音频数据训练的,为了避免后续比较时,每次都对参考音频进行重采样,所以,提前进行了采样
|
||||||
|
# windows不支持torchaudio.sox_effects.apply_effects_tensor,所以改写了依赖文件中的重采样方法
|
||||||
|
# 改用torchaudio.transforms.Resample进行重采样,如果在非windows环境下,没有更改依赖包的采样方法的话,
|
||||||
|
# 使用这段代码进行预采样会出现因为采样方法不同,而导致的模型相似度计算不准确的问题
|
||||||
|
# 当然如果在windows下,使用了其他的采样方法,也会出现不准确的问题
|
||||||
|
if params.enable_pre_sample == 'true':
|
||||||
|
reference_audio_16k = ensure_16k_wav(reference_audio_path)
|
||||||
|
else:
|
||||||
|
reference_audio_16k = reference_audio_path
|
||||||
|
else:
|
||||||
|
reference_audio_16k = reference_audio_path
|
||||||
|
|
||||||
|
# Step 2: 用参考音频依次比较音频目录下的每个音频,获取相似度分数及对应路径
|
||||||
|
all_count = len(comparison_audio_paths)
|
||||||
|
has_processed_count = 0
|
||||||
|
similarity_scores = []
|
||||||
|
for audio_path in comparison_audio_paths:
|
||||||
|
score = sv_pipeline([reference_audio_16k, audio_path])['score']
|
||||||
|
similarity_scores.append({
|
||||||
|
'score': score,
|
||||||
|
'path': audio_path
|
||||||
|
})
|
||||||
|
has_processed_count += 1
|
||||||
|
log_config.logger.info(f'进度:{has_processed_count}/{all_count}')
|
||||||
|
|
||||||
|
# Step 3: 根据相似度分数降序排列
|
||||||
|
similarity_scores.sort(key=lambda x: x['score'], reverse=True)
|
||||||
|
|
||||||
|
# Step 4: 处理输出文件不存在的情况,创建新文件
|
||||||
|
if not os.path.exists(output_file_path):
|
||||||
|
open(output_file_path, 'w').close() # Create an empty file
|
||||||
|
|
||||||
|
# Step 5: 将排序后的结果写入输出结果文件(支持中文)
|
||||||
|
formatted_scores = [f'{item["score"]}|{item["path"]}' for item in similarity_scores]
|
||||||
|
with open(output_file_path, 'w', encoding='utf-8') as f:
|
||||||
|
# 使用'\n'将每个字符串分开,使其写入不同行
|
||||||
|
content = '\n'.join(formatted_scores)
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_16k_wav(audio_file_path, target_sample_rate=16000):
|
||||||
|
"""
|
||||||
|
输入一个音频文件地址,判断其采样率并决定是否进行重采样,然后将结果保存到指定的输出文件。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
audio_file_path (str): 音频文件路径。
|
||||||
|
output_file_path (str): 保存重采样后音频数据的目标文件路径。
|
||||||
|
target_sample_rate (int, optional): 目标采样率,默认为16000Hz。
|
||||||
|
"""
|
||||||
|
# 读取音频文件并获取其采样率
|
||||||
|
waveform, sample_rate = torchaudio.load(audio_file_path)
|
||||||
|
|
||||||
|
# 判断是否需要重采样
|
||||||
|
if sample_rate == target_sample_rate:
|
||||||
|
return audio_file_path
|
||||||
|
else:
|
||||||
|
|
||||||
|
# 创建Resample实例
|
||||||
|
resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
|
||||||
|
|
||||||
|
# 应用重采样
|
||||||
|
resampled_waveform = resampler(waveform)
|
||||||
|
|
||||||
|
# 创建临时文件夹
|
||||||
|
os.makedirs(params.temp_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 设置临时文件名
|
||||||
|
temp_file_path = os.path.join(params.temp_dir, os.path.basename(audio_file_path))
|
||||||
|
|
||||||
|
# 保存重采样后的音频到指定文件
|
||||||
|
torchaudio.save(temp_file_path, resampled_waveform, target_sample_rate)
|
||||||
|
|
||||||
|
return temp_file_path
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(description="Audio processing script arguments")
|
||||||
|
|
||||||
|
# Reference audio path
|
||||||
|
parser.add_argument("-r", "--reference_audio", type=str, required=True,
|
||||||
|
help="Path to the reference WAV file.")
|
||||||
|
|
||||||
|
# Comparison directory path
|
||||||
|
parser.add_argument("-c", "--comparison_dir", type=str, required=True,
|
||||||
|
help="Path to the directory containing comparison WAV files.")
|
||||||
|
|
||||||
|
# Output file path
|
||||||
|
parser.add_argument("-o", "--output_file", type=str, required=True,
|
||||||
|
help="Path to the output file where results will be written.")
|
||||||
|
|
||||||
|
# Model Type
|
||||||
|
parser.add_argument("-m", "--model_type", type=str, required=True,
|
||||||
|
help="Path to the model type.")
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cmd = parse_arguments()
|
||||||
|
compare_audio_and_generate_report(
|
||||||
|
reference_audio_path=cmd.reference_audio,
|
||||||
|
comparison_dir_path=cmd.comparison_dir,
|
||||||
|
output_file_path=cmd.output_file,
|
||||||
|
model_type=cmd.model_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
# compare_audio_and_generate_report(
|
||||||
|
# reference_audio_path="D:/tt/渡鸦/refer_audio_all/也对,你的身份和我们不同吗?.wav",
|
||||||
|
# comparison_dir_path='D:/tt/渡鸦/refer_audio_all',
|
||||||
|
# output_file_path='D:/tt/渡鸦/test.txt',
|
||||||
|
# )
|
||||||
77
Ref_Audio_Selector/tool/text_check.py
Normal file
77
Ref_Audio_Selector/tool/text_check.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import os
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
import Ref_Audio_Selector.tool.audio_check as audio_check
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
def parse_text_similarity_result_txt(file_path):
|
||||||
|
"""
|
||||||
|
解析指定格式的txt文件,每行格式:f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}"
|
||||||
|
|
||||||
|
:param file_path: txt文件的路径
|
||||||
|
:return: 包含解析后数据的字典列表
|
||||||
|
"""
|
||||||
|
data_list = []
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# 使用'|'作为分隔符分割每行数据
|
||||||
|
parts = line.strip().split('|')
|
||||||
|
if len(parts) == 3:
|
||||||
|
# 将分割后的字符串转换为浮点数、整数和字符串
|
||||||
|
try:
|
||||||
|
item = {
|
||||||
|
'average_similarity_score': float(parts[0]),
|
||||||
|
'count': int(parts[1]),
|
||||||
|
'emotion': parts[2]
|
||||||
|
}
|
||||||
|
data_list.append(item)
|
||||||
|
except ValueError as e:
|
||||||
|
# 如果转换失败,打印错误信息并跳过该行
|
||||||
|
logger.error(f"Error parsing line: {line.strip()} - {e}")
|
||||||
|
|
||||||
|
return data_list
|
||||||
|
|
||||||
|
|
||||||
|
def remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary):
|
||||||
|
"""
|
||||||
|
根据条件删除低相似度音频文件并返回删除数量。
|
||||||
|
|
||||||
|
:param ref_audio_list: 包含音频路径和情感属性的列表
|
||||||
|
:param report_list: 包含相似度评分和情感属性的列表
|
||||||
|
:param audio_text_similarity_boundary: 相似度阈值
|
||||||
|
:return: 删除的文件数量
|
||||||
|
"""
|
||||||
|
deleted_count = 0
|
||||||
|
|
||||||
|
# 筛选出平均相似度低于阈值的报告
|
||||||
|
low_similarity_reports = [report for report in report_list if
|
||||||
|
report['average_similarity_score'] < audio_text_similarity_boundary]
|
||||||
|
|
||||||
|
# 遍历低相似度报告,查找并删除对应音频文件
|
||||||
|
for report in low_similarity_reports:
|
||||||
|
emotion = report['emotion']
|
||||||
|
# 查找ref_audio_list中相同情感的音频文件路径
|
||||||
|
matching_refs = [ref for ref in ref_audio_list if ref['emotion'] == emotion]
|
||||||
|
for match in matching_refs:
|
||||||
|
ref_path = match['ref_path']
|
||||||
|
# 检查文件是否存在,然后尝试删除
|
||||||
|
if os.path.exists(ref_path):
|
||||||
|
try:
|
||||||
|
os.remove(ref_path)
|
||||||
|
deleted_count += 1
|
||||||
|
logger.info(f"Deleted file: {ref_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error deleting file {ref_path}: {e}")
|
||||||
|
else:
|
||||||
|
logger.error(f"File not found: {ref_path}")
|
||||||
|
|
||||||
|
return deleted_count
|
||||||
|
|
||||||
|
|
||||||
|
def delete_ref_audio_below_boundary(ref_audio_path, text_similarity_result_path, sync_inference_audio_dir,
|
||||||
|
audio_text_similarity_boundary):
|
||||||
|
ref_audio_list = common.RefAudioListManager(ref_audio_path).get_ref_audio_list()
|
||||||
|
report_list = parse_text_similarity_result_txt(text_similarity_result_path)
|
||||||
|
count = remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary)
|
||||||
|
audio_check.sync_ref_audio(ref_audio_path, sync_inference_audio_dir)
|
||||||
|
return count
|
||||||
0
Ref_Audio_Selector/tool/text_comparison/__init__.py
Normal file
0
Ref_Audio_Selector/tool/text_comparison/__init__.py
Normal file
161
Ref_Audio_Selector/tool/text_comparison/asr_text_process.py
Normal file
161
Ref_Audio_Selector/tool/text_comparison/asr_text_process.py
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
from operator import itemgetter
|
||||||
|
from Ref_Audio_Selector.common.time_util import timeit_decorator
|
||||||
|
import Ref_Audio_Selector.tool.text_comparison.text_comparison as text_comparison
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
def parse_asr_file(file_path):
|
||||||
|
output = []
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# 假设每行都是正确的格式,且"|"'是固定分隔符
|
||||||
|
input_file_path, original_text, language, asr_text = line.strip().split('|')
|
||||||
|
|
||||||
|
emotion = common.get_filename_without_extension(input_file_path)
|
||||||
|
|
||||||
|
# 将解析出的数据构造成新的字典或元组等结构
|
||||||
|
parsed_data = {
|
||||||
|
'emotion': emotion,
|
||||||
|
'input_file_path': input_file_path,
|
||||||
|
'original_text': original_text,
|
||||||
|
'language': language,
|
||||||
|
'asr_text': asr_text,
|
||||||
|
'similarity_score': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
output.append(parsed_data)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@timeit_decorator
|
||||||
|
def calculate_similarity_and_append_to_list(input_list, boundary):
|
||||||
|
all_count = len(input_list)
|
||||||
|
has_been_processed_count = 0
|
||||||
|
for item in input_list:
|
||||||
|
original_score, similarity_score = text_comparison.calculate_result(item['original_text'], item['asr_text'], boundary)
|
||||||
|
item['similarity_score'] = similarity_score
|
||||||
|
item['original_score'] = original_score
|
||||||
|
has_been_processed_count += 1
|
||||||
|
logger.info(f'进度:{has_been_processed_count}/{all_count}')
|
||||||
|
|
||||||
|
return input_list
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_average_similarity_by_emotion(data_list):
|
||||||
|
result_dict = defaultdict(list)
|
||||||
|
|
||||||
|
for item in data_list:
|
||||||
|
emotion = item['emotion']
|
||||||
|
similarity_score = item['similarity_score']
|
||||||
|
result_dict[emotion].append(similarity_score)
|
||||||
|
|
||||||
|
average_scores = [{'emotion': emotion, 'average_similarity_score': sum(scores) / len(scores), 'count': len(scores)}
|
||||||
|
for emotion, scores in result_dict.items()]
|
||||||
|
|
||||||
|
average_scores.sort(key=lambda x: x['average_similarity_score'], reverse=True)
|
||||||
|
|
||||||
|
return average_scores
|
||||||
|
|
||||||
|
|
||||||
|
def group_and_sort_by_field(data, group_by_field):
|
||||||
|
# 创建一个空的结果字典,键是group_by_field指定的字段,值是一个列表
|
||||||
|
result_dict = defaultdict(list)
|
||||||
|
|
||||||
|
# 遍历输入列表
|
||||||
|
for item in data:
|
||||||
|
# 根据指定的group_by_field将当前元素添加到对应键的列表中
|
||||||
|
key_to_group = item[group_by_field]
|
||||||
|
result_dict[key_to_group].append(item)
|
||||||
|
|
||||||
|
# 对每个键对应的列表中的元素按similarity_score降序排序
|
||||||
|
for key in result_dict:
|
||||||
|
result_dict[key].sort(key=itemgetter('similarity_score'), reverse=True)
|
||||||
|
|
||||||
|
# 将结果字典转换为列表,每个元素是一个包含键(emotion或original_text)和排序后数组的元组
|
||||||
|
result_list = [(k, v) for k, v in result_dict.items()]
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def format_list_to_text(data_list, output_filename):
|
||||||
|
with open(output_filename, 'w', encoding='utf-8') as output_file:
|
||||||
|
output_file.write('放大后的相似度分值|原始分值|ASR文本|原文文本\n')
|
||||||
|
for key, items in data_list:
|
||||||
|
# 写入情绪标题
|
||||||
|
output_file.write(key + '\n')
|
||||||
|
|
||||||
|
# 写入每条记录
|
||||||
|
for item in items:
|
||||||
|
formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['original_text']}\n"
|
||||||
|
output_file.write(formatted_line)
|
||||||
|
|
||||||
|
|
||||||
|
def format_list_to_emotion(data_list, output_filename):
|
||||||
|
with open(output_filename, 'w', encoding='utf-8') as output_file:
|
||||||
|
output_file.write('放大后的相似度分值|原始分值|ASR文本|情绪类型\n')
|
||||||
|
for key, items in data_list:
|
||||||
|
# 写入情绪标题
|
||||||
|
output_file.write(key + '\n')
|
||||||
|
|
||||||
|
# 写入每条记录
|
||||||
|
for item in items:
|
||||||
|
formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['emotion']}\n"
|
||||||
|
output_file.write(formatted_line)
|
||||||
|
|
||||||
|
|
||||||
|
@timeit_decorator
|
||||||
|
def process(asr_file_path, output_dir, similarity_enlarge_boundary):
|
||||||
|
# 检查输出目录是否存在,如果不存在则创建
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
records = parse_asr_file(asr_file_path)
|
||||||
|
calculate_similarity_and_append_to_list(records, similarity_enlarge_boundary)
|
||||||
|
average_similarity_list = calculate_average_similarity_by_emotion(records)
|
||||||
|
|
||||||
|
average_similarity_file = os.path.join(output_dir,
|
||||||
|
f'{params.text_emotion_average_similarity_report_filename}.txt')
|
||||||
|
average_similarity_content = \
|
||||||
|
'\n'.join([f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}" for item in average_similarity_list])
|
||||||
|
common.write_text_to_file(average_similarity_content, average_similarity_file)
|
||||||
|
|
||||||
|
emotion_detail_list = group_and_sort_by_field(records, 'emotion')
|
||||||
|
|
||||||
|
emotion_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_emotion_detail_filename}.txt')
|
||||||
|
format_list_to_text(emotion_detail_list, emotion_detail_file)
|
||||||
|
|
||||||
|
original_text_detail_list = group_and_sort_by_field(records, 'original_text')
|
||||||
|
|
||||||
|
original_text_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_text_detail_filename}.txt')
|
||||||
|
format_list_to_emotion(original_text_detail_list, original_text_detail_file)
|
||||||
|
|
||||||
|
logger.info('文本相似度分析完成。')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(description="Process ASR files and analyze similarity.")
|
||||||
|
|
||||||
|
parser.add_argument("-a", "--asr_file_path", type=str, required=True,
|
||||||
|
help="Path to the directory containing ASR files or path to a single ASR file.")
|
||||||
|
|
||||||
|
parser.add_argument("-o", "--output_dir", type=str, required=True,
|
||||||
|
help="Path to the directory where the analysis results should be saved.")
|
||||||
|
|
||||||
|
parser.add_argument("-b", "--similarity_enlarge_boundary", type=float, required=True,
|
||||||
|
help="Similarity score boundary value to be used in your calculations.")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cmd = parse_arguments()
|
||||||
|
# print(cmd)
|
||||||
|
process(cmd.asr_file_path, cmd.output_dir, cmd.similarity_enlarge_boundary)
|
||||||
128
Ref_Audio_Selector/tool/text_comparison/text_comparison.py
Normal file
128
Ref_Audio_Selector/tool/text_comparison/text_comparison.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
from scipy.spatial.distance import cosine
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
bert_path = os.environ.get(
|
||||||
|
"bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set device to GPU if available, else CPU
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
logger.info(f'使用计算设备: {device}')
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
||||||
|
model = AutoModel.from_pretrained(bert_path).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_similarity(text1, text2, max_length=512):
|
||||||
|
# 预处理文本,设置最大长度
|
||||||
|
inputs1 = tokenizer(text1, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
|
||||||
|
inputs2 = tokenizer(text2, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
|
||||||
|
|
||||||
|
# 获取句子向量(这里是取CLS token的向量并展平为一维)
|
||||||
|
with torch.no_grad():
|
||||||
|
encoded_text1 = model(**inputs1)[0][:, 0, :].flatten()
|
||||||
|
encoded_text2 = model(**inputs2)[0][:, 0, :].flatten()
|
||||||
|
|
||||||
|
# 确保转换为numpy数组并且是一维的
|
||||||
|
similarity = 1 - cosine(encoded_text1.cpu().numpy().flatten(), encoded_text2.cpu().numpy().flatten())
|
||||||
|
|
||||||
|
return similarity
|
||||||
|
|
||||||
|
|
||||||
|
# 对boundary到1区间的值进行放大
|
||||||
|
def adjusted_similarity(similarity_score2, boundary=0.8):
|
||||||
|
if similarity_score2 < boundary:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# 倍数
|
||||||
|
multiple = 1 / (1 - boundary)
|
||||||
|
|
||||||
|
adjusted_score = (similarity_score2 - boundary) * multiple
|
||||||
|
|
||||||
|
return adjusted_score
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_result(t1, t2, boundary):
|
||||||
|
# 计算并打印相似度
|
||||||
|
similarity_score2 = calculate_similarity(t1, t2)
|
||||||
|
|
||||||
|
# 调整相似度
|
||||||
|
adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary)
|
||||||
|
|
||||||
|
return similarity_score2, adjusted_similarity_score2
|
||||||
|
|
||||||
|
|
||||||
|
def print_result(t1, t2, boundary):
|
||||||
|
print(f't2: {t2}')
|
||||||
|
# 计算并打印相似度
|
||||||
|
similarity_score2 = calculate_similarity(t1, t2)
|
||||||
|
print(f"两句话的相似度为: {similarity_score2:.4f}")
|
||||||
|
|
||||||
|
# 调整相似度
|
||||||
|
adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary)
|
||||||
|
print(f"调整后的相似度为: {adjusted_similarity_score2:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
def test(boundary):
|
||||||
|
# 原始文本
|
||||||
|
text1 = "这是第一个句子"
|
||||||
|
list = """
|
||||||
|
这是第一个句子
|
||||||
|
这是第二个句子。
|
||||||
|
那么,这是第三个表达。
|
||||||
|
当前呈现的是第四个句子。
|
||||||
|
接下来,我们有第五句话。
|
||||||
|
在此,展示第六条陈述。
|
||||||
|
继续下去,这是第七个短句。
|
||||||
|
不容忽视的是第八个表述。
|
||||||
|
顺延着序列,这是第九句。
|
||||||
|
此处列举的是第十个说法。
|
||||||
|
进入新的篇章,这是第十一个句子。
|
||||||
|
下一段内容即为第十二个句子。
|
||||||
|
显而易见,这是第十三个叙述。
|
||||||
|
渐进地,我们来到第十四句话。
|
||||||
|
向下滚动,您会看到第十五个表达。
|
||||||
|
此刻,呈现在眼前的是第十六个句子。
|
||||||
|
它们中的一个——第十七个句子在此。
|
||||||
|
如同链条般连接,这是第十八个断言。
|
||||||
|
按照顺序排列,接下来是第十九个话语。
|
||||||
|
逐一列举,这是第二十个陈述句。
|
||||||
|
结构相似,本例给出第二十一个实例句。
|
||||||
|
这是最初的陈述句。
|
||||||
|
首先表达的是这一个句子。
|
||||||
|
第一句内容即为此处所示。
|
||||||
|
这是起始的叙述段落。
|
||||||
|
开篇所展示的第一句话就是这个。
|
||||||
|
明媚的阳光洒满大地
|
||||||
|
窗外飘落粉色樱花瓣
|
||||||
|
笔尖轻触纸面思绪万千
|
||||||
|
深夜的月光如水般静谧
|
||||||
|
穿越丛林的小径蜿蜒曲折
|
||||||
|
浅酌清茶品味人生百态
|
||||||
|
破晓时分雄鸡一唱天下白
|
||||||
|
草原上奔驰的骏马无拘无束
|
||||||
|
秋叶纷飞描绘季节更替画卷
|
||||||
|
寒冬雪夜炉火旁围坐共话家常
|
||||||
|
kszdRjYXw
|
||||||
|
pfsMgTlVHnB
|
||||||
|
uQaGxIbWz
|
||||||
|
ZtqNhPmKcOe
|
||||||
|
jfyrXsStVUo
|
||||||
|
wDiEgLkZbn
|
||||||
|
yhNvAfUmqC
|
||||||
|
TpKjxMrWgs
|
||||||
|
eBzHUaFJtYd
|
||||||
|
oQnXcVSiPkL
|
||||||
|
00000
|
||||||
|
"""
|
||||||
|
list2 = list.strip().split('\n')
|
||||||
|
for item in list2:
|
||||||
|
print_result(text1, item, boundary)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test(0.9)
|
||||||
0
Ref_Audio_Selector/ui_init/__init__.py
Normal file
0
Ref_Audio_Selector/ui_init/__init__.py
Normal file
197
Ref_Audio_Selector/ui_init/init_ui_param.py
Normal file
197
Ref_Audio_Selector/ui_init/init_ui_param.py
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
import os
|
||||||
|
import multiprocessing
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
import Ref_Audio_Selector.tool.audio_inference as audio_inference
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
|
||||||
|
rw_param = params.config_manager.get_rw_param()
|
||||||
|
# -------------------基本信息---------------------------
|
||||||
|
|
||||||
|
# 角色所在工作目录
|
||||||
|
base_dir_default = None
|
||||||
|
# 工作目录
|
||||||
|
text_work_space_dir_default = None
|
||||||
|
# 角色名称
|
||||||
|
text_role_default = None
|
||||||
|
# 参考音频所在目录
|
||||||
|
text_refer_audio_file_dir_default = None
|
||||||
|
# 推理音频所在目录
|
||||||
|
text_inference_audio_file_dir_default = None
|
||||||
|
|
||||||
|
# -------------------第一步------------------------------
|
||||||
|
|
||||||
|
# 参考音频抽样目录
|
||||||
|
text_sample_dir_default = None
|
||||||
|
# 分段数
|
||||||
|
slider_subsection_num_default = None
|
||||||
|
# 每段随机抽样个数
|
||||||
|
slider_sample_num_default = None
|
||||||
|
|
||||||
|
# -------------------第二步------------------------------
|
||||||
|
|
||||||
|
# api服务模型切换接口地址
|
||||||
|
text_api_set_model_base_url_default = None
|
||||||
|
# GPT模型参数名
|
||||||
|
text_api_gpt_param_default = None
|
||||||
|
# SoVITS模型参数名
|
||||||
|
text_api_sovits_param_default = None
|
||||||
|
# api服务GPT模型切换接口地址
|
||||||
|
text_api_v2_set_gpt_model_base_url_default = None
|
||||||
|
# GPT模型参数名
|
||||||
|
text_api_v2_gpt_model_param_default = None
|
||||||
|
# api服务SoVITS模型切换接口地址
|
||||||
|
text_api_v2_set_sovits_model_base_url_default = None
|
||||||
|
# SoVITS模型参数名
|
||||||
|
text_api_v2_sovits_model_param_default = None
|
||||||
|
# 推理服务请求地址与参数
|
||||||
|
text_url_default = None
|
||||||
|
# 推理服务请求完整地址
|
||||||
|
text_whole_url_default = None
|
||||||
|
# 文本参数名
|
||||||
|
text_text_default = None
|
||||||
|
# 参考参数类型
|
||||||
|
dropdown_refer_type_param_default = None
|
||||||
|
# 参考音频路径参数名
|
||||||
|
text_ref_path_default = None
|
||||||
|
# 参考音频文本参数名
|
||||||
|
text_ref_text_default = None
|
||||||
|
# 角色情绪参数名
|
||||||
|
text_emotion_default = None
|
||||||
|
# 待推理文本路径
|
||||||
|
text_test_content_default = None
|
||||||
|
# 请求并发数
|
||||||
|
slider_request_concurrency_num_default = 3
|
||||||
|
# 最大并发数
|
||||||
|
slider_request_concurrency_max_num = None
|
||||||
|
|
||||||
|
# -------------------第三步------------------------------
|
||||||
|
|
||||||
|
# 待asr的音频所在目录
|
||||||
|
text_asr_audio_dir_default = None
|
||||||
|
# 待分析的文件路径
|
||||||
|
text_text_similarity_analysis_path_default = None
|
||||||
|
# 文本相似度放大边界
|
||||||
|
slider_text_similarity_amplification_boundary_default = 0.90
|
||||||
|
# 文本相似度分析结果文件所在路径
|
||||||
|
text_text_similarity_result_path_default = None
|
||||||
|
|
||||||
|
# -------------------第四步------------------------------
|
||||||
|
# -------------------第五步------------------------------
|
||||||
|
# 模板内容
|
||||||
|
text_template_default = None
|
||||||
|
|
||||||
|
|
||||||
|
def empty_default(vale, default_value):
|
||||||
|
if vale is None or vale == "":
|
||||||
|
return default_value
|
||||||
|
else:
|
||||||
|
return vale
|
||||||
|
|
||||||
|
|
||||||
|
def init_base():
|
||||||
|
global text_work_space_dir_default, text_role_default, base_dir_default, text_refer_audio_file_dir_default, text_inference_audio_file_dir_default
|
||||||
|
|
||||||
|
text_work_space_dir_default = rw_param.read(rw_param.work_dir)
|
||||||
|
text_role_default = rw_param.read(rw_param.role)
|
||||||
|
base_dir_default = os.path.join(text_work_space_dir_default, text_role_default)
|
||||||
|
|
||||||
|
text_refer_audio_file_dir_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.reference_audio_dir))
|
||||||
|
|
||||||
|
text_inference_audio_file_dir_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.inference_audio_dir))
|
||||||
|
|
||||||
|
|
||||||
|
def init_first():
|
||||||
|
global text_sample_dir_default, slider_subsection_num_default, slider_sample_num_default
|
||||||
|
|
||||||
|
text_sample_dir_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.list_to_convert_reference_audio_dir))
|
||||||
|
|
||||||
|
slider_subsection_num_default = int(empty_default(rw_param.read(rw_param.subsection_num), 10))
|
||||||
|
|
||||||
|
slider_sample_num_default = (empty_default(rw_param.read(rw_param.sample_num), 4))
|
||||||
|
|
||||||
|
|
||||||
|
def init_second():
|
||||||
|
global text_api_set_model_base_url_default, text_api_gpt_param_default, text_api_sovits_param_default, text_api_v2_set_gpt_model_base_url_default, text_api_v2_gpt_model_param_default
|
||||||
|
global text_api_v2_set_sovits_model_base_url_default, text_api_v2_sovits_model_param_default, text_url_default, text_whole_url_default, text_text_default, dropdown_refer_type_param_default, text_ref_path_default
|
||||||
|
global text_ref_text_default, text_emotion_default, text_test_content_default, slider_request_concurrency_num_default, slider_request_concurrency_max_num
|
||||||
|
|
||||||
|
text_api_set_model_base_url_default = empty_default(rw_param.read(rw_param.api_set_model_base_url),
|
||||||
|
'http://localhost:9880/set_model')
|
||||||
|
text_api_gpt_param_default = empty_default(rw_param.read(rw_param.api_gpt_param), 'gpt_model_path')
|
||||||
|
text_api_sovits_param_default = empty_default(rw_param.read(rw_param.api_sovits_param), 'sovits_model_path')
|
||||||
|
|
||||||
|
text_api_v2_set_gpt_model_base_url_default = empty_default(rw_param.read(rw_param.api_v2_set_gpt_model_base_url),
|
||||||
|
'http://localhost:9880/set_gpt_weights')
|
||||||
|
text_api_v2_gpt_model_param_default = empty_default(rw_param.read(rw_param.api_v2_gpt_model_param), 'weights_path')
|
||||||
|
|
||||||
|
text_api_v2_set_sovits_model_base_url_default = empty_default(
|
||||||
|
rw_param.read(rw_param.api_v2_set_sovits_model_base_url), 'http://localhost:9880/set_sovits_weights')
|
||||||
|
text_api_v2_sovits_model_param_default = empty_default(rw_param.read(rw_param.api_v2_sovits_model_param), 'weights_path')
|
||||||
|
|
||||||
|
text_url_default = empty_default(rw_param.read(rw_param.text_url),
|
||||||
|
'http://localhost:9880?prompt_language=中文&text_language=中文&cut_punc=,.;?!、,。?!;:…')
|
||||||
|
text_text_default = empty_default(rw_param.read(rw_param.text_param), 'text')
|
||||||
|
dropdown_refer_type_param_default = empty_default(rw_param.read(rw_param.refer_type_param), '参考音频')
|
||||||
|
|
||||||
|
text_ref_path_default = empty_default(rw_param.read(rw_param.ref_path_param), 'refer_wav_path')
|
||||||
|
text_ref_text_default = empty_default(rw_param.read(rw_param.ref_text_param), 'prompt_text')
|
||||||
|
text_emotion_default = empty_default(rw_param.read(rw_param.emotion_param), 'emotion')
|
||||||
|
|
||||||
|
text_whole_url_default = whole_url(text_url_default, dropdown_refer_type_param_default, text_text_default,
|
||||||
|
text_ref_path_default, text_ref_text_default, text_emotion_default)
|
||||||
|
|
||||||
|
text_test_content_default = empty_default(rw_param.read(rw_param.test_content_path), params.default_test_text_path)
|
||||||
|
|
||||||
|
slider_request_concurrency_max_num = multiprocessing.cpu_count()
|
||||||
|
|
||||||
|
slider_request_concurrency_num_default = empty_default(rw_param.read(rw_param.request_concurrency_num), 3)
|
||||||
|
|
||||||
|
slider_request_concurrency_num_default = min(int(slider_request_concurrency_num_default), slider_request_concurrency_max_num)
|
||||||
|
|
||||||
|
|
||||||
|
# 基于请求路径和参数,合成完整的请求路径
|
||||||
|
def whole_url(text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, text_emotion):
|
||||||
|
url_composer = audio_inference.TTSURLComposer(text_url, dropdown_refer_type_param, text_emotion, text_text,
|
||||||
|
text_ref_path, text_ref_text)
|
||||||
|
if url_composer.is_emotion():
|
||||||
|
text_whole_url = url_composer.build_url_with_emotion('测试内容', '情绪类型', False)
|
||||||
|
else:
|
||||||
|
text_whole_url = url_composer.build_url_with_ref('测试内容', '参考路径', '参考文本', False)
|
||||||
|
return text_whole_url
|
||||||
|
|
||||||
|
|
||||||
|
def init_third():
|
||||||
|
global text_asr_audio_dir_default, text_text_similarity_analysis_path_default, slider_text_similarity_amplification_boundary_default, text_text_similarity_result_path_default
|
||||||
|
|
||||||
|
text_asr_audio_dir_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.inference_audio_dir, params.inference_audio_text_aggregation_dir))
|
||||||
|
text_text_similarity_analysis_path_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.asr_filename + '.list'))
|
||||||
|
slider_text_similarity_amplification_boundary_default = empty_default(
|
||||||
|
rw_param.read(rw_param.text_similarity_amplification_boundary), 0.90)
|
||||||
|
text_text_similarity_result_path_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.text_emotion_average_similarity_report_filename + '.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
def init_fourth():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def init_fifth():
|
||||||
|
global text_template_default
|
||||||
|
|
||||||
|
default_template_path = params.default_template_path
|
||||||
|
text_template_default = empty_default(rw_param.read(rw_param.text_template),
|
||||||
|
common.read_file(default_template_path))
|
||||||
|
|
||||||
|
|
||||||
|
def init_all():
|
||||||
|
init_base()
|
||||||
|
init_first()
|
||||||
|
init_second()
|
||||||
|
init_third()
|
||||||
|
init_fourth()
|
||||||
|
init_fifth()
|
||||||
BIN
Ref_Audio_Selector/参考音频筛选流程.png
Normal file
BIN
Ref_Audio_Selector/参考音频筛选流程.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 95 KiB |
@ -594,11 +594,11 @@
|
|||||||
- 内容: 修复实验名结尾出现空格在win中路径不正确的问题
|
- 内容: 修复实验名结尾出现空格在win中路径不正确的问题
|
||||||
- 类型: 修复
|
- 类型: 修复
|
||||||
- 提交: RVC-Boss
|
- 提交: RVC-Boss
|
||||||
- 2025.06.10 [Commit#746cb536](https://github.com/RVC-Boss/GPT-SoVITS/commit/746cb536c68b1fe6ce3ca7e882235375b8a8dd89)
|
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
|
||||||
- 内容: 语种分割优化
|
- 内容: 语种分割优化
|
||||||
- 类型: 优化
|
- 类型: 优化
|
||||||
- 提交: KamioRinn
|
- 提交: KamioRinn
|
||||||
- 2025.06.11 [Commit#dd2b9253](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2b9253aabb09db32db7a3344570ed9df043351)
|
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
|
||||||
- 内容: 修复并行推理对v2pro支持bug
|
- 内容: 修复并行推理对v2pro支持bug
|
||||||
- 类型: 修复
|
- 类型: 修复
|
||||||
- 提交: YYuX-1145
|
- 提交: YYuX-1145
|
||||||
@ -606,21 +606,132 @@
|
|||||||
- 内容: v2pro对ge提取时会出现数值溢出的问题修复
|
- 内容: v2pro对ge提取时会出现数值溢出的问题修复
|
||||||
- 类型: 修复
|
- 类型: 修复
|
||||||
- 提交: RVC-Boss
|
- 提交: RVC-Boss
|
||||||
- 2025.06.11 [Commit#37f5abfc](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fdc67ca83418306f11e90b9139278313ac5c3e9)[Commit#6fdc67ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/37f5abfcb4a6553652235909db2e124b6f8ff3a5)
|
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
|
||||||
- 内容: install.sh逻辑优化
|
- 内容: install.sh逻辑优化
|
||||||
- 类型: 优化
|
- 类型: 优化
|
||||||
- 提交: XXXXRT666
|
- 提交: XXXXRT666
|
||||||
- 2025.06.27 [Commit#90ebefa7](https://github.com/RVC-Boss/GPT-SoVITS/commit/90ebefa78fd544da36eebe0b2003620879c921b0)
|
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
|
||||||
- 内容: onnxruntime加载逻辑优化(对gpu/cpu的判断)
|
- 内容: onnxruntime加载逻辑优化(对gpu/cpu的判断)
|
||||||
- 类型: 优化
|
- 类型: 优化
|
||||||
- 提交: KamioRinn
|
- 提交: KamioRinn
|
||||||
- 2025.06.27 [Commit#6df61f58](https://github.com/RVC-Boss/GPT-SoVITS/commit/6df61f58e4d18d4c2ad9d1eddd6a1bd690034c23)
|
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
|
||||||
- 内容: 语言分割及格式化优化
|
- 内容: 语言分割及格式化优化
|
||||||
- 类型: 优化
|
- 类型: 优化
|
||||||
- 提交: KamioRinn
|
- 提交: KamioRinn
|
||||||
|
|
||||||
|
## 202507
|
||||||
|
|
||||||
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
|
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
|
||||||
- 内容: 提升推理进程优先级(修复win11下可能GPU利用率受限的问题)
|
- 内容: 提升推理进程优先级(修复win11下可能GPU利用率受限的问题)
|
||||||
- 类型: 修复
|
- 类型: 优化
|
||||||
- 提交: XianYue0125
|
- 提交: XianYue0125
|
||||||
|
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
|
||||||
|
- 内容: 解决 TTS.py 无法识别真正支持版本 v2Pro、v2ProPlus 的问题, 同时更新一版默认配置。
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: jiangsier-xyz
|
||||||
|
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
|
||||||
|
- 内容: 修复并行推理模式下v2pro模型识别问题
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: RVC-Boss
|
||||||
|
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
|
||||||
|
- 内容: whisper asr支持性价比更高的distill模型
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: XXXXRT666
|
||||||
|
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
|
||||||
|
- 内容: 优化TTS_Config的代码逻辑
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: ChasonJiang
|
||||||
|
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
|
||||||
|
- 内容: 修复gpt的loss计算问题
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: ChasonJiang
|
||||||
|
|
||||||
|
## 202508
|
||||||
|
|
||||||
|
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
|
||||||
|
- 内容: WSL Rocm
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: XXXXRT666
|
||||||
|
|
||||||
|
## 202509
|
||||||
|
|
||||||
|
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
|
||||||
|
- 内容: 修复环境变量可能不为str的问题
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: RVC-Boss
|
||||||
|
|
||||||
|
## 202511
|
||||||
|
|
||||||
|
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
|
||||||
|
- 内容: 流式推理
|
||||||
|
- 类型: 新功能
|
||||||
|
- 提交: ChasonJiang
|
||||||
|
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
|
||||||
|
- 内容: 数学计算文本前端逻辑优化
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: KamioRinn
|
||||||
|
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
|
||||||
|
- 内容: 流式推理
|
||||||
|
- 类型: 新功能
|
||||||
|
- 提交: L-jasmine
|
||||||
|
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
|
||||||
|
- 内容: 支持vq分布式训练
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: wzy3650
|
||||||
|
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
|
||||||
|
- 内容: ASR模型下载逻辑优化
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: XXXXRT666
|
||||||
|
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
|
||||||
|
- 内容: default batch size bug 修复
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: Spr-Aachen
|
||||||
|
|
||||||
|
## 202512
|
||||||
|
|
||||||
|
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
|
||||||
|
- 内容: 修复采样错误
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: ChasonJiang
|
||||||
|
|
||||||
|
## 202602
|
||||||
|
|
||||||
|
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
|
||||||
|
- 内容: 修复 Conda 条款未同意导致的构建失败
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: Oarora
|
||||||
|
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
|
||||||
|
- 内容: 环境自动构建优化
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: XXXXRT666
|
||||||
|
|
||||||
|
## 202604
|
||||||
|
|
||||||
|
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
|
||||||
|
- 内容: 优化 G2PW 的推理输入构造与多音字处理流程,减少重复计算,降低长句场景下的推理开销
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: baicai-1145
|
||||||
|
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
|
||||||
|
- 内容: 改进 Windows 单卡 v3 LoRA 训练流程
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: 2409324124
|
||||||
|
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
|
||||||
|
- 内容: 修复多个模块中的独立 bug
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: wishhyt
|
||||||
|
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
|
||||||
|
- 内容: 添加数据集的错误处理提示
|
||||||
|
- 类型: 优化
|
||||||
|
- 提交: mushroomcowisheggs
|
||||||
|
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
|
||||||
|
- 内容: 并行推理部分bug修复
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: wishhyt
|
||||||
|
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
|
||||||
|
- 内容: bug修复:DPO 训练不支持漏字模拟
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: Mr-Neutr0n
|
||||||
|
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
|
||||||
|
- 内容: 修复onnx脚本未导入Optional等的问题
|
||||||
|
- 类型: 修复
|
||||||
|
- 提交: RVC-Boss
|
||||||
@ -578,3 +578,160 @@
|
|||||||
- Content: Optimized automatic precision detection logic; added collapsible functionality to WebUI frontend modules.
|
- Content: Optimized automatic precision detection logic; added collapsible functionality to WebUI frontend modules.
|
||||||
- Type: New Feature
|
- Type: New Feature
|
||||||
- Contributors: XXXXRT666, RVC-Boss
|
- Contributors: XXXXRT666, RVC-Boss
|
||||||
|
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
|
||||||
|
- Content: Fix polyphone detection for "X一X" pattern
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: wzy3650
|
||||||
|
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
|
||||||
|
- Content: Config fix; fix SoVITS model loading
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: wzy3650
|
||||||
|
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
|
||||||
|
- Content: Fix possible numerical explosion of `ge.sum` causing silent inference
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: RVC-Boss
|
||||||
|
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
|
||||||
|
- Content: Fix incorrect Windows path when experiment name ends with a space
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: RVC-Boss
|
||||||
|
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
|
||||||
|
- Content: Optimize language segmentation
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: KamioRinn
|
||||||
|
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
|
||||||
|
- Content: Fix bug in parallel inference support for v2pro
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: YYuX-1145
|
||||||
|
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
|
||||||
|
- Content: Fix numerical overflow issue when extracting `ge` for v2pro
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: RVC-Boss
|
||||||
|
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
|
||||||
|
- Content: Optimize `install.sh` logic
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: XXXXRT666
|
||||||
|
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
|
||||||
|
- Content: Optimize onnxruntime loading logic (GPU/CPU detection)
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: KamioRinn
|
||||||
|
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
|
||||||
|
- Content: Optimize language segmentation and formatting
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: KamioRinn
|
||||||
|
|
||||||
|
## 202507
|
||||||
|
|
||||||
|
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
|
||||||
|
- Content: Increase inference process priority (fix possible GPU utilization limitation on Win11)
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: XianYue0125
|
||||||
|
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
|
||||||
|
- Content: Fix TTS.py not recognizing actually supported versions v2Pro and v2ProPlus, and update default configuration
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: jiangsier-xyz
|
||||||
|
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
|
||||||
|
- Content: Fix v2pro model recognition issue in parallel inference mode
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: RVC-Boss
|
||||||
|
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
|
||||||
|
- Content: Whisper ASR supports more cost-effective distill models
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: XXXXRT666
|
||||||
|
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
|
||||||
|
- Content: Optimize `TTS_Config` code logic
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: ChasonJiang
|
||||||
|
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
|
||||||
|
- Content: Fix GPT loss calculation issue
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: ChasonJiang
|
||||||
|
|
||||||
|
## 202508
|
||||||
|
|
||||||
|
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
|
||||||
|
- Content: WSL Rocm
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: XXXXRT666
|
||||||
|
|
||||||
|
## 202509
|
||||||
|
|
||||||
|
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
|
||||||
|
- Content: Fix issue where environment variable may not be a string
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: RVC-Boss
|
||||||
|
|
||||||
|
## 202511
|
||||||
|
|
||||||
|
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
|
||||||
|
- Content: Streaming inference
|
||||||
|
- Type: New Feature
|
||||||
|
- Contributor: ChasonJiang
|
||||||
|
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
|
||||||
|
- Content: Optimize text frontend logic for mathematical expression text
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: KamioRinn
|
||||||
|
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
|
||||||
|
- Content: Streaming inference
|
||||||
|
- Type: New Feature
|
||||||
|
- Contributor: L-jasmine
|
||||||
|
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
|
||||||
|
- Content: Support VQ distributed training
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: wzy3650
|
||||||
|
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
|
||||||
|
- Content: Optimize ASR model download logic
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: XXXXRT666
|
||||||
|
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
|
||||||
|
- Content: Fix default batch size bug
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: Spr-Aachen
|
||||||
|
|
||||||
|
## 202512
|
||||||
|
|
||||||
|
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
|
||||||
|
- Content: Fix sampling error
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: ChasonJiang
|
||||||
|
|
||||||
|
## 202602
|
||||||
|
|
||||||
|
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
|
||||||
|
- Content: Fix build failure caused by unaccepted Conda terms
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: Oarora
|
||||||
|
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
|
||||||
|
- Content: Optimize automatic environment setup
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: XXXXRT666
|
||||||
|
|
||||||
|
## 202604
|
||||||
|
|
||||||
|
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
|
||||||
|
- Content: Optimize G2PW inference input construction and polyphone handling to reduce redundant computation and inference overhead for long sentences
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: baicai-1145
|
||||||
|
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
|
||||||
|
- Content: Improve the LoRA training flow for GPT-SoVITS v3 on a single card under Windows
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: 2409324124
|
||||||
|
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
|
||||||
|
- Content: Fix miscellaneous bugs in multiple modules
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: wishhyt
|
||||||
|
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
|
||||||
|
- Content: Add error handling hints for dataset processing
|
||||||
|
- Type: Optimization
|
||||||
|
- Contributor: mushroomcowisheggs
|
||||||
|
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
|
||||||
|
- Content: Fix some bugs in parallel inference
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: wishhyt
|
||||||
|
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
|
||||||
|
- Content: Fix bug where DPO training does not support missing word simulation
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: Mr-Neutr0n
|
||||||
|
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
|
||||||
|
- Content: Fix missing imports (e.g., Optional) in ONNX script
|
||||||
|
- Type: Fix
|
||||||
|
- Contributor: RVC-Boss
|
||||||
@ -578,3 +578,160 @@
|
|||||||
- 内容: 自動精度検出ロジックを最適化し、WebUI フロントエンドモジュールに折り畳み(Collapsible)機能を追加
|
- 内容: 自動精度検出ロジックを最適化し、WebUI フロントエンドモジュールに折り畳み(Collapsible)機能を追加
|
||||||
- タイプ: 新機能
|
- タイプ: 新機能
|
||||||
- 貢献者: XXXXRT666, RVC-Boss
|
- 貢献者: XXXXRT666, RVC-Boss
|
||||||
|
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
|
||||||
|
- 内容: 「X一X」パターンの多音字検出を修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: wzy3650
|
||||||
|
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
|
||||||
|
- 内容: 設定の修正;SoVITSモデル読み込みの修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: wzy3650
|
||||||
|
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
|
||||||
|
- 内容: `ge.sum`の数値爆発による推論の無音化を修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: RVC-Boss
|
||||||
|
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
|
||||||
|
- 内容: 実験名がスペースで終わる場合のWindowsパスの誤りを修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: RVC-Boss
|
||||||
|
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
|
||||||
|
- 内容: 言語分割の最適化
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: KamioRinn
|
||||||
|
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
|
||||||
|
- 内容: v2proの並列推論対応におけるバグを修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: YYuX-1145
|
||||||
|
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
|
||||||
|
- 内容: v2proの`ge`抽出時の数値オーバーフロー問題を修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: RVC-Boss
|
||||||
|
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
|
||||||
|
- 内容: `install.sh`のロジックを最適化
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: XXXXRT666
|
||||||
|
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
|
||||||
|
- 内容: onnxruntime読み込みロジックを最適化(GPU/CPU検出)
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: KamioRinn
|
||||||
|
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
|
||||||
|
- 内容: 言語分割と書式を最適化
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: KamioRinn
|
||||||
|
|
||||||
|
## 202507
|
||||||
|
|
||||||
|
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
|
||||||
|
- 内容: 推論プロセスの優先度を上げる(Win11でのGPU利用制限の可能性を修正)
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: XianYue0125
|
||||||
|
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
|
||||||
|
- 内容: TTS.pyが実際にサポートされているバージョンv2Proおよびv2ProPlusを認識しない問題を修正し、デフォルト設定を更新
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: jiangsier-xyz
|
||||||
|
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
|
||||||
|
- 内容: 並列推論モードでのv2proモデル認識問題を修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: RVC-Boss
|
||||||
|
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
|
||||||
|
- 内容: Whisper ASRがよりコスト効率の高い蒸留モデルをサポート
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: XXXXRT666
|
||||||
|
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
|
||||||
|
- 内容: `TTS_Config`のコードロジックを最適化
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: ChasonJiang
|
||||||
|
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
|
||||||
|
- 内容: GPT損失計算の問題を修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: ChasonJiang
|
||||||
|
|
||||||
|
## 202508
|
||||||
|
|
||||||
|
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
|
||||||
|
- 内容: WSL Rocm対応
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: XXXXRT666
|
||||||
|
|
||||||
|
## 202509
|
||||||
|
|
||||||
|
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
|
||||||
|
- 内容: 環境変数が文字列でない可能性がある問題を修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: RVC-Boss
|
||||||
|
|
||||||
|
## 202511
|
||||||
|
|
||||||
|
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
|
||||||
|
- 内容: ストリーミング推論
|
||||||
|
- タイプ: 新機能
|
||||||
|
- 貢献者: ChasonJiang
|
||||||
|
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
|
||||||
|
- 内容: 数式テキストに対するテキスト前処理ロジックを最適化
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: KamioRinn
|
||||||
|
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
|
||||||
|
- 内容: ストリーミング推論
|
||||||
|
- タイプ: 新機能
|
||||||
|
- 貢献者: L-jasmine
|
||||||
|
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
|
||||||
|
- 内容: VQ分散学習をサポート
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: wzy3650
|
||||||
|
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
|
||||||
|
- 内容: ASRモデルダウンロードロジックを最適化
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: XXXXRT666
|
||||||
|
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
|
||||||
|
- 内容: デフォルトのバッチサイズのバグを修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: Spr-Aachen
|
||||||
|
|
||||||
|
## 202512
|
||||||
|
|
||||||
|
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
|
||||||
|
- 内容: サンプリングエラーを修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: ChasonJiang
|
||||||
|
|
||||||
|
## 202602
|
||||||
|
|
||||||
|
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
|
||||||
|
- 内容: 受け入れられなかったConda利用規約によるビルド失敗を修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: Oarora
|
||||||
|
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
|
||||||
|
- 内容: 自動環境セットアップを最適化
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: XXXXRT666
|
||||||
|
|
||||||
|
## 202604
|
||||||
|
|
||||||
|
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
|
||||||
|
- 内容: G2PW推論入力の構築と多音字処理を最適化し、長文における冗長な計算と推論オーバーヘッドを削減
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: baicai-1145
|
||||||
|
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
|
||||||
|
- 内容: WindowsでのシングルカードにおけるGPT-SoVITS v3のLoRAトレーニングフローを改善
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: 2409324124
|
||||||
|
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
|
||||||
|
- 内容: 複数モジュールの雑多なバグを修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: wishhyt
|
||||||
|
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
|
||||||
|
- 内容: データセット処理時のエラーハンドリングヒントを追加
|
||||||
|
- タイプ: 最適化
|
||||||
|
- 貢献者: mushroomcowisheggs
|
||||||
|
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
|
||||||
|
- 内容: 並列推論の一部バグを修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: wishhyt
|
||||||
|
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
|
||||||
|
- 内容: DPOトレーニングが欠落単語シミュレーションをサポートしないバグを修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: Mr-Neutr0n
|
||||||
|
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
|
||||||
|
- 内容: ONNXスクリプトでの(Optionalなどの)不足インポートを修正
|
||||||
|
- タイプ: 修正
|
||||||
|
- 貢献者: RVC-Boss
|
||||||
@ -578,3 +578,160 @@
|
|||||||
- 내용: 자동 정밀도 감지 로직 최적화; WebUI 프론트엔드 모듈에 접기 기능 추가
|
- 내용: 자동 정밀도 감지 로직 최적화; WebUI 프론트엔드 모듈에 접기 기능 추가
|
||||||
- 유형: 신규 기능
|
- 유형: 신규 기능
|
||||||
- 기여자: XXXXRT666, RVC-Boss
|
- 기여자: XXXXRT666, RVC-Boss
|
||||||
|
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
|
||||||
|
- 내용: "X一X" 패턴의 다중 발음 감지 오류 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: wzy3650
|
||||||
|
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
|
||||||
|
- 내용: 설정 오류 수정; SoVITS 모델 로딩 오류 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: wzy3650
|
||||||
|
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
|
||||||
|
- 내용: `ge.sum`의 수치 폭발 가능성으로 인한 추론 무음 현상 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: RVC-Boss
|
||||||
|
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
|
||||||
|
- 내용: 실험 이름이 공백으로 끝날 때 발생하는 잘못된 Windows 경로 문제 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: RVC-Boss
|
||||||
|
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
|
||||||
|
- 내용: 언어 분할 최적화
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: KamioRinn
|
||||||
|
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
|
||||||
|
- 내용: v2pro 병렬 추론 지원 버그 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: YYuX-1145
|
||||||
|
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
|
||||||
|
- 내용: v2pro의 `ge` 추출 시 수치 오버플로우 문제 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: RVC-Boss
|
||||||
|
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
|
||||||
|
- 내용: `install.sh` 로직 최적화
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: XXXXRT666
|
||||||
|
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
|
||||||
|
- 내용: onnxruntime 로딩 로직 최적화 (GPU/CPU 감지)
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: KamioRinn
|
||||||
|
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
|
||||||
|
- 내용: 언어 분할 및 형식 최적화
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: KamioRinn
|
||||||
|
|
||||||
|
## 202507
|
||||||
|
|
||||||
|
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
|
||||||
|
- 내용: 추론 프로세스 우선순위 증가 (Win11에서 GPU 활용 제한 가능성 수정)
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: XianYue0125
|
||||||
|
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
|
||||||
|
- 내용: TTS.py가 실제 지원되는 버전 v2Pro 및 v2ProPlus를 인식하지 못하는 문제 수정 및 기본 설정 업데이트
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: jiangsier-xyz
|
||||||
|
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
|
||||||
|
- 내용: 병렬 추론 모드에서 v2pro 모델 인식 문제 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: RVC-Boss
|
||||||
|
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
|
||||||
|
- 내용: Whisper ASR이 더 비용 효율적인 distill 모델 지원
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: XXXXRT666
|
||||||
|
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
|
||||||
|
- 내용: `TTS_Config` 코드 로직 최적화
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: ChasonJiang
|
||||||
|
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
|
||||||
|
- 내용: GPT 손실(loss) 계산 문제 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: ChasonJiang
|
||||||
|
|
||||||
|
## 202508
|
||||||
|
|
||||||
|
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
|
||||||
|
- 내용: WSL Rocm
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: XXXXRT666
|
||||||
|
|
||||||
|
## 202509
|
||||||
|
|
||||||
|
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
|
||||||
|
- 내용: 환경 변수가 문자열이 아닐 수 있는 문제 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: RVC-Boss
|
||||||
|
|
||||||
|
## 202511
|
||||||
|
|
||||||
|
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
|
||||||
|
- 내용: 스트리밍 추론
|
||||||
|
- 유형: 새 기능
|
||||||
|
- 기여자: ChasonJiang
|
||||||
|
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
|
||||||
|
- 내용: 수학 표현식 텍스트에 대한 텍스트 전처리 로직 최적화
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: KamioRinn
|
||||||
|
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
|
||||||
|
- 내용: 스트리밍 추론
|
||||||
|
- 유형: 새 기능
|
||||||
|
- 기여자: L-jasmine
|
||||||
|
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
|
||||||
|
- 내용: VQ 분산 학습 지원
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: wzy3650
|
||||||
|
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
|
||||||
|
- 내용: ASR 모델 다운로드 로직 최적화
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: XXXXRT666
|
||||||
|
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
|
||||||
|
- 내용: 기본 배치 크기 버그 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: Spr-Aachen
|
||||||
|
|
||||||
|
## 202512
|
||||||
|
|
||||||
|
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
|
||||||
|
- 내용: 샘플링 오류 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: ChasonJiang
|
||||||
|
|
||||||
|
## 202602
|
||||||
|
|
||||||
|
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
|
||||||
|
- 내용: Conda 약관 미동의로 인한 빌드 실패 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: Oarora
|
||||||
|
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
|
||||||
|
- 내용: 자동 환경 설정 최적화
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: XXXXRT666
|
||||||
|
|
||||||
|
## 202604
|
||||||
|
|
||||||
|
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
|
||||||
|
- 내용: G2PW 추론 입력 구성 및 다중 발음 처리를 최적화하여 긴 문장에 대한 중복 계산 및 추론 오버헤드 감소
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: baicai-1145
|
||||||
|
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
|
||||||
|
- 내용: Windows 환경 단일 GPU에서 GPT-SoVITS v3의 LoRA 학습 흐름 개선
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: 2409324124
|
||||||
|
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
|
||||||
|
- 내용: 여러 모듈의 잡다한 버그 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: wishhyt
|
||||||
|
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
|
||||||
|
- 내용: 데이터셋 처리를 위한 오류 처리 힌트 추가
|
||||||
|
- 유형: 최적화
|
||||||
|
- 기여자: mushroomcowisheggs
|
||||||
|
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
|
||||||
|
- 내용: 병렬 추론의 일부 버그 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: wishhyt
|
||||||
|
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
|
||||||
|
- 내용: DPO 학습이 누락 단어 시뮬레이션을 지원하지 않는 버그 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: Mr-Neutr0n
|
||||||
|
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
|
||||||
|
- 내용: ONNX 스크립트에서 Optional 등 누락된 임포트 문제 수정
|
||||||
|
- 유형: 수정
|
||||||
|
- 기여자: RVC-Boss
|
||||||
@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
## 202401
|
## 202401
|
||||||
|
|
||||||
## 202401
|
|
||||||
|
|
||||||
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108)
|
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108)
|
||||||
- İçerik: WebUI'ya İngilizce sistem çeviri desteği eklendi.
|
- İçerik: WebUI'ya İngilizce sistem çeviri desteği eklendi.
|
||||||
- Tür: Dokümantasyon
|
- Tür: Dokümantasyon
|
||||||
@ -332,6 +330,8 @@
|
|||||||
- Tür: Optimizasyon
|
- Tür: Optimizasyon
|
||||||
- Katkıda Bulunan: RVC-Boss, GoHomeToMacDonal
|
- Katkıda Bulunan: RVC-Boss, GoHomeToMacDonal
|
||||||
- İlgili: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
|
- İlgili: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
|
||||||
|
- Gelecek güncellemeler, `fast_inference` dalındaki değişikliklerin tutarlılığını doğrulamaya devam edecek.
|
||||||
|
|
||||||
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298)
|
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298)
|
||||||
- İçerik: i18n taraması yeniden düzenlendi ve çok dilli yapılandırma dosyaları güncellendi
|
- İçerik: i18n taraması yeniden düzenlendi ve çok dilli yapılandırma dosyaları güncellendi
|
||||||
- Tür: Dokümantasyon
|
- Tür: Dokümantasyon
|
||||||
@ -578,3 +578,160 @@
|
|||||||
- İçerik: Otomatik hassasiyet algılama mantığı optimize edildi; WebUI önyüz modüllerine katlanabilir özellik eklendi
|
- İçerik: Otomatik hassasiyet algılama mantığı optimize edildi; WebUI önyüz modüllerine katlanabilir özellik eklendi
|
||||||
- Tür: Yeni Özellik
|
- Tür: Yeni Özellik
|
||||||
- Katkıda Bulunanlar: XXXXRT666, RVC-Boss
|
- Katkıda Bulunanlar: XXXXRT666, RVC-Boss
|
||||||
|
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
|
||||||
|
- İçerik: "X一X" kalıbı için çok sesli harf tespitini düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: wzy3650
|
||||||
|
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
|
||||||
|
- İçerik: Yapılandırma düzeltmesi; SoVITS model yüklemesini düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: wzy3650
|
||||||
|
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
|
||||||
|
- İçerik: `ge.sum` kaynaklı olası sayısal patlamayı (sessiz çıkarıma yol açan) düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: RVC-Boss
|
||||||
|
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
|
||||||
|
- İçerik: Deney adı boşlukla bittiğinde oluşan hatalı Windows yolunu düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: RVC-Boss
|
||||||
|
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
|
||||||
|
- İçerik: Dil bölütlemeyi optimize et
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: KamioRinn
|
||||||
|
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
|
||||||
|
- İçerik: v2pro için paralel çıkarım desteğindeki hatayı düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: YYuX-1145
|
||||||
|
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
|
||||||
|
- İçerik: v2pro için `ge` çıkarımındaki sayısal taşma sorununu düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: RVC-Boss
|
||||||
|
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
|
||||||
|
- İçerik: `install.sh` mantığını optimize et
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: XXXXRT666
|
||||||
|
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
|
||||||
|
- İçerik: onnxruntime yükleme mantığını optimize et (GPU/CPU algılama)
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: KamioRinn
|
||||||
|
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
|
||||||
|
- İçerik: Dil bölütleme ve biçimlendirmeyi optimize et
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: KamioRinn
|
||||||
|
|
||||||
|
## 202507
|
||||||
|
|
||||||
|
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
|
||||||
|
- İçerik: Çıkarım işlem önceliğini artır (Win11'de olası GPU kullanım sınırlamasını düzelt)
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: XianYue0125
|
||||||
|
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
|
||||||
|
- İçerik: TTS.py'nin gerçekte desteklenen sürümler olan v2Pro ve v2ProPlus'ı tanımaması sorununu düzelt ve varsayılan yapılandırmayı güncelle
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: jiangsier-xyz
|
||||||
|
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
|
||||||
|
- İçerik: Paralel çıkarım modunda v2pro model tanıma sorununu düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: RVC-Boss
|
||||||
|
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
|
||||||
|
- İçerik: Whisper ASR daha uygun maliyetli distill modellerini destekler
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: XXXXRT666
|
||||||
|
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
|
||||||
|
- İçerik: `TTS_Config` kod mantığını optimize et
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: ChasonJiang
|
||||||
|
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
|
||||||
|
- İçerik: GPT kayıp (loss) hesaplama sorununu düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: ChasonJiang
|
||||||
|
|
||||||
|
## 202508
|
||||||
|
|
||||||
|
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
|
||||||
|
- İçerik: WSL Rocm
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: XXXXRT666
|
||||||
|
|
||||||
|
## 202509
|
||||||
|
|
||||||
|
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
|
||||||
|
- İçerik: Ortam değişkeninin dize (string) olmaması sorununu düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: RVC-Boss
|
||||||
|
|
||||||
|
## 202511
|
||||||
|
|
||||||
|
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
|
||||||
|
- İçerik: Akışlı çıkarım (streaming inference)
|
||||||
|
- Tür: Yeni Özellik
|
||||||
|
- Katkıda Bulunan: ChasonJiang
|
||||||
|
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
|
||||||
|
- İçerik: Matematiksel ifade metinleri için metin ön uç (frontend) mantığını optimize et
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: KamioRinn
|
||||||
|
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
|
||||||
|
- İçerik: Akışlı çıkarım (streaming inference)
|
||||||
|
- Tür: Yeni Özellik
|
||||||
|
- Katkıda Bulunan: L-jasmine
|
||||||
|
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
|
||||||
|
- İçerik: VQ dağıtılmış eğitimi destekle
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: wzy3650
|
||||||
|
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
|
||||||
|
- İçerik: ASR model indirme mantığını optimize et
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: XXXXRT666
|
||||||
|
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
|
||||||
|
- İçerik: Varsayılan parti boyutu (batch size) hatasını düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: Spr-Aachen
|
||||||
|
|
||||||
|
## 202512
|
||||||
|
|
||||||
|
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
|
||||||
|
- İçerik: Örnekleme (sampling) hatasını düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: ChasonJiang
|
||||||
|
|
||||||
|
## 202602
|
||||||
|
|
||||||
|
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
|
||||||
|
- İçerik: Kabul edilmeyen Conda koşullarının neden olduğu derleme hatasını düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: Oarora
|
||||||
|
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
|
||||||
|
- İçerik: Otomatik ortam kurulumunu optimize et
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: XXXXRT666
|
||||||
|
|
||||||
|
# 202604
|
||||||
|
|
||||||
|
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
|
||||||
|
- İçerik: Uzun cümlelerde gereksiz hesaplama ve çıkarım yükünü azaltmak için G2PW çıkarım girdi oluşturmayı ve çok sesli harf işlemeyi optimize et
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: baicai-1145
|
||||||
|
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
|
||||||
|
- İçerik: Windows altında tek kartta GPT-SoVITS v3 için LoRA eğitim akışını iyileştir
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: 2409324124
|
||||||
|
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
|
||||||
|
- İçerik: Birden çok modüldeki çeşitli hataları düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: wishhyt
|
||||||
|
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
|
||||||
|
- İçerik: Veri kümesi işleme için hata işleme ipuçları ekle
|
||||||
|
- Tür: Optimizasyon
|
||||||
|
- Katkıda Bulunan: mushroomcowisheggs
|
||||||
|
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
|
||||||
|
- İçerik: Paralel çıkarımdaki bazı hataları düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: wishhyt
|
||||||
|
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
|
||||||
|
- İçerik: DPO eğitiminin eksik kelime simülasyonunu desteklememe hatasını düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: Mr-Neutr0n
|
||||||
|
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
|
||||||
|
- İçerik: ONNX betiğinde (Optional vb.) eksik içe aktarmaları düzelt
|
||||||
|
- Tür: Düzeltme
|
||||||
|
- Katkıda Bulunan: RVC-Boss
|
||||||
@ -39,6 +39,7 @@ def create_model(language="zh"):
|
|||||||
local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||||
)
|
)
|
||||||
model_revision = "v2.0.4"
|
model_revision = "v2.0.4"
|
||||||
|
vad_model_revision = punc_model_revision = "v2.0.4"
|
||||||
elif language == "yue":
|
elif language == "yue":
|
||||||
path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
@ -51,8 +52,6 @@ def create_model(language="zh"):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"{language} is not supported")
|
raise ValueError(f"{language} is not supported")
|
||||||
|
|
||||||
vad_model_revision = punc_model_revision = "v2.0.4"
|
|
||||||
|
|
||||||
if language in funasr_models:
|
if language in funasr_models:
|
||||||
return funasr_models[language]
|
return funasr_models[language]
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -485,6 +485,8 @@ def istft(spec, hl):
|
|||||||
wave_right = librosa.istft(spec_right, hop_length=hl)
|
wave_right = librosa.istft(spec_right, hop_length=hl)
|
||||||
wave = np.asfortranarray([wave_left, wave_right])
|
wave = np.asfortranarray([wave_left, wave_right])
|
||||||
|
|
||||||
|
return wave
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user