Compare commits

...

13 Commits

Author SHA1 Message Date
逸游仙人
56180a4c26
Merge 7604f36bb270d0a897df0b8d4dd9d35f860d06cb into ea2d2a81667239d37615697e8f0056e35bab2db6 2026-04-20 17:49:18 +00:00
RVC-Boss
ea2d2a8166
Update README.md 2026-04-19 21:02:57 +08:00
SapphireLab
d9f03dad3e
Update Documentation (#2768)
* 调整日志格式

* docs: Update other languages' changelogs
2026-04-18 22:33:55 +08:00
RVC-Boss
647935357a
Update Changelog_CN.md 2026-04-18 19:01:11 +08:00
RVC-Boss
02425ea256
Fixed issues such as missing imports for types like Optional.
Fixed issues such as missing imports for types like `Optional`.
2026-04-18 17:33:53 +08:00
Harikrishna KP
938f05fce8
fix: correct torch.randint upper bound to include both values (#2733) 2026-04-18 17:19:55 +08:00
huang yutong
445d18ccce
fix: 修复 TTS 音频后处理中的多个缺陷 (#2753)
1. 修复音频超采样时 int16 双重转换导致整数溢出(CRITICAL)
   - audio_postprocess 中 `audio = (audio * 32768).astype(np.int16)` 位于
     if/else 块之外无条件执行,当 super_sampling=True 时音频已在分支内
     转为 int16,再次乘以 32768 导致溢出和音频完全失真
   - 同时修复 super_sampling=True 但超分模型不存在时 torch.Tensor 调用
     .astype() 的 AttributeError

2. 修复 batched vocoder 推理中 padding_len=0 导致音频丢失(HIGH)
   - 当 padding_len 恰好为 0 时,`-0 * upsample_rate == 0`,切片
     `audio[x:0]` 返回空张量,导致整段音频丢失

3. 修复文件不存在时错误地抛出 FileExistsError(LOW)
   - 应为 FileNotFoundError

Made-with: Cursor
2026-04-18 17:16:24 +08:00
Mushroomcowisheggs
00ce973412
feat: 添加数据集的错误处理提示 (#2758)
Co-authored-by: moomushroom <107208254+moomushroom@users.noreply.github.com>
2026-04-18 17:13:30 +08:00
huang yutong
14191901cd
fix: 修复多个模块中的独立 bug (#2755)
1. 修复 sync_buffer 中除以函数对象而非调用结果(distrib.py)
   - `buffer.data /= world_size` 中 world_size 是函数,缺少 (),
     导致 TypeError 使分布式训练 buffer 同步失败

2. 修复 istft 函数缺少 return 语句(spec_utils.py)
   - 函数计算了结果但未返回,调用者始终得到 None

3. 修复 cut0 返回字面量 "/n" 而非换行符 "\n"(text_segmentation_method.py)
   - 导致后续 text.split("\n") 无法正确切分,字面 /n 被当作文本内容

4. 修复粤语 ASR 的 vad/punc model_revision 被无条件覆盖(funasr_asr.py)
   - 粤语分支将 vad_model_revision 设为空(因不使用 VAD/标点模型),
     但 if/else 外的赋值将其覆盖为 "v2.0.4",传入错误的 revision 参数

Made-with: Cursor
2026-04-18 17:10:56 +08:00
东云
780383d5bd
[codex] Improve Windows single-GPU v3 LoRA training / 改进 Windows 单卡 v3 LoRA 训练流程 (#2767)
* Improve Windows single-GPU v3 LoRA training

* Drop unrelated checkpoint helper change from PR

* Tighten PR scope to single-GPU training path fixes
2026-04-18 16:54:26 +08:00
白菜工厂1145号员工
ba8de9b760
优化 G2PW 的推理输入构造与多音字处理流程,减少重复计算,降低长句场景下的推理开销 (#2763)
* Enhance G2P processing by implementing batch input handling in _g2p function, improving efficiency. Update prepare_onnx_input to utilize caching for tokenization and add optional parameters for character ID mapping and phoneme masks. Refactor G2PWOnnxConverter to streamline model loading and configuration management.

* Enhance G2PW model input handling by introducing polyphonic context character support and updating the data preparation method to return additional query IDs. This improves the processing of polyphonic characters in sentences.
2026-04-18 16:52:32 +08:00
逸游仙人
7604f36bb2 用于特殊用途的大数据量实验,请勿合并!!!!!!! 2025-06-22 03:51:59 +08:00
逸游仙人
6c88f1ea32 cc 2025-06-22 03:35:34 +08:00
20 changed files with 1737 additions and 152 deletions

View File

@ -67,8 +67,10 @@ class Text2SemanticDataset(Dataset):
)
) # "%s/3-bert"%exp_dir#bert_dir
self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path
assert os.path.exists(self.path2)
assert os.path.exists(self.path6)
if not os.path.exists(self.path2):
raise FileNotFoundError(f"Phoneme data file not found: {self.path2}")
if not os.path.exists(self.path6):
raise FileNotFoundError(f"Semantic data file not found: {self.path6}")
self.phoneme_data = {}
with open(self.path2, "r", encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
@ -131,7 +133,7 @@ class Text2SemanticDataset(Dataset):
phoneme, word2ph, text = self.phoneme_data[item_name]
except Exception:
traceback.print_exc()
# print(f"{item_name} not in self.phoneme_data !")
print(f"Warning: File \"{item_name}\" not in self.phoneme_data! Skipped. ")
num_not_in += 1
continue
@ -152,7 +154,7 @@ class Text2SemanticDataset(Dataset):
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
except:
traceback.print_exc()
# print(f"{item_name} not in self.phoneme_data !")
print(f"Warning: Failed to convert phonemes to sequence for file \"{item_name}\"! Skipped. ")
num_not_in += 1
continue
# if len(phoneme_ids) >400:###########2改为恒定限制为semantic/2.5就行
@ -228,7 +230,11 @@ class Text2SemanticDataset(Dataset):
# bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
bert_feature = None
else:
assert bert_feature.shape[-1] == len(phoneme_ids)
try:
assert bert_feature.shape[-1] == len(phoneme_ids)
except AssertionError:
print(f"AssertionError: The BERT feature dimension ({bert_feature.shape[-1]}) of the file '{item_name}' does not match the length of the phoneme sequence ({len(phoneme_ids)}).")
raise
return {
"idx": idx,
"phoneme_ids": phoneme_ids,

View File

@ -262,7 +262,7 @@ def make_reject_y(y_o, y_lens):
reject_y = []
reject_y_lens = []
for b in range(bs):
process_item_idx = torch.randint(0, 1, size=(1,))[0]
process_item_idx = torch.randint(0, 2, size=(1,))[0]
if process_item_idx == 0:
new_y = repeat_P(y_o[b])
reject_y.append(new_y)

View File

@ -8,30 +8,30 @@ def multi_head_attention_forward_patched(
query,
key,
value,
embed_dim_to_check: int,
num_heads: int,
embed_dim_to_check,
num_heads,
in_proj_weight,
in_proj_bias: Optional[Tensor],
bias_k: Optional[Tensor],
bias_v: Optional[Tensor],
add_zero_attn: bool,
dropout_p: float,
out_proj_weight: Tensor,
out_proj_bias: Optional[Tensor],
training: bool = True,
key_padding_mask: Optional[Tensor] = None,
need_weights: bool = True,
attn_mask: Optional[Tensor] = None,
use_separate_proj_weight: bool = False,
q_proj_weight: Optional[Tensor] = None,
k_proj_weight: Optional[Tensor] = None,
v_proj_weight: Optional[Tensor] = None,
static_k: Optional[Tensor] = None,
static_v: Optional[Tensor] = None,
average_attn_weights: bool = True,
is_causal: bool = False,
in_proj_bias,
bias_k,
bias_v,
add_zero_attn,
dropout_p,
out_proj_weight,
out_proj_bias,
training=True,
key_padding_mask=None,
need_weights=True,
attn_mask=None,
use_separate_proj_weight=False,
q_proj_weight=None,
k_proj_weight=None,
v_proj_weight=None,
static_k=None,
static_v=None,
average_attn_weights=True,
is_causal=False,
cache=None,
) -> Tuple[Tensor, Optional[Tensor]]:
):
# set up shape vars
_, _, embed_dim = query.shape
attn_mask = _canonical_mask(

View File

@ -499,7 +499,7 @@ class TTS:
if if_lora_v3 == True and os.path.exists(path_sovits) == False:
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
raise FileExistsError(info)
raise FileNotFoundError(info)
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
dict_s2 = load_sovits_new(weights_path)
@ -1578,16 +1578,15 @@ class TTS:
max_audio = np.abs(audio).max()
if max_audio > 1:
audio /= max_audio
audio = (audio * 32768).astype(np.int16)
audio = (audio * 32768).astype(np.int16)
else:
audio = audio.cpu().numpy()
audio = (audio * 32768).astype(np.int16)
t2 = time.perf_counter()
print(f"超采样用时:{t2 - t1:.3f}s")
else:
# audio = audio.float() * 32768
# audio = audio.to(dtype=torch.int16).clamp(-32768, 32767).cpu().numpy()
audio = audio.cpu().numpy()
audio = (audio * 32768).astype(np.int16)
audio = (audio * 32768).astype(np.int16)
# try:
@ -1768,7 +1767,10 @@ class TTS:
pos += chunk_len * upsample_rate
audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate)
audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
if padding_len > 0:
audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
else:
audio = audio[overlapped_len * upsample_rate :]
audio_fragments = []
for feat_len in feat_lens:

View File

@ -92,7 +92,7 @@ def cut0(inp):
if not set(inp).issubset(punctuation):
return inp
else:
return "/n"
return "\n"
# 凑四句一切

View File

@ -27,11 +27,14 @@ import re
import sys
import traceback
import warnings
import soundfile # 新增导入
import torch
import torchaudio
from text.LangSegmenter import LangSegmenter
logging.getLogger("markdown_it").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("httpcore").setLevel(logging.ERROR)
@ -1001,6 +1004,255 @@ def get_tts_wav(
yield opt_sr, (audio_opt * 32767).astype(np.int16)
import uuid
import shutil
from pydub import AudioSegment
TEMP_FOLDER = "TEMP" # 临时文件夹路径
os.makedirs(TEMP_FOLDER, exist_ok=True)
def clean_temp_folder_on_startup():
"""启动时清理临时文件夹"""
try:
if os.path.exists(TEMP_FOLDER):
shutil.rmtree(TEMP_FOLDER)
os.makedirs(TEMP_FOLDER, exist_ok=True)
print("启动时已清理临时文件夹")
else:
os.makedirs(TEMP_FOLDER, exist_ok=True)
print("临时文件夹已创建")
except Exception as e:
print(f"启动时清理临时文件夹失败: {str(e)}")
def split_text_by_punctuation(text, period_pause=0.3, comma_pause=0.15):
"""改进的文本分割函数"""
if not text or not isinstance(text, str):
print("收到空或非字符串文本输入")
return []
segments = []
current_segment = ""
punctuation_marks = ['', ',', '', '.']
for char in text:
current_segment += char
if char in punctuation_marks:
# 根据标点类型设置停顿时间
pause = period_pause if char in ['', '.'] else comma_pause
segments.append({
"text": current_segment.strip(),
"pause": pause,
"punctuation": char
})
current_segment = ""
if current_segment:
segments.append({
"text": current_segment.strip(),
"pause": comma_pause, # 默认使用非句号停顿时间
"punctuation": ""
})
print(f"分割结果: {[seg['text'] for seg in segments]}")
return segments
def generate_segment_audio(segment_data, ref_wav_path, prompt_text, prompt_language, text_language, top_k, top_p, temperature):
"""增强的音频生成函数"""
try:
if not os.path.exists(ref_wav_path):
raise FileNotFoundError(f"参考音频不存在: {ref_wav_path}")
# 生成音频
sr, audio_data = next(get_tts_wav(
ref_wav_path=ref_wav_path,
prompt_text=prompt_text,
prompt_language=prompt_language,
text=segment_data["text"],
text_language=text_language,
how_to_cut=i18n("不切"),
top_k=top_k,
top_p=top_p,
temperature=temperature,
pause_second=0 # 不在内部添加停顿
))
# 这里不再生成ID由调用者提供
temp_path = os.path.join(TEMP_FOLDER, "temp_generate.wav")
soundfile.write(temp_path, audio_data, sr)
# 添加停顿
audio = AudioSegment.from_wav(temp_path)
pause = AudioSegment.silent(duration=int(segment_data["pause"]*1000))
final_audio = audio + pause
final_audio.export(temp_path, format="wav")
return {
"success": True,
"audio_path": temp_path,
"text": segment_data["text"],
"pause": segment_data["pause"],
"message": "生成成功"
}
except Exception as e:
print(f"生成片段失败: {str(e)}", exc_info=True)
return {
"success": False,
"audio_path": None,
"text": segment_data["text"],
"pause": segment_data["pause"],
"message": f"生成失败: {str(e)}"
}
def process_all_segments(text, ref_wav_path, prompt_text, prompt_language, text_language,
top_k, top_p, temperature, period_pause, comma_pause):
"""完整处理流程"""
# 输入验证
if not text or not isinstance(text, str):
error_msg = "输入文本无效"
print(error_msg)
return [[1, error_msg, "错误"]], None
if not os.path.exists(ref_wav_path):
error_msg = f"参考音频不存在: {ref_wav_path}"
print(error_msg)
return [[1, error_msg, "错误"]], None
# 处理分段
segments = split_text_by_punctuation(text, period_pause, comma_pause)
if not segments:
error_msg = "无法分割文本"
print(error_msg)
return [[1, error_msg, "错误"]], None
results = []
audio_files = []
# 修改这里使用enumerate从1开始编号而不是基于文件夹内容
for i, segment in enumerate(segments, 1):
result = generate_segment_audio(
segment, ref_wav_path, prompt_text,
prompt_language, text_language, top_k, top_p, temperature
)
# 更新结果中的segment_id
result["segment_id"] = f"{i}temp"
if result["success"] and result["audio_path"]:
# 重命名文件以匹配新的编号
new_path = os.path.join(TEMP_FOLDER, f"{result['segment_id']}.wav")
os.rename(result["audio_path"], new_path)
result["audio_path"] = new_path
audio_files.append(new_path)
results.append(result)
print(f"处理进度: {i}/{len(segments)} - {result['message']}")
# 准备显示数据
df_data = []
for i, result in enumerate(results, 1):
df_data.append([
f"{i}temp",
result["text"],
result["message"]
])
first_audio = audio_files[0] if audio_files else None
return df_data, first_audio
def regenerate_segment(segment_id, new_text, ref_wav_path, prompt_text,
prompt_language, text_language, top_k, top_p, temperature,
period_pause, comma_pause):
try:
if not segment_id or not new_text:
raise ValueError("缺少片段ID或新文本内容")
# 从文件名解析原始停顿时间
try:
pause = 0.25 if segment_id.endswith(("", ".")) else 0.1
except:
pause = 0.1 # 默认值
is_period = segment_id.endswith(("", "."))
pause = period_pause if is_period else comma_pause
segment_data = {
"text": new_text,
"pause": pause,
"punctuation": "" if is_period else ""
}
result = generate_segment_audio(
segment_data, ref_wav_path, prompt_text,
prompt_language, text_language, top_k, top_p, temperature
)
# 更新文件
if result["success"]:
old_path = os.path.join(TEMP_FOLDER, f"{segment_id}.wav")
if os.path.exists(old_path):
os.remove(old_path)
os.rename(result["audio_path"], old_path)
result["audio_path"] = old_path
return (
result["audio_path"],
segment_id,
result["message"]
)
except Exception as e:
print(f"重新生成片段失败: {str(e)}", exc_info=True)
return None, segment_id, f"重新生成失败: {str(e)}"
def merge_all_segments():
try:
# 获取并按编号排序片段
segments = sorted(
[f for f in os.listdir(TEMP_FOLDER) if f.endswith(".wav") and f != "final_output.wav"],
key=lambda x: int(x.split("temp")[0])
)
if not segments:
raise ValueError("没有找到可合并的音频片段")
combined = AudioSegment.empty()
for seg in segments:
seg_path = os.path.join(TEMP_FOLDER, seg)
audio = AudioSegment.from_wav(seg_path)
combined += audio
# 保存最终结果
output_path = os.path.join(TEMP_FOLDER, "final_output.wav")
combined.export(output_path, format="wav")
print(f"成功合并 {len(segments)} 个片段")
return output_path, "合并成功"
except Exception as e:
print(f"合并片段失败: {str(e)}", exc_info=True)
return None, f"合并失败: {str(e)}"
def clean_temp_files():
"""清理临时文件函数"""
try:
for filename in os.listdir(TEMP_FOLDER):
file_path = os.path.join(TEMP_FOLDER, filename)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception as e:
print(f"删除文件 {file_path} 失败: {e}")
return "临时文件已清理"
except Exception as e:
return f"清理失败: {str(e)}"
def on_segment_select(df, evt: gr.SelectData):
"""当选择分段列表中的项目时更新显示"""
if evt.index:
selected_row = df.iloc[evt.index[0]]
audio_path = os.path.join(TEMP_FOLDER, f"{selected_row['编号']}.wav")
return (
selected_row["编号"],
selected_row["文本内容"],
audio_path if os.path.exists(audio_path) else None
)
return "1temp", "", None
def split(todo_text):
todo_text = todo_text.replace("……", "").replace("——", "")
if todo_text[-1] not in splits:
@ -1018,6 +1270,236 @@ def split(todo_text):
else:
i_split_head += 1
return todo_texts
# ======================== 合并功能实现 ========================
def merge_selected_segments(merge_range, segment_list_data, ref_wav_path, prompt_text,
prompt_language, text_language, top_k, top_p, temperature,
pause_period, pause_comma):
"""合并选中的句子并立即生成新音频"""
try:
if not merge_range:
return segment_list_data, "请输入合并范围例如1-3", None
# 解析合并范围
start, end = map(int, merge_range.split('-'))
if start <= 0 or end <= 0 or start > end:
return segment_list_data, "无效的合并范围", None
# 检查范围是否有效
if end > len(segment_list_data):
return segment_list_data, f"结束编号 {end} 超过总段数 {len(segment_list_data)}", None
# === 第一步:收集要删除的文件并立即删除 ===
files_to_delete = []
for i in range(start-1, end):
file_id = segment_list_data.iloc[i, 0]
file_path = os.path.join(TEMP_FOLDER, f"{file_id}.wav")
if os.path.exists(file_path):
os.remove(file_path)
files_to_delete.append(file_id)
# === 第二步:合并文本 ===
merged_text = ""
for i in range(start-1, end):
merged_text += segment_list_data.iloc[i, 1] # 文本内容在第二列
# 确定合并后的停顿类型(取最后一个片段的标点)
last_punctuation = segment_list_data.iloc[end-1, 1][-1] if segment_list_data.iloc[end-1, 1] else ""
is_period = last_punctuation in ["", "."]
pause = pause_period if is_period else pause_comma
# === 第三步:立即生成合并后的音频 ===
segment_data = {
"text": merged_text,
"pause": pause,
"punctuation": last_punctuation
}
# 生成音频
result = generate_segment_audio(
segment_data, ref_wav_path, prompt_text,
prompt_language, text_language, top_k, top_p, temperature
)
if not result["success"]:
return segment_list_data, result["message"], None
# === 第四步:构建新的分段列表 ===
# 创建新的合并条目 - 使用起始编号
new_id = f"{start}temp"
merged_entry = [new_id, merged_text, "已生成"]
# 移动生成的音频到正确位置
new_path = os.path.join(TEMP_FOLDER, f"{new_id}.wav")
shutil.move(result["audio_path"], new_path)
# 构建新的分段列表
new_segment_list = []
# 添加合并前的部分
if start > 1:
new_segment_list.extend(segment_list_data.iloc[:start-1].values.tolist())
# 添加合并条目
new_segment_list.append(merged_entry)
# 添加合并后的部分
if end < len(segment_list_data):
new_segment_list.extend(segment_list_data.iloc[end:].values.tolist())
# === 第五步:重新编号 ===
reindexed_list = []
new_id_counter = 1
for segment in new_segment_list:
old_id = segment[0]
# 为新列表生成连续编号
new_id = f"{new_id_counter}temp"
# 重命名文件(如果存在)
old_path = os.path.join(TEMP_FOLDER, f"{old_id}.wav")
new_path = os.path.join(TEMP_FOLDER, f"{new_id}.wav")
if os.path.exists(old_path) and old_id != new_id:
os.rename(old_path, new_path)
# 更新ID
segment[0] = new_id
reindexed_list.append(segment)
new_id_counter += 1
return reindexed_list, "合并成功", new_id
except Exception as e:
traceback.print_exc()
return segment_list_data, f"合并失败: {str(e)}", None
# ======================== 实现拆分逻辑函数 ========================
def split_selected_segment(split_id, segment_list_data, ref_wav_path, prompt_text,
prompt_language, text_language, top_k, top_p, temperature,
pause_period, pause_comma):
"""拆分选中的句子并重新生成 - 使用倒序重命名避免冲突"""
try:
if not split_id:
return segment_list_data, "请输入要拆分的句子编号", None
# 从文件名解析原始停顿时间
try:
pause = 0.25 if split_id.endswith(("", ".")) else 0.1
except:
pause = 0.1 # 默认值
# 查找要拆分的句子
df = segment_list_data
target_row = None
target_idx = None
# 查找匹配的句子
for idx, row in enumerate(df.itertuples()):
if str(row[1]) == split_id: # 第一列是ID
target_row = row
target_idx = idx
break
if target_row is None:
return segment_list_data, f"未找到编号为 {split_id} 的句子", None
# 获取原始文本
original_text = target_row[2] # 第二列是文本
# 根据标点符号拆分文本
segments = []
current_segment = ""
punctuation_marks = ['', ',', '', '.', '', '?', '', '!', '', ';']
for char in original_text:
current_segment += char
if char in punctuation_marks:
# 根据标点类型设置停顿时间
pause = pause_period if char in ['', '.', '', '?', '', '!'] else pause_comma
segments.append({
"text": current_segment.strip(),
"pause": pause,
"punctuation": char
})
current_segment = ""
if current_segment:
segments.append({
"text": current_segment.strip(),
"pause": pause_comma, # 默认使用非句号停顿时间
"punctuation": ""
})
if len(segments) <= 1:
return segment_list_data, "句子无法拆分(没有标点符号)", None
# 计算拆分后新增的句子数量
num_new_segments = len(segments)
offset = num_new_segments - 1 # 拆分后增加的句子数
# === 第一步:删除原始音频文件 ===
original_path = os.path.join(TEMP_FOLDER, f"{split_id}.wav")
if os.path.exists(original_path):
os.remove(original_path)
# === 第二步:倒序重命名后续句子避免冲突 ===
new_segment_list = []
# 添加拆分前的句子
for i in range(target_idx):
new_segment_list.append(df.iloc[i].tolist())
# 倒序重命名从最后一个句子开始,避免冲突
total_segments = len(df)
# 从最后一个句子开始倒序遍历
for i in range(total_segments - 1, target_idx, -1):
old_id = df.iloc[i, 0]
old_num = int(old_id.replace("temp", ""))
new_id_num = old_num + offset
new_id = f"{new_id_num}temp"
# 重命名音频文件
old_path = os.path.join(TEMP_FOLDER, f"{old_id}.wav")
new_path = os.path.join(TEMP_FOLDER, f"{new_id}.wav")
if os.path.exists(old_path):
os.rename(old_path, new_path)
# 更新列表条目
new_segment_list.append([new_id, df.iloc[i, 1], df.iloc[i, 2]])
# === 第三步:添加拆分后的新句子 ===
for i, segment in enumerate(segments):
new_id = f"{target_idx+1+i}temp" # 新ID从原始位置开始
new_segment_list.append([new_id, segment["text"], "待生成"])
# === 第四步:重新排序列表 ===
# 按照ID中的数字排序
new_segment_list.sort(key=lambda x: int(x[0].replace("temp", "")))
# === 第五步:生成拆分后的新句子 ===
for i, segment in enumerate(segments):
segment_id = f"{target_idx+1+i}temp"
result = generate_segment_audio(
segment, ref_wav_path, prompt_text,
prompt_language, text_language, top_k, top_p, temperature
)
if result["success"]:
# 更新文件
new_path = os.path.join(TEMP_FOLDER, f"{segment_id}.wav")
if os.path.exists(result["audio_path"]):
os.rename(result["audio_path"], new_path)
# 更新状态
for j, item in enumerate(new_segment_list):
if item[0] == segment_id:
new_segment_list[j][2] = "已生成"
break
return new_segment_list, f"成功拆分为 {num_new_segments} 个句子", f"{target_idx+1}temp"
except Exception as e:
traceback.print_exc()
return segment_list_data, f"拆分失败: {str(e)}", None
def cut1(inp):
@ -1327,6 +1809,160 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
)
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
# ======================== 插入分段合成UI开始 ========================
with gr.Tab(i18n("分段合成模式")):
with gr.Row():
with gr.Column():
segmented_text = gr.Textbox(label=i18n("需要分段的文本"), lines=10, value="")
segment_button = gr.Button(i18n("分割文本并生成所有片段"), variant="primary")
segmented_output = gr.Audio(label=i18n("当前选中片段"), interactive=False)
with gr.Row():
segment_index = gr.Textbox(label=i18n("片段编号"), interactive=False, visible=False)
new_segment_text = gr.Textbox(label=i18n("新文本内容"), lines=2, max_lines=4)
regenerate_button = gr.Button(i18n("重新生成当前片段"), variant="primary")
with gr.Row():
pause_period = gr.Slider(
minimum=0.1,
maximum=1.0,
step=0.05,
label=i18n("句号停顿时间(秒)"),
value=0.3,
interactive=True
)
pause_comma = gr.Slider(
minimum=0.1,
maximum=0.5,
step=0.05,
label=i18n("非句号停顿时间(秒)"),
value=0.15,
interactive=True
)
with gr.Row():
clean_button = gr.Button(i18n("清理临时文件"), variant="secondary", scale=1)
confirm_button = gr.Button(i18n("确认并合并所有片段"), variant="primary", scale=2)
final_output = gr.Audio(label=i18n("最终合成结果"), interactive=False)
segment_status = gr.Textbox(label=i18n("状态"), interactive=False)
# 添加合并功能
with gr.Row():
merge_range = gr.Textbox(label=i18n("合并范围例如1-3"), scale=3)
merge_button = gr.Button(i18n("合并句子"), variant="primary", scale=1)
# 添加拆分功能控件
with gr.Row():
split_index = gr.Textbox(label=i18n("要拆分的句子编号"), scale=1)
split_button = gr.Button(i18n("拆分句子"), variant="primary", scale=1)
with gr.Column():
segment_list = gr.Dataframe(
headers=["编号", "文本内容", "状态"],
datatype=["str", "str", "str"],
interactive=False,
label=i18n("分段列表"),
value=[]
)
# 在分段合成UI部分添加以下代码大约在line 3200附近
# ======================== 插入分段合成UI结束 ========================
# ======================== 插入分段合成事件绑定开始 ========================
# 分割文本并生成所有片段
segment_button.click(
process_all_segments,
inputs=[
segmented_text,
inp_ref,
prompt_text,
prompt_language,
text_language,
top_k,
top_p,
temperature,
pause_period, # 新增句号停顿参数
pause_comma # 新增非句号停顿参数
],
outputs=[segment_list, segmented_output]
)
# 重新生成当前片段
regenerate_button.click(
regenerate_segment,
inputs=[
segment_index, # 第一个参数应该是segment_id
new_segment_text,
inp_ref,
prompt_text,
prompt_language,
text_language,
top_k,
top_p,
temperature,
pause_period,
pause_comma
],
outputs=[segmented_output, segment_index, segment_status]
)
# 合并所有片段
confirm_button.click(
merge_all_segments,
inputs=[],
outputs=[final_output, segment_status]
)
# 清理临时文件
clean_button.click(
clean_temp_files,
inputs=[],
outputs=[segment_status]
)
# 当选择分段列表中的项目时
segment_list.select(
fn=on_segment_select,
inputs=[segment_list],
outputs=[segment_index, new_segment_text, segmented_output],
)
merge_button.click(
fn=merge_selected_segments,
inputs=[
merge_range,
segment_list,
inp_ref,
prompt_text,
prompt_language,
text_language,
top_k,
top_p,
temperature,
pause_period,
pause_comma
],
outputs=[segment_list, segment_status, segment_index]
)
# ======================== 添加拆分按钮的事件绑定 ========================
split_button.click(
fn=split_selected_segment,
inputs=[
split_index,
segment_list,
inp_ref,
prompt_text,
prompt_language,
text_language,
top_k,
top_p,
temperature,
pause_period,
pause_comma
],
outputs=[segment_list, segment_status, segment_index]
)
# ======================== 插入分段合成事件绑定结束 ========================
# gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
# with gr.Row():
# text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
@ -1351,3 +1987,4 @@ if __name__ == "__main__":
server_port=infer_ttswebui,
# quiet=True,
)

View File

@ -87,7 +87,7 @@ def sync_buffer(buffers, average=True):
for buffer, handle in handles:
handle.wait()
if average:
buffer.data /= world_size
buffer.data /= world_size()
def sync_grad(params):

View File

@ -55,6 +55,10 @@ def main():
n_gpus = torch.cuda.device_count()
else:
n_gpus = 1
if n_gpus <= 1:
run(0, n_gpus, hps)
return
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
@ -77,12 +81,14 @@ def run(rank, n_gpus, hps):
writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
dist.init_process_group(
backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
init_method="env://?use_libuv=False",
world_size=n_gpus,
rank=rank,
)
use_ddp = n_gpus > 1
if use_ddp:
dist.init_process_group(
backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
init_method="env://?use_libuv=False",
world_size=n_gpus,
rank=rank,
)
torch.manual_seed(hps.train.seed)
if torch.cuda.is_available():
torch.cuda.set_device(rank)
@ -118,15 +124,20 @@ def run(rank, n_gpus, hps):
shuffle=True,
)
collate_fn = TextAudioSpeakerCollate()
train_loader = DataLoader(
train_dataset,
num_workers=5,
worker_count = 0 if os.name == "nt" and n_gpus <= 1 else min(2 if os.name == "nt" else 5, os.cpu_count() or 1)
loader_kwargs = dict(
num_workers=worker_count,
shuffle=False,
pin_memory=True,
pin_memory=torch.cuda.is_available(),
collate_fn=collate_fn,
batch_sampler=train_sampler,
persistent_workers=True,
prefetch_factor=3,
)
if worker_count > 0:
loader_kwargs["persistent_workers"] = True
loader_kwargs["prefetch_factor"] = 2 if os.name == "nt" else 3
train_loader = DataLoader(
train_dataset,
**loader_kwargs,
)
save_root = "%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir, hps.model.version, hps.train.lora_rank)
os.makedirs(save_root, exist_ok=True)
@ -156,7 +167,9 @@ def run(rank, n_gpus, hps):
def model2cuda(net_g, rank):
if torch.cuda.is_available():
net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True)
net_g = net_g.cuda(rank)
if use_ddp:
net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
else:
net_g = net_g.to(device)
return net_g
@ -242,6 +255,8 @@ def run(rank, n_gpus, hps):
None,
)
scheduler_g.step()
if use_ddp and dist.is_initialized():
dist.destroy_process_group()
print("training done")

View File

@ -180,10 +180,15 @@ def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) ->
def _g2p(segments):
phones_list = []
word2ph = []
for seg in segments:
g2pw_batch_results = []
g2pw_batch_cursor = 0
processed_segments = [re.sub("[a-zA-Z]+", "", seg) for seg in segments]
if is_g2pw:
batch_inputs = [seg for seg in processed_segments if seg]
g2pw_batch_results = g2pw._g2pw(batch_inputs) if batch_inputs else []
for seg in processed_segments:
pinyins = []
# Replace all English words in the sentence
seg = re.sub("[a-zA-Z]+", "", seg)
seg_cut = psg.lcut(seg)
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
initials = []
@ -204,8 +209,10 @@ def _g2p(segments):
finals = sum(finals, [])
print("pypinyin结果", initials, finals)
else:
# g2pw采用整句推理
pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
# g2pw采用整句推理批量推理逐句取结果
if seg:
pinyins = g2pw_batch_results[g2pw_batch_cursor]
g2pw_batch_cursor += 1
pre_word_length = 0
for word, pos in seg_cut:

View File

@ -18,6 +18,7 @@ Credits
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
import numpy as np
@ -37,6 +38,8 @@ def prepare_onnx_input(
use_mask: bool = False,
window_size: int = None,
max_len: int = 512,
char2id: Optional[Dict[str, int]] = None,
char_phoneme_masks: Optional[Dict[str, List[int]]] = None,
) -> Dict[str, np.array]:
if window_size is not None:
truncated_texts, truncated_query_ids = _truncate_texts(
@ -48,33 +51,88 @@ def prepare_onnx_input(
phoneme_masks = []
char_ids = []
position_ids = []
tokenized_cache = {}
if char2id is None:
char2id = {char: idx for idx, char in enumerate(chars)}
if use_mask:
if char_phoneme_masks is None:
char_phoneme_masks = {
char: [1 if i in char2phonemes[char] else 0 for i in range(len(labels))]
for char in char2phonemes
}
else:
full_phoneme_mask = [1] * len(labels)
for idx in range(len(texts)):
text = (truncated_texts if window_size else texts)[idx].lower()
query_id = (truncated_query_ids if window_size else query_ids)[idx]
try:
tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
except Exception:
print(f'warning: text "{text}" is invalid')
return {}
cached = tokenized_cache.get(text)
if cached is None:
try:
tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
except Exception:
print(f'warning: text "{text}" is invalid')
return {}
text, query_id, tokens, text2token, token2text = _truncate(
max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text
)
if len(tokens) <= max_len - 2:
processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
shared_input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
shared_token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
shared_attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
cached = {
"is_short": True,
"tokens": tokens,
"text2token": text2token,
"token2text": token2text,
"input_id": shared_input_id,
"token_type_id": shared_token_type_id,
"attention_mask": shared_attention_mask,
}
else:
cached = {
"is_short": False,
"tokens": tokens,
"text2token": text2token,
"token2text": token2text,
}
tokenized_cache[text] = cached
processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
if cached["is_short"]:
text_for_query = text
query_id_for_query = query_id
text2token_for_query = cached["text2token"]
input_id = cached["input_id"]
token_type_id = cached["token_type_id"]
attention_mask = cached["attention_mask"]
else:
(
text_for_query,
query_id_for_query,
tokens_for_query,
text2token_for_query,
_token2text_for_query,
) = _truncate(
max_len=max_len,
text=text,
query_id=query_id,
tokens=cached["tokens"],
text2token=cached["text2token"],
token2text=cached["token2text"],
)
processed_tokens = ["[CLS]"] + tokens_for_query + ["[SEP]"]
input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
query_char = text[query_id]
phoneme_mask = (
[1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels)
)
char_id = chars.index(query_char)
position_id = text2token[query_id] + 1 # [CLS] token locate at first place
query_char = text_for_query[query_id_for_query]
if use_mask:
phoneme_mask = char_phoneme_masks[query_char]
else:
phoneme_mask = full_phoneme_mask
char_id = char2id[query_char]
position_id = text2token_for_query[query_id_for_query] + 1 # [CLS] token locate at first place
input_ids.append(input_id)
token_type_ids.append(token_type_id)
@ -83,10 +141,15 @@ def prepare_onnx_input(
char_ids.append(char_id)
position_ids.append(position_id)
max_token_length = max(len(seq) for seq in input_ids)
def _pad_sequences(sequences, pad_value=0):
return [seq + [pad_value] * (max_token_length - len(seq)) for seq in sequences]
outputs = {
"input_ids": np.array(input_ids).astype(np.int64),
"token_type_ids": np.array(token_type_ids).astype(np.int64),
"attention_masks": np.array(attention_masks).astype(np.int64),
"input_ids": np.array(_pad_sequences(input_ids, pad_value=0)).astype(np.int64),
"token_type_ids": np.array(_pad_sequences(token_type_ids, pad_value=0)).astype(np.int64),
"attention_masks": np.array(_pad_sequences(attention_masks, pad_value=0)).astype(np.int64),
"phoneme_masks": np.array(phoneme_masks).astype(np.float32),
"char_ids": np.array(char_ids).astype(np.int64),
"position_ids": np.array(position_ids).astype(np.int64),

View File

@ -10,7 +10,6 @@ from typing import Any, Dict, List, Tuple
import numpy as np
import onnxruntime
import requests
import torch
from opencc import OpenCC
from pypinyin import Style, pinyin
from transformers.models.auto.tokenization_auto import AutoTokenizer
@ -22,9 +21,8 @@ from .utils import load_config
onnxruntime.set_default_logger_severity(3)
try:
onnxruntime.preload_dlls()
except:
except Exception:
pass
# traceback.print_exc()
warnings.filterwarnings("ignore")
model_version = "1.1"
@ -55,6 +53,24 @@ def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[Lis
return all_preds, all_confidences
def _load_json_from_candidates(filename: str, candidate_dirs: List[str]) -> Dict[str, Any]:
for candidate_dir in candidate_dirs:
if not candidate_dir:
continue
json_path = os.path.join(candidate_dir, filename)
if os.path.exists(json_path):
with open(json_path, "r", encoding="utf-8") as fr:
return json.load(fr)
raise FileNotFoundError(f"Cannot locate {filename} in candidate dirs: {candidate_dirs}")
def _find_first_existing_file(*paths: str) -> str:
for path in paths:
if path and os.path.exists(path):
return path
raise FileNotFoundError(f"Files not found: {paths}")
def download_and_decompress(model_dir: str = "G2PWModel/"):
if not os.path.exists(model_dir):
parent_directory = os.path.dirname(model_dir)
@ -62,7 +78,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
extract_dir_new = os.path.join(parent_directory, "G2PWModel")
print("Downloading g2pw model...")
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip" # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"
with requests.get(modelscope_url, stream=True) as r:
r.raise_for_status()
with open(zip_dir, "wb") as f:
@ -79,7 +95,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
return model_dir
class G2PWOnnxConverter:
class _G2PWBaseOnnxConverter:
def __init__(
self,
model_dir: str = "G2PWModel/",
@ -87,33 +103,16 @@ class G2PWOnnxConverter:
model_source: str = None,
enable_non_tradional_chinese: bool = False,
):
uncompress_path = download_and_decompress(model_dir)
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0
if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
self.session_g2pW = onnxruntime.InferenceSession(
os.path.join(uncompress_path, "g2pW.onnx"),
sess_options=sess_options,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
else:
self.session_g2pW = onnxruntime.InferenceSession(
os.path.join(uncompress_path, "g2pW.onnx"),
sess_options=sess_options,
providers=["CPUExecutionProvider"],
)
self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True)
self.model_dir = download_and_decompress(model_dir)
self.config = load_config(config_path=os.path.join(self.model_dir, "config.py"), use_default=True)
self.model_source = model_source if model_source else self.config.model_source
self.enable_opencc = enable_non_tradional_chinese
self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)
polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt")
monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt")
polyphonic_chars_path = os.path.join(self.model_dir, "POLYPHONIC_CHARS.txt")
monophonic_chars_path = os.path.join(self.model_dir, "MONOPHONIC_CHARS.txt")
self.polyphonic_chars = [
line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n")
]
@ -149,31 +148,47 @@ class G2PWOnnxConverter:
)
self.chars = sorted(list(self.char2phonemes.keys()))
self.char2id = {char: idx for idx, char in enumerate(self.chars)}
self.char_phoneme_masks = (
{
char: [1 if i in self.char2phonemes[char] else 0 for i in range(len(self.labels))]
for char in self.char2phonemes
}
if self.config.use_mask
else None
)
self.polyphonic_chars_new = set(self.chars)
for char in self.non_polyphonic:
if char in self.polyphonic_chars_new:
self.polyphonic_chars_new.remove(char)
self.polyphonic_chars_new.discard(char)
self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars}
for char in self.non_monophonic:
if char in self.monophonic_chars_dict:
self.monophonic_chars_dict.pop(char)
self.monophonic_chars_dict.pop(char, None)
self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
default_asset_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "G2PWModel"))
candidate_asset_dirs = [self.model_dir, default_asset_dir]
self.bopomofo_convert_dict = _load_json_from_candidates(
"bopomofo_to_pinyin_wo_tune_dict.json", candidate_asset_dirs
)
self.char_bopomofo_dict = _load_json_from_candidates("char_bopomofo_dict.json", candidate_asset_dirs)
with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr:
self.bopomofo_convert_dict = json.load(fr)
self.style_convert_func = {
"bopomofo": lambda x: x,
"pinyin": self._convert_bopomofo_to_pinyin,
}[style]
with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr:
self.char_bopomofo_dict = json.load(fr)
if self.enable_opencc:
self.cc = OpenCC("s2tw")
self.enable_sentence_dedup = os.getenv("g2pw_sentence_dedup", "true").strip().lower() in {
"1",
"true",
"yes",
"y",
"on",
}
# 聚焦到多音字附近上下文默认左右各16字设为0表示关闭裁剪整句
self.polyphonic_context_chars = max(0, int(os.getenv("g2pw_polyphonic_context_chars", "16")))
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
tone = bopomofo[-1]
@ -181,9 +196,8 @@ class G2PWOnnxConverter:
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
if component:
return component + tone
else:
print(f'Warning: "{bopomofo}" cannot convert to pinyin')
return None
print(f'Warning: "{bopomofo}" cannot convert to pinyin')
return None
def __call__(self, sentences: List[str]) -> List[List[str]]:
if isinstance(sentences, str):
@ -197,51 +211,147 @@ class G2PWOnnxConverter:
translated_sentences.append(translated_sent)
sentences = translated_sentences
texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
texts, model_query_ids, result_query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
if len(texts) == 0:
# sentences no polyphonic words
return partial_results
onnx_input = prepare_onnx_input(
model_input = prepare_onnx_input(
tokenizer=self.tokenizer,
labels=self.labels,
char2phonemes=self.char2phonemes,
chars=self.chars,
texts=texts,
query_ids=query_ids,
query_ids=model_query_ids,
use_mask=self.config.use_mask,
window_size=None,
char2id=self.char2id,
char_phoneme_masks=self.char_phoneme_masks,
)
preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels)
if not model_input:
return partial_results
if self.enable_sentence_dedup:
preds, _confidences = self._predict_with_sentence_dedup(model_input=model_input, texts=texts)
else:
preds, _confidences = self._predict(model_input=model_input)
if self.config.use_char_phoneme:
preds = [pred.split(" ")[1] for pred in preds]
results = partial_results
for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
for sent_id, query_id, pred in zip(sent_ids, result_query_ids, preds):
results[sent_id][query_id] = self.style_convert_func(pred)
return results
def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
texts, query_ids, sent_ids, partial_results = [], [], [], []
def _prepare_data(
self, sentences: List[str]
) -> Tuple[List[str], List[int], List[int], List[int], List[List[str]]]:
texts, model_query_ids, result_query_ids, sent_ids, partial_results = [], [], [], [], []
for sent_id, sent in enumerate(sentences):
# pypinyin works well for Simplified Chinese than Traditional Chinese
sent_s = tranditional_to_simplified(sent)
pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3)
partial_result = [None] * len(sent)
polyphonic_indices: List[int] = []
for i, char in enumerate(sent):
if char in self.polyphonic_chars_new:
texts.append(sent)
query_ids.append(i)
sent_ids.append(sent_id)
polyphonic_indices.append(i)
elif char in self.monophonic_chars_dict:
partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char])
elif char in self.char_bopomofo_dict:
partial_result[i] = pypinyin_result[i][0]
# partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0])
else:
partial_result[i] = pypinyin_result[i][0]
if polyphonic_indices:
if self.polyphonic_context_chars > 0:
left = max(0, polyphonic_indices[0] - self.polyphonic_context_chars)
right = min(len(sent), polyphonic_indices[-1] + self.polyphonic_context_chars + 1)
sent_for_predict = sent[left:right]
query_offset = left
else:
sent_for_predict = sent
query_offset = 0
for index in polyphonic_indices:
texts.append(sent_for_predict)
model_query_ids.append(index - query_offset)
result_query_ids.append(index)
sent_ids.append(sent_id)
partial_results.append(partial_result)
return texts, query_ids, sent_ids, partial_results
return texts, model_query_ids, result_query_ids, sent_ids, partial_results
def _predict(self, model_input: Dict[str, Any]) -> Tuple[List[str], List[float]]:
raise NotImplementedError
def _predict_with_sentence_dedup(
self, model_input: Dict[str, Any], texts: List[str]
) -> Tuple[List[str], List[float]]:
if len(texts) <= 1:
return self._predict(model_input=model_input)
grouped_indices: Dict[str, List[int]] = {}
for idx, text in enumerate(texts):
grouped_indices.setdefault(text, []).append(idx)
if all(len(indices) == 1 for indices in grouped_indices.values()):
return self._predict(model_input=model_input)
preds: List[str] = [""] * len(texts)
confidences: List[float] = [0.0] * len(texts)
for indices in grouped_indices.values():
group_input = {name: value[indices] for name, value in model_input.items()}
if len(indices) > 1:
for name in ("input_ids", "token_type_ids", "attention_masks"):
group_input[name] = group_input[name][:1]
group_preds, group_confidences = self._predict(model_input=group_input)
for output_idx, pred, confidence in zip(indices, group_preds, group_confidences):
preds[output_idx] = pred
confidences[output_idx] = confidence
return preds, confidences
class G2PWOnnxConverter(_G2PWBaseOnnxConverter):
def __init__(
self,
model_dir: str = "G2PWModel/",
style: str = "bopomofo",
model_source: str = None,
enable_non_tradional_chinese: bool = False,
):
super().__init__(
model_dir=model_dir,
style=style,
model_source=model_source,
enable_non_tradional_chinese=enable_non_tradional_chinese,
)
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
sess_options.intra_op_num_threads = 2
onnx_path = _find_first_existing_file(
os.path.join(self.model_dir, "g2pW.onnx"),
os.path.join(self.model_dir, "g2pw.onnx"),
)
if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
self.session_g2pw = onnxruntime.InferenceSession(
onnx_path,
sess_options=sess_options,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
else:
self.session_g2pw = onnxruntime.InferenceSession(
onnx_path,
sess_options=sess_options,
providers=["CPUExecutionProvider"],
)
def _predict(self, model_input: Dict[str, Any]) -> Tuple[List[str], List[float]]:
return predict(session=self.session_g2pw, onnx_input=model_input, labels=self.labels)

View File

@ -48,6 +48,8 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
请不要尬黑GPT-SoVITS推理速度慢谢谢
CPU-Optimized Inference Versionhttps://github.com/baicai-1145/GPT-SoVITS-CPUFast
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
## Installation

View File

@ -594,11 +594,11 @@
- 内容: 修复实验名结尾出现空格在win中路径不正确的问题
- 类型: 修复
- 提交: RVC-Boss
- 2025.06.10 [Commit#746cb536](https://github.com/RVC-Boss/GPT-SoVITS/commit/746cb536c68b1fe6ce3ca7e882235375b8a8dd89)
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- 内容: 语种分割优化
- 类型: 优化
- 提交: KamioRinn
- 2025.06.11 [Commit#dd2b9253](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2b9253aabb09db32db7a3344570ed9df043351)
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- 内容: 修复并行推理对v2pro支持bug
- 类型: 修复
- 提交: YYuX-1145
@ -606,21 +606,132 @@
- 内容: v2pro对ge提取时会出现数值溢出的问题修复
- 类型: 修复
- 提交: RVC-Boss
- 2025.06.11 [Commit#37f5abfc](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fdc67ca83418306f11e90b9139278313ac5c3e9)[Commit#6fdc67ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/37f5abfcb4a6553652235909db2e124b6f8ff3a5)
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- 内容: install.sh逻辑优化
- 类型: 优化
- 提交: XXXXRT666
- 2025.06.27 [Commit#90ebefa7](https://github.com/RVC-Boss/GPT-SoVITS/commit/90ebefa78fd544da36eebe0b2003620879c921b0)
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- 内容: onnxruntime加载逻辑优化对gpu/cpu的判断
- 类型: 优化
- 提交: KamioRinn
- 2025.06.27 [Commit#6df61f58](https://github.com/RVC-Boss/GPT-SoVITS/commit/6df61f58e4d18d4c2ad9d1eddd6a1bd690034c23)
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- 内容: 语言分割及格式化优化
- 类型: 优化
- 提交: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- 内容: 提升推理进程优先级修复win11下可能GPU利用率受限的问题
- 类型: 修复
- 类型: 优化
- 提交: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- 内容: 解决 TTS.py 无法识别真正支持版本 v2Pro、v2ProPlus 的问题, 同时更新一版默认配置。
- 类型: 修复
- 提交: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- 内容: 修复并行推理模式下v2pro模型识别问题
- 类型: 修复
- 提交: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- 内容: whisper asr支持性价比更高的distill模型
- 类型: 优化
- 提交: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- 内容: 优化TTS_Config的代码逻辑
- 类型: 优化
- 提交: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- 内容: 修复gpt的loss计算问题
- 类型: 修复
- 提交: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- 内容: WSL Rocm
- 类型: 修复
- 提交: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- 内容: 修复环境变量可能不为str的问题
- 类型: 修复
- 提交: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- 内容: 流式推理
- 类型: 新功能
- 提交: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- 内容: 数学计算文本前端逻辑优化
- 类型: 优化
- 提交: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- 内容: 流式推理
- 类型: 新功能
- 提交: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- 内容: 支持vq分布式训练
- 类型: 优化
- 提交: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- 内容: ASR模型下载逻辑优化
- 类型: 优化
- 提交: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- 内容: default batch size bug 修复
- 类型: 修复
- 提交: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- 内容: 修复采样错误
- 类型: 修复
- 提交: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- 内容: 修复 Conda 条款未同意导致的构建失败
- 类型: 修复
- 提交: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- 内容: 环境自动构建优化
- 类型: 优化
- 提交: XXXXRT666
## 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- 内容: 优化 G2PW 的推理输入构造与多音字处理流程,减少重复计算,降低长句场景下的推理开销
- 类型: 优化
- 提交: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- 内容: 改进 Windows 单卡 v3 LoRA 训练流程
- 类型: 优化
- 提交: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- 内容: 修复多个模块中的独立 bug
- 类型: 修复
- 提交: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- 内容: 添加数据集的错误处理提示
- 类型: 优化
- 提交: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- 内容: 并行推理部分bug修复
- 类型: 修复
- 提交: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- 内容: bug修复DPO 训练不支持漏字模拟
- 类型: 修复
- 提交: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- 内容: 修复onnx脚本未导入Optional等的问题
- 类型: 修复
- 提交: RVC-Boss

View File

@ -578,3 +578,160 @@
- Content: Optimized automatic precision detection logic; added collapsible functionality to WebUI frontend modules.
- Type: New Feature
- Contributors: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- Content: Fix polyphone detection for "X一X" pattern
- Type: Fix
- Contributor: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- Content: Config fix; fix SoVITS model loading
- Type: Fix
- Contributor: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- Content: Fix possible numerical explosion of `ge.sum` causing silent inference
- Type: Fix
- Contributor: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- Content: Fix incorrect Windows path when experiment name ends with a space
- Type: Fix
- Contributor: RVC-Boss
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- Content: Optimize language segmentation
- Type: Optimization
- Contributor: KamioRinn
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- Content: Fix bug in parallel inference support for v2pro
- Type: Fix
- Contributor: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- Content: Fix numerical overflow issue when extracting `ge` for v2pro
- Type: Fix
- Contributor: RVC-Boss
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- Content: Optimize `install.sh` logic
- Type: Optimization
- Contributor: XXXXRT666
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- Content: Optimize onnxruntime loading logic (GPU/CPU detection)
- Type: Optimization
- Contributor: KamioRinn
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- Content: Optimize language segmentation and formatting
- Type: Optimization
- Contributor: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- Content: Increase inference process priority (fix possible GPU utilization limitation on Win11)
- Type: Optimization
- Contributor: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- Content: Fix TTS.py not recognizing actually supported versions v2Pro and v2ProPlus, and update default configuration
- Type: Fix
- Contributor: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- Content: Fix v2pro model recognition issue in parallel inference mode
- Type: Fix
- Contributor: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- Content: Whisper ASR supports more cost-effective distill models
- Type: Optimization
- Contributor: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- Content: Optimize `TTS_Config` code logic
- Type: Optimization
- Contributor: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- Content: Fix GPT loss calculation issue
- Type: Fix
- Contributor: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- Content: WSL Rocm
- Type: Fix
- Contributor: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- Content: Fix issue where environment variable may not be a string
- Type: Fix
- Contributor: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- Content: Streaming inference
- Type: New Feature
- Contributor: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- Content: Optimize text frontend logic for mathematical expression text
- Type: Optimization
- Contributor: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- Content: Streaming inference
- Type: New Feature
- Contributor: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- Content: Support VQ distributed training
- Type: Optimization
- Contributor: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- Content: Optimize ASR model download logic
- Type: Optimization
- Contributor: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- Content: Fix default batch size bug
- Type: Fix
- Contributor: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- Content: Fix sampling error
- Type: Fix
- Contributor: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- Content: Fix build failure caused by unaccepted Conda terms
- Type: Fix
- Contributor: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- Content: Optimize automatic environment setup
- Type: Optimization
- Contributor: XXXXRT666
## 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- Content: Optimize G2PW inference input construction and polyphone handling to reduce redundant computation and inference overhead for long sentences
- Type: Optimization
- Contributor: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- Content: Improve the LoRA training flow for GPT-SoVITS v3 on a single card under Windows
- Type: Optimization
- Contributor: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- Content: Fix miscellaneous bugs in multiple modules
- Type: Fix
- Contributor: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- Content: Add error handling hints for dataset processing
- Type: Optimization
- Contributor: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- Content: Fix some bugs in parallel inference
- Type: Fix
- Contributor: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- Content: Fix bug where DPO training does not support missing word simulation
- Type: Fix
- Contributor: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- Content: Fix missing imports (e.g., Optional) in ONNX script
- Type: Fix
- Contributor: RVC-Boss

View File

@ -578,3 +578,160 @@
- 内容: 自動精度検出ロジックを最適化し、WebUI フロントエンドモジュールに折り畳みCollapsible機能を追加
- タイプ: 新機能
- 貢献者: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- 内容: 「X一X」パターンの多音字検出を修正
- タイプ: 修正
- 貢献者: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- 内容: 設定の修正SoVITSモデル読み込みの修正
- タイプ: 修正
- 貢献者: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- 内容: `ge.sum`の数値爆発による推論の無音化を修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- 内容: 実験名がスペースで終わる場合のWindowsパスの誤りを修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- 内容: 言語分割の最適化
- タイプ: 最適化
- 貢献者: KamioRinn
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- 内容: v2proの並列推論対応におけるバグを修正
- タイプ: 修正
- 貢献者: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- 内容: v2proの`ge`抽出時の数値オーバーフロー問題を修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- 内容: `install.sh`のロジックを最適化
- タイプ: 最適化
- 貢献者: XXXXRT666
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- 内容: onnxruntime読み込みロジックを最適化GPU/CPU検出
- タイプ: 最適化
- 貢献者: KamioRinn
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- 内容: 言語分割と書式を最適化
- タイプ: 最適化
- 貢献者: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- 内容: 推論プロセスの優先度を上げるWin11でのGPU利用制限の可能性を修正
- タイプ: 最適化
- 貢献者: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- 内容: TTS.pyが実際にサポートされているバージョンv2Proおよびv2ProPlusを認識しない問題を修正し、デフォルト設定を更新
- タイプ: 修正
- 貢献者: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- 内容: 並列推論モードでのv2proモデル認識問題を修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- 内容: Whisper ASRがよりコスト効率の高い蒸留モデルをサポート
- タイプ: 最適化
- 貢献者: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- 内容: `TTS_Config`のコードロジックを最適化
- タイプ: 最適化
- 貢献者: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- 内容: GPT損失計算の問題を修正
- タイプ: 修正
- 貢献者: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- 内容: WSL Rocm対応
- タイプ: 修正
- 貢献者: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- 内容: 環境変数が文字列でない可能性がある問題を修正
- タイプ: 修正
- 貢献者: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- 内容: ストリーミング推論
- タイプ: 新機能
- 貢献者: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- 内容: 数式テキストに対するテキスト前処理ロジックを最適化
- タイプ: 最適化
- 貢献者: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- 内容: ストリーミング推論
- タイプ: 新機能
- 貢献者: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- 内容: VQ分散学習をサポート
- タイプ: 最適化
- 貢献者: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- 内容: ASRモデルダウンロードロジックを最適化
- タイプ: 最適化
- 貢献者: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- 内容: デフォルトのバッチサイズのバグを修正
- タイプ: 修正
- 貢献者: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- 内容: サンプリングエラーを修正
- タイプ: 修正
- 貢献者: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- 内容: 受け入れられなかったConda利用規約によるビルド失敗を修正
- タイプ: 修正
- 貢献者: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- 内容: 自動環境セットアップを最適化
- タイプ: 最適化
- 貢献者: XXXXRT666
## 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- 内容: G2PW推論入力の構築と多音字処理を最適化し、長文における冗長な計算と推論オーバーヘッドを削減
- タイプ: 最適化
- 貢献者: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- 内容: WindowsでのシングルカードにおけるGPT-SoVITS v3のLoRAトレーニングフローを改善
- タイプ: 最適化
- 貢献者: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- 内容: 複数モジュールの雑多なバグを修正
- タイプ: 修正
- 貢献者: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- 内容: データセット処理時のエラーハンドリングヒントを追加
- タイプ: 最適化
- 貢献者: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- 内容: 並列推論の一部バグを修正
- タイプ: 修正
- 貢献者: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- 内容: DPOトレーニングが欠落単語シミュレーションをサポートしないバグを修正
- タイプ: 修正
- 貢献者: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- 内容: ONNXスクリプトでのOptionalなどの不足インポートを修正
- タイプ: 修正
- 貢献者: RVC-Boss

View File

@ -578,3 +578,160 @@
- 내용: 자동 정밀도 감지 로직 최적화; WebUI 프론트엔드 모듈에 접기 기능 추가
- 유형: 신규 기능
- 기여자: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- 내용: "X一X" 패턴의 다중 발음 감지 오류 수정
- 유형: 수정
- 기여자: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- 내용: 설정 오류 수정; SoVITS 모델 로딩 오류 수정
- 유형: 수정
- 기여자: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- 내용: `ge.sum`의 수치 폭발 가능성으로 인한 추론 무음 현상 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- 내용: 실험 이름이 공백으로 끝날 때 발생하는 잘못된 Windows 경로 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- 내용: 언어 분할 최적화
- 유형: 최적화
- 기여자: KamioRinn
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- 내용: v2pro 병렬 추론 지원 버그 수정
- 유형: 수정
- 기여자: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- 내용: v2pro의 `ge` 추출 시 수치 오버플로우 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- 내용: `install.sh` 로직 최적화
- 유형: 최적화
- 기여자: XXXXRT666
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- 내용: onnxruntime 로딩 로직 최적화 (GPU/CPU 감지)
- 유형: 최적화
- 기여자: KamioRinn
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- 내용: 언어 분할 및 형식 최적화
- 유형: 최적화
- 기여자: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- 내용: 추론 프로세스 우선순위 증가 (Win11에서 GPU 활용 제한 가능성 수정)
- 유형: 최적화
- 기여자: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- 내용: TTS.py가 실제 지원되는 버전 v2Pro 및 v2ProPlus를 인식하지 못하는 문제 수정 및 기본 설정 업데이트
- 유형: 수정
- 기여자: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- 내용: 병렬 추론 모드에서 v2pro 모델 인식 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- 내용: Whisper ASR이 더 비용 효율적인 distill 모델 지원
- 유형: 최적화
- 기여자: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- 내용: `TTS_Config` 코드 로직 최적화
- 유형: 최적화
- 기여자: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- 내용: GPT 손실(loss) 계산 문제 수정
- 유형: 수정
- 기여자: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- 내용: WSL Rocm
- 유형: 수정
- 기여자: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- 내용: 환경 변수가 문자열이 아닐 수 있는 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- 내용: 스트리밍 추론
- 유형: 새 기능
- 기여자: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- 내용: 수학 표현식 텍스트에 대한 텍스트 전처리 로직 최적화
- 유형: 최적화
- 기여자: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- 내용: 스트리밍 추론
- 유형: 새 기능
- 기여자: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- 내용: VQ 분산 학습 지원
- 유형: 최적화
- 기여자: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- 내용: ASR 모델 다운로드 로직 최적화
- 유형: 최적화
- 기여자: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- 내용: 기본 배치 크기 버그 수정
- 유형: 수정
- 기여자: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- 내용: 샘플링 오류 수정
- 유형: 수정
- 기여자: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- 내용: Conda 약관 미동의로 인한 빌드 실패 수정
- 유형: 수정
- 기여자: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- 내용: 자동 환경 설정 최적화
- 유형: 최적화
- 기여자: XXXXRT666
## 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- 내용: G2PW 추론 입력 구성 및 다중 발음 처리를 최적화하여 긴 문장에 대한 중복 계산 및 추론 오버헤드 감소
- 유형: 최적화
- 기여자: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- 내용: Windows 환경 단일 GPU에서 GPT-SoVITS v3의 LoRA 학습 흐름 개선
- 유형: 최적화
- 기여자: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- 내용: 여러 모듈의 잡다한 버그 수정
- 유형: 수정
- 기여자: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- 내용: 데이터셋 처리를 위한 오류 처리 힌트 추가
- 유형: 최적화
- 기여자: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- 내용: 병렬 추론의 일부 버그 수정
- 유형: 수정
- 기여자: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- 내용: DPO 학습이 누락 단어 시뮬레이션을 지원하지 않는 버그 수정
- 유형: 수정
- 기여자: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- 내용: ONNX 스크립트에서 Optional 등 누락된 임포트 문제 수정
- 유형: 수정
- 기여자: RVC-Boss

View File

@ -2,8 +2,6 @@
## 202401
## 202401
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108)
- İçerik: WebUI'ya İngilizce sistem çeviri desteği eklendi.
- Tür: Dokümantasyon
@ -332,6 +330,8 @@
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss, GoHomeToMacDonal
- İlgili: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- Gelecek güncellemeler, `fast_inference` dalındaki değişikliklerin tutarlılığını doğrulamaya devam edecek.
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298)
- İçerik: i18n taraması yeniden düzenlendi ve çok dilli yapılandırma dosyaları güncellendi
- Tür: Dokümantasyon
@ -578,3 +578,160 @@
- İçerik: Otomatik hassasiyet algılama mantığı optimize edildi; WebUI önyüz modüllerine katlanabilir özellik eklendi
- Tür: Yeni Özellik
- Katkıda Bulunanlar: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- İçerik: "X一X" kalıbı için çok sesli harf tespitini düzelt
- Tür: Düzeltme
- Katkıda Bulunan: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- İçerik: Yapılandırma düzeltmesi; SoVITS model yüklemesini düzelt
- Tür: Düzeltme
- Katkıda Bulunan: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- İçerik: `ge.sum` kaynaklı olası sayısal patlamayı (sessiz çıkarıma yol açan) düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- İçerik: Deney adı boşlukla bittiğinde oluşan hatalı Windows yolunu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
- İçerik: Dil bölütlemeyi optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
- İçerik: v2pro için paralel çıkarım desteğindeki hatayı düzelt
- Tür: Düzeltme
- Katkıda Bulunan: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- İçerik: v2pro için `ge` çıkarımındaki sayısal taşma sorununu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
- İçerik: `install.sh` mantığını optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
- İçerik: onnxruntime yükleme mantığını optimize et (GPU/CPU algılama)
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
- İçerik: Dil bölütleme ve biçimlendirmeyi optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
## 202507
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- İçerik: Çıkarım işlem önceliğini artır (Win11'de olası GPU kullanım sınırlamasını düzelt)
- Tür: Optimizasyon
- Katkıda Bulunan: XianYue0125
- 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
- İçerik: TTS.py'nin gerçekte desteklenen sürümler olan v2Pro ve v2ProPlus'ı tanımaması sorununu düzelt ve varsayılan yapılandırmayı güncelle
- Tür: Düzeltme
- Katkıda Bulunan: jiangsier-xyz
- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
- İçerik: Paralel çıkarım modunda v2pro model tanıma sorununu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
- İçerik: Whisper ASR daha uygun maliyetli distill modellerini destekler
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
- 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
- İçerik: `TTS_Config` kod mantığını optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: ChasonJiang
- 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
- İçerik: GPT kayıp (loss) hesaplama sorununu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: ChasonJiang
## 202508
- 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
- İçerik: WSL Rocm
- Tür: Düzeltme
- Katkıda Bulunan: XXXXRT666
## 202509
- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
- İçerik: Ortam değişkeninin dize (string) olmaması sorununu düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
## 202511
- 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
- İçerik: Akışlı çıkarım (streaming inference)
- Tür: Yeni Özellik
- Katkıda Bulunan: ChasonJiang
- 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
- İçerik: Matematiksel ifade metinleri için metin ön uç (frontend) mantığını optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
- İçerik: Akışlı çıkarım (streaming inference)
- Tür: Yeni Özellik
- Katkıda Bulunan: L-jasmine
- 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
- İçerik: VQ dağıtılmış eğitimi destekle
- Tür: Optimizasyon
- Katkıda Bulunan: wzy3650
- 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
- İçerik: ASR model indirme mantığını optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
- 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
- İçerik: Varsayılan parti boyutu (batch size) hatasını düzelt
- Tür: Düzeltme
- Katkıda Bulunan: Spr-Aachen
## 202512
- 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
- İçerik: Örnekleme (sampling) hatasını düzelt
- Tür: Düzeltme
- Katkıda Bulunan: ChasonJiang
## 202602
- 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
- İçerik: Kabul edilmeyen Conda koşullarının neden olduğu derleme hatasını düzelt
- Tür: Düzeltme
- Katkıda Bulunan: Oarora
- 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
- İçerik: Otomatik ortam kurulumunu optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
# 202604
- 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
- İçerik: Uzun cümlelerde gereksiz hesaplama ve çıkarım yükünü azaltmak için G2PW çıkarım girdi oluşturmayı ve çok sesli harf işlemeyi optimize et
- Tür: Optimizasyon
- Katkıda Bulunan: baicai-1145
- 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
- İçerik: Windows altında tek kartta GPT-SoVITS v3 için LoRA eğitim akışını iyileştir
- Tür: Optimizasyon
- Katkıda Bulunan: 2409324124
- 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
- İçerik: Birden çok modüldeki çeşitli hataları düzelt
- Tür: Düzeltme
- Katkıda Bulunan: wishhyt
- 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
- İçerik: Veri kümesi işleme için hata işleme ipuçları ekle
- Tür: Optimizasyon
- Katkıda Bulunan: mushroomcowisheggs
- 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
- İçerik: Paralel çıkarımdaki bazı hataları düzelt
- Tür: Düzeltme
- Katkıda Bulunan: wishhyt
- 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
- İçerik: DPO eğitiminin eksik kelime simülasyonunu desteklememe hatasını düzelt
- Tür: Düzeltme
- Katkıda Bulunan: Mr-Neutr0n
- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
- İçerik: ONNX betiğinde (Optional vb.) eksik içe aktarmaları düzelt
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss

View File

@ -39,6 +39,7 @@ def create_model(language="zh"):
local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
)
model_revision = "v2.0.4"
vad_model_revision = punc_model_revision = "v2.0.4"
elif language == "yue":
path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
snapshot_download(
@ -51,8 +52,6 @@ def create_model(language="zh"):
else:
raise ValueError(f"{language} is not supported")
vad_model_revision = punc_model_revision = "v2.0.4"
if language in funasr_models:
return funasr_models[language]
else:

View File

@ -485,6 +485,8 @@ def istft(spec, hl):
wave_right = librosa.istft(spec_right, hop_length=hl)
wave = np.asfortranarray([wave_left, wave_right])
return wave
if __name__ == "__main__":
import argparse

View File

@ -1,3 +1,6 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 限制为单卡
import os
import sys
@ -123,13 +126,13 @@ def set_default():
if version not in v3v4set:
default_sovits_epoch = 8
default_sovits_save_every_epoch = 4
max_sovits_epoch = 25 # 40
max_sovits_save_every_epoch = 25 # 10
max_sovits_epoch = 255 # 40
max_sovits_save_every_epoch = 255 # 10
else:
default_sovits_epoch = 2
default_sovits_save_every_epoch = 1
max_sovits_epoch = 16 # 40 # 3 #训太多=作死
max_sovits_save_every_epoch = 10 # 10 # 3
max_sovits_epoch = 255 # 40 # 3 #训太多=作死
max_sovits_save_every_epoch = 255 # 10 # 3
default_batch_size = max(1, default_batch_size)
default_batch_size_s1 = max(1, default_batch_size_s1)
@ -503,7 +506,7 @@ def open1Ba(
):
global p_train_SoVITS
if p_train_SoVITS == None:
exp_name = exp_name.rstrip(" ")
exp_name=exp_name.rstrip(" ")
config_file = (
"GPT_SoVITS/configs/s2.json"
if version not in {"v2Pro", "v2ProPlus"}
@ -600,7 +603,7 @@ def open1Bb(
):
global p_train_GPT
if p_train_GPT == None:
exp_name = exp_name.rstrip(" ")
exp_name=exp_name.rstrip(" ")
with open(
"GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml"
) as f:
@ -1724,8 +1727,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
)
with gr.Row():
text_low_lr_rate = gr.Slider(
minimum=0.2,
maximum=0.6,
minimum=0,
maximum=1,
step=0.05,
label=i18n("文本模块学习率权重"),
value=0.4,
@ -1734,7 +1737,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
lora_rank = gr.Radio(
label=i18n("LoRA秩"),
value="32",
choices=["16", "32", "64", "128"],
choices=["16", "32", "64", "128", "256", "512", "1024","2048", "4096"],
visible=True if version in v3v4set else False,
) # v1v2 not need
save_every_epoch = gr.Slider(
@ -1796,7 +1799,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
)
total_epoch1Bb = gr.Slider(
minimum=2,
maximum=50,
maximum=max_sovits_epoch,
step=1,
label=i18n("总训练轮数total_epoch"),
value=15,