chasonjiang 17832e5c4a 忽略ffmpeg .gitignore
使t2s模型支持批量推理:   GPT_SoVITS/AR/models/t2s_model.py
	修复batch bug   GPT_SoVITS/AR/models/utils.py
    重构的tts infer   GPT_SoVITS/TTS_infer_pack/TTS.py
	文本预处理模块   GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
	new file   GPT_SoVITS/TTS_infer_pack/__init__.py
	文本拆分方法模块   GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
	tts infer配置文件   GPT_SoVITS/configs/tts_infer.yaml
	modified   GPT_SoVITS/feature_extractor/cnhubert.py
	modified   GPT_SoVITS/inference_gui.py
	重构的webui   GPT_SoVITS/inference_webui.py
	new file   GPT_SoVITS/inference_webui_old.py
2024-03-08 23:41:59 +08:00

108 lines
3.3 KiB
Python

import time
import librosa
import torch
import torch.nn.functional as F
import soundfile as sf
import logging
logging.getLogger("numba").setLevel(logging.WARNING)
from transformers import (
Wav2Vec2FeatureExtractor,
HubertModel,
)
import utils
import torch.nn as nn
cnhubert_base_path = None
class CNHubert(nn.Module):
def __init__(self, base_path:str=None):
super().__init__()
if base_path is None:
base_path = cnhubert_base_path
self.model = HubertModel.from_pretrained(base_path)
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
base_path
)
def forward(self, x):
input_values = self.feature_extractor(
x, return_tensors="pt", sampling_rate=16000
).input_values.to(x.device)
feats = self.model(input_values)["last_hidden_state"]
return feats
# class CNHubertLarge(nn.Module):
# def __init__(self):
# super().__init__()
# self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
# def forward(self, x):
# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
# feats = self.model(input_values)["last_hidden_state"]
# return feats
#
# class CVec(nn.Module):
# def __init__(self):
# super().__init__()
# self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
# def forward(self, x):
# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
# feats = self.model(input_values)["last_hidden_state"]
# return feats
#
# class cnw2v2base(nn.Module):
# def __init__(self):
# super().__init__()
# self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
# def forward(self, x):
# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
# feats = self.model(input_values)["last_hidden_state"]
# return feats
def get_model():
model = CNHubert()
model.eval()
return model
# def get_large_model():
# model = CNHubertLarge()
# model.eval()
# return model
#
# def get_model_cvec():
# model = CVec()
# model.eval()
# return model
#
# def get_model_cnw2v2base():
# model = cnw2v2base()
# model.eval()
# return model
def get_content(hmodel, wav_16k_tensor):
with torch.no_grad():
feats = hmodel(wav_16k_tensor)
return feats.transpose(1, 2)
if __name__ == "__main__":
model = get_model()
src_path = "/Users/Shared/原音频2.wav"
wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
model = model
wav_16k_tensor = wav_16k_tensor
feats = get_content(model, wav_16k_tensor)
print(feats.shape)