mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-05 19:41:56 +08:00
112 lines
3.5 KiB
Python
112 lines
3.5 KiB
Python
import time
|
|
|
|
import librosa
|
|
import torch
|
|
import torch.nn.functional as F
|
|
import soundfile as sf
|
|
import os
|
|
from transformers import logging as tf_logging
|
|
tf_logging.set_verbosity_error()
|
|
|
|
import logging
|
|
logging.getLogger("numba").setLevel(logging.WARNING)
|
|
|
|
from transformers import (
|
|
Wav2Vec2FeatureExtractor,
|
|
HubertModel,
|
|
)
|
|
|
|
import utils
|
|
import torch.nn as nn
|
|
|
|
cnhubert_base_path = None
|
|
|
|
|
|
class CNHubert(nn.Module):
|
|
def __init__(self, base_path:str=None):
|
|
super().__init__()
|
|
if base_path is None:
|
|
base_path = cnhubert_base_path
|
|
if os.path.exists(base_path):...
|
|
else:raise FileNotFoundError(base_path)
|
|
self.model = HubertModel.from_pretrained(base_path, local_files_only=True)
|
|
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
|
|
base_path, local_files_only=True
|
|
)
|
|
|
|
def forward(self, x):
|
|
input_values = self.feature_extractor(
|
|
x, return_tensors="pt", sampling_rate=16000
|
|
).input_values.to(x.device)
|
|
feats = self.model(input_values)["last_hidden_state"]
|
|
return feats
|
|
|
|
|
|
# class CNHubertLarge(nn.Module):
|
|
# def __init__(self):
|
|
# super().__init__()
|
|
# self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
|
|
# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
|
|
# def forward(self, x):
|
|
# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
|
|
# feats = self.model(input_values)["last_hidden_state"]
|
|
# return feats
|
|
#
|
|
# class CVec(nn.Module):
|
|
# def __init__(self):
|
|
# super().__init__()
|
|
# self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
|
|
# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
|
|
# def forward(self, x):
|
|
# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
|
|
# feats = self.model(input_values)["last_hidden_state"]
|
|
# return feats
|
|
#
|
|
# class cnw2v2base(nn.Module):
|
|
# def __init__(self):
|
|
# super().__init__()
|
|
# self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
|
|
# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
|
|
# def forward(self, x):
|
|
# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
|
|
# feats = self.model(input_values)["last_hidden_state"]
|
|
# return feats
|
|
|
|
|
|
def get_model():
|
|
model = CNHubert()
|
|
model.eval()
|
|
return model
|
|
|
|
|
|
# def get_large_model():
|
|
# model = CNHubertLarge()
|
|
# model.eval()
|
|
# return model
|
|
#
|
|
# def get_model_cvec():
|
|
# model = CVec()
|
|
# model.eval()
|
|
# return model
|
|
#
|
|
# def get_model_cnw2v2base():
|
|
# model = cnw2v2base()
|
|
# model.eval()
|
|
# return model
|
|
|
|
|
|
def get_content(hmodel, wav_16k_tensor):
|
|
with torch.no_grad():
|
|
feats = hmodel(wav_16k_tensor)
|
|
return feats.transpose(1, 2)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
model = get_model()
|
|
src_path = "/Users/Shared/原音频2.wav"
|
|
wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
|
|
model = model
|
|
wav_16k_tensor = wav_16k_tensor
|
|
feats = get_content(model, wav_16k_tensor)
|
|
print(feats.shape)
|