diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py index 0696c35f..ca49efb4 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py @@ -10,9 +10,9 @@ from typing import Dict import torch from pytorch_lightning import LightningModule -from AR.models.t2s_model import Text2SemanticDecoder -from AR.modules.lr_schedulers import WarmupCosineLRSchedule -from AR.modules.optim import ScaledAdam +from GPT_SoVITS.AR.models.t2s_model import Text2SemanticDecoder +from GPT_SoVITS.AR.modules.lr_schedulers import WarmupCosineLRSchedule +from GPT_SoVITS.AR.modules.optim import ScaledAdam class Text2SemanticLightningModule(LightningModule): diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index 4725b7a3..5f9a5bd1 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -9,7 +9,7 @@ from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy from tqdm import tqdm -from AR.models.utils import ( +from GPT_SoVITS.AR.models.utils import ( dpo_loss, get_batch_logps, make_pad_mask, @@ -18,8 +18,8 @@ from AR.models.utils import ( sample, topk_sampling, ) -from AR.modules.embedding import SinePositionalEmbedding, TokenEmbedding -from AR.modules.transformer import LayerNorm, TransformerEncoder, TransformerEncoderLayer +from GPT_SoVITS.AR.modules.embedding import SinePositionalEmbedding, TokenEmbedding +from GPT_SoVITS.AR.modules.transformer import LayerNorm, TransformerEncoder, TransformerEncoderLayer default_config = { "embedding_dim": 512, diff --git a/GPT_SoVITS/AR/modules/activation.py b/GPT_SoVITS/AR/modules/activation.py index 936f9c3f..3474bf41 100644 --- a/GPT_SoVITS/AR/modules/activation.py +++ b/GPT_SoVITS/AR/modules/activation.py @@ -9,7 +9,7 @@ from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter -from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched +from GPT_SoVITS.AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched F.multi_head_attention_forward = multi_head_attention_forward_patched diff --git a/GPT_SoVITS/AR/modules/transformer.py b/GPT_SoVITS/AR/modules/transformer.py index 1bf21cdb..5446fd3e 100644 --- a/GPT_SoVITS/AR/modules/transformer.py +++ b/GPT_SoVITS/AR/modules/transformer.py @@ -10,8 +10,8 @@ from typing import Tuple from typing import Union import torch -from AR.modules.activation import MultiheadAttention -from AR.modules.scaling import BalancedDoubleSwish +from GPT_SoVITS.AR.modules.activation import MultiheadAttention +from GPT_SoVITS.AR.modules.scaling import BalancedDoubleSwish from torch import nn from torch import Tensor from torch.nn import functional as F diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 1b7ad11a..c1638dc0 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -21,20 +21,20 @@ import numpy as np import torch import torch.nn.functional as F import yaml -from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from BigVGAN.bigvgan import BigVGAN -from feature_extractor.cnhubert import CNHubert -from module.mel_processing import mel_spectrogram_torch, spectrogram_torch -from module.models import SynthesizerTrn, SynthesizerTrnV3 +from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule +from GPT_SoVITS.BigVGAN.bigvgan import BigVGAN +from GPT_SoVITS.feature_extractor.cnhubert import CNHubert +from GPT_SoVITS.module.mel_processing import mel_spectrogram_torch, spectrogram_torch +from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3 from peft import LoraConfig, get_peft_model -from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new +from GPT_SoVITS.process_ckpt import get_sovits_version_from_path_fast, load_sovits_new from transformers import AutoModelForMaskedLM, AutoTokenizer from tools.audio_sr import AP_BWE from tools.i18n.i18n import I18nAuto, scan_language_list from tools.my_utils import load_audio -from TTS_infer_pack.text_segmentation_method import splits -from TTS_infer_pack.TextPreprocessor import TextPreprocessor +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import splits +from GPT_SoVITS.TTS_infer_pack.TextPreprocessor import TextPreprocessor language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 426929f8..82e24e69 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -9,13 +9,13 @@ sys.path.append(now_dir) import re import torch -from text.LangSegmenter import LangSegmenter -from text import chinese +from GPT_SoVITS.text.LangSegmenter import LangSegmenter +from GPT_SoVITS.text import chinese from typing import Dict, List, Tuple -from text.cleaner import clean_text -from text import cleaned_text_to_sequence +from GPT_SoVITS.text.cleaner import clean_text +from GPT_SoVITS.text import cleaned_text_to_sequence from transformers import AutoModelForMaskedLM, AutoTokenizer -from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method from tools.i18n.i18n import I18nAuto, scan_language_list diff --git a/GPT_SoVITS/TTS_infer_pack/zero_crossing.py b/GPT_SoVITS/TTS_infer_pack/zero_crossing.py new file mode 100644 index 00000000..fe93355e --- /dev/null +++ b/GPT_SoVITS/TTS_infer_pack/zero_crossing.py @@ -0,0 +1,84 @@ +import numpy as np + +""" +Essentially returns the index of the middle of the zero zone + the starting index. +So if the starting index was 0 and we found the zero zone to be from 12789:12800, +then we would be returning 0 + 12795 or 12795 (since the window was of size 11 and midpoint is 6) +This method works by using a sliding window mechanic on each chunk, where we +slide the window from the end going to the start. If all the values in the window +meet the threshold, then we assign this as the zero zone. +TLDR: Returns the zero zone where a region in the audio has enough silence. +""" +def find_zero_zone(chunk, start_index, search_length, search_window_size=11): + zone = chunk[start_index:start_index + search_length] + # print(f"Zero-crossing search zone: Start={start_index}, Length={len(zone)}") + + zero_threshold = 1.0e-4 + # Check for y consecutive zeros + for idx in range(len(zone), -1 + search_window_size, -1): + index_to_start = idx-search_window_size + abs_zone = np.abs(zone[index_to_start:idx]) + if np.all(abs_zone < zero_threshold): + # print(f"Found Abs Zone: {abs_zone}") + # print(f"Extended Abs Zone: {chunk[idx-21:idx+10]}") + index_midpoint = index_to_start + int(search_window_size // 2) + # print(f"Returning {start_index} + {index_midpoint}") + return (start_index + index_midpoint), None + + # print("Falling back to zero crossing due to no zero zone found. You may hear more prominent pops and clicks in the audio. Try increasing search length or cumulative tokens.") + return find_zero_crossing(chunk, start_index, search_length) + +def find_zero_crossing(chunk, start_index, search_length): + # If the model is falling back on the this function, it might be a bad indicator that the search length is too low + + zone = chunk[start_index:start_index + search_length] + sign_changes = np.where(np.diff(np.sign(zone)) != 0)[0] + + if len(sign_changes) == 0: + raise ("No zero-crossings found in this zone. This should not be happening, debugging time.") + else: + zc_index = start_index + sign_changes[0] + 1 + # print(f"Zero-crossing found at index {zc_index}") + # Determine the crossing direction in chunk1 + prev_value = chunk[zc_index - 1] + curr_value = chunk[zc_index] + crossing_direction = np.sign(curr_value) - np.sign(prev_value) + # print(f"Crossing direction in chunk1: {np.sign(prev_value)} to {np.sign(curr_value)}") + return zc_index, crossing_direction + +def find_matching_index(chunk, center_index, max_offset, crossing_direction): + """ + Finds a zero-crossing in data that matches the specified crossing direction, + starting from center_index and searching outward. + """ + if crossing_direction == None: + return center_index # if zero zone + + # fall back for zero_crossing + data_length = len(chunk) + # print(f"Center index in chunk2: {center_index}") + for offset in range(max_offset + 1): + # Check index bounds + idx_forward = center_index + offset + idx_backward = center_index - offset + + # Check forward direction + if idx_forward < data_length - 1: + prev_sign = np.sign(chunk[idx_forward]) + curr_sign = np.sign(chunk[idx_forward + 1]) + direction = curr_sign - prev_sign + if direction == crossing_direction: + # print(f"Matching zero-crossing found at index {idx_forward + 1} (forward)") + return idx_forward + 1 + + # Check backward direction + if idx_backward > 0: + prev_sign = np.sign(chunk[idx_backward - 1]) + curr_sign = np.sign(chunk[idx_backward]) + direction = curr_sign - prev_sign + if direction == crossing_direction: + # print(f"Matching zero-crossing found at index {idx_backward} (backward)") + return idx_backward + + # print("No matching zero-crossings found in this zone.") + return None \ No newline at end of file diff --git a/GPT_SoVITS/f5_tts/model/backbones/dit.py b/GPT_SoVITS/f5_tts/model/backbones/dit.py index 7d98a858..105578ed 100644 --- a/GPT_SoVITS/f5_tts/model/backbones/dit.py +++ b/GPT_SoVITS/f5_tts/model/backbones/dit.py @@ -25,7 +25,7 @@ from GPT_SoVITS.f5_tts.model.modules import ( get_pos_embed_indices, ) -from module.commons import sequence_mask +from GPT_SoVITS.module.commons import sequence_mask class TextEmbedding(nn.Module): diff --git a/GPT_SoVITS/feature_extractor/cnhubert.py b/GPT_SoVITS/feature_extractor/cnhubert.py index f22b8d09..44b8f881 100644 --- a/GPT_SoVITS/feature_extractor/cnhubert.py +++ b/GPT_SoVITS/feature_extractor/cnhubert.py @@ -13,7 +13,7 @@ from transformers import ( HubertModel, ) -import utils +import GPT_SoVITS.utils import torch.nn as nn cnhubert_base_path = None diff --git a/GPT_SoVITS/module/attentions.py b/GPT_SoVITS/module/attentions.py index 341de4ac..65d1c87c 100644 --- a/GPT_SoVITS/module/attentions.py +++ b/GPT_SoVITS/module/attentions.py @@ -3,8 +3,8 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons -from module.modules import LayerNorm +from GPT_SoVITS.module import commons +from GPT_SoVITS.module.modules import LayerNorm class Encoder(nn.Module): diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index aac520a0..6bc4abf5 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -7,19 +7,19 @@ import torch from torch import nn from torch.nn import functional as F -from module import commons -from module import modules -from module import attentions -from f5_tts.model import DiT +from GPT_SoVITS.module import commons +from GPT_SoVITS.module import modules +from GPT_SoVITS.module import attentions +from GPT_SoVITS.f5_tts.model import DiT from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from module.commons import init_weights, get_padding -from module.mrte_model import MRTE -from module.quantize import ResidualVectorQuantizer +from GPT_SoVITS.module.commons import init_weights, get_padding +from GPT_SoVITS.module.mrte_model import MRTE +from GPT_SoVITS.module.quantize import ResidualVectorQuantizer # from text import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 from torch.cuda.amp import autocast import contextlib import random diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py index 7493f0b4..1b39f691 100644 --- a/GPT_SoVITS/module/modules.py +++ b/GPT_SoVITS/module/modules.py @@ -7,9 +7,9 @@ from torch.nn import functional as F from torch.nn import Conv1d from torch.nn.utils import weight_norm, remove_weight_norm -from module import commons -from module.commons import init_weights, get_padding -from module.transforms import piecewise_rational_quadratic_transform +from GPT_SoVITS.module import commons +from GPT_SoVITS.module.commons import init_weights, get_padding +from GPT_SoVITS.module.transforms import piecewise_rational_quadratic_transform import torch.distributions as D diff --git a/GPT_SoVITS/module/mrte_model.py b/GPT_SoVITS/module/mrte_model.py index e889b7e9..33c35601 100644 --- a/GPT_SoVITS/module/mrte_model.py +++ b/GPT_SoVITS/module/mrte_model.py @@ -3,7 +3,7 @@ import torch from torch import nn from torch.nn.utils import remove_weight_norm, weight_norm -from module.attentions import MultiHeadAttention +from GPT_SoVITS.module.attentions import MultiHeadAttention class MRTE(nn.Module): diff --git a/GPT_SoVITS/module/quantize.py b/GPT_SoVITS/module/quantize.py index 0afed835..186902a3 100644 --- a/GPT_SoVITS/module/quantize.py +++ b/GPT_SoVITS/module/quantize.py @@ -12,7 +12,7 @@ import typing as tp import torch from torch import nn -from module.core_vq import ResidualVectorQuantization +from GPT_SoVITS.module.core_vq import ResidualVectorQuantization @dataclass diff --git a/GPT_SoVITS/text/__init__.py b/GPT_SoVITS/text/__init__.py index 82df1fbb..c0ac180d 100644 --- a/GPT_SoVITS/text/__init__.py +++ b/GPT_SoVITS/text/__init__.py @@ -4,8 +4,8 @@ import os # else: # from text.symbols2 import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)} _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)} diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py index ce44215f..767bc1a6 100644 --- a/GPT_SoVITS/text/chinese.py +++ b/GPT_SoVITS/text/chinese.py @@ -4,9 +4,9 @@ import re import cn2an from pypinyin import lazy_pinyin, Style -from text.symbols import punctuation -from text.tone_sandhi import ToneSandhi -from text.zh_normalization.text_normlization import TextNormalizer +from GPT_SoVITS.text.symbols import punctuation +from GPT_SoVITS.text.tone_sandhi import ToneSandhi +from GPT_SoVITS.text.zh_normalization.text_normlization import TextNormalizer normalizer = lambda x: cn2an.transform(x, "an2cn") diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 7ba8f376..38e28852 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -1,4 +1,4 @@ -from text import cleaned_text_to_sequence +from GPT_SoVITS.text import cleaned_text_to_sequence import os # if os.environ.get("version","v1")=="v1": # from text import chinese @@ -7,8 +7,8 @@ import os # from text import chinese2 as chinese # from text.symbols2 import symbols -from text import symbols as symbols_v1 -from text import symbols2 as symbols_v2 +from GPT_SoVITS.text import symbols as symbols_v1 +from GPT_SoVITS.text import symbols2 as symbols_v2 special = [ # ("%", "zh", "SP"), @@ -34,7 +34,7 @@ def clean_text(text, language, version=None): for special_s, special_l, target_symbol in special: if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol, version) - language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) + language_module = __import__("GPT_SoVITS.text." + language_module_map[language], fromlist=[language_module_map[language]]) if hasattr(language_module, "text_normalize"): norm_text = language_module.text_normalize(text) else: diff --git a/GPT_SoVITS/text/english.py b/GPT_SoVITS/text/english.py index f6c69449..879b251d 100644 --- a/GPT_SoVITS/text/english.py +++ b/GPT_SoVITS/text/english.py @@ -4,12 +4,12 @@ import re import wordsegment from g2p_en import G2p -from text.symbols import punctuation +from GPT_SoVITS.text.symbols import punctuation -from text.symbols2 import symbols +from GPT_SoVITS.text.symbols2 import symbols from builtins import str as unicode -from text.en_normalization.expend import normalize +from GPT_SoVITS.text.en_normalization.expend import normalize from nltk.tokenize import TweetTokenizer word_tokenize = TweetTokenizer().tokenize diff --git a/GPT_SoVITS/text/zh_normalization/__init__.py b/GPT_SoVITS/text/zh_normalization/__init__.py index 46b367a6..ad201c81 100644 --- a/GPT_SoVITS/text/zh_normalization/__init__.py +++ b/GPT_SoVITS/text/zh_normalization/__init__.py @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from text.zh_normalization.text_normlization import * +from GPT_SoVITS.text.zh_normalization.text_normlization import * diff --git a/audio/ayaka/aux_ref_audio/11_audio.wav b/audio/ayaka/aux_ref_audio/11_audio.wav new file mode 100644 index 00000000..e1bd5072 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/11_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/12_audio.wav b/audio/ayaka/aux_ref_audio/12_audio.wav new file mode 100644 index 00000000..4e09482a Binary files /dev/null and b/audio/ayaka/aux_ref_audio/12_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/13_audio.wav b/audio/ayaka/aux_ref_audio/13_audio.wav new file mode 100644 index 00000000..11f527a9 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/13_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/14_audio.wav b/audio/ayaka/aux_ref_audio/14_audio.wav new file mode 100644 index 00000000..ad07d72d Binary files /dev/null and b/audio/ayaka/aux_ref_audio/14_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/15_audio.wav b/audio/ayaka/aux_ref_audio/15_audio.wav new file mode 100644 index 00000000..379e4874 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/15_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/16_audio.wav b/audio/ayaka/aux_ref_audio/16_audio.wav new file mode 100644 index 00000000..6741691c Binary files /dev/null and b/audio/ayaka/aux_ref_audio/16_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/17_audio.wav b/audio/ayaka/aux_ref_audio/17_audio.wav new file mode 100644 index 00000000..931110d4 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/17_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/18_audio.wav b/audio/ayaka/aux_ref_audio/18_audio.wav new file mode 100644 index 00000000..02b012dc Binary files /dev/null and b/audio/ayaka/aux_ref_audio/18_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/19_audio.wav b/audio/ayaka/aux_ref_audio/19_audio.wav new file mode 100644 index 00000000..7f0932f0 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/19_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/20_audio.wav b/audio/ayaka/aux_ref_audio/20_audio.wav new file mode 100644 index 00000000..f1499293 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/20_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/21_audio.wav b/audio/ayaka/aux_ref_audio/21_audio.wav new file mode 100644 index 00000000..06f621b8 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/21_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/22_audio.wav b/audio/ayaka/aux_ref_audio/22_audio.wav new file mode 100644 index 00000000..fdf130a4 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/22_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/23_audio.wav b/audio/ayaka/aux_ref_audio/23_audio.wav new file mode 100644 index 00000000..4a758fbb Binary files /dev/null and b/audio/ayaka/aux_ref_audio/23_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/24_audio.wav b/audio/ayaka/aux_ref_audio/24_audio.wav new file mode 100644 index 00000000..469a299a Binary files /dev/null and b/audio/ayaka/aux_ref_audio/24_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/25_audio.wav b/audio/ayaka/aux_ref_audio/25_audio.wav new file mode 100644 index 00000000..b29cfe53 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/25_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/26_audio.wav b/audio/ayaka/aux_ref_audio/26_audio.wav new file mode 100644 index 00000000..887a7cbf Binary files /dev/null and b/audio/ayaka/aux_ref_audio/26_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/27_audio.wav b/audio/ayaka/aux_ref_audio/27_audio.wav new file mode 100644 index 00000000..bada94cc Binary files /dev/null and b/audio/ayaka/aux_ref_audio/27_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/28_audio.wav b/audio/ayaka/aux_ref_audio/28_audio.wav new file mode 100644 index 00000000..8362a549 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/28_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/29_audio.wav b/audio/ayaka/aux_ref_audio/29_audio.wav new file mode 100644 index 00000000..7dbaeb9e Binary files /dev/null and b/audio/ayaka/aux_ref_audio/29_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/30_audio.wav b/audio/ayaka/aux_ref_audio/30_audio.wav new file mode 100644 index 00000000..f394ac84 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/30_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/31_audio.wav b/audio/ayaka/aux_ref_audio/31_audio.wav new file mode 100644 index 00000000..a76aea13 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/31_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/32_audio.wav b/audio/ayaka/aux_ref_audio/32_audio.wav new file mode 100644 index 00000000..4e8ad0a5 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/32_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/33_audio.wav b/audio/ayaka/aux_ref_audio/33_audio.wav new file mode 100644 index 00000000..e3b2c64a Binary files /dev/null and b/audio/ayaka/aux_ref_audio/33_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/34_audio.wav b/audio/ayaka/aux_ref_audio/34_audio.wav new file mode 100644 index 00000000..b69d11d6 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/34_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/35_audio.wav b/audio/ayaka/aux_ref_audio/35_audio.wav new file mode 100644 index 00000000..a8819b7e Binary files /dev/null and b/audio/ayaka/aux_ref_audio/35_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/36_audio.wav b/audio/ayaka/aux_ref_audio/36_audio.wav new file mode 100644 index 00000000..2e746419 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/36_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/37_audio.wav b/audio/ayaka/aux_ref_audio/37_audio.wav new file mode 100644 index 00000000..32079e8d Binary files /dev/null and b/audio/ayaka/aux_ref_audio/37_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/38_audio.wav b/audio/ayaka/aux_ref_audio/38_audio.wav new file mode 100644 index 00000000..b9ad65a4 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/38_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/39_audio.wav b/audio/ayaka/aux_ref_audio/39_audio.wav new file mode 100644 index 00000000..8a2dd49e Binary files /dev/null and b/audio/ayaka/aux_ref_audio/39_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/40_audio.wav b/audio/ayaka/aux_ref_audio/40_audio.wav new file mode 100644 index 00000000..cf17f645 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/40_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/41_audio.wav b/audio/ayaka/aux_ref_audio/41_audio.wav new file mode 100644 index 00000000..ef16a32f Binary files /dev/null and b/audio/ayaka/aux_ref_audio/41_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/42_audio.wav b/audio/ayaka/aux_ref_audio/42_audio.wav new file mode 100644 index 00000000..a2f720a1 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/42_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/43_audio.wav b/audio/ayaka/aux_ref_audio/43_audio.wav new file mode 100644 index 00000000..25d7f625 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/43_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/44_audio.wav b/audio/ayaka/aux_ref_audio/44_audio.wav new file mode 100644 index 00000000..542067b5 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/44_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/45_audio.wav b/audio/ayaka/aux_ref_audio/45_audio.wav new file mode 100644 index 00000000..7c717ab6 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/45_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/46_audio.wav b/audio/ayaka/aux_ref_audio/46_audio.wav new file mode 100644 index 00000000..4c1fa38c Binary files /dev/null and b/audio/ayaka/aux_ref_audio/46_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/47_audio.wav b/audio/ayaka/aux_ref_audio/47_audio.wav new file mode 100644 index 00000000..f4ed0971 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/47_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/48_audio.wav b/audio/ayaka/aux_ref_audio/48_audio.wav new file mode 100644 index 00000000..20a49830 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/48_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/49_audio.wav b/audio/ayaka/aux_ref_audio/49_audio.wav new file mode 100644 index 00000000..0f5d3364 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/49_audio.wav differ diff --git a/audio/ayaka/aux_ref_audio/50_audio.wav b/audio/ayaka/aux_ref_audio/50_audio.wav new file mode 100644 index 00000000..efc8cc64 Binary files /dev/null and b/audio/ayaka/aux_ref_audio/50_audio.wav differ diff --git a/audio/ayaka/ref_audio/10_audio.wav b/audio/ayaka/ref_audio/10_audio.wav new file mode 100644 index 00000000..a3381874 Binary files /dev/null and b/audio/ayaka/ref_audio/10_audio.wav differ diff --git a/inference.py b/inference.py new file mode 100644 index 00000000..b3fab842 --- /dev/null +++ b/inference.py @@ -0,0 +1,102 @@ +import torch +import sounddevice as sd +import time +from queue import Queue +from threading import Thread +import os + +class TTS: + def __init__(self): + # Replace with your checkpoints and reference audio here + # Note: Using a venv may require updating the default paths provided here + self.bert_checkpoint = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" + self.cnhuhbert_checkpoint = "GPT_SoVITS/pretrained_models/chinese-hubert-base" + # self.t2s_checkpoint = "GPT_SoVITS/pretrained_models/ayaka/Ayaka-e15.ckpt" + # self.vits_checkpoint = "GPT_SoVITS/pretrained_models/ayaka/Ayaka_e3_s1848_l32.pth" + self.t2s_checkpoint = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt" + self.vits_checkpoint = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth" + self.ref_audio = "audio/ayaka/ref_audio/10_audio.wav" + + from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config + + self.config = { + "custom": { + "bert_base_path": self.bert_checkpoint, + "cnhuhbert_base_path": self.cnhuhbert_checkpoint, + "device": "cuda" if torch.cuda.is_available() else "cpu", + "is_half": True, + "t2s_weights_path": self.t2s_checkpoint, + "vits_weights_path": self.vits_checkpoint, + "version": "v3" + } + } + + self.tts = TTS(TTS_Config(self.config)) + + self.audio_queue = Queue() + self.generating_audio = False + + def audio_stream(self, start_time): + with sd.OutputStream(samplerate=32000, channels=1, dtype="int16") as stream: + while True: + sr, audio_data = self.audio_queue.get() + if audio_data is None: + print(f"Stream Thread Done ({time.time() - start_time:.2f}s)") + break + print((sr, audio_data)) + stream.write(audio_data) + self.generating_audio = False + + def synthesize(self, text, start_time, generating_text=False): + if not self.generating_audio: + Thread(target=self.audio_stream, args=(start_time,)).start() + self.generating_audio = True + + path = "audio/ayaka/aux_ref_audio" + aux_ref_audios = [f"{path}/{file_name}" for file_name in os.listdir(path)] + + args = { + "text": text, + "text_lang": "en", + "ref_audio_path": self.ref_audio, + "aux_ref_audio_paths": aux_ref_audios, + "prompt_text": "Don't worry. Now that I've experienced the event once already, I won't be easily frightened. I'll see you later. Have a lovely chat with your friend.", + "prompt_lang": "en", + "temperature": 0.8, + "top_k": 50, + "top_p": 0.9, + "parallel_infer": True, + "sample_steps": 32, + "super_sampling": True, + "speed_factor": 1, + "fragment_interval": 0.2 + # "stream_output": True, + # "max_chunk_size": 20, + } + + if text: + print(f"Synthesis Start: {time.time() - start_time}") + generator = self.tts.run(args) + while True: + try: + audio_chunk = next(generator) + self.audio_queue.put(audio_chunk) + except StopIteration: + break + + if not generating_text: + self.audio_queue.put((None, None)) + + print(f"Synthesis End ({time.time() - start_time:.2f}s)") + +# Usage +tts = TTS() +""" +Time is only for debugging purposes. If not needed, feel free to remove. +Since this TTS model was built to be paired with LLM text streaming, we use a generating_text bool +this bool signifies if we are receiving the last chunk of streamed text (hence if we are generating anymore). +""" +tts.synthesize("One day, a fierce storm rolled in, bringing heavy rain and strong winds that threatened to destroy the wheat crops.", time.time(), False) +while tts.generating_audio: + time.sleep(0.1) +tts.synthesize("One day, a fierce storm rolled in, bringing heavy rain and strong winds that threatened to destroy the wheat crops.", time.time(), False) \ No newline at end of file