From 72c0eca0a26ff64a6cd7e3185682799a8ae8624e Mon Sep 17 00:00:00 2001 From: SapphireLab <36986837+SapphireLab@users.noreply.github.com> Date: Wed, 3 Apr 2024 17:42:23 +0800 Subject: [PATCH] spellcheck (#916) Co-authored-by: starylan --- GPT_SoVITS/TTS_infer_pack/TTS.py | 68 ++++++++++++++++---------------- GPT_SoVITS/inference_webui.py | 10 ++--- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 18ce2e6..690be18 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -140,7 +140,7 @@ class TTS_Config: self.win_length:int = 2048 self.n_speakers:int = 300 - self.langauges:list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] + self.languages:list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] # print(self) def _load_configs(self, configs_path: str)->dict: @@ -207,19 +207,19 @@ class TTS: self.prompt_cache:dict = { - "ref_audio_path":None, - "prompt_semantic":None, - "refer_spepc":None, - "prompt_text":None, - "prompt_lang":None, - "phones":None, - "bert_features":None, - "norm_text":None, + "ref_audio_path" : None, + "prompt_semantic": None, + "refer_spec" : None, + "prompt_text" : None, + "prompt_lang" : None, + "phones" : None, + "bert_features" : None, + "norm_text" : None, } self.stop_flag:bool = False - self.precison:torch.dtype = torch.float16 if self.configs.is_half else torch.float32 + self.precision:torch.dtype = torch.float16 if self.configs.is_half else torch.float32 def _init_models(self,): self.init_t2s_weights(self.configs.t2s_weights_path) @@ -312,7 +312,7 @@ class TTS: return self.configs.is_half = enable - self.precison = torch.float16 if enable else torch.float32 + self.precision = torch.float16 if enable else torch.float32 self.configs.save_configs() if enable: if self.t2s_model is not None: @@ -358,9 +358,9 @@ class TTS: ref_audio_path: str, the path of the reference audio. ''' self._set_prompt_semantic(ref_audio_path) - self._set_ref_spepc(ref_audio_path) + self._set_ref_spec(ref_audio_path) - def _set_ref_spepc(self, ref_audio_path): + def _set_ref_spec(self, ref_audio_path): audio = load_audio(ref_audio_path, int(self.configs.sampling_rate)) audio = torch.FloatTensor(audio) audio_norm = audio @@ -376,8 +376,8 @@ class TTS: spec = spec.to(self.configs.device) if self.configs.is_half: spec = spec.half() - # self.refer_spepc = spec - self.prompt_cache["refer_spepc"] = spec + # self.refer_spec = spec + self.prompt_cache["refer_spec"] = spec def _set_prompt_semantic(self, ref_wav_path:str): @@ -435,7 +435,7 @@ class TTS: threshold:float=0.75, split_bucket:bool=True, device:torch.device=torch.device("cpu"), - precison:torch.dtype=torch.float32, + precision:torch.dtype=torch.float32, ): _data:list = [] @@ -488,13 +488,13 @@ class TTS: for item in item_list: if prompt_data is not None: all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1)\ - .to(dtype=precison, device=device) + .to(dtype=precision, device=device) all_phones = torch.LongTensor(prompt_data["phones"]+item["phones"]).to(device) phones = torch.LongTensor(item["phones"]).to(device) # norm_text = prompt_data["norm_text"]+item["norm_text"] else: all_bert_features = item["bert_features"]\ - .to(dtype=precison, device=device) + .to(dtype=precision, device=device) phones = torch.LongTensor(item["phones"]).to(device) all_phones = phones # norm_text = item["norm_text"] @@ -519,7 +519,7 @@ class TTS: #### 直接对phones和bert_features进行pad,会增大复读概率。 # all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len) # all_bert_features_batch = all_bert_features_list - # all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=precison, device=device) + # all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=precision, device=device) # for idx, item in enumerate(all_bert_features_list): # all_bert_features_batch[idx, :, : item.shape[-1]] = item @@ -555,8 +555,8 @@ class TTS: Returns: list (List[np.ndarray]): the data in the original order. ''' - lenght = len(sum(batch_index_list, [])) - _data = [None]*lenght + length = len(sum(batch_index_list, [])) + _data = [None]*length for i, index_list in enumerate(batch_index_list): for j, index in enumerate(index_list): _data[index] = data[i][j] @@ -584,7 +584,7 @@ class TTS: "top_k": 5, # int. top k sampling "top_p": 1, # float. top p sampling "temperature": 1, # float. temperature for sampling - "text_split_method": "cut0", # str. text split method, see text_segmentaion_method.py for details. + "text_split_method": "cut0", # str. text split method, see text_segmentation_method.py for details. "batch_size": 1, # int. batch size for inference "batch_threshold": 0.75, # float. threshold for batch splitting. "split_bucket: True, # bool. whether to split the batch into multiple buckets. @@ -594,7 +594,7 @@ class TTS: "seed": -1, # int. random seed for reproducibility. } returns: - tulpe[int, np.ndarray]: sampling rate and audio data. + tuple[int, np.ndarray]: sampling rate and audio data. """ ########## variables initialization ########### self.stop_flag:bool = False @@ -635,12 +635,12 @@ class TTS: if prompt_text in [None, ""]: no_prompt_text = True - assert text_lang in self.configs.langauges + assert text_lang in self.configs.languages if not no_prompt_text: - assert prompt_lang in self.configs.langauges + assert prompt_lang in self.configs.languages if ref_audio_path in [None, ""] and \ - ((self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spepc"] is None)): + ((self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] is None)): raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()") @@ -682,7 +682,7 @@ class TTS: threshold=batch_threshold, split_bucket=split_bucket, device=self.configs.device, - precison=self.precison + precision=self.precision ) else: print(i18n("############ 切分文本 ############")) @@ -714,7 +714,7 @@ class TTS: threshold=batch_threshold, split_bucket=False, device=self.configs.device, - precison=self.precison + precision=self.precision ) return batch[0] @@ -760,8 +760,8 @@ class TTS: t4 = ttime() t_34 += t4 - t3 - refer_audio_spepc:torch.Tensor = self.prompt_cache["refer_spepc"]\ - .to(dtype=self.precison, device=self.configs.device) + refer_audio_spec:torch.Tensor = self.prompt_cache["refer_spec"]\ + .to(dtype=self.precision, device=self.configs.device) batch_audio_fragment = [] @@ -775,7 +775,7 @@ class TTS: # batch_phones = self.batch_sequences(batch_phones, axis=0, pad_value=0, max_length=max_len) # batch_phones = batch_phones.to(self.configs.device) # batch_audio_fragment = (self.vits_model.batched_decode( - # pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spepc + # pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spec # )) # ## vits并行推理 method 2 @@ -786,7 +786,7 @@ class TTS: all_pred_semantic = torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device) _batch_audio_fragment = (self.vits_model.decode( - all_pred_semantic, _batch_phones,refer_audio_spepc + all_pred_semantic, _batch_phones, refer_audio_spec ).detach()[0, 0, :]) audio_frag_end_idx.insert(0, 0) batch_audio_fragment= [_batch_audio_fragment[audio_frag_end_idx[i-1]:audio_frag_end_idx[i]] for i in range(1, len(audio_frag_end_idx))] @@ -797,7 +797,7 @@ class TTS: # phones = batch_phones[i].unsqueeze(0).to(self.configs.device) # _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次 # audio_fragment =(self.vits_model.decode( - # _pred_semantic, phones, refer_audio_spepc + # _pred_semantic, phones, refer_audio_spec # ).detach()[0, 0, :]) # batch_audio_fragment.append( # audio_fragment @@ -866,7 +866,7 @@ class TTS: )->tuple[int, np.ndarray]: zero_wav = torch.zeros( int(self.configs.sampling_rate * fragment_interval), - dtype=self.precison, + dtype=self.precision, device=self.configs.device ) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 199948c..ff72c26 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -82,7 +82,7 @@ if bert_path is not None: tts_config.bert_base_path = bert_path print(tts_config) -tts_pipline = TTS(tts_config) +tts_pipeline = TTS(tts_config) gpt_path = tts_config.t2s_weights_path sovits_path = tts_config.vits_weights_path @@ -113,7 +113,7 @@ def inference(text, text_lang, "fragment_interval":fragment_interval, "seed":actual_seed, } - for item in tts_pipline.run(inputs): + for item in tts_pipeline.run(inputs): yield item, actual_seed def custom_sort_key(s): @@ -162,8 +162,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True) refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) - SoVITS_dropdown.change(tts_pipline.init_vits_weights, [SoVITS_dropdown], []) - GPT_dropdown.change(tts_pipline.init_t2s_weights, [GPT_dropdown], []) + SoVITS_dropdown.change(tts_pipeline.init_vits_weights, [SoVITS_dropdown], []) + GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], []) with gr.Row(): with gr.Column(): @@ -227,7 +227,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: ], [output, seed], ) - stop_infer.click(tts_pipline.stop, [], []) + stop_infer.click(tts_pipeline.stop, [], []) with gr.Group(): gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))