Merge 09f28ae518c38d1c010040e261ed561e420d0684 into ee4a466f79b4f643251aa2f873f541f85df11d91

Update patched_mha_with_cache.py
更新对amd显卡的支持 (#2076 )
2025-06-29 00:59:16 +08:00 · 2025-03-27 14:08:49 +08:00 · 2025-03-26 17:39:19 +08:00 · 2025-03-26 16:04:13 +08:00 · 2025-03-26 15:32:43 +08:00 · 2025-03-26 15:22:01 +08:00
7 changed files with 196 additions and 91 deletions
--- a/Docker/download.py
+++ b/Docker/download.py
@ -3,3 +3,44 @@ from modelscope import snapshot_download
 model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',revision="v2.0.4")
 model_dir = snapshot_download('damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',revision="v2.0.4")
 model_dir = snapshot_download('damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',revision="v2.0.4")
+
+import nltk
+nltk.download('averaged_perceptron_tagger_eng')
+
+# Download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip unzip and rename to G2PWModel, and then place them in GPT_SoVITS/text.
+
+import os
+import zipfile
+import shutil
+import requests
+
+# 获取当前文件的路径
+current_file_path = os.path.abspath(__file__)
+current_dir = os.path.dirname(current_file_path)
+
+# 定义下载链接和目标路径
+url = 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip'
+download_path = os.path.join(current_dir, 'G2PWModel_1.1.zip')
+target_dir = os.path.join(current_dir, '../GPT_SoVITS/text/')
+
+# 下载文件
+response = requests.get(url)
+with open(download_path, 'wb') as file:
+    file.write(response.content)
+
+# 解压文件
+with zipfile.ZipFile(download_path, 'r') as zip_ref:
+    zip_ref.extractall(current_dir)
+
+# 重命名解压后的文件夹
+os.rename(os.path.join(current_dir, 'G2PWModel_1.1'), os.path.join(current_dir, 'G2PWModel'))
+
+# 移动文件夹到目标目录
+if not os.path.exists(target_dir):
+    os.makedirs(target_dir)
+shutil.move(os.path.join(current_dir, 'G2PWModel'), target_dir)
+
+# 清理临时文件
+os.remove(download_path)
+
+print("Download G2PWModel successfully")
--- a/Docker/links.txt
+++ b/Docker/links.txt
@ -17,6 +17,12 @@ https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-la
  out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
 https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json
  out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
+  out=GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/gsv-v2final-pretrained/s2D2333k.pth
+  out=GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2D2333k.pth
+https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/gsv-v2final-pretrained/s2G2333k.pth
+  out=GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
 # UVR5
 https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
  out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth
--- a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py
+++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py
@ -12,33 +12,33 @@ import torch


 def multi_head_attention_forward_patched(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    embed_dim_to_check: int,
-    num_heads: int,
-    in_proj_weight: Optional[Tensor],
-    in_proj_bias: Optional[Tensor],
-    bias_k: Optional[Tensor],
-    bias_v: Optional[Tensor],
-    add_zero_attn: bool,
+    query,
+    key,
+    value,
+    embed_dim_to_check,
+    num_heads,
+    in_proj_weight,
+    in_proj_bias,
+    bias_k,
+    bias_v,
+    add_zero_attn,
    dropout_p: float,
-    out_proj_weight: Tensor,
-    out_proj_bias: Optional[Tensor],
-    training: bool = True,
-    key_padding_mask: Optional[Tensor] = None,
-    need_weights: bool = True,
-    attn_mask: Optional[Tensor] = None,
-    use_separate_proj_weight: bool = False,
-    q_proj_weight: Optional[Tensor] = None,
-    k_proj_weight: Optional[Tensor] = None,
-    v_proj_weight: Optional[Tensor] = None,
-    static_k: Optional[Tensor] = None,
-    static_v: Optional[Tensor] = None,
-    average_attn_weights: bool = True,
-    is_causal: bool = False,
+    out_proj_weight,
+    out_proj_bias,
+    training = True,
+    key_padding_mask = None,
+    need_weights = True,
+    attn_mask = None,
+    use_separate_proj_weight = False,
+    q_proj_weight = None,
+    k_proj_weight = None,
+    v_proj_weight = None,
+    static_k = None,
+    static_v = None,
+    average_attn_weights = True,
+    is_causal = False,
    cache=None,
-) -> Tuple[Tensor, Optional[Tensor]]:
+):
    r"""
    Args:
        query, key, value: map a query and a set of key-value pairs to an output.
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@ -1,5 +1,6 @@

 import os, sys
+import threading

 from tqdm import tqdm
 now_dir = os.getcwd()
@ -54,6 +55,7 @@ class TextPreprocessor:
        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.device = device
+        self.bert_lock = threading.RLock()

    def preprocess(self, text:str, lang:str, text_split_method:str, version:str="v2")->List[Dict]:
        print(f'############ {i18n("切分文本")} ############')
@ -117,70 +119,71 @@ class TextPreprocessor:
        return self.get_phones_and_bert(text, language, version)

    def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False):
-        if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
-            # language = language.replace("all_","")
-            formattext = text
-            while "  " in formattext:
-                formattext = formattext.replace("  ", " ")
-            if language == "all_zh":
-                if re.search(r'[A-Za-z]', formattext):
-                    formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
-                    formattext = chinese.mix_text_normalize(formattext)
-                    return self.get_phones_and_bert(formattext,"zh",version)
-                else:
-                    phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
-                    bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
-            elif language == "all_yue" and re.search(r'[A-Za-z]', formattext):
-                    formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
-                    formattext = chinese.mix_text_normalize(formattext)
-                    return self.get_phones_and_bert(formattext,"yue",version)
-            else:
-                phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
-                bert = torch.zeros(
-                    (1024, len(phones)),
-                    dtype=torch.float32,
-                ).to(self.device)
-        elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
-            textlist=[]
-            langlist=[]
-            if language == "auto":
-                for tmp in LangSegmenter.getTexts(text):
-                    langlist.append(tmp["lang"])
-                    textlist.append(tmp["text"])
-            elif language == "auto_yue":
-                for tmp in LangSegmenter.getTexts(text):
-                    if tmp["lang"] == "zh":
-                        tmp["lang"] = "yue"
-                    langlist.append(tmp["lang"])
-                    textlist.append(tmp["text"])
-            else:
-                for tmp in LangSegmenter.getTexts(text):
-                    if tmp["lang"] == "en":
-                        langlist.append(tmp["lang"])
-                    else:
-                        # 因无法区别中日韩文汉字,以用户输入为准
-                        langlist.append(language)
-                    textlist.append(tmp["text"])
-            # print(textlist)
-            # print(langlist)
-            phones_list = []
-            bert_list = []
-            norm_text_list = []
-            for i in range(len(textlist)):
-                lang = langlist[i]
-                phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
-                bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
-                phones_list.append(phones)
-                norm_text_list.append(norm_text)
-                bert_list.append(bert)
-            bert = torch.cat(bert_list, dim=1)
-            phones = sum(phones_list, [])
-            norm_text = ''.join(norm_text_list)
+        with self.bert_lock:
+          if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
+              # language = language.replace("all_","")
+              formattext = text
+              while "  " in formattext:
+                  formattext = formattext.replace("  ", " ")
+              if language == "all_zh":
+                  if re.search(r'[A-Za-z]', formattext):
+                      formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
+                      formattext = chinese.mix_text_normalize(formattext)
+                      return self.get_phones_and_bert(formattext,"zh",version)
+                  else:
+                      phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
+                      bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
+              elif language == "all_yue" and re.search(r'[A-Za-z]', formattext):
+                      formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
+                      formattext = chinese.mix_text_normalize(formattext)
+                      return self.get_phones_and_bert(formattext,"yue",version)
+              else:
+                  phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
+                  bert = torch.zeros(
+                      (1024, len(phones)),
+                      dtype=torch.float32,
+                  ).to(self.device)
+          elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
+              textlist=[]
+              langlist=[]
+              if language == "auto":
+                  for tmp in LangSegmenter.getTexts(text):
+                      langlist.append(tmp["lang"])
+                      textlist.append(tmp["text"])
+              elif language == "auto_yue":
+                  for tmp in LangSegmenter.getTexts(text):
+                      if tmp["lang"] == "zh":
+                          tmp["lang"] = "yue"
+                      langlist.append(tmp["lang"])
+                      textlist.append(tmp["text"])
+              else:
+                  for tmp in LangSegmenter.getTexts(text):
+                      if tmp["lang"] == "en":
+                          langlist.append(tmp["lang"])
+                      else:
+                          # 因无法区别中日韩文汉字,以用户输入为准
+                          langlist.append(language)
+                      textlist.append(tmp["text"])
+              # print(textlist)
+              # print(langlist)
+              phones_list = []
+              bert_list = []
+              norm_text_list = []
+              for i in range(len(textlist)):
+                  lang = langlist[i]
+                  phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
+                  bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
+                  phones_list.append(phones)
+                  norm_text_list.append(norm_text)
+                  bert_list.append(bert)
+              bert = torch.cat(bert_list, dim=1)
+              phones = sum(phones_list, [])
+              norm_text = ''.join(norm_text_list)

-        if not final and len(phones) < 6:
-            return self.get_phones_and_bert("." + text,language,version,final=True)
+          if not final and len(phones) < 6:
+              return self.get_phones_and_bert("." + text,language,version,final=True)

-        return phones, bert, norm_text
+          return phones, bert, norm_text


    def get_bert_feature(self, text:str, word2ph:list)->torch.Tensor:
--- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py
+++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py
@ -81,7 +81,7 @@ if os.path.exists(semantic_path) == False:
    # utils.load_checkpoint(pretrained_s2G, vq_model, None, True)
    print(
        vq_model.load_state_dict(
-            torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False
+            torch.load(pretrained_s2G, map_location="cpu", weights_only=False)["weight"], strict=False
        )
    )

--- a/install.sh
+++ b/install.sh
@ -2,8 +2,13 @@

 # 安装构建工具
 # Install build tools
+echo "Installing GCC..."
 conda install -c conda-forge gcc=14
+
+echo "Installing G++..."
 conda install -c conda-forge gxx
+
+echo "Installing ffmpeg and cmake..."
 conda install ffmpeg cmake

 # 设置编译环境
@ -12,10 +17,60 @@ export CMAKE_MAKE_PROGRAM="$CONDA_PREFIX/bin/cmake"
 export CC="$CONDA_PREFIX/bin/gcc"
 export CXX="$CONDA_PREFIX/bin/g++"

-conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
+echo "Checking for CUDA installation..."
+if command -v nvidia-smi &> /dev/null; then
+    USE_CUDA=true
+    echo "CUDA found."
+else
+    echo "CUDA not found."
+    USE_CUDA=false
+fi
+
+
+if [ "$USE_CUDA" = false ]; then
+    echo "Checking for ROCm installation..."
+    if [ -d "/opt/rocm" ]; then
+        USE_ROCM=true
+        echo "ROCm found."
+        if grep -qi "microsoft" /proc/version; then
+            echo "You are running WSL."
+            IS_WSL=true
+        else
+            echo "You are NOT running WSL."
+            IS_WSL=false
+        fi
+    else
+        echo "ROCm not found."
+        USE_ROCM=false
+    fi
+fi
+
+if [ "$USE_CUDA" = true ]; then
+    echo "Installing PyTorch with CUDA support..."
+    conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
+elif [ "$USE_ROCM" = true ] ; then
+    echo "Installing PyTorch with ROCm support..."
+    pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
+else
+    echo "Installing PyTorch for CPU..."
+    conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
+fi
+
+
+echo "Installing Python dependencies from requirements.txt..."

 # 刷新环境
 # Refresh environment
 hash -r
+pip install -r requirements.txt
+
+if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ] ; then
+    echo "Update to WSL compatible runtime lib..."
+    location=`pip show torch | grep Location | awk -F ": " '{print $2}'`
+    cd ${location}/torch/lib/
+    rm libhsa-runtime64.so*
+    cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
+fi
+
+echo "Installation completed successfully!"

-pip install -r requirements.txt
--- a/tools/my_utils.py
+++ b/tools/my_utils.py
@ -32,7 +32,7 @@ def clean_path(path_str:str):
    if path_str.endswith(('\\','/')):
        return clean_path(path_str[0:-1])
    path_str = path_str.replace('/', os.sep).replace('\\', os.sep)
-    return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")
+    return path_str.strip(" \'\n\"\u202a")#path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")


 def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False):
Author	SHA1	Message	Date
Zone Tome	7d84c9685a	Merge 09f28ae518c38d1c010040e261ed561e420d0684 into ee4a466f79b4f643251aa2f873f541f85df11d91	2025-03-27 14:08:49 +08:00
RVC-Boss	ee4a466f79	Update patched_mha_with_cache.py	2025-03-26 17:39:19 +08:00
C3EZ	b65ea9181e	更新对amd显卡的支持 (#2076 ) * Added the instruction for AMD GPU in English * Added the instruction for AMD GPU in Chinese * Update install.sh, now it will check wether user are using cuda or rocm * 恢复原来的readme，已经更新了install.sh * 恢复中文readme * 将n卡的判断条件由nvcc改成nvidia-smi	2025-03-26 16:04:13 +08:00
RVC-Boss	c0ce55a132	Update my_utils.py	2025-03-26 15:32:43 +08:00
RVC-Boss	13573a1b06	fix torch.load	2025-03-26 15:22:01 +08:00
lishq	fef65d40fe	fix: prevent concurrent access to BERT model with thread lock (#2165 ) Added thread lock to protect get_phones_and_bert method from potential race conditions during concurrent access. This addresses issue #1844 where multiple threads accessing the BERT model simultaneously could cause data inconsistency or crashes. Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>	2025-03-26 15:03:36 +08:00
Zone Tome	09f28ae518	fix: docker add v2 models	2024-10-30 09:06:02 +00:00