Add New LangSegmenter

This commit is contained in:
KamioRinn 2025-02-14 02:38:25 +08:00
parent c2b3298bed
commit 6976f77572
4 changed files with 11 additions and 10 deletions

View File

@ -18,9 +18,10 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
logging.getLogger("multipart.multipart").setLevel(logging.ERROR) logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=FutureWarning)
import LangSegment, os, re, sys, json import os, re, sys, json
import pdb import pdb
import torch import torch
from text.LangSegmenter import LangSegmenter
try: try:
import gradio.analytics as analytics import gradio.analytics as analytics
@ -379,8 +380,7 @@ def get_phones_and_bert(text,language,version,final=False):
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
language = language.replace("all_","") language = language.replace("all_","")
if language == "en": if language == "en":
LangSegment.setfilters(["en"]) formattext = text
formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
else: else:
# 因无法区别中日韩文汉字,以用户输入为准 # 因无法区别中日韩文汉字,以用户输入为准
formattext = text formattext = text
@ -407,19 +407,18 @@ def get_phones_and_bert(text,language,version,final=False):
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
textlist=[] textlist=[]
langlist=[] langlist=[]
LangSegment.setfilters(["zh","ja","en","ko"])
if language == "auto": if language == "auto":
for tmp in LangSegment.getTexts(text): for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) textlist.append(tmp["text"])
elif language == "auto_yue": elif language == "auto_yue":
for tmp in LangSegment.getTexts(text): for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh": if tmp["lang"] == "zh":
tmp["lang"] = "yue" tmp["lang"] = "yue"
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) textlist.append(tmp["text"])
else: else:
for tmp in LangSegment.getTexts(text): for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "en": if tmp["lang"] == "en":
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
else: else:

View File

@ -112,7 +112,7 @@ def replace_phs(phs):
def replace_consecutive_punctuation(text): def replace_consecutive_punctuation(text):
punctuations = ''.join(re.escape(p) for p in punctuation) punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+' pattern = f'([{punctuations}\s])([{punctuations}])+'
result = re.sub(pattern, r'\1', text) result = re.sub(pattern, r'\1', text)
return result return result
@ -233,6 +233,7 @@ def text_normalize(text):
# 来自 g2p_en 文本格式化处理 # 来自 g2p_en 文本格式化处理
# 增加大写兼容 # 增加大写兼容
# 增加纯大写单词拆分
text = unicode(text) text = unicode(text)
text = normalize_numbers(text) text = normalize_numbers(text)
text = ''.join(char for char in unicodedata.normalize('NFD', text) text = ''.join(char for char in unicodedata.normalize('NFD', text)
@ -240,6 +241,7 @@ def text_normalize(text):
text = re.sub("[^ A-Za-z'.,?!\-]", "", text) text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
text = re.sub(r"(?i)i\.e\.", "that is", text) text = re.sub(r"(?i)i\.e\.", "that is", text)
text = re.sub(r"(?i)e\.g\.", "for example", text) text = re.sub(r"(?i)e\.g\.", "for example", text)
text = re.sub(r'(?<!^)(?<![\s])([A-Z])', r' \1', text)
# 避免重复标点引起的参考泄露 # 避免重复标点引起的参考泄露
text = replace_consecutive_punctuation(text) text = replace_consecutive_punctuation(text)

View File

@ -329,7 +329,7 @@ Special thanks to the following projects and contributors:
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### Text Frontend for Inference ### Text Frontend for Inference
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [LangSegment](https://github.com/juntaosun/LangSegment) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)

View File

@ -23,7 +23,7 @@ PyYAML
psutil psutil
jieba_fast jieba_fast
jieba jieba
LangSegment>=0.2.0 split-lang
Faster_Whisper Faster_Whisper
wordsegment wordsegment
rotary_embedding_torch rotary_embedding_torch