mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-06-01 05:09:16 +08:00
* Update README * Optimize-English-G2P * docs: change akward expression * docs: update Changelog_KO.md * Fix CN punc in EN,add 's match * Adjust normalize and g2p logic * Update zh_CN.json * Update README (#827) Update README.md Update some outdated file paths and commands * 修复英文多音字,调整字典热加载,新增姓名匹配 (#869) * Fix homograph dict * Add JSON in dict * Adjust hot dict to hot reload * Add English name dict * Adjust get name dict logic * Make API Great Again (#894) * Add zh/jp/en mix * Optimize code readability and formatted output. * Try OGG streaming * Add stream mode arg * Add media type arg * Add cut punc arg * Eliminate punc risk * Update README (#895) * Update README * Update README * update README * update README * fix typo s/Licence /License (#904) * fix reformat cmd (#917) Co-authored-by: starylan <starylan@outlook.com> * Update README.md * Normalize chinese arithmetic operations (#947) * 改变训练和推理时的mask策略,以修复当batch_size>1时,产生的复读现象 * 同步main分支代码,增加“保持随机”选项 * 在colab中运行colab_webui.ipynb发生的uvr5模型缺失问题 (#968) 在colab中使用git下载uvr5模型时报错: fatal: destination path 'uvr5_weights' already exists and is not an empty directory. 通过在下载前将原本从本仓库下载的uvr5_weights文件夹删除可以解决问题。 * [ASR] 修复FasterWhisper遍历输入路径失败 (#956) * remove glob * rename * reset mirror pos * 回退mask策略; 回退pad策略; 在T2SBlock中添加padding_mask,以减少pad的影响; 开放repetition_penalty参数,让用户自行调整重复惩罚的强度; 增加parallel_infer参数,用于开启或关闭并行推理,关闭时与0307版本保持一致; 在webui中增加“保持随机”选项; 同步main分支代码。 * 删除无用注释 --------- Co-authored-by: Lion <drain.daters.0p@icloud.com> Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Co-authored-by: KamioRinn <snowsdream@live.com> Co-authored-by: Pengoose <pengoose_dev@naver.com> Co-authored-by: Yuan-Man <68322456+Yuan-ManX@users.noreply.github.com> Co-authored-by: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Co-authored-by: KamioRinn <63162909+KamioRinn@users.noreply.github.com> Co-authored-by: Lion-Wu <130235128+Lion-Wu@users.noreply.github.com> Co-authored-by: digger yu <digger-yu@outlook.com> Co-authored-by: SapphireLab <36986837+SapphireLab@users.noreply.github.com> Co-authored-by: starylan <starylan@outlook.com> Co-authored-by: shadow01a <141255649+shadow01a@users.noreply.github.com>
166 lines
7.0 KiB
Python
166 lines
7.0 KiB
Python
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
import re
|
||
from typing import List
|
||
|
||
from .char_convert import tranditional_to_simplified
|
||
from .chronology import RE_DATE
|
||
from .chronology import RE_DATE2
|
||
from .chronology import RE_TIME
|
||
from .chronology import RE_TIME_RANGE
|
||
from .chronology import replace_date
|
||
from .chronology import replace_date2
|
||
from .chronology import replace_time
|
||
from .constants import F2H_ASCII_LETTERS
|
||
from .constants import F2H_DIGITS
|
||
from .constants import F2H_SPACE
|
||
from .num import RE_DECIMAL_NUM
|
||
from .num import RE_DEFAULT_NUM
|
||
from .num import RE_FRAC
|
||
from .num import RE_INTEGER
|
||
from .num import RE_NUMBER
|
||
from .num import RE_PERCENTAGE
|
||
from .num import RE_POSITIVE_QUANTIFIERS
|
||
from .num import RE_RANGE
|
||
from .num import RE_TO_RANGE
|
||
from .num import RE_ASMD
|
||
from .num import replace_default_num
|
||
from .num import replace_frac
|
||
from .num import replace_negative_num
|
||
from .num import replace_number
|
||
from .num import replace_percentage
|
||
from .num import replace_positive_quantifier
|
||
from .num import replace_range
|
||
from .num import replace_to_range
|
||
from .num import replace_asmd
|
||
from .phonecode import RE_MOBILE_PHONE
|
||
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
|
||
from .phonecode import RE_TELEPHONE
|
||
from .phonecode import replace_mobile
|
||
from .phonecode import replace_phone
|
||
from .quantifier import RE_TEMPERATURE
|
||
from .quantifier import replace_measure
|
||
from .quantifier import replace_temperature
|
||
|
||
|
||
class TextNormalizer():
|
||
def __init__(self):
|
||
self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)')
|
||
|
||
def _split(self, text: str, lang="zh") -> List[str]:
|
||
"""Split long text into sentences with sentence-splitting punctuations.
|
||
Args:
|
||
text (str): The input text.
|
||
Returns:
|
||
List[str]: Sentences.
|
||
"""
|
||
# Only for pure Chinese here
|
||
if lang == "zh":
|
||
text = text.replace(" ", "")
|
||
# 过滤掉特殊字符
|
||
text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text)
|
||
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
||
text = text.strip()
|
||
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
||
return sentences
|
||
|
||
def _post_replace(self, sentence: str) -> str:
|
||
sentence = sentence.replace('/', '每')
|
||
# sentence = sentence.replace('~', '至')
|
||
# sentence = sentence.replace('~', '至')
|
||
sentence = sentence.replace('①', '一')
|
||
sentence = sentence.replace('②', '二')
|
||
sentence = sentence.replace('③', '三')
|
||
sentence = sentence.replace('④', '四')
|
||
sentence = sentence.replace('⑤', '五')
|
||
sentence = sentence.replace('⑥', '六')
|
||
sentence = sentence.replace('⑦', '七')
|
||
sentence = sentence.replace('⑧', '八')
|
||
sentence = sentence.replace('⑨', '九')
|
||
sentence = sentence.replace('⑩', '十')
|
||
sentence = sentence.replace('α', '阿尔法')
|
||
sentence = sentence.replace('β', '贝塔')
|
||
sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
|
||
sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
|
||
sentence = sentence.replace('ε', '艾普西龙')
|
||
sentence = sentence.replace('ζ', '捷塔')
|
||
sentence = sentence.replace('η', '依塔')
|
||
sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
|
||
sentence = sentence.replace('ι', '艾欧塔')
|
||
sentence = sentence.replace('κ', '喀帕')
|
||
sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
|
||
sentence = sentence.replace('μ', '缪')
|
||
sentence = sentence.replace('ν', '拗')
|
||
sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
|
||
sentence = sentence.replace('ο', '欧米克伦')
|
||
sentence = sentence.replace('π', '派').replace('Π', '派')
|
||
sentence = sentence.replace('ρ', '肉')
|
||
sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
|
||
'σ', '西格玛')
|
||
sentence = sentence.replace('τ', '套')
|
||
sentence = sentence.replace('υ', '宇普西龙')
|
||
sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
|
||
sentence = sentence.replace('χ', '器')
|
||
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
|
||
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
|
||
# re filter special characters, have one more character "-" than line 68
|
||
sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|\\]', '', sentence)
|
||
return sentence
|
||
|
||
def normalize_sentence(self, sentence: str) -> str:
|
||
# basic character conversions
|
||
sentence = tranditional_to_simplified(sentence)
|
||
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
|
||
F2H_DIGITS).translate(F2H_SPACE)
|
||
|
||
# number related NSW verbalization
|
||
sentence = RE_DATE.sub(replace_date, sentence)
|
||
sentence = RE_DATE2.sub(replace_date2, sentence)
|
||
|
||
# range first
|
||
sentence = RE_TIME_RANGE.sub(replace_time, sentence)
|
||
sentence = RE_TIME.sub(replace_time, sentence)
|
||
|
||
# 处理~波浪号作为至的替换
|
||
sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
|
||
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
|
||
sentence = replace_measure(sentence)
|
||
sentence = RE_FRAC.sub(replace_frac, sentence)
|
||
sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
|
||
sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
|
||
|
||
sentence = RE_TELEPHONE.sub(replace_phone, sentence)
|
||
sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
|
||
|
||
sentence = RE_RANGE.sub(replace_range, sentence)
|
||
|
||
# 处理加减乘除
|
||
while RE_ASMD.search(sentence):
|
||
sentence = RE_ASMD.sub(replace_asmd, sentence)
|
||
|
||
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
|
||
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
|
||
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
|
||
sentence)
|
||
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
|
||
sentence = RE_NUMBER.sub(replace_number, sentence)
|
||
sentence = self._post_replace(sentence)
|
||
|
||
return sentence
|
||
|
||
def normalize(self, text: str) -> List[str]:
|
||
sentences = self._split(text)
|
||
sentences = [self.normalize_sentence(sent) for sent in sentences]
|
||
return sentences
|