ChasonJiang 29f22115fb
[fast_inference] 回退策略,减少padding影响,开放选项,同步代码 (#986)
* Update README

* Optimize-English-G2P

* docs: change akward expression

* docs: update Changelog_KO.md

* Fix CN punc in EN,add 's match

* Adjust normalize and g2p logic

* Update zh_CN.json

* Update README (#827)

Update README.md
Update some outdated file paths and commands

* 修复英文多音字,调整字典热加载,新增姓名匹配 (#869)

* Fix homograph dict

* Add JSON in dict

* Adjust hot dict to hot reload

* Add English name dict

* Adjust get name dict logic

* Make API Great Again (#894)

* Add zh/jp/en mix

* Optimize code readability and formatted output.

* Try OGG streaming

* Add stream mode arg

* Add media type arg

* Add cut punc arg

* Eliminate punc risk

* Update README (#895)

* Update README

* Update README

* update README

* update README

* fix typo s/Licence /License (#904)

* fix reformat cmd (#917)

Co-authored-by: starylan <starylan@outlook.com>

* Update README.md

* Normalize chinese arithmetic operations (#947)

* 改变训练和推理时的mask策略,以修复当batch_size>1时,产生的复读现象

* 同步main分支代码,增加“保持随机”选项

* 在colab中运行colab_webui.ipynb发生的uvr5模型缺失问题 (#968)

在colab中使用git下载uvr5模型时报错:
fatal: destination path 'uvr5_weights' already exists and is not an empty directory.
通过在下载前将原本从本仓库下载的uvr5_weights文件夹删除可以解决问题。

* [ASR] 修复FasterWhisper遍历输入路径失败 (#956)

* remove glob

* rename

* reset mirror pos

* 回退mask策略;
回退pad策略;
在T2SBlock中添加padding_mask,以减少pad的影响;
开放repetition_penalty参数,让用户自行调整重复惩罚的强度;
增加parallel_infer参数,用于开启或关闭并行推理,关闭时与0307版本保持一致;
在webui中增加“保持随机”选项;
同步main分支代码。

* 删除无用注释

---------

Co-authored-by: Lion <drain.daters.0p@icloud.com>
Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Co-authored-by: KamioRinn <snowsdream@live.com>
Co-authored-by: Pengoose <pengoose_dev@naver.com>
Co-authored-by: Yuan-Man <68322456+Yuan-ManX@users.noreply.github.com>
Co-authored-by: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com>
Co-authored-by: KamioRinn <63162909+KamioRinn@users.noreply.github.com>
Co-authored-by: Lion-Wu <130235128+Lion-Wu@users.noreply.github.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: SapphireLab <36986837+SapphireLab@users.noreply.github.com>
Co-authored-by: starylan <starylan@outlook.com>
Co-authored-by: shadow01a <141255649+shadow01a@users.noreply.github.com>
2024-04-19 14:35:28 +08:00

166 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import List
from .char_convert import tranditional_to_simplified
from .chronology import RE_DATE
from .chronology import RE_DATE2
from .chronology import RE_TIME
from .chronology import RE_TIME_RANGE
from .chronology import replace_date
from .chronology import replace_date2
from .chronology import replace_time
from .constants import F2H_ASCII_LETTERS
from .constants import F2H_DIGITS
from .constants import F2H_SPACE
from .num import RE_DECIMAL_NUM
from .num import RE_DEFAULT_NUM
from .num import RE_FRAC
from .num import RE_INTEGER
from .num import RE_NUMBER
from .num import RE_PERCENTAGE
from .num import RE_POSITIVE_QUANTIFIERS
from .num import RE_RANGE
from .num import RE_TO_RANGE
from .num import RE_ASMD
from .num import replace_default_num
from .num import replace_frac
from .num import replace_negative_num
from .num import replace_number
from .num import replace_percentage
from .num import replace_positive_quantifier
from .num import replace_range
from .num import replace_to_range
from .num import replace_asmd
from .phonecode import RE_MOBILE_PHONE
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
from .phonecode import RE_TELEPHONE
from .phonecode import replace_mobile
from .phonecode import replace_phone
from .quantifier import RE_TEMPERATURE
from .quantifier import replace_measure
from .quantifier import replace_temperature
class TextNormalizer():
def __init__(self):
self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)')
def _split(self, text: str, lang="zh") -> List[str]:
"""Split long text into sentences with sentence-splitting punctuations.
Args:
text (str): The input text.
Returns:
List[str]: Sentences.
"""
# Only for pure Chinese here
if lang == "zh":
text = text.replace(" ", "")
# 过滤掉特殊字符
text = re.sub(r'[——《》【】<>{}()#&@“”^_|\\]', '', text)
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
return sentences
def _post_replace(self, sentence: str) -> str:
sentence = sentence.replace('/', '')
# sentence = sentence.replace('~', '至')
# sentence = sentence.replace('', '至')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('α', '阿尔法')
sentence = sentence.replace('β', '贝塔')
sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
sentence = sentence.replace('ε', '艾普西龙')
sentence = sentence.replace('ζ', '捷塔')
sentence = sentence.replace('η', '依塔')
sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
sentence = sentence.replace('ι', '艾欧塔')
sentence = sentence.replace('κ', '喀帕')
sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
sentence = sentence.replace('μ', '')
sentence = sentence.replace('ν', '')
sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
sentence = sentence.replace('ο', '欧米克伦')
sentence = sentence.replace('π', '').replace('Π', '')
sentence = sentence.replace('ρ', '')
sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
'σ', '西格玛')
sentence = sentence.replace('τ', '')
sentence = sentence.replace('υ', '宇普西龙')
sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
sentence = sentence.replace('χ', '')
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
# re filter special characters, have one more character "-" than line 68
sentence = re.sub(r'[-——《》【】<=>{}()#&@“”^_|\\]', '', sentence)
return sentence
def normalize_sentence(self, sentence: str) -> str:
# basic character conversions
sentence = tranditional_to_simplified(sentence)
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
F2H_DIGITS).translate(F2H_SPACE)
# number related NSW verbalization
sentence = RE_DATE.sub(replace_date, sentence)
sentence = RE_DATE2.sub(replace_date2, sentence)
# range first
sentence = RE_TIME_RANGE.sub(replace_time, sentence)
sentence = RE_TIME.sub(replace_time, sentence)
# 处理~波浪号作为至的替换
sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
sentence = replace_measure(sentence)
sentence = RE_FRAC.sub(replace_frac, sentence)
sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
sentence = RE_TELEPHONE.sub(replace_phone, sentence)
sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
sentence = RE_RANGE.sub(replace_range, sentence)
# 处理加减乘除
while RE_ASMD.search(sentence):
sentence = RE_ASMD.sub(replace_asmd, sentence)
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
sentence)
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
sentence = RE_NUMBER.sub(replace_number, sentence)
sentence = self._post_replace(sentence)
return sentence
def normalize(self, text: str) -> List[str]:
sentences = self._split(text)
sentences = [self.normalize_sentence(sent) for sent in sentences]
return sentences