mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-06 03:57:44 +08:00
Add japanese cleaner with prosody info.
This commit is contained in:
parent
53817b983c
commit
a806bb20f9
@ -4,8 +4,8 @@ import sys
|
|||||||
|
|
||||||
import pyopenjtalk
|
import pyopenjtalk
|
||||||
|
|
||||||
from text import symbols
|
|
||||||
|
|
||||||
|
from text import symbols
|
||||||
# Regular expression matching Japanese without punctuation marks:
|
# Regular expression matching Japanese without punctuation marks:
|
||||||
_japanese_characters = re.compile(
|
_japanese_characters = re.compile(
|
||||||
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
|
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
|
||||||
@ -71,7 +71,7 @@ def symbols_to_japanese(text):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def preprocess_jap(text):
|
def preprocess_jap(text, with_prosody=False):
|
||||||
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
|
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
|
||||||
text = symbols_to_japanese(text)
|
text = symbols_to_japanese(text)
|
||||||
sentences = re.split(_japanese_marks, text)
|
sentences = re.split(_japanese_marks, text)
|
||||||
@ -79,10 +79,15 @@ def preprocess_jap(text):
|
|||||||
text = []
|
text = []
|
||||||
for i, sentence in enumerate(sentences):
|
for i, sentence in enumerate(sentences):
|
||||||
if re.match(_japanese_characters, sentence):
|
if re.match(_japanese_characters, sentence):
|
||||||
p = pyopenjtalk.g2p(sentence)
|
if with_prosody:
|
||||||
text += p.split(" ")
|
text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
|
||||||
|
else:
|
||||||
|
p = pyopenjtalk.g2p(sentence)
|
||||||
|
text += p.split(" ")
|
||||||
|
|
||||||
if i < len(marks):
|
if i < len(marks):
|
||||||
|
if marks[i] == " ":# 防止意外的UNK
|
||||||
|
continue
|
||||||
text += [marks[i].replace(" ", "")]
|
text += [marks[i].replace(" ", "")]
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@ -91,16 +96,96 @@ def text_normalize(text):
|
|||||||
# todo: jap text normalize
|
# todo: jap text normalize
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
|
||||||
|
def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> list[str]:
|
||||||
|
"""Extract phoneme + prosoody symbol sequence from input full-context labels.
|
||||||
|
|
||||||
def g2p(norm_text):
|
The algorithm is based on `Prosodic features control by symbols as input of
|
||||||
phones = preprocess_jap(norm_text)
|
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Input text.
|
||||||
|
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: List of phoneme + prosody symbols.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
|
||||||
|
>>> pyopenjtalk_g2p_prosody("こんにちは。")
|
||||||
|
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
|
||||||
|
|
||||||
|
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
|
||||||
|
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
|
||||||
|
|
||||||
|
"""
|
||||||
|
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
|
||||||
|
N = len(labels)
|
||||||
|
|
||||||
|
phones = []
|
||||||
|
for n in range(N):
|
||||||
|
lab_curr = labels[n]
|
||||||
|
|
||||||
|
# current phoneme
|
||||||
|
p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
|
||||||
|
# deal unvoiced vowels as normal vowels
|
||||||
|
if drop_unvoiced_vowels and p3 in "AEIOU":
|
||||||
|
p3 = p3.lower()
|
||||||
|
|
||||||
|
# deal with sil at the beginning and the end of text
|
||||||
|
if p3 == "sil":
|
||||||
|
assert n == 0 or n == N - 1
|
||||||
|
if n == 0:
|
||||||
|
phones.append("^")
|
||||||
|
elif n == N - 1:
|
||||||
|
# check question form or not
|
||||||
|
e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
|
||||||
|
if e3 == 0:
|
||||||
|
phones.append("$")
|
||||||
|
elif e3 == 1:
|
||||||
|
phones.append("?")
|
||||||
|
continue
|
||||||
|
elif p3 == "pau":
|
||||||
|
phones.append("_")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
phones.append(p3)
|
||||||
|
|
||||||
|
# accent type and position info (forward or backward)
|
||||||
|
a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
|
||||||
|
a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
|
||||||
|
a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
|
||||||
|
|
||||||
|
# number of mora in accent phrase
|
||||||
|
f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
|
||||||
|
|
||||||
|
a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
|
||||||
|
# accent phrase border
|
||||||
|
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
|
||||||
|
phones.append("#")
|
||||||
|
# pitch falling
|
||||||
|
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
|
||||||
|
phones.append("]")
|
||||||
|
# pitch rising
|
||||||
|
elif a2 == 1 and a2_next == 2:
|
||||||
|
phones.append("[")
|
||||||
|
|
||||||
|
return phones
|
||||||
|
|
||||||
|
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
|
||||||
|
def _numeric_feature_by_regex(regex, s):
|
||||||
|
match = re.search(regex, s)
|
||||||
|
if match is None:
|
||||||
|
return -50
|
||||||
|
return int(match.group(1))
|
||||||
|
|
||||||
|
def g2p(norm_text, with_prosody=False):
|
||||||
|
phones = preprocess_jap(norm_text, with_prosody)
|
||||||
phones = [post_replace_ph(i) for i in phones]
|
phones = [post_replace_ph(i) for i in phones]
|
||||||
# todo: implement tones and word2ph
|
# todo: implement tones and word2ph
|
||||||
return phones
|
return phones
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
for line in open("../../../Downloads/transcript_utf8.txt").readlines():
|
phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!")
|
||||||
text = line.split(":")[1]
|
print(phones)
|
||||||
phones = g2p(text)
|
|
||||||
print(phones)
|
|
@ -315,6 +315,10 @@ ja_symbols = [
|
|||||||
"w",
|
"w",
|
||||||
"y",
|
"y",
|
||||||
"z",
|
"z",
|
||||||
|
# "[", #上升调型
|
||||||
|
# "]", #下降调型
|
||||||
|
# "$", #结束符
|
||||||
|
# "^", #开始符
|
||||||
]
|
]
|
||||||
|
|
||||||
arpa = {
|
arpa = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user