|
from typing import List, Union |
|
from unittest import TestCase |
|
from unittest.mock import Mock |
|
|
|
import numpy |
|
|
|
from voicevox_engine.model import AccentPhrase, AudioQuery, Mora |
|
from voicevox_engine.synthesis_engine import SynthesisEngine |
|
|
|
|
|
def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray): |
|
result = [] |
|
|
|
for i in range(length): |
|
result.append(round(float(phoneme_list[i] * 0.0625 + speaker_id), 2)) |
|
return numpy.array(result) |
|
|
|
|
|
def yukarin_sa_mock( |
|
length: int, |
|
vowel_phoneme_list: numpy.ndarray, |
|
consonant_phoneme_list: numpy.ndarray, |
|
start_accent_list: numpy.ndarray, |
|
end_accent_list: numpy.ndarray, |
|
start_accent_phrase_list: numpy.ndarray, |
|
end_accent_phrase_list: numpy.ndarray, |
|
speaker_id: numpy.ndarray, |
|
): |
|
result = [] |
|
|
|
for i in range(length): |
|
result.append( |
|
round( |
|
float( |
|
( |
|
vowel_phoneme_list[0][i] |
|
+ consonant_phoneme_list[0][i] |
|
+ start_accent_list[0][i] |
|
+ end_accent_list[0][i] |
|
+ start_accent_phrase_list[0][i] |
|
+ end_accent_phrase_list[0][i] |
|
) |
|
* 0.0625 |
|
+ speaker_id |
|
), |
|
2, |
|
) |
|
) |
|
return numpy.array(result)[numpy.newaxis] |
|
|
|
|
|
def decode_mock( |
|
length: int, |
|
phoneme_size: int, |
|
f0: numpy.ndarray, |
|
phoneme: numpy.ndarray, |
|
speaker_id: Union[numpy.ndarray, int], |
|
): |
|
result = [] |
|
|
|
for i in range(length): |
|
|
|
for _ in range(256): |
|
result.append( |
|
float( |
|
f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) |
|
+ speaker_id |
|
) |
|
) |
|
return numpy.array(result) |
|
|
|
|
|
def koreha_arimasuka_base_expected(): |
|
return [ |
|
AccentPhrase( |
|
moras=[ |
|
Mora( |
|
text="コ", |
|
consonant="k", |
|
consonant_length=2.44, |
|
vowel="o", |
|
vowel_length=2.88, |
|
pitch=4.38, |
|
), |
|
Mora( |
|
text="レ", |
|
consonant="r", |
|
consonant_length=3.06, |
|
vowel="e", |
|
vowel_length=1.88, |
|
pitch=4.0, |
|
), |
|
Mora( |
|
text="ワ", |
|
consonant="w", |
|
consonant_length=3.62, |
|
vowel="a", |
|
vowel_length=1.44, |
|
pitch=4.19, |
|
), |
|
], |
|
accent=3, |
|
pause_mora=None, |
|
is_interrogative=False, |
|
), |
|
AccentPhrase( |
|
moras=[ |
|
Mora( |
|
text="ア", |
|
consonant=None, |
|
consonant_length=None, |
|
vowel="a", |
|
vowel_length=1.44, |
|
pitch=1.44, |
|
), |
|
Mora( |
|
text="リ", |
|
consonant="r", |
|
consonant_length=3.06, |
|
vowel="i", |
|
vowel_length=2.31, |
|
pitch=4.44, |
|
), |
|
Mora( |
|
text="マ", |
|
consonant="m", |
|
consonant_length=2.62, |
|
vowel="a", |
|
vowel_length=1.44, |
|
pitch=3.12, |
|
), |
|
Mora( |
|
text="ス", |
|
consonant="s", |
|
consonant_length=3.19, |
|
vowel="U", |
|
vowel_length=1.38, |
|
pitch=0.0, |
|
), |
|
Mora( |
|
text="カ", |
|
consonant="k", |
|
consonant_length=2.44, |
|
vowel="a", |
|
vowel_length=1.44, |
|
pitch=2.94, |
|
), |
|
], |
|
accent=3, |
|
pause_mora=None, |
|
is_interrogative=False, |
|
), |
|
] |
|
|
|
|
|
def create_mock_query(accent_phrases): |
|
return AudioQuery( |
|
accent_phrases=accent_phrases, |
|
speedScale=1, |
|
pitchScale=0, |
|
intonationScale=1, |
|
volumeScale=1, |
|
prePhonemeLength=0.1, |
|
postPhonemeLength=0.1, |
|
outputSamplingRate=24000, |
|
outputStereo=False, |
|
kana="", |
|
) |
|
|
|
|
|
class MockCore: |
|
yukarin_s_forward = Mock(side_effect=yukarin_s_mock) |
|
yukarin_sa_forward = Mock(side_effect=yukarin_sa_mock) |
|
decode_forward = Mock(side_effect=decode_mock) |
|
|
|
def metas(self): |
|
return "" |
|
|
|
def supported_devices(self): |
|
return "" |
|
|
|
def is_model_loaded(self, speaker_id): |
|
return True |
|
|
|
|
|
class TestSynthesisEngineBase(TestCase): |
|
def setUp(self): |
|
super().setUp() |
|
self.synthesis_engine = SynthesisEngine( |
|
core=MockCore(), |
|
) |
|
self.synthesis_engine._synthesis_impl = Mock() |
|
|
|
def create_accent_phrases_test_base(self, text: str, expected: List[AccentPhrase]): |
|
actual = self.synthesis_engine.create_accent_phrases(text, 1) |
|
self.assertEqual( |
|
expected, |
|
actual, |
|
"case(text:" + text + ")", |
|
) |
|
|
|
def create_synthesis_test_base( |
|
self, |
|
text: str, |
|
expected: List[AccentPhrase], |
|
enable_interrogative_upspeak: bool, |
|
): |
|
"""音声合成時に疑問文モーラ処理を行っているかどうかを検証 |
|
(https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) |
|
""" |
|
accent_phrases = self.synthesis_engine.create_accent_phrases(text, 1) |
|
query = create_mock_query(accent_phrases=accent_phrases) |
|
self.synthesis_engine.synthesis( |
|
query, 0, enable_interrogative_upspeak=enable_interrogative_upspeak |
|
) |
|
|
|
actual = self.synthesis_engine._synthesis_impl.call_args[0][0].accent_phrases |
|
|
|
self.assertEqual( |
|
expected, |
|
actual, |
|
"case(text:" + text + ")", |
|
) |
|
|
|
def test_create_accent_phrases(self): |
|
"""accent_phrasesの作成時では疑問文モーラ処理を行わない |
|
(https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) |
|
""" |
|
expected = koreha_arimasuka_base_expected() |
|
expected[-1].is_interrogative = True |
|
self.create_accent_phrases_test_base(text="これはありますか?", expected=expected) |
|
|
|
def test_synthesis_interrogative(self): |
|
expected = koreha_arimasuka_base_expected() |
|
expected[-1].is_interrogative = True |
|
expected[-1].moras += [ |
|
Mora( |
|
text="ア", |
|
consonant=None, |
|
consonant_length=None, |
|
vowel="a", |
|
vowel_length=0.15, |
|
pitch=expected[-1].moras[-1].pitch + 0.3, |
|
) |
|
] |
|
self.create_synthesis_test_base( |
|
text="これはありますか?", |
|
expected=expected, |
|
enable_interrogative_upspeak=True, |
|
) |
|
|
|
expected = koreha_arimasuka_base_expected() |
|
expected[-1].is_interrogative = True |
|
self.create_synthesis_test_base( |
|
text="これはありますか?", |
|
expected=expected, |
|
enable_interrogative_upspeak=False, |
|
) |
|
|
|
expected = koreha_arimasuka_base_expected() |
|
self.create_synthesis_test_base( |
|
text="これはありますか", |
|
expected=expected, |
|
enable_interrogative_upspeak=True, |
|
) |
|
|
|
def nn_base_expected(): |
|
return [ |
|
AccentPhrase( |
|
moras=[ |
|
Mora( |
|
text="ン", |
|
consonant=None, |
|
consonant_length=None, |
|
vowel="N", |
|
vowel_length=1.25, |
|
pitch=1.44, |
|
) |
|
], |
|
accent=1, |
|
pause_mora=None, |
|
is_interrogative=False, |
|
) |
|
] |
|
|
|
expected = nn_base_expected() |
|
self.create_synthesis_test_base( |
|
text="ん", |
|
expected=expected, |
|
enable_interrogative_upspeak=True, |
|
) |
|
|
|
expected = nn_base_expected() |
|
expected[-1].is_interrogative = True |
|
expected[-1].moras += [ |
|
Mora( |
|
text="ン", |
|
consonant=None, |
|
consonant_length=None, |
|
vowel="N", |
|
vowel_length=0.15, |
|
pitch=expected[-1].moras[-1].pitch + 0.3, |
|
) |
|
] |
|
self.create_synthesis_test_base( |
|
text="ん?", |
|
expected=expected, |
|
enable_interrogative_upspeak=True, |
|
) |
|
|
|
expected = nn_base_expected() |
|
expected[-1].is_interrogative = True |
|
self.create_synthesis_test_base( |
|
text="ん?", |
|
expected=expected, |
|
enable_interrogative_upspeak=False, |
|
) |
|
|
|
def ltu_base_expected(): |
|
return [ |
|
AccentPhrase( |
|
moras=[ |
|
Mora( |
|
text="ッ", |
|
consonant=None, |
|
consonant_length=None, |
|
vowel="cl", |
|
vowel_length=1.69, |
|
pitch=0.0, |
|
) |
|
], |
|
accent=1, |
|
pause_mora=None, |
|
is_interrogative=False, |
|
) |
|
] |
|
|
|
expected = ltu_base_expected() |
|
self.create_synthesis_test_base( |
|
text="っ", |
|
expected=expected, |
|
enable_interrogative_upspeak=True, |
|
) |
|
|
|
expected = ltu_base_expected() |
|
expected[-1].is_interrogative = True |
|
self.create_synthesis_test_base( |
|
text="っ?", |
|
expected=expected, |
|
enable_interrogative_upspeak=True, |
|
) |
|
|
|
expected = ltu_base_expected() |
|
expected[-1].is_interrogative = True |
|
self.create_synthesis_test_base( |
|
text="っ?", |
|
expected=expected, |
|
enable_interrogative_upspeak=False, |
|
) |
|
|
|
def su_base_expected(): |
|
return [ |
|
AccentPhrase( |
|
moras=[ |
|
Mora( |
|
text="ス", |
|
consonant="s", |
|
consonant_length=3.19, |
|
vowel="u", |
|
vowel_length=3.5, |
|
pitch=5.94, |
|
) |
|
], |
|
accent=1, |
|
pause_mora=None, |
|
is_interrogative=False, |
|
) |
|
] |
|
|
|
expected = su_base_expected() |
|
self.create_synthesis_test_base( |
|
text="す", |
|
expected=expected, |
|
enable_interrogative_upspeak=True, |
|
) |
|
|
|
expected = su_base_expected() |
|
expected[-1].is_interrogative = True |
|
expected[-1].moras += [ |
|
Mora( |
|
text="ウ", |
|
consonant=None, |
|
consonant_length=None, |
|
vowel="u", |
|
vowel_length=0.15, |
|
pitch=expected[-1].moras[-1].pitch + 0.3, |
|
) |
|
] |
|
self.create_synthesis_test_base( |
|
text="す?", |
|
expected=expected, |
|
enable_interrogative_upspeak=True, |
|
) |
|
|
|
expected = su_base_expected() |
|
expected[-1].is_interrogative = True |
|
self.create_synthesis_test_base( |
|
text="す?", |
|
expected=expected, |
|
enable_interrogative_upspeak=False, |
|
) |
|
|