diff --git a/test/e2e/__snapshots__/test_tts.ambr b/test/e2e/__snapshots__/test_tts.ambr index 091a376a..09f2fdf1 100644 --- a/test/e2e/__snapshots__/test_tts.ambr +++ b/test/e2e/__snapshots__/test_tts.ambr @@ -1,4 +1,4 @@ # serializer version: 1 # name: test_テキストとキャラクターIDから音声を合成できる - 'MD5:8f7ddc461c68542d4d8ef4cd5c54ca82' + 'MD5:015a2f6482f2a893c4c61e63c10993b3' # --- diff --git a/test/e2e/conftest.py b/test/e2e/conftest.py index 2cb3ed02..de503769 100644 --- a/test/e2e/conftest.py +++ b/test/e2e/conftest.py @@ -9,12 +9,15 @@ from voicevox_engine.aivm_manager import AivmManager from voicevox_engine.app.application import generate_app -from voicevox_engine.core.core_initializer import initialize_cores +from voicevox_engine.core.core_initializer import MOCK_VER, initialize_cores from voicevox_engine.engine_manifest import load_manifest from voicevox_engine.library.library_manager import LibraryManager from voicevox_engine.preset.preset_manager import PresetManager from voicevox_engine.setting.setting_manager import SettingHandler -from voicevox_engine.tts_pipeline.tts_engine import make_tts_engines_from_cores +from voicevox_engine.tts_pipeline.style_bert_vits2_tts_engine import ( + StyleBertVITS2TTSEngine, +) +from voicevox_engine.tts_pipeline.tts_engine import TTSEngineManager from voicevox_engine.user_dict.user_dict_manager import ( DEFAULT_DICT_DIR_PATH, UserDictionary, @@ -36,7 +39,11 @@ def _copy_under_dir(file_path: Path, dir_path: Path) -> Path: def app_params(tmp_path: Path) -> dict[str, Any]: aivm_manager = AivmManager(get_save_dir() / "Models") core_manager = initialize_cores(use_gpu=False, enable_mock=True) - tts_engines = make_tts_engines_from_cores(core_manager) + tts_engines = TTSEngineManager() + tts_engines.register_engine( + StyleBertVITS2TTSEngine(aivm_manager, False, False), + MOCK_VER, + ) setting_loader = SettingHandler(tmp_path / "not_exist.yaml") # テスト用に隔離されたプリセットを生成する diff --git a/test/e2e/single_api/engine_info/__snapshots__/test_supported_devices/test_get_supported_devices_200.json b/test/e2e/single_api/engine_info/__snapshots__/test_supported_devices/test_get_supported_devices_200.json index 8ca772ae..ebaa7746 100644 --- a/test/e2e/single_api/engine_info/__snapshots__/test_supported_devices/test_get_supported_devices_200.json +++ b/test/e2e/single_api/engine_info/__snapshots__/test_supported_devices/test_get_supported_devices_200.json @@ -1,5 +1,5 @@ { "cpu": true, - "cuda": true, + "cuda": false, "dml": false } diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_accent_phrases/test_post_accent_phrases_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_accent_phrases/test_post_accent_phrases_200.json index 9a61f7ac..a3683fd4 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_accent_phrases/test_post_accent_phrases_200.json +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_accent_phrases/test_post_accent_phrases_200.json @@ -5,43 +5,43 @@ "moras": [ { "consonant": "t", - "consonant_length": 2.31, - "pitch": 3.38, + "consonant_length": 0.0, + "pitch": 0.0, "text": "テ", "vowel": "e", - "vowel_length": 0.88 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 2.19, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", - "vowel": "U", - "vowel_length": 0.38 + "vowel": "u", + "vowel_length": 0.0 }, { "consonant": "t", - "consonant_length": 2.31, - "pitch": 4.19, + "consonant_length": 0.0, + "pitch": 0.0, "text": "ト", "vowel": "o", - "vowel_length": 1.88 + "vowel_length": 0.0 }, { "consonant": "d", - "consonant_length": 0.75, - "pitch": 1.62, + "consonant_length": 0.0, + "pitch": 0.0, "text": "デ", "vowel": "e", - "vowel_length": 0.88 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 2.19, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", - "vowel": "U", - "vowel_length": 0.38 + "vowel": "u", + "vowel_length": 0.0 } ], "pause_mora": null diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_audio_query/test_post_audio_query_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_audio_query/test_post_audio_query_200.json index 56ec0a2d..74bc14df 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_audio_query/test_post_audio_query_200.json +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_audio_query/test_post_audio_query_200.json @@ -6,43 +6,43 @@ "moras": [ { "consonant": "t", - "consonant_length": 2.31, - "pitch": 3.38, + "consonant_length": 0.0, + "pitch": 0.0, "text": "テ", "vowel": "e", - "vowel_length": 0.88 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 2.19, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", - "vowel": "U", - "vowel_length": 0.38 + "vowel": "u", + "vowel_length": 0.0 }, { "consonant": "t", - "consonant_length": 2.31, - "pitch": 4.19, + "consonant_length": 0.0, + "pitch": 0.0, "text": "ト", "vowel": "o", - "vowel_length": 1.88 + "vowel_length": 0.0 }, { "consonant": "d", - "consonant_length": 0.75, - "pitch": 1.62, + "consonant_length": 0.0, + "pitch": 0.0, "text": "デ", "vowel": "e", - "vowel_length": 0.88 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 2.19, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", - "vowel": "U", - "vowel_length": 0.38 + "vowel": "u", + "vowel_length": 0.0 } ], "pause_mora": null @@ -50,7 +50,7 @@ ], "intonationScale": 1.0, "kana": "テストです", - "outputSamplingRate": 24000, + "outputSamplingRate": 44100, "outputStereo": false, "pauseLength": null, "pauseLengthScale": 1.0, diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_audio_query_from_preset/test_post_audio_query_from_preset_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_audio_query_from_preset/test_post_audio_query_from_preset_200.json index 7f1d5805..29f055ef 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_audio_query_from_preset/test_post_audio_query_from_preset_200.json +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_audio_query_from_preset/test_post_audio_query_from_preset_200.json @@ -6,43 +6,43 @@ "moras": [ { "consonant": "t", - "consonant_length": 10001.31, - "pitch": 10002.38, + "consonant_length": 0.0, + "pitch": 0.0, "text": "テ", "vowel": "e", - "vowel_length": 9999.88 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 10001.19, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", - "vowel": "U", - "vowel_length": 9999.38 + "vowel": "u", + "vowel_length": 0.0 }, { "consonant": "t", - "consonant_length": 10001.31, - "pitch": 10003.19, + "consonant_length": 0.0, + "pitch": 0.0, "text": "ト", "vowel": "o", - "vowel_length": 10000.88 + "vowel_length": 0.0 }, { "consonant": "d", - "consonant_length": 9999.75, - "pitch": 10000.62, + "consonant_length": 0.0, + "pitch": 0.0, "text": "デ", "vowel": "e", - "vowel_length": 9999.88 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 10001.19, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", - "vowel": "U", - "vowel_length": 9999.38 + "vowel": "u", + "vowel_length": 0.0 } ], "pause_mora": null @@ -50,7 +50,7 @@ ], "intonationScale": 1.2, "kana": "テストです", - "outputSamplingRate": 24000, + "outputSamplingRate": 44100, "outputStereo": false, "pauseLength": 15.0, "pauseLengthScale": 1.4, diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_is_initialized_speaker/test_get_is_initialized_speaker_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_is_initialized_speaker/test_get_is_initialized_speaker_200.json index 27ba77dd..c508d536 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_is_initialized_speaker/test_get_is_initialized_speaker_200.json +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_is_initialized_speaker/test_get_is_initialized_speaker_200.json @@ -1 +1 @@ -true +false diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_data/test_post_mora_data_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_data/test_post_mora_data_200.json index 8882a2db..b551592d 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_data/test_post_mora_data_200.json +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_data/test_post_mora_data_200.json @@ -5,27 +5,27 @@ "moras": [ { "consonant": "t", - "consonant_length": 2.31, - "pitch": 3.38, + "consonant_length": 0.0, + "pitch": 0.0, "text": "テ", "vowel": "e", - "vowel_length": 0.88 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 2.19, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", "vowel": "U", - "vowel_length": 0.38 + "vowel_length": 0.0 }, { "consonant": "t", - "consonant_length": 2.31, - "pitch": 4.25, + "consonant_length": 0.0, + "pitch": 0.0, "text": "ト", "vowel": "o", - "vowel_length": 1.88 + "vowel_length": 0.0 } ], "pause_mora": null diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_length/test_post_mora_length_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_length/test_post_mora_length_200.json index 2fcfc63f..b551592d 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_length/test_post_mora_length_200.json +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_length/test_post_mora_length_200.json @@ -5,27 +5,27 @@ "moras": [ { "consonant": "t", - "consonant_length": 2.31, - "pitch": 3.3, + "consonant_length": 0.0, + "pitch": 0.0, "text": "テ", "vowel": "e", - "vowel_length": 0.88 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 2.19, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", "vowel": "U", - "vowel_length": 0.38 + "vowel_length": 0.0 }, { "consonant": "t", - "consonant_length": 2.31, - "pitch": 4.1, + "consonant_length": 0.0, + "pitch": 0.0, "text": "ト", "vowel": "o", - "vowel_length": 1.88 + "vowel_length": 0.0 } ], "pause_mora": null diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_pitch/test_post_mora_pitch_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_pitch/test_post_mora_pitch_200.json index 075eac34..b551592d 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_pitch/test_post_mora_pitch_200.json +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_mora_pitch/test_post_mora_pitch_200.json @@ -5,27 +5,27 @@ "moras": [ { "consonant": "t", - "consonant_length": 2.3, - "pitch": 3.38, + "consonant_length": 0.0, + "pitch": 0.0, "text": "テ", "vowel": "e", - "vowel_length": 0.8 + "vowel_length": 0.0 }, { "consonant": "s", - "consonant_length": 2.1, + "consonant_length": 0.0, "pitch": 0.0, "text": "ス", "vowel": "U", - "vowel_length": 0.3 + "vowel_length": 0.0 }, { "consonant": "t", - "consonant_length": 2.3, - "pitch": 4.25, + "consonant_length": 0.0, + "pitch": 0.0, "text": "ト", "vowel": "o", - "vowel_length": 1.8 + "vowel_length": 0.0 } ], "pause_mora": null diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_synthesis.ambr b/test/e2e/single_api/tts_pipeline/__snapshots__/test_synthesis.ambr index a8970cf0..a7180ae1 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_synthesis.ambr +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_synthesis.ambr @@ -1,7 +1,7 @@ # serializer version: 1 # name: test_post_synthesis_200 - 'MD5:f7d42ce5787856549abc3d2d7561c06f' + 'MD5:bb372629df13cbec280585d3d34dc217' # --- # name: test_post_synthesis_old_audio_query_200 - 'MD5:f7d42ce5787856549abc3d2d7561c06f' + 'MD5:f76a5639ff8327ccee59844624be2ef5' # --- diff --git a/test/e2e/single_api/tts_pipeline/test_accent_phrases.py b/test/e2e/single_api/tts_pipeline/test_accent_phrases.py index 5b1d6000..b44a099a 100644 --- a/test/e2e/single_api/tts_pipeline/test_accent_phrases.py +++ b/test/e2e/single_api/tts_pipeline/test_accent_phrases.py @@ -12,7 +12,7 @@ def test_post_accent_phrases_200( client: TestClient, snapshot_json: SnapshotAssertion ) -> None: response = client.post( - "/accent_phrases", params={"text": "テストです", "speaker": 0} + "/accent_phrases", params={"text": "テストです", "speaker": 888753760} ) assert response.status_code == 200 assert snapshot_json == round_floats(response.json(), 2) diff --git a/test/e2e/single_api/tts_pipeline/test_audio_query.py b/test/e2e/single_api/tts_pipeline/test_audio_query.py index e57ee66a..ba539900 100644 --- a/test/e2e/single_api/tts_pipeline/test_audio_query.py +++ b/test/e2e/single_api/tts_pipeline/test_audio_query.py @@ -11,6 +11,8 @@ def test_post_audio_query_200( client: TestClient, snapshot_json: SnapshotAssertion ) -> None: - response = client.post("/audio_query", params={"text": "テストです", "speaker": 0}) + response = client.post( + "/audio_query", params={"text": "テストです", "speaker": 888753760} + ) assert response.status_code == 200 assert snapshot_json == round_floats(response.json(), round_value=2) diff --git a/test/e2e/single_api/tts_pipeline/test_initialize_speaker.py b/test/e2e/single_api/tts_pipeline/test_initialize_speaker.py index 0aceeff4..48e3745a 100644 --- a/test/e2e/single_api/tts_pipeline/test_initialize_speaker.py +++ b/test/e2e/single_api/tts_pipeline/test_initialize_speaker.py @@ -9,6 +9,6 @@ def test_post_initialize_speaker_204( client: TestClient, snapshot: SnapshotAssertion ) -> None: - response = client.post("/initialize_speaker", params={"speaker": 0}) + response = client.post("/initialize_speaker", params={"speaker": 888753760}) assert response.status_code == 204 assert snapshot == response.content diff --git a/test/e2e/single_api/tts_pipeline/test_is_initialized_speaker.py b/test/e2e/single_api/tts_pipeline/test_is_initialized_speaker.py index 8da91b9e..dca40f64 100644 --- a/test/e2e/single_api/tts_pipeline/test_is_initialized_speaker.py +++ b/test/e2e/single_api/tts_pipeline/test_is_initialized_speaker.py @@ -9,6 +9,6 @@ def test_get_is_initialized_speaker_200( client: TestClient, snapshot_json: SnapshotAssertion ) -> None: - response = client.get("/is_initialized_speaker", params={"speaker": 0}) + response = client.get("/is_initialized_speaker", params={"speaker": 888753760}) assert response.status_code == 200 assert snapshot_json == response.json() diff --git a/test/e2e/single_api/tts_pipeline/test_mora_data.py b/test/e2e/single_api/tts_pipeline/test_mora_data.py index d5b97d7b..df7e2a41 100644 --- a/test/e2e/single_api/tts_pipeline/test_mora_data.py +++ b/test/e2e/single_api/tts_pipeline/test_mora_data.py @@ -15,15 +15,17 @@ def test_post_mora_data_200( accent_phrases = [ { "moras": [ - gen_mora("テ", "t", 2.3, "e", 0.8, 3.3), - gen_mora("ス", "s", 2.1, "U", 0.3, 0.0), - gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), + gen_mora("テ", "t", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ト", "t", 0.0, "o", 0.0, 0.0), ], "accent": 1, "pause_mora": None, "is_interrogative": False, } ] - response = client.post("/mora_data", params={"speaker": 0}, json=accent_phrases) + response = client.post( + "/mora_data", params={"speaker": 888753760}, json=accent_phrases + ) assert response.status_code == 200 assert snapshot_json == round_floats(response.json(), 2) diff --git a/test/e2e/single_api/tts_pipeline/test_mora_length.py b/test/e2e/single_api/tts_pipeline/test_mora_length.py index 33f6bb83..ed7d51f4 100644 --- a/test/e2e/single_api/tts_pipeline/test_mora_length.py +++ b/test/e2e/single_api/tts_pipeline/test_mora_length.py @@ -15,15 +15,17 @@ def test_post_mora_length_200( accent_phrases = [ { "moras": [ - gen_mora("テ", "t", 2.3, "e", 0.8, 3.3), - gen_mora("ス", "s", 2.1, "U", 0.3, 0.0), - gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), + gen_mora("テ", "t", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ト", "t", 0.0, "o", 0.0, 0.0), ], "accent": 1, "pause_mora": None, "is_interrogative": False, } ] - response = client.post("/mora_length", params={"speaker": 0}, json=accent_phrases) + response = client.post( + "/mora_length", params={"speaker": 888753760}, json=accent_phrases + ) assert response.status_code == 200 assert snapshot_json == round_floats(response.json(), 2) diff --git a/test/e2e/single_api/tts_pipeline/test_mora_pitch.py b/test/e2e/single_api/tts_pipeline/test_mora_pitch.py index c0fb5c9c..14b6b67a 100644 --- a/test/e2e/single_api/tts_pipeline/test_mora_pitch.py +++ b/test/e2e/single_api/tts_pipeline/test_mora_pitch.py @@ -15,15 +15,17 @@ def test_post_mora_pitch_200( accent_phrases = [ { "moras": [ - gen_mora("テ", "t", 2.3, "e", 0.8, 3.3), - gen_mora("ス", "s", 2.1, "U", 0.3, 0.0), - gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), + gen_mora("テ", "t", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ト", "t", 0.0, "o", 0.0, 0.0), ], "accent": 1, "pause_mora": None, "is_interrogative": False, } ] - response = client.post("/mora_pitch", params={"speaker": 0}, json=accent_phrases) + response = client.post( + "/mora_pitch", params={"speaker": 888753760}, json=accent_phrases + ) assert response.status_code == 200 assert snapshot_json == round_floats(response.json(), 2) diff --git a/test/e2e/single_api/tts_pipeline/test_multi_synthesis.py b/test/e2e/single_api/tts_pipeline/test_multi_synthesis.py index 50a8b50a..4d79dc66 100644 --- a/test/e2e/single_api/tts_pipeline/test_multi_synthesis.py +++ b/test/e2e/single_api/tts_pipeline/test_multi_synthesis.py @@ -13,9 +13,9 @@ def test_post_multi_synthesis_200(client: TestClient) -> None: "accent_phrases": [ { "moras": [ - gen_mora("テ", "t", 2.3, "e", 0.8, 3.3), - gen_mora("ス", "s", 2.1, "U", 0.3, 0.0), - gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), + gen_mora("テ", "t", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ト", "t", 0.0, "o", 0.0, 0.0), ], "accent": 1, "pause_mora": None, @@ -30,18 +30,18 @@ def test_post_multi_synthesis_200(client: TestClient) -> None: "postPhonemeLength": 0.1, "pauseLength": None, "pauseLengthScale": 1.0, - "outputSamplingRate": 24000, + "outputSamplingRate": 44100, "outputStereo": False, - "kana": "テ'_スト", + "kana": "テスト", }, { "accent_phrases": [ { "moras": [ - gen_mora("テ", "t", 2.3, "e", 0.8, 3.3), - gen_mora("ス", "s", 2.1, "U", 0.3, 0.0), - gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), - gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), + gen_mora("テ", "t", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ト", "t", 0.0, "o", 0.0, 0.0), + gen_mora("ト", "t", 0.0, "o", 0.0, 0.0), ], "accent": 1, "pause_mora": None, @@ -56,12 +56,14 @@ def test_post_multi_synthesis_200(client: TestClient) -> None: "postPhonemeLength": 0.1, "pauseLength": None, "pauseLengthScale": 1.0, - "outputSamplingRate": 24000, + "outputSamplingRate": 44100, "outputStereo": False, - "kana": "テ'_ストト", + "kana": "テストト", }, ] - response = client.post("/multi_synthesis", params={"speaker": 0}, json=queries) + response = client.post( + "/multi_synthesis", params={"speaker": 888753760}, json=queries + ) assert response.status_code == 200 # FileResponse 内の zip ファイルに圧縮された .wav から抽出された音声波形が一致する diff --git a/test/e2e/single_api/tts_pipeline/test_synthesis.py b/test/e2e/single_api/tts_pipeline/test_synthesis.py index 1827eba3..7bc34217 100644 --- a/test/e2e/single_api/tts_pipeline/test_synthesis.py +++ b/test/e2e/single_api/tts_pipeline/test_synthesis.py @@ -14,9 +14,9 @@ def test_post_synthesis_200(client: TestClient, snapshot: SnapshotAssertion) -> "accent_phrases": [ { "moras": [ - gen_mora("テ", "t", 2.3, "e", 0.8, 3.3), - gen_mora("ス", "s", 2.1, "U", 0.3, 0.0), - gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), + gen_mora("テ", "t", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ト", "t", 0.0, "o", 0.0, 0.0), ], "accent": 1, "pause_mora": None, @@ -31,11 +31,11 @@ def test_post_synthesis_200(client: TestClient, snapshot: SnapshotAssertion) -> "postPhonemeLength": 0.1, "pauseLength": None, "pauseLengthScale": 1.0, - "outputSamplingRate": 24000, + "outputSamplingRate": 44100, "outputStereo": False, - "kana": "テ'_スト", + "kana": "テスト", } - response = client.post("/synthesis", params={"speaker": 0}, json=query) + response = client.post("/synthesis", params={"speaker": 888753760}, json=query) assert response.status_code == 200 # 音声波形が一致する @@ -51,9 +51,9 @@ def test_post_synthesis_old_audio_query_200( "accent_phrases": [ { "moras": [ - gen_mora("テ", "t", 2.3, "e", 0.8, 3.3), - gen_mora("ス", "s", 2.1, "U", 0.3, 0.0), - gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), + gen_mora("テ", "t", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ト", "t", 0.0, "o", 0.0, 0.0), ], "accent": 1, "pause_mora": None, @@ -66,10 +66,10 @@ def test_post_synthesis_old_audio_query_200( "volumeScale": 1.0, "prePhonemeLength": 0.1, "postPhonemeLength": 0.1, - "outputSamplingRate": 24000, + "outputSamplingRate": 44100, "outputStereo": False, } - response = client.post("/synthesis", params={"speaker": 0}, json=query) + response = client.post("/synthesis", params={"speaker": 888753760}, json=query) assert response.status_code == 200 # 音声波形が一致する diff --git a/test/e2e/test_tts.py b/test/e2e/test_tts.py index 66c8e60a..87f315d9 100644 --- a/test/e2e/test_tts.py +++ b/test/e2e/test_tts.py @@ -13,12 +13,14 @@ def test_テキストとキャラクターIDから音声を合成できる( ) -> None: # テキストとキャラクター ID から AudioQuery を生成する audio_query_res = client.post( - "/audio_query", params={"text": "テストです", "speaker": 0} + "/audio_query", params={"text": "テストです", "speaker": 888753760} ) audio_query = audio_query_res.json() # AudioQuery から音声波形を生成する - synthesis_res = client.post("/synthesis", params={"speaker": 0}, json=audio_query) + synthesis_res = client.post( + "/synthesis", params={"speaker": 888753760}, json=audio_query + ) # リクエストが成功している assert synthesis_res.status_code == 200 diff --git a/tools/check_release_build.py b/tools/check_release_build.py index ecf7bbc5..c93ca9f6 100644 --- a/tools/check_release_build.py +++ b/tools/check_release_build.py @@ -47,19 +47,21 @@ def test_release_build(dist_dir: Path, skip_run_process: bool) -> None: # テキスト -> クエリ text = "こんにちは、音声合成の世界へようこそ" req = Request( - base_url + "audio_query?" + urlencode({"speaker": "1", "text": text}), + base_url + + "audio_query?" + + urlencode({"speaker": "888753760", "text": text}), method="POST", ) with urlopen(req) as res: query = json.loads(res.read().decode("utf-8")) # クエリ -> 音声 - # req = Request(base_url + "synthesis?speaker=1", method="POST") - # req.add_header("Content-Type", "application/json") - # req.data = json.dumps(query).encode("utf-8") - # with urlopen(req) as res: - # wave = res.read() - # soundfile.read(BytesIO(wave)) + req = Request(base_url + "synthesis?speaker=888753760", method="POST") + req.add_header("Content-Type", "application/json") + req.data = json.dumps(query).encode("utf-8") + with urlopen(req) as res: + wave = res.read() + soundfile.read(BytesIO(wave)) # エンジンマニフェスト req = Request(base_url + "engine_manifest", method="GET") diff --git a/voicevox_engine/tts_pipeline/style_bert_vits2_tts_engine.py b/voicevox_engine/tts_pipeline/style_bert_vits2_tts_engine.py index 9a8175e9..c070c140 100644 --- a/voicevox_engine/tts_pipeline/style_bert_vits2_tts_engine.py +++ b/voicevox_engine/tts_pipeline/style_bert_vits2_tts_engine.py @@ -502,7 +502,7 @@ def synthesize_wave( else: logger.warning("AudioQuery.kana is not specified. Using accent phrases instead.") # fmt: skip # 読み仮名 (カタカナのみ) のテキストを取得 - ## ひらがなの方がまだ抑揚の棒読み度がマシになるため、カタカナをひらがなに変換した上で句点を付ける + ## ひらがなの方がまだ抑揚の棒読み度がマシになるため、カタカナをひらがなに変換する flatten_moras = to_flatten_moras(query.accent_phrases) text = "".join([mora.text for mora in flatten_moras]) text = jaconv.kata2hira(text)