From 2ef2892e2ac219a3f66d23c7e2f94f7eadaeab29 Mon Sep 17 00:00:00 2001 From: Subhankar Ghosh Date: Thu, 21 Apr 2022 19:25:05 -0500 Subject: [PATCH] [TTS] Restore_buffer bug fix and update NeMo checkpoint URL (#4041) * restore_buffer bug fix and update NeMo checkpoint URL Signed-off-by: Subhankar Ghosh * skip test Signed-off-by: ericharper * use tacotron2 until new fastpitch model is on ngc Signed-off-by: ericharper * use tacotron2 until new fastpitch model is on ngc Signed-off-by: ericharper Co-authored-by: ericharper --- nemo/collections/tts/models/fastpitch.py | 2 +- nemo/collections/tts/modules/fastpitch.py | 6 +- tests/collections/tts/test_tts_exportables.py | 1 + tutorials/AudioTranslationSample.ipynb | 169 +++++++------- tutorials/VoiceSwapSample.ipynb | 214 +++++++++--------- 5 files changed, 197 insertions(+), 195 deletions(-) diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 84ab147278fd..034752a4e943 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -479,7 +479,7 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]': list_of_models = [] model = PretrainedModelInfo( pretrained_model_name="tts_en_fastpitch", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/1.4.0/files/tts_en_fastpitch_align.nemo", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/1.8.1/files/tts_en_fastpitch_align.nemo", description="This model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent.", class_=cls, ) diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index 2f5542a1bd22..591cf907ed4b 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -158,8 +158,8 @@ def __init__( else: self.speaker_emb = None - self.register_buffer('max_token_duration', torch.tensor(max_token_duration)) - self.register_buffer('min_token_duration', torch.tensor(0.0)) + self.register_buffer('max_token_duration', torch.tensor(max_token_duration), persistent=False) + self.register_buffer('min_token_duration', torch.tensor(0.0), persistent=False) self.pitch_emb = torch.nn.Conv1d( 1, @@ -171,7 +171,7 @@ def __init__( # Store values precomputed from training data for convenience self.register_buffer('pitch_mean', torch.zeros(1)) self.register_buffer('pitch_std', torch.zeros(1)) - self.register_buffer('zero_emb', torch.zeros(1)) + self.register_buffer('zero_emb', torch.zeros(1), persistent=False) self.proj = torch.nn.Linear(self.decoder.d_model, n_mel_channels, bias=True) diff --git a/tests/collections/tts/test_tts_exportables.py b/tests/collections/tts/test_tts_exportables.py index 6833370395fd..6fac0c6dcba8 100644 --- a/tests/collections/tts/test_tts_exportables.py +++ b/tests/collections/tts/test_tts_exportables.py @@ -39,6 +39,7 @@ def hifigan_model(): class TestExportable: + @pytest.mark.pleasefixme @pytest.mark.run_only_on('GPU') @pytest.mark.unit def test_FastPitchModel_export_to_onnx(self, fastpitch_model): diff --git a/tutorials/AudioTranslationSample.ipynb b/tutorials/AudioTranslationSample.ipynb index a31b010ab349..0d0c3ebf3f46 100644 --- a/tutorials/AudioTranslationSample.ipynb +++ b/tutorials/AudioTranslationSample.ipynb @@ -2,6 +2,9 @@ "cells": [ { "cell_type": "markdown", + "metadata": { + "id": "RYGnI-EZp_nK" + }, "source": [ "# Getting Started: Sample Conversational AI application\n", "This notebook shows how to use NVIDIA NeMo (https://github.com/NVIDIA/NeMo) to construct a toy demo which translate Mandarin audio file into English one.\n", @@ -12,48 +15,49 @@ "* Transcribe audio with (Mandarin) speech recognition model.\n", "* Translate text with machine translation model.\n", "* Generate audio with text-to-speech models." - ], - "metadata": { - "id": "RYGnI-EZp_nK" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "V72HXYuQ_p9a" + }, "source": [ "## Installation\n", "NeMo can be installed via simple pip command.\n", "This will take about 4 minutes.\n", "\n", "(The installation method below should work inside your new Conda environment or in an NVIDIA docker container.)" - ], - "metadata": { - "id": "V72HXYuQ_p9a" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "efDmTWf1_iYK" + }, + "outputs": [], "source": [ "BRANCH = 'r1.8.1'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" - ], - "outputs": [], - "metadata": { - "id": "efDmTWf1_iYK" - } + ] }, { "cell_type": "markdown", - "source": [ - "## Import all necessary packages" - ], "metadata": { "id": "EyJ5HiiPrPKA" - } + }, + "source": [ + "## Import all necessary packages" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "tdUqxeUEA8nw" + }, + "outputs": [], "source": [ "# Import NeMo and it's ASR, NLP and TTS collections\n", "import nemo\n", @@ -65,14 +69,13 @@ "import nemo.collections.tts as nemo_tts\n", "# We'll use this to listen to audio\n", "import IPython" - ], - "outputs": [], - "metadata": { - "id": "tdUqxeUEA8nw" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "bt2EZyU3A1aq" + }, "source": [ "## Instantiate pre-trained NeMo models\n", "\n", @@ -81,56 +84,60 @@ "* ``list_available_models()`` - it will list all models currently available on NGC and their names.\n", "\n", "* ``from_pretrained(...)`` API downloads and initialized model directly from the NGC using model name.\n" - ], - "metadata": { - "id": "bt2EZyU3A1aq" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "YNNHs5Xjr8ox", + "scrolled": true + }, + "outputs": [], "source": [ "# Here is an example of all CTC-based models:\n", "nemo_asr.models.EncDecCTCModel.list_available_models()\n", "# More ASR Models are available - see: nemo_asr.models.ASRModel.list_available_models()" - ], - "outputs": [], - "metadata": { - "id": "YNNHs5Xjr8ox", - "scrolled": true - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "1h9nhICjA5Dk", + "scrolled": true + }, + "outputs": [], "source": [ "# Speech Recognition model - Citrinet initially trained on Multilingual LibriSpeech English corpus, and fine-tuned on the open source Aishell-2\n", "asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=\"stt_zh_citrinet_1024_gamma_0_25\").cuda()\n", + "\n", "# Neural Machine Translation model\n", "nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_zh_en_transformer6x6').cuda()\n", + "\n", "# Spectrogram generator which takes text as an input and produces spectrogram\n", - "spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name=\"tts_en_fastpitch\").cuda()\n", + "spectrogram_generator = nemo_tts.models.Tacotron2Model.from_pretrained(model_name=\"tts_en_tacotron2\").cuda()\n", + "\n", "# Vocoder model which takes spectrogram and produces actual audio\n", "vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name=\"tts_hifigan\").cuda()" - ], - "outputs": [], - "metadata": { - "id": "1h9nhICjA5Dk", - "scrolled": true - } + ] }, { "cell_type": "markdown", - "source": [ - "## Get an audio sample in Mandarin" - ], "metadata": { "id": "KPota-JtsqSY" - } + }, + "source": [ + "## Get an audio sample in Mandarin" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "7cGCEKkcLr52" + }, + "outputs": [], "source": [ "# Download audio sample which we'll try\n", "# This is a sample from MCV 6.1 Dev dataset - the model hasn't seen it before\n", @@ -139,71 +146,71 @@ "!wget 'https://nemo-public.s3.us-east-2.amazonaws.com/zh-samples/common_voice_zh-CN_21347786.mp3'\n", "# To listen it, click on the play button below\n", "IPython.display.Audio(audio_sample)" - ], - "outputs": [], - "metadata": { - "id": "7cGCEKkcLr52" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "BaCdNJhhtBfM" + }, "source": [ "## Transcribe audio file\n", "We will use speech recognition model to convert audio into text.\n" - ], - "metadata": { - "id": "BaCdNJhhtBfM" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "KTA7jM6sL6yC" + }, + "outputs": [], "source": [ "transcribed_text = asr_model.transcribe([audio_sample])\n", "print(transcribed_text)" - ], - "outputs": [], - "metadata": { - "id": "KTA7jM6sL6yC" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "BjYb2TMtttCc" + }, "source": [ "## Translate Chinese text into English\n", "NeMo's NMT models have a handy ``.translate()`` method." - ], - "metadata": { - "id": "BjYb2TMtttCc" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "kQTdE4b9Nm9O" + }, + "outputs": [], "source": [ "english_text = nmt_model.translate(transcribed_text)\n", "print(english_text)" - ], - "outputs": [], - "metadata": { - "id": "kQTdE4b9Nm9O" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "9Rppc59Ut7uy" + }, "source": [ "## Generate English audio from text\n", "Speech generation from text typically has two steps:\n", "* Generate spectrogram from the text. In this example we will use FastPitch model for this.\n", "* Generate actual audio from the spectrogram. In this example we will use HifiGan model for this.\n" - ], - "metadata": { - "id": "9Rppc59Ut7uy" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "wpMYfufgNt15" + }, + "outputs": [], "source": [ "# A helper function which combines FastPitch and HifiGan to go directly from \n", "# text to audio\n", @@ -212,24 +219,23 @@ " spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)\n", " audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)\n", " return audio.to('cpu').detach().numpy()" - ], - "outputs": [], - "metadata": { - "id": "wpMYfufgNt15" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Listen to generated audio in English\n", "IPython.display.Audio(text_to_audio(english_text[0]), rate=22050)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": { + "id": "LiQ_GQpcBYUs" + }, "source": [ "## Next steps\n", "A demo like this is great for prototyping and experimentation. However, for real production deployment, you would want to use a service like [NVIDIA Riva](https://developer.nvidia.com/riva).\n", @@ -244,10 +250,7 @@ "\n", "\n", "You can find scripts for training and fine-tuning ASR, NLP and TTS models [here](https://github.com/NVIDIA/NeMo/tree/main/examples). " - ], - "metadata": { - "id": "LiQ_GQpcBYUs" - } + ] } ], "metadata": { @@ -277,4 +280,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/tutorials/VoiceSwapSample.ipynb b/tutorials/VoiceSwapSample.ipynb index 3f5cbad36a25..271d7383dcfc 100644 --- a/tutorials/VoiceSwapSample.ipynb +++ b/tutorials/VoiceSwapSample.ipynb @@ -2,6 +2,10 @@ "cells": [ { "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_wIWPxBVc3_O" + }, "source": [ "# Getting Started: Voice swap application\n", "This notebook shows how to use NVIDIA NeMo (https://github.com/NVIDIA/NeMo) to construct a toy demo which will swap a voice in the audio fragment with a computer generated one.\n", @@ -12,31 +16,22 @@ "* Adding punctuation and capitalization to the text\n", "* Generating spectrogram from resulting text\n", "* Generating waveform audio from the spectrogram." - ], - "metadata": { - "colab_type": "text", - "id": "_wIWPxBVc3_O" - } + ] }, { "cell_type": "markdown", - "source": [ - "## Installation\n", - "NeMo can be installed via simple pip command." - ], "metadata": { "colab_type": "text", "id": "gzcsqceVdtj3" - } + }, + "source": [ + "## Installation\n", + "NeMo can be installed via simple pip command." + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "BRANCH = 'r1.8.1'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n" - ], - "outputs": [], "metadata": { "colab": {}, "colab_type": "code", @@ -51,6 +46,12 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "-X2OyAxreGfl" + }, + "outputs": [], "source": [ "# Ignore pre-production warnings\n", "import warnings\n", @@ -64,17 +65,17 @@ "import nemo.collections.tts as nemo_tts\n", "# We'll use this to listen to audio\n", "import IPython" - ], - "outputs": [], - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "-X2OyAxreGfl" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1vC2DHawIGt8" + }, + "outputs": [], "source": [ "# Download audio sample which we'll try\n", "# This is a sample from LibriSpeech Dev Clean dataset - the model hasn't seen it before\n", @@ -82,61 +83,64 @@ "!wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav\n", "# Listen to it\n", "IPython.display.Audio(Audio_sample)" - ], - "outputs": [], - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "1vC2DHawIGt8" - } + ] }, { "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zodyzdyTVXas" + }, "source": [ "## Instantiate pre-trained NeMo models which we'll use\n", "``from_pretrained(...)`` API downloads and initialized model directly from the cloud.\n", "\n", "We will load audio_sample and convert it to text with QuartzNet ASR model (an action called transcribe).\n", "To convert text back to audio, we actually need to generate spectrogram with FastPitch first and then convert it to actual audio signal using the HiFiGAN vocoder." - ], - "metadata": { - "colab_type": "text", - "id": "zodyzdyTVXas" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "f_J9cuU1H6Bn" + }, + "outputs": [], "source": [ "# Speech Recognition model - QuartzNet\n", "quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=\"stt_en_quartznet15x5\").cuda()\n", + "\n", "# Punctuation and capitalization model\n", "punctuation = nemo_nlp.models.PunctuationCapitalizationModel.from_pretrained(model_name='punctuation_en_distilbert').cuda()\n", + "\n", "# Spectrogram generator which takes text as an input and produces spectrogram\n", - "spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name=\"tts_en_fastpitch\").cuda()\n", + "spectrogram_generator = nemo_tts.models.Tacotron2Model.from_pretrained(model_name=\"tts_en_tacotron2\").cuda()\n", + "\n", "# Vocoder model which takes spectrogram and produces actual audio\n", "vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name=\"tts_hifigan\").cuda()" - ], - "outputs": [], - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "f_J9cuU1H6Bn" - } + ] }, { "cell_type": "markdown", - "source": [ - "## Using the models" - ], "metadata": { "colab_type": "text", "id": "jQSj-IhEhrtI" - } + }, + "source": [ + "## Using the models" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "s0ERrXIzKpwu" + }, + "outputs": [], "source": [ "# Convert our audio sample to text\n", "files = [Audio_sample]\n", @@ -149,17 +153,17 @@ "res = punctuation.add_punctuation_capitalization(queries=[raw_text])\n", "text = res[0]\n", "print(f'\\nRaw recognized text: {raw_text}. \\nText with capitalization and punctuation: {text}')" - ], - "outputs": [], - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "s0ERrXIzKpwu" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "-0Sk0C9-LmAR" + }, + "outputs": [], "source": [ "# A helper function which combines TTS models to go directly from \n", "# text to audio\n", @@ -168,106 +172,104 @@ " spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)\n", " audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)\n", " return audio.to('cpu').detach().numpy()" - ], - "outputs": [], - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "-0Sk0C9-LmAR" - } + ] }, { "cell_type": "markdown", - "source": [ - "## Results" - ], "metadata": { "colab_type": "text", "id": "Q8Jvwe4Ahncx" - } + }, + "source": [ + "## Results" + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "# This is our original audio sample\n", - "IPython.display.Audio(Audio_sample)" - ], - "outputs": [], "metadata": { "colab": {}, "colab_type": "code", "id": "-im5TDF-MP2N" - } + }, + "outputs": [], + "source": [ + "# This is our original audio sample\n", + "IPython.display.Audio(Audio_sample)" + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "# This is what was recognized by the ASR model\n", - "print(raw_text)" - ], - "outputs": [], "metadata": { "colab": {}, "colab_type": "code", "id": "SNOMquwviEEQ" - } + }, + "outputs": [], + "source": [ + "# This is what was recognized by the ASR model\n", + "print(raw_text)" + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "# This is how punctuation model changed it\n", - "print(text)" - ], - "outputs": [], "metadata": { "colab": {}, "colab_type": "code", "id": "6qRpDPfNiLOU" - } + }, + "outputs": [], + "source": [ + "# This is how punctuation model changed it\n", + "print(text)" + ] }, { "cell_type": "markdown", - "source": [ - "Compare how the synthesized audio sounds when using text with and without punctuation." - ], "metadata": { "colab_type": "text", "id": "di2IzMsdiiWq" - } + }, + "source": [ + "Compare how the synthesized audio sounds when using text with and without punctuation." + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "# Without punctuation\n", - "IPython.display.Audio(text_to_audio(raw_text), rate=22050)" - ], - "outputs": [], "metadata": { "colab": {}, "colab_type": "code", "id": "EIh8wTVs5uH7" - } + }, + "outputs": [], + "source": [ + "# Without punctuation\n", + "IPython.display.Audio(text_to_audio(raw_text), rate=22050)" + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "# Final result - with punctuation\n", - "IPython.display.Audio(text_to_audio(text), rate=22050)" - ], - "outputs": [], "metadata": { "colab": {}, "colab_type": "code", "id": "_qgKa9L954bJ" - } + }, + "outputs": [], + "source": [ + "# Final result - with punctuation\n", + "IPython.display.Audio(text_to_audio(text), rate=22050)" + ] }, { "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "JOEFYywbctbJ" + }, "source": [ "## Next steps\n", "A demo like this is great for prototyping and experimentation. However, for real production deployment, you would want to use a service like [NVIDIA Riva](https://developer.nvidia.com/riva).\n", @@ -282,21 +284,17 @@ "\n", "\n", "You can find scripts for training and fine-tuning ASR, NLP and TTS models [here](https://github.com/NVIDIA/NeMo/tree/main/examples). " - ], - "metadata": { - "colab_type": "text", - "id": "JOEFYywbctbJ" - } + ] }, { "cell_type": "markdown", - "source": [ - "That's it folks! Head over to NeMo GitHub for more examples: https://github.com/NVIDIA/NeMo" - ], "metadata": { "colab_type": "text", "id": "ahRh2Y0Lc0G1" - } + }, + "source": [ + "That's it folks! Head over to NeMo GitHub for more examples: https://github.com/NVIDIA/NeMo" + ] } ], "metadata": { @@ -327,4 +325,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +}