From d14ec6d56197b0f35535c16723aa36c3b6bce95d Mon Sep 17 00:00:00 2001 From: yonishelach Date: Tue, 31 Dec 2024 09:35:10 +0200 Subject: [PATCH] [text to audio generator] Replaced bark with openai tts models --- text_to_audio_generator/function.yaml | 88 ++++++++---------- text_to_audio_generator/item.yaml | 7 +- text_to_audio_generator/requirements.txt | 5 +- .../test_text_to_audio_generator.py | 18 ++-- .../text_to_audio_generator.ipynb | 12 +-- .../text_to_audio_generator.py | 91 +++++++++++++------ 6 files changed, 122 insertions(+), 99 deletions(-) diff --git a/text_to_audio_generator/function.yaml b/text_to_audio_generator/function.yaml index 88ef9cb89..65d8d82aa 100644 --- a/text_to_audio_generator/function.yaml +++ b/text_to_audio_generator/function.yaml @@ -1,32 +1,28 @@ -kind: job -metadata: - name: text-to-audio-generator - tag: '' - hash: 89fcaf3fab53e7b7fbba448a5e65c253d7fa66ed - project: '' - labels: - author: yonatans - categories: - - data-preparation - - machine-learning - - pytorch spec: - command: '' - args: [] image: '' + default_handler: generate_multi_speakers_audio build: - functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import pathlib
import random
import tempfile
from typing import Dict, List, Optional, Tuple, Union

import bark
import numpy as np
import pandas as pd
import torch
import torchaudio
import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def generate_multi_speakers_audio(
    data_path: str,
    speakers: Union[List[str], Dict[str, int]],
    available_voices: List[str],
    output_directory: str = None,
    use_gpu: bool = True,
    use_small_models: bool = False,
    offload_cpu: bool = False,
    sample_rate: int = 16000,
    file_format: str = "wav",
    verbose: bool = True,
    bits_per_sample: Optional[int] = None,
) -> Tuple[str, pd.DataFrame, dict]:
    """
    Generate audio files from text files.

    :param data_path:           Path to the text file or directory containing the text files to generate audio from.
    :param speakers:            List / Dict of speakers to generate audio for.
                                If a list is given, the speakers will be assigned to channels in the order given.
                                If dictionary, the keys will be the speakers and the values will be the channels.
    :param available_voices:    List of available voices to use for the generation.
                        See here for the available voices:
                        https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
    :param output_directory:    Path to the directory to save the generated audio files to.
    :param use_gpu:             Whether to use the GPU for the generation.
    :param use_small_models:    Whether to use the small models for the generation.
    :param offload_cpu:         To reduce the memory footprint, the models can be offloaded to the CPU after loading.
    :param sample_rate:         The sampling rate of the generated audio.
    :param file_format:         The format of the generated audio files.
    :param verbose:             Whether to print the progress of the generation.
    :param bits_per_sample:     Changes the bit depth for the supported formats.
                                Supported only in "wav" or "flac" formats.

    :returns:                   A tuple of:
                                - The output directory path.
                                - The generated audio files dataframe.
                                - The errors dictionary.
    """

    global _LOGGER
    _LOGGER = _get_logger()
    # Get the input text files to turn to audio:
    data_path = pathlib.Path(data_path).absolute()
    text_files = _get_text_files(data_path=data_path)

    # Load the bark models according to the given configurations:
    bark.preload_models(
        text_use_gpu=use_gpu,
        text_use_small=use_small_models,
        coarse_use_gpu=use_gpu,
        coarse_use_small=use_small_models,
        fine_use_gpu=use_gpu,
        fine_use_small=use_small_models,
        codec_use_gpu=use_gpu,
        force_reload=offload_cpu,
    )

    # Check for per channel generation:
    if isinstance(speakers, dict):
        speaker_per_channel = True
        # Sort the given speakers by channels:
        speakers = {
            speaker: channel
            for speaker, channel in sorted(speakers.items(), key=lambda item: item[1])
        }
    else:
        speaker_per_channel = False

    # Prepare the resampling module:
    resampler = torchaudio.transforms.Resample(
        orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
    )

    # Prepare the gap between each speaker:
    gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE))

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Create the output directory:
    if output_directory is None:
        output_directory = tempfile.mkdtemp()
    output_directory = pathlib.Path(output_directory)
    if not output_directory.exists():
        output_directory.mkdir(exist_ok=True, parents=True)

    # Start generating audio:
    # Go over the audio files and transcribe:
    for text_file in tqdm.tqdm(
        text_files, desc="Generating", unit="file", disable=not verbose
    ):

        try:
            # Randomize voices for each speaker:
            chosen_voices = {}
            available_voices_copy = available_voices.copy()
            for speaker in speakers:
                voice = random.choice(available_voices_copy)
                chosen_voices[speaker] = voice
                available_voices_copy.remove(voice)
            # Read text:
            with open(text_file, "r") as fp:
                text = fp.read()
            # Prepare a holder for all the generated pieces (if per channel each speaker will have its own):
            audio_pieces = (
                {speaker: [] for speaker in speakers}
                if speaker_per_channel
                else {"all": []}
            )

            # Generate audio per line:
            for line in text.splitlines():
                # Validate line is in correct speaker format:

                if ": " not in line:
                    if verbose:
                        _LOGGER.warning(f"Skipping line: {line}")
                    continue
                # Split line to speaker and his words:
                current_speaker, sentences = line.split(": ", 1)
                # Validate speaker is known:
                if current_speaker not in speakers:
                    raise ValueError(
                        f"Unknown speaker: {current_speaker}. Given speakers are: {speakers}"
                    )
                for sentence in _split_line(line=sentences):
                    # Generate words audio:
                    audio = bark.generate_audio(
                        sentence,
                        history_prompt=chosen_voices[current_speaker],
                        silent=True,
                    )
                    if speaker_per_channel:
                        silence = np.zeros_like(audio)
                        for speaker in audio_pieces.keys():
                            if speaker == current_speaker:
                                audio_pieces[speaker] += [audio, gap_between_speakers]
                            else:
                                audio_pieces[speaker] += [silence, gap_between_speakers]
                    else:
                        audio_pieces["all"] += [audio, gap_between_speakers]
            # Construct a single audio array from all the pieces and channels:

            audio = np.vstack(
                [np.concatenate(audio_pieces[speaker]) for speaker in speakers]
            ).astype(dtype=np.float32)
            # Resample:
            audio = torch.from_numpy(audio)
            audio = resampler(audio)
            # Save to audio file:
            audio_file = output_directory / f"{text_file.stem}.{file_format}"

            torchaudio.save(
                uri=str(audio_file),
                src=audio,
                sample_rate=sample_rate,
                format=file_format,
                bits_per_sample=bits_per_sample,
            )

            # Collect to the successes:
            successes.append([text_file.name, audio_file.name])
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{text_file.name}'")
            print(exception)
            errors[text_file.name] = str(exception)

    # Construct the translations dataframe:
    successes = pd.DataFrame(
        successes,
        columns=["text_file", "audio_file"],
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Translations summary:\n"
            f"{successes.head()}"
        )
    return str(output_directory), successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _split_line(line: str, max_length: int = 250) -> List[str]:
    if len(line) < max_length:
        return [line]

    sentences = [
        f"{sentence.strip()}." for sentence in line.split(".") if sentence.strip()
    ]

    splits = []
    current_length = len(sentences[0])
    split = sentences[0]
    for sentence in sentences[1:]:
        if current_length + len(sentence) > max_length:
            splits.append(split)
            split = sentence
            current_length = len(sentence)
        else:
            current_length += len(sentence)
            split += " " + sentence
    if split:
        splits.append(split)

    return splits


def _get_logger():
    global _LOGGER
    try:
        import mlrun
        # Check if MLRun is available:
        context = mlrun.get_or_create_ctx(name="mlrun")
        return context.logger
    except ModuleNotFoundError:
        return _LOGGER
 - base_image: mlrun/mlrun - commands: [] - code_origin: '' - origin_filename: '' + functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import logging
import os
import pathlib
import random
import tempfile
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import openai
import pandas as pd
import torch
import torchaudio
import tqdm
from pydub import AudioSegment

# Get the global logger:
_LOGGER = logging.getLogger()

OPENAI_API_KEY = "OPENAI_API_KEY"
OPENAI_BASE_URL = "OPENAI_BASE_URL"
SAMPLE_RATE = 24000


def generate_multi_speakers_audio(
    data_path: str,
    speakers: Union[List[str], Dict[str, int]],
    available_voices: List[str],
    output_directory: str = None,
    model: str = "tts-1",
    sample_rate: int = 16000,
    file_format: str = "wav",
    verbose: bool = True,
    bits_per_sample: Optional[int] = None,
    speed: float = 1.0,
) -> Tuple[str, pd.DataFrame, dict]:
    """
    Generate audio files from text files.

    :param data_path:           Path to the text file or directory containing the text files to generate audio from.
    :param speakers:            List / Dict of speakers to generate audio for.
                                If a list is given, the speakers will be assigned to channels in the order given.
                                If dictionary, the keys will be the speakers and the values will be the channels.
    :param available_voices:    List of available voices to use for the generation.
                        See here for the available voices:
                        https://platform.openai.com/docs/guides/text-to-speech#voice-options
    :param output_directory:    Path to the directory to save the generated audio files to.
    :param model:               Which model to use for the generation.
    :param sample_rate:         The sampling rate of the generated audio.
    :param file_format:         The format of the generated audio files.
    :param verbose:             Whether to print the progress of the generation.
    :param bits_per_sample:     Changes the bit depth for the supported formats.
                                Supported only in "wav" or "flac" formats.
    :param speed:               The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default.

    :returns:                   A tuple of:
                                - The output directory path.
                                - The generated audio files dataframe.
                                - The errors' dictionary.
    """

    global _LOGGER
    _LOGGER = _get_logger()
    # Get the input text files to turn to audio:
    data_path = pathlib.Path(data_path).absolute()
    text_files = _get_text_files(data_path=data_path)

    # connect to openai client:
    client = _get_openai_client()

    # Check for per channel generation:
    if isinstance(speakers, dict):
        speaker_per_channel = True
        # Sort the given speakers by channels:
        speakers = {
            speaker: channel
            for speaker, channel in sorted(speakers.items(), key=lambda item: item[1])
        }
    else:
        speaker_per_channel = False

    # Prepare the resampling module:
    resampler = torchaudio.transforms.Resample(
        orig_freq=SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
    )

    # Prepare the gap between each speaker:
    gap_between_speakers = np.zeros(int(0.5 * SAMPLE_RATE))

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Create the output directory:
    if output_directory is None:
        output_directory = tempfile.mkdtemp()
    output_directory = pathlib.Path(output_directory)
    if not output_directory.exists():
        output_directory.mkdir(exist_ok=True, parents=True)

    # Start generating audio:
    # Go over the audio files and transcribe:
    for text_file in tqdm.tqdm(
        text_files, desc="Generating", unit="file", disable=not verbose
    ):

        try:
            # Randomize voices for each speaker:
            chosen_voices = {}
            available_voices_copy = available_voices.copy()
            for speaker in speakers:
                voice = random.choice(available_voices_copy)
                chosen_voices[speaker] = voice
                available_voices_copy.remove(voice)
            # Read text:
            with open(text_file, "r") as fp:
                text = fp.read()
            # Prepare a holder for all the generated pieces (if per channel each speaker will have its own):
            audio_pieces = (
                {speaker: [] for speaker in speakers}
                if speaker_per_channel
                else {"all": []}
            )

            # Generate audio per line:
            for line in text.splitlines():
                # Validate line is in correct speaker format:

                if ": " not in line:
                    if verbose:
                        _LOGGER.warning(f"Skipping line: {line}")
                    continue
                # Split line to speaker and his words:
                current_speaker, sentences = line.split(": ", 1)
                # Validate speaker is known:
                if current_speaker not in speakers:
                    raise ValueError(
                        f"Unknown speaker: {current_speaker}. Given speakers are: {speakers}"
                    )
                for sentence in _split_line(line=sentences):
                    # Generate words audio:
                    audio = client.audio.speech.create(
                        model=model,
                        input=sentence,
                        voice=chosen_voices[current_speaker],
                        response_format=file_format,
                        speed=speed,
                    )
                    audio = audio.content
                    audio = _bytes_to_np_array(audio=audio, file_format=file_format)

                    if speaker_per_channel:
                        silence = np.zeros_like(audio)
                        for speaker in audio_pieces.keys():
                            if speaker == current_speaker:
                                audio_pieces[speaker] += [audio, gap_between_speakers]
                            else:
                                audio_pieces[speaker] += [silence, gap_between_speakers]
                    else:
                        audio_pieces["all"] += [audio, gap_between_speakers]
            # Construct a single audio array from all the pieces and channels:

            audio = np.vstack(
                [np.concatenate(audio_pieces[speaker]) for speaker in speakers]
            ).astype(dtype=np.float32)
            # Resample:
            audio = torch.from_numpy(audio)
            audio = resampler(audio)
            # Save to audio file:
            audio_file = output_directory / f"{text_file.stem}.{file_format}"

            torchaudio.save(
                uri=str(audio_file),
                src=audio,
                sample_rate=sample_rate,
                format=file_format,
                bits_per_sample=bits_per_sample,
            )

            # Collect to the successes:
            successes.append([text_file.name, audio_file.name])
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{text_file.name}'")
            print(exception)
            errors[text_file.name] = str(exception)

    # Construct the translations dataframe:
    successes = pd.DataFrame(
        successes,
        columns=["text_file", "audio_file"],
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Translations summary:\n"
            f"{successes.head()}"
        )
    return str(output_directory), successes, errors


def _get_openai_client():
    api_key = os.getenv(OPENAI_API_KEY)
    base_url = os.getenv(OPENAI_BASE_URL)
    # Check if the key is already in the environment variables:
    if not api_key or not base_url:
        try:
            import mlrun

            context = mlrun.get_or_create_ctx(name="context")
            # Check if the key is in the secrets:
            api_key = context.get_secret(OPENAI_API_KEY)
            base_url = context.get_secret(OPENAI_BASE_URL)
        except ModuleNotFoundError:
            raise EnvironmentError(
                f"One or more of the OpenAI required environment variables ('{OPENAI_API_KEY}', '{OPENAI_BASE_URL}') are missing."
                f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
                f"and set them as project secrets using `project.set_secrets`."
            )
    return openai.OpenAI(api_key=api_key, base_url=base_url)


def _bytes_to_np_array(audio: bytes, file_format: str):
    if file_format == "mp3":
        audio_segment = AudioSegment.from_mp3(io.BytesIO(audio))

        # Convert to raw PCM audio data
        samples = audio_segment.get_array_of_samples()

        # Convert to numpy array
        audio_array = np.array(samples)

        # Normalize to float between -1 and 1
        return audio_array.astype(np.float32) / np.iinfo(samples.typecode).max
    else:
        return np.frombuffer(audio, dtype=np.int16) / 32768.0


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _split_line(line: str, max_length: int = 250) -> List[str]:
    if len(line) < max_length:
        return [line]

    sentences = [
        f"{sentence.strip()}." for sentence in line.split(".") if sentence.strip()
    ]

    splits = []
    current_length = len(sentences[0])
    split = sentences[0]
    for sentence in sentences[1:]:
        if current_length + len(sentence) > max_length:
            splits.append(split)
            split = sentence
            current_length = len(sentence)
        else:
            current_length += len(sentence)
            split += " " + sentence
    if split:
        splits.append(split)

    return splits


def _get_logger():
    global _LOGGER
    try:
        import mlrun

        # Check if MLRun is available:
        context = mlrun.get_or_create_ctx(name="mlrun")
        return context.logger
    except ModuleNotFoundError:
        return _LOGGER
 requirements: - - bark + - openai - torchaudio + - pydub + origin_filename: '' + base_image: mlrun/mlrun + code_origin: '' + command: '' + disable_auto_mount: false + description: Generate audio file from text using different speakers entry_points: generate_multi_speakers_audio: - name: generate_multi_speakers_audio + has_varargs: false doc: Generate audio files from text files. + name: generate_multi_speakers_audio + outputs: + - doc: 'A tuple of: - The output directory path. - The generated audio files + dataframe. - The errors'' dictionary.' + type: Tuple[str, pd.DataFrame, dict] + has_kwargs: false parameters: - name: data_path type: str @@ -40,24 +36,15 @@ spec: - name: available_voices type: List[str] doc: 'List of available voices to use for the generation. See here for the - available voices: https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c' + available voices: https://platform.openai.com/docs/guides/text-to-speech#voice-options' - name: output_directory type: str doc: Path to the directory to save the generated audio files to. default: null - - name: use_gpu - type: bool - doc: Whether to use the GPU for the generation. - default: true - - name: use_small_models - type: bool - doc: Whether to use the small models for the generation. - default: false - - name: offload_cpu - type: bool - doc: To reduce the memory footprint, the models can be offloaded to the CPU - after loading. - default: false + - name: model + type: str + doc: Which model to use for the generation. + default: tts-1 - name: sample_rate type: int doc: The sampling rate of the generated audio. @@ -75,21 +62,18 @@ spec: doc: Changes the bit depth for the supported formats. Supported only in "wav" or "flac" formats. default: null - outputs: - - doc: 'A tuple of: - The output directory path. - The generated audio files - dataframe. - The errors dictionary.' - type: Tuple[str, pd.DataFrame, dict] - lineno: 31 - has_varargs: false - has_kwargs: false - description: Generate audio file from text using different speakers - default_handler: generate_multi_speakers_audio - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} + - name: speed + type: float + doc: The speed of the generated audio. Select a value from `0.25` to `4.0`. + `1.0` is the default. + default: 1.0 + lineno: 38 +kind: job +metadata: + categories: + - data-preparation + - machine-learning + - pytorch + tag: '' + name: text-to-audio-generator verbose: false diff --git a/text_to_audio_generator/item.yaml b/text_to_audio_generator/item.yaml index efa8afc90..3a6af1e7e 100644 --- a/text_to_audio_generator/item.yaml +++ b/text_to_audio_generator/item.yaml @@ -13,7 +13,7 @@ labels: author: yonatans maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.1 +mlrunVersion: 1.7.1 name: text_to_audio_generator platformVersion: 3.5.3 spec: @@ -22,8 +22,9 @@ spec: image: mlrun/mlrun kind: job requirements: - - bark + - openai - torchaudio + - pydub url: '' -version: 1.2.0 +version: 1.3.0 test_valid: True diff --git a/text_to_audio_generator/requirements.txt b/text_to_audio_generator/requirements.txt index 36f17cd61..63dee64df 100644 --- a/text_to_audio_generator/requirements.txt +++ b/text_to_audio_generator/requirements.txt @@ -1,2 +1,3 @@ -bark -torchaudio>=2.1.0 \ No newline at end of file +openai>=1.58.0 +torchaudio>=2.1.0 +pydub \ No newline at end of file diff --git a/text_to_audio_generator/test_text_to_audio_generator.py b/text_to_audio_generator/test_text_to_audio_generator.py index 87ffe1496..94fd8c098 100644 --- a/text_to_audio_generator/test_text_to_audio_generator.py +++ b/text_to_audio_generator/test_text_to_audio_generator.py @@ -12,11 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import mlrun +import os import tempfile + +import mlrun import pytest +@pytest.mark.skipif( + condition=os.getenv("OPENAI_BASE_URL") is None + and os.getenv("OPENAI_API_KEY") is None, + reason="OpenAI API key and base URL are required to run this test", +) @pytest.mark.parametrize("file_format,bits_per_sample", [("wav", 8), ("mp3", None)]) def test_generate_multi_speakers_audio(file_format, bits_per_sample): text_to_audio_generator_function = mlrun.import_function("function.yaml") @@ -28,12 +35,9 @@ def test_generate_multi_speakers_audio(file_format, bits_per_sample): "output_directory": test_directory, "speakers": {"Agent": 0, "Client": 1}, "available_voices": [ - "v2/en_speaker_0", - "v2/en_speaker_1", + "alloy", + "echo", ], - "use_small_models": True, - "use_gpu": False, - "offload_cpu": True, "file_format": file_format, "bits_per_sample": bits_per_sample, }, @@ -45,6 +49,6 @@ def test_generate_multi_speakers_audio(file_format, bits_per_sample): ], artifact_path=test_directory, ) - assert function_run.error == "Run state (completed) is not in error state" + assert function_run.error == "" for key in ["audio_files", "audio_files_dataframe", "text_to_speech_errors"]: assert key in function_run.outputs and function_run.outputs[key] is not None diff --git a/text_to_audio_generator/text_to_audio_generator.ipynb b/text_to_audio_generator/text_to_audio_generator.ipynb index 268fe2efb..a70882a44 100644 --- a/text_to_audio_generator/text_to_audio_generator.ipynb +++ b/text_to_audio_generator/text_to_audio_generator.ipynb @@ -31,10 +31,7 @@ "id": "bb20c4a6-f362-40e6-8f73-9145953959ec", "metadata": {}, "outputs": [], - "source": [ - "import mlrun\n", - "import tempfile" - ] + "source": "import mlrun" }, { "cell_type": "code", @@ -322,12 +319,9 @@ " \"output_directory\": \"./out\",\n", " \"speakers\": {\"Agent\": 0, \"Client\": 1},\n", " \"available_voices\": [\n", - " \"v2/en_speaker_0\",\n", - " \"v2/en_speaker_1\",\n", + " \"alloy\",\n", + " \"echo\",\n", " ],\n", - " \"use_small_models\": True,\n", - " \"use_gpu\": False,\n", - " \"offload_cpu\": True,\n", " \"file_format\": \"mp3\",\n", " # \"bits_per_sample\": 8,\n", " },\n", diff --git a/text_to_audio_generator/text_to_audio_generator.py b/text_to_audio_generator/text_to_audio_generator.py index 7602745ee..d47d6b865 100644 --- a/text_to_audio_generator/text_to_audio_generator.py +++ b/text_to_audio_generator/text_to_audio_generator.py @@ -11,35 +11,41 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import io import logging +import os import pathlib import random import tempfile from typing import Dict, List, Optional, Tuple, Union -import bark import numpy as np +import openai import pandas as pd import torch import torchaudio import tqdm +from pydub import AudioSegment # Get the global logger: _LOGGER = logging.getLogger() +OPENAI_API_KEY = "OPENAI_API_KEY" +OPENAI_BASE_URL = "OPENAI_BASE_URL" +SAMPLE_RATE = 24000 + def generate_multi_speakers_audio( data_path: str, speakers: Union[List[str], Dict[str, int]], available_voices: List[str], output_directory: str = None, - use_gpu: bool = True, - use_small_models: bool = False, - offload_cpu: bool = False, + model: str = "tts-1", sample_rate: int = 16000, file_format: str = "wav", verbose: bool = True, bits_per_sample: Optional[int] = None, + speed: float = 1.0, ) -> Tuple[str, pd.DataFrame, dict]: """ Generate audio files from text files. @@ -50,21 +56,20 @@ def generate_multi_speakers_audio( If dictionary, the keys will be the speakers and the values will be the channels. :param available_voices: List of available voices to use for the generation. See here for the available voices: - https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c + https://platform.openai.com/docs/guides/text-to-speech#voice-options :param output_directory: Path to the directory to save the generated audio files to. - :param use_gpu: Whether to use the GPU for the generation. - :param use_small_models: Whether to use the small models for the generation. - :param offload_cpu: To reduce the memory footprint, the models can be offloaded to the CPU after loading. + :param model: Which model to use for the generation. :param sample_rate: The sampling rate of the generated audio. :param file_format: The format of the generated audio files. :param verbose: Whether to print the progress of the generation. :param bits_per_sample: Changes the bit depth for the supported formats. Supported only in "wav" or "flac" formats. + :param speed: The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default. :returns: A tuple of: - The output directory path. - The generated audio files dataframe. - - The errors dictionary. + - The errors' dictionary. """ global _LOGGER @@ -73,17 +78,8 @@ def generate_multi_speakers_audio( data_path = pathlib.Path(data_path).absolute() text_files = _get_text_files(data_path=data_path) - # Load the bark models according to the given configurations: - bark.preload_models( - text_use_gpu=use_gpu, - text_use_small=use_small_models, - coarse_use_gpu=use_gpu, - coarse_use_small=use_small_models, - fine_use_gpu=use_gpu, - fine_use_small=use_small_models, - codec_use_gpu=use_gpu, - force_reload=offload_cpu, - ) + # connect to openai client: + client = _get_openai_client() # Check for per channel generation: if isinstance(speakers, dict): @@ -98,11 +94,11 @@ def generate_multi_speakers_audio( # Prepare the resampling module: resampler = torchaudio.transforms.Resample( - orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32 + orig_freq=SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32 ) # Prepare the gap between each speaker: - gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE)) + gap_between_speakers = np.zeros(int(0.5 * SAMPLE_RATE)) # Prepare the successes dataframe and errors dictionary to be returned: successes = [] @@ -156,11 +152,16 @@ def generate_multi_speakers_audio( ) for sentence in _split_line(line=sentences): # Generate words audio: - audio = bark.generate_audio( - sentence, - history_prompt=chosen_voices[current_speaker], - silent=True, + audio = client.audio.speech.create( + model=model, + input=sentence, + voice=chosen_voices[current_speaker], + response_format=file_format, + speed=speed, ) + audio = audio.content + audio = _bytes_to_np_array(audio=audio, file_format=file_format) + if speaker_per_channel: silence = np.zeros_like(audio) for speaker in audio_pieces.keys(): @@ -214,6 +215,43 @@ def generate_multi_speakers_audio( return str(output_directory), successes, errors +def _get_openai_client(): + api_key = os.getenv(OPENAI_API_KEY) + base_url = os.getenv(OPENAI_BASE_URL) + # Check if the key is already in the environment variables: + if not api_key or not base_url: + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="context") + # Check if the key is in the secrets: + api_key = context.get_secret(OPENAI_API_KEY) + base_url = context.get_secret(OPENAI_BASE_URL) + except ModuleNotFoundError: + raise EnvironmentError( + f"One or more of the OpenAI required environment variables ('{OPENAI_API_KEY}', '{OPENAI_BASE_URL}') are missing." + f"Please set them as environment variables or install mlrun (`pip install mlrun`)" + f"and set them as project secrets using `project.set_secrets`." + ) + return openai.OpenAI(api_key=api_key, base_url=base_url) + + +def _bytes_to_np_array(audio: bytes, file_format: str): + if file_format == "mp3": + audio_segment = AudioSegment.from_mp3(io.BytesIO(audio)) + + # Convert to raw PCM audio data + samples = audio_segment.get_array_of_samples() + + # Convert to numpy array + audio_array = np.array(samples) + + # Normalize to float between -1 and 1 + return audio_array.astype(np.float32) / np.iinfo(samples.typecode).max + else: + return np.frombuffer(audio, dtype=np.int16) / 32768.0 + + def _get_text_files( data_path: pathlib.Path, ) -> List[pathlib.Path]: @@ -261,6 +299,7 @@ def _get_logger(): global _LOGGER try: import mlrun + # Check if MLRun is available: context = mlrun.get_or_create_ctx(name="mlrun") return context.logger