Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[text to audio generator] Replaced bark with OpenAI's TTS models #836

Merged
merged 2 commits into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 36 additions & 52 deletions text_to_audio_generator/function.yaml

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions text_to_audio_generator/item.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ labels:
author: yonatans
maintainers: []
marketplaceType: ''
mlrunVersion: 1.5.1
mlrunVersion: 1.7.1
name: text_to_audio_generator
platformVersion: 3.5.3
spec:
Expand All @@ -22,8 +22,9 @@ spec:
image: mlrun/mlrun
kind: job
requirements:
- bark
- openai
- torchaudio
- pydub
url: ''
version: 1.2.0
version: 1.3.0
test_valid: True
5 changes: 3 additions & 2 deletions text_to_audio_generator/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
bark
torchaudio>=2.1.0
openai>=1.58.0
torchaudio>=2.1.0
pydub
18 changes: 11 additions & 7 deletions text_to_audio_generator/test_text_to_audio_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import mlrun
import os
import tempfile

import mlrun
import pytest


@pytest.mark.skipif(
condition=os.getenv("OPENAI_BASE_URL") is None
and os.getenv("OPENAI_API_KEY") is None,
reason="OpenAI API key and base URL are required to run this test",
)
@pytest.mark.parametrize("file_format,bits_per_sample", [("wav", 8), ("mp3", None)])
def test_generate_multi_speakers_audio(file_format, bits_per_sample):
text_to_audio_generator_function = mlrun.import_function("function.yaml")
Expand All @@ -28,12 +35,9 @@ def test_generate_multi_speakers_audio(file_format, bits_per_sample):
"output_directory": test_directory,
"speakers": {"Agent": 0, "Client": 1},
"available_voices": [
"v2/en_speaker_0",
"v2/en_speaker_1",
"alloy",
"echo",
],
"use_small_models": True,
"use_gpu": False,
"offload_cpu": True,
"file_format": file_format,
"bits_per_sample": bits_per_sample,
},
Expand All @@ -45,6 +49,6 @@ def test_generate_multi_speakers_audio(file_format, bits_per_sample):
],
artifact_path=test_directory,
)
assert function_run.error == "Run state (completed) is not in error state"
assert function_run.error == ""
for key in ["audio_files", "audio_files_dataframe", "text_to_speech_errors"]:
assert key in function_run.outputs and function_run.outputs[key] is not None
12 changes: 3 additions & 9 deletions text_to_audio_generator/text_to_audio_generator.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,7 @@
"id": "bb20c4a6-f362-40e6-8f73-9145953959ec",
"metadata": {},
"outputs": [],
"source": [
"import mlrun\n",
"import tempfile"
]
"source": "import mlrun"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -322,12 +319,9 @@
" \"output_directory\": \"./out\",\n",
" \"speakers\": {\"Agent\": 0, \"Client\": 1},\n",
" \"available_voices\": [\n",
" \"v2/en_speaker_0\",\n",
" \"v2/en_speaker_1\",\n",
" \"alloy\",\n",
" \"echo\",\n",
" ],\n",
" \"use_small_models\": True,\n",
" \"use_gpu\": False,\n",
" \"offload_cpu\": True,\n",
" \"file_format\": \"mp3\",\n",
" # \"bits_per_sample\": 8,\n",
" },\n",
Expand Down
91 changes: 65 additions & 26 deletions text_to_audio_generator/text_to_audio_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,41 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import logging
import os
import pathlib
import random
import tempfile
from typing import Dict, List, Optional, Tuple, Union

import bark
import numpy as np
import openai
import pandas as pd
import torch
import torchaudio
import tqdm
from pydub import AudioSegment

# Get the global logger:
_LOGGER = logging.getLogger()

OPENAI_API_KEY = "OPENAI_API_KEY"
OPENAI_BASE_URL = "OPENAI_BASE_URL"
SAMPLE_RATE = 24000


def generate_multi_speakers_audio(
data_path: str,
speakers: Union[List[str], Dict[str, int]],
available_voices: List[str],
output_directory: str = None,
use_gpu: bool = True,
use_small_models: bool = False,
offload_cpu: bool = False,
model: str = "tts-1",
sample_rate: int = 16000,
file_format: str = "wav",
verbose: bool = True,
bits_per_sample: Optional[int] = None,
speed: float = 1.0,
) -> Tuple[str, pd.DataFrame, dict]:
"""
Generate audio files from text files.
Expand All @@ -50,21 +56,20 @@ def generate_multi_speakers_audio(
If dictionary, the keys will be the speakers and the values will be the channels.
:param available_voices: List of available voices to use for the generation.
See here for the available voices:
https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
https://platform.openai.com/docs/guides/text-to-speech#voice-options
:param output_directory: Path to the directory to save the generated audio files to.
:param use_gpu: Whether to use the GPU for the generation.
:param use_small_models: Whether to use the small models for the generation.
:param offload_cpu: To reduce the memory footprint, the models can be offloaded to the CPU after loading.
:param model: Which model to use for the generation.
:param sample_rate: The sampling rate of the generated audio.
:param file_format: The format of the generated audio files.
:param verbose: Whether to print the progress of the generation.
:param bits_per_sample: Changes the bit depth for the supported formats.
Supported only in "wav" or "flac" formats.
:param speed: The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default.

:returns: A tuple of:
- The output directory path.
- The generated audio files dataframe.
- The errors dictionary.
- The errors' dictionary.
"""

global _LOGGER
Expand All @@ -73,17 +78,8 @@ def generate_multi_speakers_audio(
data_path = pathlib.Path(data_path).absolute()
text_files = _get_text_files(data_path=data_path)

# Load the bark models according to the given configurations:
bark.preload_models(
text_use_gpu=use_gpu,
text_use_small=use_small_models,
coarse_use_gpu=use_gpu,
coarse_use_small=use_small_models,
fine_use_gpu=use_gpu,
fine_use_small=use_small_models,
codec_use_gpu=use_gpu,
force_reload=offload_cpu,
)
# connect to openai client:
client = _get_openai_client()

# Check for per channel generation:
if isinstance(speakers, dict):
Expand All @@ -98,11 +94,11 @@ def generate_multi_speakers_audio(

# Prepare the resampling module:
resampler = torchaudio.transforms.Resample(
orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
orig_freq=SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
)

# Prepare the gap between each speaker:
gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE))
gap_between_speakers = np.zeros(int(0.5 * SAMPLE_RATE))

# Prepare the successes dataframe and errors dictionary to be returned:
successes = []
Expand Down Expand Up @@ -156,11 +152,16 @@ def generate_multi_speakers_audio(
)
for sentence in _split_line(line=sentences):
# Generate words audio:
audio = bark.generate_audio(
sentence,
history_prompt=chosen_voices[current_speaker],
silent=True,
audio = client.audio.speech.create(
model=model,
input=sentence,
voice=chosen_voices[current_speaker],
response_format=file_format,
speed=speed,
)
audio = audio.content
audio = _bytes_to_np_array(audio=audio, file_format=file_format)

if speaker_per_channel:
silence = np.zeros_like(audio)
for speaker in audio_pieces.keys():
Expand Down Expand Up @@ -214,6 +215,43 @@ def generate_multi_speakers_audio(
return str(output_directory), successes, errors


def _get_openai_client():
api_key = os.getenv(OPENAI_API_KEY)
base_url = os.getenv(OPENAI_BASE_URL)
# Check if the key is already in the environment variables:
if not api_key or not base_url:
try:
import mlrun

context = mlrun.get_or_create_ctx(name="context")
# Check if the key is in the secrets:
api_key = context.get_secret(OPENAI_API_KEY)
base_url = context.get_secret(OPENAI_BASE_URL)
except ModuleNotFoundError:
raise EnvironmentError(
f"One or more of the OpenAI required environment variables ('{OPENAI_API_KEY}', '{OPENAI_BASE_URL}') are missing."
f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
f"and set them as project secrets using `project.set_secrets`."
)
return openai.OpenAI(api_key=api_key, base_url=base_url)


def _bytes_to_np_array(audio: bytes, file_format: str):
if file_format == "mp3":
audio_segment = AudioSegment.from_mp3(io.BytesIO(audio))

# Convert to raw PCM audio data
samples = audio_segment.get_array_of_samples()

# Convert to numpy array
audio_array = np.array(samples)

# Normalize to float between -1 and 1
return audio_array.astype(np.float32) / np.iinfo(samples.typecode).max
else:
return np.frombuffer(audio, dtype=np.int16) / 32768.0


def _get_text_files(
data_path: pathlib.Path,
) -> List[pathlib.Path]:
Expand Down Expand Up @@ -261,6 +299,7 @@ def _get_logger():
global _LOGGER
try:
import mlrun

# Check if MLRun is available:
context = mlrun.get_or_create_ctx(name="mlrun")
return context.logger
Expand Down
Loading